shithub: libvpx

--- a/build/x86-msvs/obj_int_extract.bat

+++ b/build/x86-msvs/obj_int_extract.bat

@@ -7,9 +7,9 @@

 REM   be found in the AUTHORS file in the root of the source tree.

 echo on

-cl /I "./" /I "%1" /nologo /c "%1/vp8/common/asm_com_offsets.c"

-cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/asm_dec_offsets.c"

-cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/asm_enc_offsets.c"

+cl /I "./" /I "%1" /nologo /c "%1/vp9/common/asm_com_offsets.c"

+cl /I "./" /I "%1" /nologo /c "%1/vp9/decoder/asm_dec_offsets.c"

+cl /I "./" /I "%1" /nologo /c "%1/vp9/encoder/asm_enc_offsets.c"

 obj_int_extract.exe rvds "asm_com_offsets.obj" > "asm_com_offsets.asm"

 obj_int_extract.exe rvds "asm_dec_offsets.obj" > "asm_dec_offsets.asm"

 obj_int_extract.exe rvds "asm_enc_offsets.obj" > "asm_enc_offsets.asm"

--- a/configure

+++ b/configure

@@ -31,7 +31,7 @@

   ${toggle_debug_libs}            in/exclude debug version of libraries

   ${toggle_md5}                   support for output of checksum data

   ${toggle_static_msvcrt}         use static MSVCRT (VS builds only)

-  ${toggle_vp8}                   VP8 codec support

+  ${toggle_vp9}                   VP9 codec support

   ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)

   ${toggle_mem_tracker}           track memory usage

   ${toggle_postproc}              postprocessing

@@ -161,17 +161,17 @@

 enable os_support

 [ -d ${source_path}/../include ] && enable alt_tree_layout

-for d in vp8; do

+for d in vp9; do

     [ -d ${source_path}/${d} ] && disable alt_tree_layout;

 done

 if ! enabled alt_tree_layout; then

 # development environment

-[ -d ${source_path}/vp8 ] && CODECS="${CODECS} vp8_encoder vp8_decoder"

+[ -d ${source_path}/vp9 ] && CODECS="${CODECS} vp9_encoder vp9_decoder"

 else

 # customer environment

-[ -f ${source_path}/../include/vpx/vp8cx.h ] && CODECS="${CODECS} vp8_encoder"

-[ -f ${source_path}/../include/vpx/vp8dx.h ] && CODECS="${CODECS} vp8_decoder"

+[ -f ${source_path}/../include/vpx/vp8cx.h ] && CODECS="${CODECS} vp9_encoder"

+[ -f ${source_path}/../include/vpx/vp8dx.h ] && CODECS="${CODECS} vp9_decoder"

 [ -f ${source_path}/../lib/*/*mt.lib ] && soft_enable static_msvcrt

fi

--- a/docs.mk

+++ b/docs.mk

@@ -21,7 +21,7 @@

 		usage_dx.dox \

 # Other doxy files sourced in Markdown

-TXT_DOX-$(CONFIG_VP8)          += vp8_api1_migration.dox

+TXT_DOX-$(CONFIG_VP9)          += vp8_api1_migration.dox

 vp8_api1_migration.dox.DESC     = VP8 API 1.x Migration

 TXT_DOX = $(call enabled,TXT_DOX)

--- a/example_xma.c

+++ b/example_xma.c

@@ -18,7 +18,7 @@

 #include "vpx_config.h"

 #include "vpx/vpx_decoder.h"

 #include "vpx/vpx_integer.h"

-#if CONFIG_VP8_DECODER

+#if CONFIG_VP9_DECODER

 #include "vpx/vp8dx.h"

 #endif

@@ -29,8 +29,8 @@

   const char *name;

   const vpx_codec_iface_t *iface;

 } ifaces[] = {

-#if CONFIG_VP8_DECODER

-  {"vp8",  &vpx_codec_vp8_dx_algo},

+#if CONFIG_VP9_DECODER

+  {"vp9",  &vpx_codec_vp8_dx_algo},

 #endif

};

--- a/examples.mk

+++ b/examples.mk

@@ -81,13 +81,13 @@

 error_resilient.GUID             = DF5837B9-4145-4F92-A031-44E4F832E00C

 error_resilient.DESCRIPTION      = Error Resiliency Feature

-GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8_scalable_patterns.c

+GEN_EXAMPLES-$(CONFIG_VP9_ENCODER) += vp8_scalable_patterns.c

 vp8_scalable_patterns.GUID          = 0D6A210B-F482-4D6F-8570-4A9C01ACC88C

 vp8_scalable_patterns.DESCRIPTION   = VP8 Scalable Bitstream Patterns

-GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8_set_maps.c

+GEN_EXAMPLES-$(CONFIG_VP9_ENCODER) += vp8_set_maps.c

 vp8_set_maps.GUID                   = ECB2D24D-98B8-4015-A465-A4AF3DCC145F

 vp8_set_maps.DESCRIPTION            = VP8 set active and ROI maps

-GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8cx_set_ref.c

+GEN_EXAMPLES-$(CONFIG_VP9_ENCODER) += vp8cx_set_ref.c

 vp8cx_set_ref.GUID                  = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A

 vp8cx_set_ref.DESCRIPTION           = VP8 set encoder reference frame

@@ -97,10 +97,10 @@

 # We should not link to math library (libm) on RVCT

 # when building for bare-metal targets

 ifeq ($(CONFIG_OS_SUPPORT), yes)

-CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m

+CODEC_EXTRA_LIBS-$(CONFIG_VP9)         += m

 else

     ifeq ($(CONFIG_GCC), yes)

-    CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m

+    CODEC_EXTRA_LIBS-$(CONFIG_VP9)         += m

     endif

 endif

@@ -117,8 +117,8 @@

     INC_PATH := $(SRC_PATH_BARE)/../include

 else

     LIB_PATH-yes                     += $(if $(BUILD_PFX),$(BUILD_PFX),.)

-    INC_PATH-$(CONFIG_VP8_DECODER)   += $(SRC_PATH_BARE)/vp8

-    INC_PATH-$(CONFIG_VP8_ENCODER)   += $(SRC_PATH_BARE)/vp8

+    INC_PATH-$(CONFIG_VP9_DECODER)   += $(SRC_PATH_BARE)/vp9

+    INC_PATH-$(CONFIG_VP9_ENCODER)   += $(SRC_PATH_BARE)/vp9

     LIB_PATH := $(call enabled,LIB_PATH)

     INC_PATH := $(call enabled,INC_PATH)

 endif

--- a/examples/decoder_tmpl.txt

+++ b/examples/decoder_tmpl.txt

@@ -1,7 +1,7 @@

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEC_INCLUDES

 #define VPX_CODEC_DISABLE_COMPAT 1

 #include "vpx/vpx_decoder.h"

-#include "vpx/vp8dx.h"

+#include "vpx/vp9dx.h"

 #define interface (vpx_codec_vp8_dx())

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEC_INCLUDES

--- a/examples/encoder_tmpl.txt

+++ b/examples/encoder_tmpl.txt

@@ -1,7 +1,7 @@

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ENC_INCLUDES

 #define VPX_CODEC_DISABLE_COMPAT 1

 #include "vpx/vpx_encoder.h"

-#include "vpx/vp8cx.h"

+#include "vpx/vp9cx.h"

 #define interface (vpx_codec_vp8_cx())

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ENC_INCLUDES

--- a/examples/postproc.txt

+++ b/examples/postproc.txt

@@ -51,7 +51,7 @@

 postprocessors. VP8 is one example. The following sample code toggles

 postprocessing on and off every 15 frames.

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PRE_DECODE

-#if CONFIG_VP8_DECODER

+#if CONFIG_VP9_DECODER

 if(frame_cnt%30 == 1) {

     vp8_postproc_cfg_t  pp = {0, 0, 0};

--- a/libs.mk

+++ b/libs.mk

@@ -30,29 +30,29 @@

 CODEC_SRCS-yes += $(addprefix vpx_scale/,$(call enabled,SCALE_SRCS))

-ifeq ($(CONFIG_VP8_ENCODER),yes)

-  VP8_PREFIX=vp8/

-  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx.mk

-  CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS))

-  CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_CX_EXPORTS))

-  CODEC_SRCS-yes += $(VP8_PREFIX)vp8cx.mk vpx/vp8.h vpx/vp8cx.h vpx/vp8e.h

-  CODEC_SRCS-$(ARCH_ARM) += $(VP8_PREFIX)vp8cx_arm.mk

+ifeq ($(CONFIG_VP9_ENCODER),yes)

+  VP9_PREFIX=vp9/

+  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9cx.mk

+  CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS))

+  CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS))

+  CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h vpx/vp8e.h

+  CODEC_SRCS-$(ARCH_ARM) += $(VP9_PREFIX)vp98cx_arm.mk

   INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8e.h include/vpx/vp8cx.h

-  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%

+  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%

   CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h

-  CODEC_DOC_SECTIONS += vp8 vp8_encoder

+  CODEC_DOC_SECTIONS += vp9 vp9_encoder

 endif

-ifeq ($(CONFIG_VP8_DECODER),yes)

-  VP8_PREFIX=vp8/

-  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8dx.mk

-  CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_DX_SRCS))

-  CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_DX_EXPORTS))

-  CODEC_SRCS-yes += $(VP8_PREFIX)vp8dx.mk vpx/vp8.h vpx/vp8dx.h

+ifeq ($(CONFIG_VP9_DECODER),yes)

+  VP9_PREFIX=vp9/

+  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9dx.mk

+  CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_DX_SRCS))

+  CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_DX_EXPORTS))

+  CODEC_SRCS-yes += $(VP9_PREFIX)vp9dx.mk vpx/vp8.h vpx/vp8dx.h

   INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h

-  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%

+  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%

   CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h

-  CODEC_DOC_SECTIONS += vp8 vp8_decoder

+  CODEC_DOC_SECTIONS += vp9 vp9_decoder

 endif

@@ -305,46 +305,46 @@

 OFFSET_PATTERN:='^[a-zA-Z0-9_]* EQU'

 ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC))

-    $(BUILD_PFX)asm_com_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S

+    $(BUILD_PFX)asm_com_offsets.asm: $(BUILD_PFX)$(VP9_PREFIX)common/asm_com_offsets.c.S

 	@echo "    [CREATE] $@"

 	$(qexec)LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@

-    $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S: $(VP8_PREFIX)common/asm_com_offsets.c

-    CLEAN-OBJS += $(BUILD_PFX)asm_com_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S

+    $(BUILD_PFX)$(VP9_PREFIX)common/asm_com_offsets.c.S: $(VP9_PREFIX)common/asm_com_offsets.c

+    CLEAN-OBJS += $(BUILD_PFX)asm_com_offsets.asm $(BUILD_PFX)$(VP9_PREFIX)common/asm_com_offsets.c.S

-    $(BUILD_PFX)asm_enc_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S

+    $(BUILD_PFX)asm_enc_offsets.asm: $(BUILD_PFX)$(VP9_PREFIX)encoder/asm_enc_offsets.c.S

 	@echo "    [CREATE] $@"

 	$(qexec)LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@

-    $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S: $(VP8_PREFIX)encoder/asm_enc_offsets.c

-    CLEAN-OBJS += $(BUILD_PFX)asm_enc_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S

+    $(BUILD_PFX)$(VP9_PREFIX)encoder/asm_enc_offsets.c.S: $(VP9_PREFIX)encoder/asm_enc_offsets.c

+    CLEAN-OBJS += $(BUILD_PFX)asm_enc_offsets.asm $(BUILD_PFX)$(VP9_PREFIX)encoder/asm_enc_offsets.c.S

-    $(BUILD_PFX)asm_dec_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S

+    $(BUILD_PFX)asm_dec_offsets.asm: $(BUILD_PFX)$(VP9_PREFIX)decoder/asm_dec_offsets.c.S

 	@echo "    [CREATE] $@"

 	$(qexec)LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@

-    $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S: $(VP8_PREFIX)decoder/asm_dec_offsets.c

-    CLEAN-OBJS += $(BUILD_PFX)asm_dec_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S

+    $(BUILD_PFX)$(VP9_PREFIX)decoder/asm_dec_offsets.c.S: $(VP9_PREFIX)decoder/asm_dec_offsets.c

+    CLEAN-OBJS += $(BUILD_PFX)asm_dec_offsets.asm $(BUILD_PFX)$(VP9_PREFIX)decoder/asm_dec_offsets.c.S

 else

   ifeq ($(filter rvct,$(TGT_CC)), $(TGT_CC))

     asm_com_offsets.asm: obj_int_extract

-    asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o

+    asm_com_offsets.asm: $(VP9_PREFIX)common/asm_com_offsets.c.o

 	@echo "    [CREATE] $@"

 	$(qexec)./obj_int_extract rvds $< $(ADS2GAS) > $@

-    OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o

+    OBJS-yes += $(VP9_PREFIX)common/asm_com_offsets.c.o

     CLEAN-OBJS += asm_com_offsets.asm

     $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm

     asm_enc_offsets.asm: obj_int_extract

-    asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o

+    asm_enc_offsets.asm: $(VP9_PREFIX)encoder/asm_enc_offsets.c.o

 	@echo "    [CREATE] $@"

 	$(qexec)./obj_int_extract rvds $< $(ADS2GAS) > $@

-    OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o

+    OBJS-yes += $(VP9_PREFIX)encoder/asm_enc_offsets.c.o

     CLEAN-OBJS += asm_enc_offsets.asm

     $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm

     asm_dec_offsets.asm: obj_int_extract

-    asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o

+    asm_dec_offsets.asm: $(VP9_PREFIX)decoder/asm_dec_offsets.c.o

 	@echo "    [CREATE] $@"

 	$(qexec)./obj_int_extract rvds $< $(ADS2GAS) > $@

-    OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o

+    OBJS-yes += $(VP9_PREFIX)decoder/asm_dec_offsets.c.o

     CLEAN-OBJS += asm_dec_offsets.asm

     $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm

   endif

--- a/test/boolcoder_test.cc

+++ b/test/boolcoder_test.cc

@@ -15,8 +15,8 @@

 #include "third_party/googletest/src/include/gtest/gtest.h"

 extern "C" {

-#include "vp8/encoder/boolhuff.h"

-#include "vp8/decoder/dboolhuff.h"

+#include "vp9/encoder/boolhuff.h"

+#include "vp9/decoder/dboolhuff.h"

 #include "acm_random.h"

--- a/test/dct16x16_test.cc

+++ b/test/dct16x16_test.cc

@@ -15,9 +15,9 @@

 #include "third_party/googletest/src/include/gtest/gtest.h"

 extern "C" {

-#include "vp8/common/entropy.h"

-#include "vp8/common/idct.h"

-#include "vp8/encoder/dct.h"

+#include "vp9/common/entropy.h"

+#include "vp9/common/idct.h"

+#include "vp9/encoder/dct.h"

 #include "acm_random.h"

--- a/test/fdct4x4_test.cc

+++ b/test/fdct4x4_test.cc

@@ -15,8 +15,8 @@

 #include "third_party/googletest/src/include/gtest/gtest.h"

 extern "C" {

-#include "vp8/common/idct.h"

-#include "vp8/encoder/dct.h"

+#include "vp9/common/idct.h"

+#include "vp9/encoder/dct.h"

 #include "acm_random.h"

--- a/test/fdct8x8_test.cc

+++ b/test/fdct8x8_test.cc

@@ -15,8 +15,8 @@

 #include "third_party/googletest/src/include/gtest/gtest.h"

 extern "C" {

-#include "vp8/encoder/dct.h"

-#include "vp8/common/idct.h"

+#include "vp9/encoder/dct.h"

+#include "vp9/common/idct.h"

 #include "acm_random.h"

--- a/test/idct8x8_test.cc

+++ b/test/idct8x8_test.cc

@@ -15,8 +15,8 @@

 #include "third_party/googletest/src/include/gtest/gtest.h"

 extern "C" {

-#include "vp8/encoder/dct.h"

-#include "vp8/common/idct.h"

+#include "vp9/encoder/dct.h"

+#include "vp9/common/idct.h"

 #include "acm_random.h"

--- a/vp8/common/alloccommon.c

+++ /dev/null

@@ -1,220 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "blockd.h"

-#include "vpx_mem/vpx_mem.h"

-#include "onyxc_int.h"

-#include "findnearmv.h"

-#include "entropymode.h"

-#include "entropymv.h"

-#include "systemdependent.h"

-void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base) {

-  int stride = cpi->mode_info_stride;

-  int i;

-  // Clear down top border row

-  vpx_memset(mi_base, 0, sizeof(MODE_INFO) * cpi->mode_info_stride);

-  // Clear left border column

-  for (i = 1; i < cpi->mb_rows + 1; i++) {

-    vpx_memset(&mi_base[i * stride], 0, sizeof(MODE_INFO));

-  }

-}

-void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi) {

-  int i, j;

-  // For each in image mode_info element set the in image flag to 1

-  for (i = 0; i < cpi->mb_rows; i++) {

-    for (j = 0; j < cpi->mb_cols; j++) {

-      mi->mbmi.mb_in_image = 1;

-      mi++;   // Next element in the row

-    }

-    mi++;       // Step over border element at start of next row

-  }

-}

-void vp9_de_alloc_frame_buffers(VP9_COMMON *oci) {

-  int i;

-  for (i = 0; i < NUM_YV12_BUFFERS; i++)

-    vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);

-  vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);

-  vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);

-  vpx_free(oci->above_context);

-  vpx_free(oci->mip);

-  vpx_free(oci->prev_mip);

-  oci->above_context = 0;

-  oci->mip = 0;

-  oci->prev_mip = 0;

-}

-int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {

-  int i;

-  vp9_de_alloc_frame_buffers(oci);

-  /* our internal buffers are always multiples of 16 */

-  if ((width & 0xf) != 0)

-    width += 16 - (width & 0xf);

-  if ((height & 0xf) != 0)

-    height += 16 - (height & 0xf);

-  for (i = 0; i < NUM_YV12_BUFFERS; i++) {

-    oci->fb_idx_ref_cnt[i] = 0;

-    oci->yv12_fb[i].flags = 0;

-    if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, VP8BORDERINPIXELS) < 0) {

-      vp9_de_alloc_frame_buffers(oci);

-      return 1;

-    }

-  }

-  oci->new_fb_idx = 0;

-  oci->lst_fb_idx = 1;

-  oci->gld_fb_idx = 2;

-  oci->alt_fb_idx = 3;

-  oci->fb_idx_ref_cnt[0] = 1;

-  oci->fb_idx_ref_cnt[1] = 1;

-  oci->fb_idx_ref_cnt[2] = 1;

-  oci->fb_idx_ref_cnt[3] = 1;

-  if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame,   width, 16, VP8BORDERINPIXELS) < 0) {

-    vp9_de_alloc_frame_buffers(oci);

-    return 1;

-  }

-  if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0) {

-    vp9_de_alloc_frame_buffers(oci);

-    return 1;

-  }

-  oci->mb_rows = height >> 4;

-  oci->mb_cols = width >> 4;

-  oci->MBs = oci->mb_rows * oci->mb_cols;

-  oci->mode_info_stride = oci->mb_cols + 1;

-  oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));

-  if (!oci->mip) {

-    vp9_de_alloc_frame_buffers(oci);

-    return 1;

-  }

-  oci->mi = oci->mip + oci->mode_info_stride + 1;

-  /* allocate memory for last frame MODE_INFO array */

-  oci->prev_mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));

-  if (!oci->prev_mip) {

-    vp9_de_alloc_frame_buffers(oci);

-    return 1;

-  }

-  oci->prev_mi = oci->prev_mip + oci->mode_info_stride + 1;

-  oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);

-  if (!oci->above_context) {

-    vp9_de_alloc_frame_buffers(oci);

-    return 1;

-  }

-  vp9_update_mode_info_border(oci, oci->mip);

-  vp9_update_mode_info_in_image(oci, oci->mi);

-  return 0;

-}

-void vp9_setup_version(VP9_COMMON *cm) {

-  if (cm->version & 0x4) {

-    if (!CONFIG_EXPERIMENTAL)

-      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,

-                         "Bitstream was created by an experimental "

-                         "encoder");

-    cm->experimental = 1;

-  }

-  switch (cm->version & 0x3) {

-    case 0:

-      cm->no_lpf = 0;

-      cm->filter_type = NORMAL_LOOPFILTER;

-      cm->use_bilinear_mc_filter = 0;

-      cm->full_pixel = 0;

-      break;

-    case 1:

-      cm->no_lpf = 0;

-      cm->filter_type = SIMPLE_LOOPFILTER;

-      cm->use_bilinear_mc_filter = 1;

-      cm->full_pixel = 0;

-      break;

-    case 2:

-    case 3:

-      cm->no_lpf = 1;

-      cm->filter_type = NORMAL_LOOPFILTER;

-      cm->use_bilinear_mc_filter = 1;

-      cm->full_pixel = 0;

-      break;

-      // Full pel only code deprecated in experimental code base

-      // case 3:

-      //    cm->no_lpf = 1;

-      //    cm->filter_type = SIMPLE_LOOPFILTER;

-      //    cm->use_bilinear_mc_filter = 1;

-      //    cm->full_pixel = 1;

-      //    break;

-  }

-}

-void vp9_create_common(VP9_COMMON *oci) {

-  vp9_machine_specific_config(oci);

-  vp9_init_mbmode_probs(oci);

-  vp9_default_bmode_probs(oci->fc.bmode_prob);

-  oci->txfm_mode = ONLY_4X4;

-  oci->mb_no_coeff_skip = 1;

-  oci->comp_pred_mode = HYBRID_PREDICTION;

-  oci->no_lpf = 0;

-  oci->filter_type = NORMAL_LOOPFILTER;

-  oci->use_bilinear_mc_filter = 0;

-  oci->full_pixel = 0;

-  oci->clr_type = REG_YUV;

-  oci->clamp_type = RECON_CLAMP_REQUIRED;

-  /* Initialise reference frame sign bias structure to defaults */

-  vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));

-  /* Default disable buffer to buffer copying */

-  oci->copy_buffer_to_gf = 0;

-  oci->copy_buffer_to_arf = 0;

-  oci->kf_ymode_probs_update = 0;

-}

-void vp9_remove_common(VP9_COMMON *oci) {

-  vp9_de_alloc_frame_buffers(oci);

-}

-void vp9_initialize_common() {

-  vp9_coef_tree_initialize();

-  vp9_entropy_mode_init();

-  vp9_entropy_mv_init();

-}

--- a/vp8/common/alloccommon.h

+++ /dev/null

@@ -1,26 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_ALLOCCOMMON_H

-#define __INC_ALLOCCOMMON_H

-#include "onyxc_int.h"

-void vp9_create_common(VP9_COMMON *oci);

-void vp9_remove_common(VP9_COMMON *oci);

-void vp9_de_alloc_frame_buffers(VP9_COMMON *oci);

-int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height);

-void vp9_setup_version(VP9_COMMON *oci);

-void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base);

-void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi);

-#endif

--- a/vp8/common/arm/arm_systemdependent.c

+++ /dev/null

@@ -1,92 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "vpx_ports/arm.h"

-#include "vp8/common/pragmas.h"

-#include "vp8/common/subpixel.h"

-#include "vp8/common/loopfilter.h"

-#include "vp8/common/recon.h"

-#include "vp8/common/idct.h"

-#include "vp8/common/onyxc_int.h"

-void vp9_arch_arm_common_init(VP9_COMMON *ctx) {

-#if CONFIG_RUNTIME_CPU_DETECT

-  VP9_COMMON_RTCD *rtcd = &ctx->rtcd;

-  int flags = arm_cpu_caps();

-  rtcd->flags = flags;

-  /* Override default functions with fastest ones for this CPU. */

-#if HAVE_ARMV5TE

-  if (flags & HAS_EDSP) {

-  }

-#endif

-// The commented functions need to be re-written for vpx.

-#if HAVE_ARMV6

-  if (flags & HAS_MEDIA) {

-    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_armv6;

-    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_armv6;

-    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_armv6;

-    rtcd->subpix.sixtap4x4     = vp9_sixtap_predict_armv6;

-    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_armv6;

-    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_armv6;

-    rtcd->subpix.bilinear8x4   = vp9_bilinear_predict8x4_armv6;

-    rtcd->subpix.bilinear4x4   = vp9_bilinear_predict4x4_armv6;

-    // rtcd->idct.idct1        = vp9_short_idct4x4llm_1_v6;

-    // rtcd->idct.idct16       = vp9_short_idct4x4llm_v6_dual;

-    // rtcd->idct.iwalsh1      = vp9_short_inv_walsh4x4_1_v6;

-    // rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_v6;

-    rtcd->recon.copy16x16   = vp9_copy_mem16x16_v6;

-    rtcd->recon.copy8x8     = vp9_copy_mem8x8_v6;

-    rtcd->recon.copy8x4     = vp9_copy_mem8x4_v6;

-    rtcd->recon.recon       = vp9_recon_b_armv6;

-    rtcd->recon.recon2      = vp9_recon2b_armv6;

-    rtcd->recon.recon4      = vp9_recon4b_armv6;

-  }

-#endif

-#if HAVE_ARMV7

-  if (flags & HAS_NEON) {

-    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_neon;

-    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_neon;

-    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_neon;

-    rtcd->subpix.sixtap4x4     = vp9_sixtap_predict_neon;

-    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_neon;

-    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_neon;

-    rtcd->subpix.bilinear8x4   = vp9_bilinear_predict8x4_neon;

-    rtcd->subpix.bilinear4x4   = vp9_bilinear_predict4x4_neon;

-    // rtcd->idct.idct1        = vp9_short_idct4x4llm_1_neon;

-    // rtcd->idct.idct16       = vp9_short_idct4x4llm_neon;

-    // rtcd->idct.iwalsh1      = vp9_short_inv_walsh4x4_1_neon;

-    // rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_neon;

-    rtcd->recon.copy16x16   = vp9_copy_mem16x16_neon;

-    rtcd->recon.copy8x8     = vp9_copy_mem8x8_neon;

-    rtcd->recon.copy8x4     = vp9_copy_mem8x4_neon;

-    rtcd->recon.recon       = vp9_recon_b_neon;

-    rtcd->recon.recon2      = vp9_recon2b_neon;

-    rtcd->recon.recon4      = vp9_recon4b_neon;

-    rtcd->recon.recon_mb    = vp9_recon_mb_neon;

-    rtcd->recon.build_intra_predictors_mby =

-      vp9_build_intra_predictors_mby_neon;

-    rtcd->recon.build_intra_predictors_mby_s =

-      vp9_build_intra_predictors_mby_s_neon;

-  }

-#endif

-#endif

-}

--- a/vp8/common/arm/armv6/bilinearfilter_v6.asm

+++ /dev/null

@@ -1,237 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_filter_block2d_bil_first_pass_armv6|

-    EXPORT  |vp9_filter_block2d_bil_second_pass_armv6|

-    AREA    |.text|, CODE, READONLY  ; name this block of code

-;-------------------------------------

-; r0    unsigned char  *src_ptr,

-; r1    unsigned short *dst_ptr,

-; r2    unsigned int    src_pitch,

-; r3    unsigned int    height,

-; stack unsigned int    width,

-; stack const short    *vp9_filter

-;-------------------------------------

-; The output is transposed stroed in output array to make it easy for second pass filtering.

-|vp9_filter_block2d_bil_first_pass_armv6| PROC

-    stmdb   sp!, {r4 - r11, lr}

-    ldr     r11, [sp, #40]                  ; vp9_filter address

-    ldr     r4, [sp, #36]                   ; width

-    mov     r12, r3                         ; outer-loop counter

-    add     r7, r2, r4                      ; preload next row

-    pld     [r0, r7]

-    sub     r2, r2, r4                      ; src increment for height loop

-    ldr     r5, [r11]                       ; load up filter coefficients

-    mov     r3, r3, lsl #1                  ; height*2

-    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)

-    mov     r11, r1                         ; save dst_ptr for each row

-    cmp     r5, #128                        ; if filter coef = 128, then skip the filter

-    beq     bil_null_1st_filter

-|bil_height_loop_1st_v6|

-    ldrb    r6, [r0]                        ; load source data

-    ldrb    r7, [r0, #1]

-    ldrb    r8, [r0, #2]

-    mov     lr, r4, lsr #2                  ; 4-in-parellel loop counter

-|bil_width_loop_1st_v6|

-    ldrb    r9, [r0, #3]

-    ldrb    r10, [r0, #4]

-    pkhbt   r6, r6, r7, lsl #16             ; src[1] | src[0]

-    pkhbt   r7, r7, r8, lsl #16             ; src[2] | src[1]

-    smuad   r6, r6, r5                      ; apply the filter

-    pkhbt   r8, r8, r9, lsl #16             ; src[3] | src[2]

-    smuad   r7, r7, r5

-    pkhbt   r9, r9, r10, lsl #16            ; src[4] | src[3]

-    smuad   r8, r8, r5

-    smuad   r9, r9, r5

-    add     r0, r0, #4

-    subs    lr, lr, #1

-    add     r6, r6, #0x40                   ; round_shift_and_clamp

-    add     r7, r7, #0x40

-    usat    r6, #16, r6, asr #7

-    usat    r7, #16, r7, asr #7

-    strh    r6, [r1], r3                    ; result is transposed and stored

-    add     r8, r8, #0x40                   ; round_shift_and_clamp

-    strh    r7, [r1], r3

-    add     r9, r9, #0x40

-    usat    r8, #16, r8, asr #7

-    usat    r9, #16, r9, asr #7

-    strh    r8, [r1], r3                    ; result is transposed and stored

-    ldrneb  r6, [r0]                        ; load source data

-    strh    r9, [r1], r3

-    ldrneb  r7, [r0, #1]

-    ldrneb  r8, [r0, #2]

-    bne     bil_width_loop_1st_v6

-    add     r0, r0, r2                      ; move to next input row

-    subs    r12, r12, #1

-    add     r9, r2, r4, lsl #1              ; adding back block width

-    pld     [r0, r9]                        ; preload next row

-    add     r11, r11, #2                    ; move over to next column

-    mov     r1, r11

-    bne     bil_height_loop_1st_v6

-    ldmia   sp!, {r4 - r11, pc}

-|bil_null_1st_filter|

-|bil_height_loop_null_1st|

-    mov     lr, r4, lsr #2                  ; loop counter

-|bil_width_loop_null_1st|

-    ldrb    r6, [r0]                        ; load data

-    ldrb    r7, [r0, #1]

-    ldrb    r8, [r0, #2]

-    ldrb    r9, [r0, #3]

-    strh    r6, [r1], r3                    ; store it to immediate buffer

-    add     r0, r0, #4

-    strh    r7, [r1], r3

-    subs    lr, lr, #1

-    strh    r8, [r1], r3

-    strh    r9, [r1], r3

-    bne     bil_width_loop_null_1st

-    subs    r12, r12, #1

-    add     r0, r0, r2                      ; move to next input line

-    add     r11, r11, #2                    ; move over to next column

-    mov     r1, r11

-    bne     bil_height_loop_null_1st

-    ldmia   sp!, {r4 - r11, pc}

-    ENDP  ; |vp9_filter_block2d_bil_first_pass_armv6|

-;---------------------------------

-; r0    unsigned short *src_ptr,

-; r1    unsigned char  *dst_ptr,

-; r2    int             dst_pitch,

-; r3    unsigned int    height,

-; stack unsigned int    width,

-; stack const short    *vp9_filter

-;---------------------------------

-|vp9_filter_block2d_bil_second_pass_armv6| PROC

-    stmdb   sp!, {r4 - r11, lr}

-    ldr     r11, [sp, #40]                  ; vp9_filter address

-    ldr     r4, [sp, #36]                   ; width

-    ldr     r5, [r11]                       ; load up filter coefficients

-    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix

-    mov     r11, r1

-    cmp     r5, #128                        ; if filter coef = 128, then skip the filter

-    beq     bil_null_2nd_filter

-|bil_height_loop_2nd|

-    ldr     r6, [r0]                        ; load the data

-    ldr     r8, [r0, #4]

-    ldrh    r10, [r0, #8]

-    mov     lr, r3, lsr #2                  ; loop counter

-|bil_width_loop_2nd|

-    pkhtb   r7, r6, r8                      ; src[1] | src[2]

-    pkhtb   r9, r8, r10                     ; src[3] | src[4]

-    smuad   r6, r6, r5                      ; apply filter

-    smuad   r8, r8, r5                      ; apply filter

-    subs    lr, lr, #1

-    smuadx  r7, r7, r5                      ; apply filter

-    smuadx  r9, r9, r5                      ; apply filter

-    add     r0, r0, #8

-    add     r6, r6, #0x40                   ; round_shift_and_clamp

-    add     r7, r7, #0x40

-    usat    r6, #8, r6, asr #7

-    usat    r7, #8, r7, asr #7

-    strb    r6, [r1], r2                    ; the result is transposed back and stored

-    add     r8, r8, #0x40                   ; round_shift_and_clamp

-    strb    r7, [r1], r2

-    add     r9, r9, #0x40

-    usat    r8, #8, r8, asr #7

-    usat    r9, #8, r9, asr #7

-    strb    r8, [r1], r2                    ; the result is transposed back and stored

-    ldrne   r6, [r0]                        ; load data

-    strb    r9, [r1], r2

-    ldrne   r8, [r0, #4]

-    ldrneh  r10, [r0, #8]

-    bne     bil_width_loop_2nd

-    subs    r12, r12, #1

-    add     r0, r0, #4                      ; update src for next row

-    add     r11, r11, #1

-    mov     r1, r11

-    bne     bil_height_loop_2nd

-    ldmia   sp!, {r4 - r11, pc}

-|bil_null_2nd_filter|

-|bil_height_loop_null_2nd|

-    mov     lr, r3, lsr #2

-|bil_width_loop_null_2nd|

-    ldr     r6, [r0], #4                    ; load data

-    subs    lr, lr, #1

-    ldr     r8, [r0], #4

-    strb    r6, [r1], r2                    ; store data

-    mov     r7, r6, lsr #16

-    strb    r7, [r1], r2

-    mov     r9, r8, lsr #16

-    strb    r8, [r1], r2

-    strb    r9, [r1], r2

-    bne     bil_width_loop_null_2nd

-    subs    r12, r12, #1

-    add     r0, r0, #4

-    add     r11, r11, #1

-    mov     r1, r11

-    bne     bil_height_loop_null_2nd

-    ldmia   sp!, {r4 - r11, pc}

-    ENDP  ; |vp9_filter_block2d_second_pass_armv6|

-    END

--- a/vp8/common/arm/armv6/copymem16x16_v6.asm

+++ /dev/null

@@ -1,186 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_copy_mem16x16_v6|

-    ; ARM

-    ; REQUIRE8

-    ; PRESERVE8

-    AREA    Block, CODE, READONLY ; name this block of code

-;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

-|vp9_copy_mem16x16_v6| PROC

-    stmdb       sp!, {r4 - r7}

-    ;push   {r4-r7}

-    ;preload

-    pld     [r0, #31]                ; preload for next 16x16 block

-    ands    r4, r0, #15

-    beq     copy_mem16x16_fast

-    ands    r4, r0, #7

-    beq     copy_mem16x16_8

-    ands    r4, r0, #3

-    beq     copy_mem16x16_4

-    ;copy one byte each time

-    ldrb    r4, [r0]

-    ldrb    r5, [r0, #1]

-    ldrb    r6, [r0, #2]

-    ldrb    r7, [r0, #3]

-    mov     r12, #16

-copy_mem16x16_1_loop

-    strb    r4, [r2]

-    strb    r5, [r2, #1]

-    strb    r6, [r2, #2]

-    strb    r7, [r2, #3]

-    ldrb    r4, [r0, #4]

-    ldrb    r5, [r0, #5]

-    ldrb    r6, [r0, #6]

-    ldrb    r7, [r0, #7]

-    subs    r12, r12, #1

-    strb    r4, [r2, #4]

-    strb    r5, [r2, #5]

-    strb    r6, [r2, #6]

-    strb    r7, [r2, #7]

-    ldrb    r4, [r0, #8]

-    ldrb    r5, [r0, #9]

-    ldrb    r6, [r0, #10]

-    ldrb    r7, [r0, #11]

-    strb    r4, [r2, #8]

-    strb    r5, [r2, #9]

-    strb    r6, [r2, #10]

-    strb    r7, [r2, #11]

-    ldrb    r4, [r0, #12]

-    ldrb    r5, [r0, #13]

-    ldrb    r6, [r0, #14]

-    ldrb    r7, [r0, #15]

-    add     r0, r0, r1

-    strb    r4, [r2, #12]

-    strb    r5, [r2, #13]

-    strb    r6, [r2, #14]

-    strb    r7, [r2, #15]

-    add     r2, r2, r3

-    ldrneb  r4, [r0]

-    ldrneb  r5, [r0, #1]

-    ldrneb  r6, [r0, #2]

-    ldrneb  r7, [r0, #3]

-    pld     [r0, #31]               ; preload for next 16x16 block

-    bne     copy_mem16x16_1_loop

-    ldmia       sp!, {r4 - r7}

-    ;pop        {r4-r7}

-    mov     pc, lr

-;copy 4 bytes each time

-copy_mem16x16_4

-    ldr     r4, [r0]

-    ldr     r5, [r0, #4]

-    ldr     r6, [r0, #8]

-    ldr     r7, [r0, #12]

-    mov     r12, #16

-copy_mem16x16_4_loop

-    subs    r12, r12, #1

-    add     r0, r0, r1

-    str     r4, [r2]

-    str     r5, [r2, #4]

-    str     r6, [r2, #8]

-    str     r7, [r2, #12]

-    add     r2, r2, r3

-    ldrne   r4, [r0]

-    ldrne   r5, [r0, #4]

-    ldrne   r6, [r0, #8]

-    ldrne   r7, [r0, #12]

-    pld     [r0, #31]               ; preload for next 16x16 block

-    bne     copy_mem16x16_4_loop

-    ldmia       sp!, {r4 - r7}

-    ;pop        {r4-r7}

-    mov     pc, lr

-;copy 8 bytes each time

-copy_mem16x16_8

-    sub     r1, r1, #16

-    sub     r3, r3, #16

-    mov     r12, #16

-copy_mem16x16_8_loop

-    ldmia   r0!, {r4-r5}

-    ;ldm        r0, {r4-r5}

-    ldmia   r0!, {r6-r7}

-    add     r0, r0, r1

-    stmia   r2!, {r4-r5}

-    subs    r12, r12, #1

-    ;stm        r2, {r4-r5}

-    stmia   r2!, {r6-r7}

-    add     r2, r2, r3

-    pld     [r0, #31]               ; preload for next 16x16 block

-    bne     copy_mem16x16_8_loop

-    ldmia       sp!, {r4 - r7}

-    ;pop        {r4-r7}

-    mov     pc, lr

-;copy 16 bytes each time

-copy_mem16x16_fast

-    ;sub        r1, r1, #16

-    ;sub        r3, r3, #16

-    mov     r12, #16

-copy_mem16x16_fast_loop

-    ldmia   r0, {r4-r7}

-    ;ldm        r0, {r4-r7}

-    add     r0, r0, r1

-    subs    r12, r12, #1

-    stmia   r2, {r4-r7}

-    ;stm        r2, {r4-r7}

-    add     r2, r2, r3

-    pld     [r0, #31]               ; preload for next 16x16 block

-    bne     copy_mem16x16_fast_loop

-    ldmia       sp!, {r4 - r7}

-    ;pop        {r4-r7}

-    mov     pc, lr

-    ENDP  ; |vp9_copy_mem16x16_v6|

-    END

--- a/vp8/common/arm/armv6/copymem8x4_v6.asm

+++ /dev/null

@@ -1,128 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_copy_mem8x4_v6|

-    ; ARM

-    ; REQUIRE8

-    ; PRESERVE8

-    AREA    Block, CODE, READONLY ; name this block of code

-;void vp9_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

-|vp9_copy_mem8x4_v6| PROC

-    ;push   {r4-r5}

-    stmdb  sp!, {r4-r5}

-    ;preload

-    pld     [r0]

-    pld     [r0, r1]

-    pld     [r0, r1, lsl #1]

-    ands    r4, r0, #7

-    beq     copy_mem8x4_fast

-    ands    r4, r0, #3

-    beq     copy_mem8x4_4

-    ;copy 1 byte each time

-    ldrb    r4, [r0]

-    ldrb    r5, [r0, #1]

-    mov     r12, #4

-copy_mem8x4_1_loop

-    strb    r4, [r2]

-    strb    r5, [r2, #1]

-    ldrb    r4, [r0, #2]

-    ldrb    r5, [r0, #3]

-    subs    r12, r12, #1

-    strb    r4, [r2, #2]

-    strb    r5, [r2, #3]

-    ldrb    r4, [r0, #4]

-    ldrb    r5, [r0, #5]

-    strb    r4, [r2, #4]

-    strb    r5, [r2, #5]

-    ldrb    r4, [r0, #6]

-    ldrb    r5, [r0, #7]

-    add     r0, r0, r1

-    strb    r4, [r2, #6]

-    strb    r5, [r2, #7]

-    add     r2, r2, r3

-    ldrneb  r4, [r0]

-    ldrneb  r5, [r0, #1]

-    bne     copy_mem8x4_1_loop

-    ldmia       sp!, {r4 - r5}

-    ;pop        {r4-r5}

-    mov     pc, lr

-;copy 4 bytes each time

-copy_mem8x4_4

-    ldr     r4, [r0]

-    ldr     r5, [r0, #4]

-    mov     r12, #4

-copy_mem8x4_4_loop

-    subs    r12, r12, #1

-    add     r0, r0, r1

-    str     r4, [r2]

-    str     r5, [r2, #4]

-    add     r2, r2, r3

-    ldrne   r4, [r0]

-    ldrne   r5, [r0, #4]

-    bne     copy_mem8x4_4_loop

-    ldmia  sp!, {r4-r5}

-    ;pop        {r4-r5}

-    mov     pc, lr

-;copy 8 bytes each time

-copy_mem8x4_fast

-    ;sub        r1, r1, #8

-    ;sub        r3, r3, #8

-    mov     r12, #4

-copy_mem8x4_fast_loop

-    ldmia   r0, {r4-r5}

-    ;ldm        r0, {r4-r5}

-    add     r0, r0, r1

-    subs    r12, r12, #1

-    stmia   r2, {r4-r5}

-    ;stm        r2, {r4-r5}

-    add     r2, r2, r3

-    bne     copy_mem8x4_fast_loop

-    ldmia  sp!, {r4-r5}

-    ;pop        {r4-r5}

-    mov     pc, lr

-    ENDP  ; |vp9_copy_mem8x4_v6|

-    END

--- a/vp8/common/arm/armv6/copymem8x8_v6.asm

+++ /dev/null

@@ -1,128 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_copy_mem8x8_v6|

-    ; ARM

-    ; REQUIRE8

-    ; PRESERVE8

-    AREA    Block, CODE, READONLY ; name this block of code

-;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

-|vp9_copy_mem8x8_v6| PROC

-    ;push   {r4-r5}

-    stmdb  sp!, {r4-r5}

-    ;preload

-    pld     [r0]

-    pld     [r0, r1]

-    pld     [r0, r1, lsl #1]

-    ands    r4, r0, #7

-    beq     copy_mem8x8_fast

-    ands    r4, r0, #3

-    beq     copy_mem8x8_4

-    ;copy 1 byte each time

-    ldrb    r4, [r0]

-    ldrb    r5, [r0, #1]

-    mov     r12, #8

-copy_mem8x8_1_loop

-    strb    r4, [r2]

-    strb    r5, [r2, #1]

-    ldrb    r4, [r0, #2]

-    ldrb    r5, [r0, #3]

-    subs    r12, r12, #1

-    strb    r4, [r2, #2]

-    strb    r5, [r2, #3]

-    ldrb    r4, [r0, #4]

-    ldrb    r5, [r0, #5]

-    strb    r4, [r2, #4]

-    strb    r5, [r2, #5]

-    ldrb    r4, [r0, #6]

-    ldrb    r5, [r0, #7]

-    add     r0, r0, r1

-    strb    r4, [r2, #6]

-    strb    r5, [r2, #7]

-    add     r2, r2, r3

-    ldrneb  r4, [r0]

-    ldrneb  r5, [r0, #1]

-    bne     copy_mem8x8_1_loop

-    ldmia       sp!, {r4 - r5}

-    ;pop        {r4-r5}

-    mov     pc, lr

-;copy 4 bytes each time

-copy_mem8x8_4

-    ldr     r4, [r0]

-    ldr     r5, [r0, #4]

-    mov     r12, #8

-copy_mem8x8_4_loop

-    subs    r12, r12, #1

-    add     r0, r0, r1

-    str     r4, [r2]

-    str     r5, [r2, #4]

-    add     r2, r2, r3

-    ldrne   r4, [r0]

-    ldrne   r5, [r0, #4]

-    bne     copy_mem8x8_4_loop

-    ldmia       sp!, {r4 - r5}

-    ;pop        {r4-r5}

-    mov     pc, lr

-;copy 8 bytes each time

-copy_mem8x8_fast

-    ;sub        r1, r1, #8

-    ;sub        r3, r3, #8

-    mov     r12, #8

-copy_mem8x8_fast_loop

-    ldmia   r0, {r4-r5}

-    ;ldm        r0, {r4-r5}

-    add     r0, r0, r1

-    subs    r12, r12, #1

-    stmia   r2, {r4-r5}

-    ;stm        r2, {r4-r5}

-    add     r2, r2, r3

-    bne     copy_mem8x8_fast_loop

-    ldmia  sp!, {r4-r5}

-    ;pop        {r4-r5}

-    mov     pc, lr

-    ENDP  ; |vp9_copy_mem8x8_v6|

-    END

--- a/vp8/common/arm/armv6/dc_only_idct_add_v6.asm

+++ /dev/null

@@ -1,67 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-    EXPORT  |vp8_dc_only_idct_add_v6|

-    AREA    |.text|, CODE, READONLY

-;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr,

-;                             unsigned char *dst_ptr, int pitch, int stride)

-; r0  input_dc

-; r1  pred_ptr

-; r2  dest_ptr

-; r3  pitch

-; sp  stride

-|vp8_dc_only_idct_add_v6| PROC

-    stmdb       sp!, {r4 - r7, lr}

-    add         r0, r0, #4                ; input_dc += 4

-    ldr         r12, c0x0000FFFF

-    ldr         r4, [r1], r3

-    ldr         r6, [r1], r3

-    and         r0, r12, r0, asr #3       ; input_dc >> 3 + mask

-    ldr         lr, [sp, #20]

-    orr         r0, r0, r0, lsl #16       ; a1 | a1

-    uxtab16     r5, r0, r4                ; a1+2 | a1+0

-    uxtab16     r4, r0, r4, ror #8        ; a1+3 | a1+1

-    uxtab16     r7, r0, r6

-    uxtab16     r6, r0, r6, ror #8

-    usat16      r5, #8, r5

-    usat16      r4, #8, r4

-    usat16      r7, #8, r7

-    usat16      r6, #8, r6

-    orr         r5, r5, r4, lsl #8

-    orr         r7, r7, r6, lsl #8

-    ldr         r4, [r1], r3

-    ldr         r6, [r1]

-    str         r5, [r2], lr

-    str         r7, [r2], lr

-    uxtab16     r5, r0, r4

-    uxtab16     r4, r0, r4, ror #8

-    uxtab16     r7, r0, r6

-    uxtab16     r6, r0, r6, ror #8

-    usat16      r5, #8, r5

-    usat16      r4, #8, r4

-    usat16      r7, #8, r7

-    usat16      r6, #8, r6

-    orr         r5, r5, r4, lsl #8

-    orr         r7, r7, r6, lsl #8

-    str         r5, [r2], lr

-    str         r7, [r2]

-    ldmia       sp!, {r4 - r7, pc}

-    ENDP  ; |vp8_dc_only_idct_add_v6|

-; Constant Pool

-c0x0000FFFF DCD 0x0000FFFF

-    END

--- a/vp8/common/arm/armv6/filter_v6.asm

+++ /dev/null

@@ -1,624 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_filter_block2d_first_pass_armv6|

-    EXPORT  |vp9_filter_block2d_first_pass_16x16_armv6|

-    EXPORT  |vp9_filter_block2d_first_pass_8x8_armv6|

-    EXPORT  |vp9_filter_block2d_second_pass_armv6|

-    EXPORT  |vp9_filter4_block2d_second_pass_armv6|

-    EXPORT  |vp9_filter_block2d_first_pass_only_armv6|

-    EXPORT  |vp9_filter_block2d_second_pass_only_armv6|

-    AREA    |.text|, CODE, READONLY  ; name this block of code

-;-------------------------------------

-; r0    unsigned char *src_ptr

-; r1    short         *output_ptr

-; r2    unsigned int src_pixels_per_line

-; r3    unsigned int output_width

-; stack unsigned int output_height

-; stack const short *vp9_filter

-;-------------------------------------

-; vp9_filter the input and put in the output array.  Apply the 6 tap FIR filter with

-; the output being a 2 byte value and the intput being a 1 byte value.

-|vp9_filter_block2d_first_pass_armv6| PROC

-    stmdb   sp!, {r4 - r11, lr}

-    ldr     r11, [sp, #40]                  ; vp9_filter address

-    ldr     r7, [sp, #36]                   ; output height

-    sub     r2, r2, r3                      ; inside loop increments input array,

-                                            ; so the height loop only needs to add

-                                            ; r2 - width to the input pointer

-    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts

-    add     r12, r3, #16                    ; square off the output

-    sub     sp, sp, #4

-    ldr     r4, [r11]                       ; load up packed filter coefficients

-    ldr     r5, [r11, #4]

-    ldr     r6, [r11, #8]

-    str     r1, [sp]                        ; push destination to stack

-    mov     r7, r7, lsl #16                 ; height is top part of counter

-; six tap filter

-|height_loop_1st_6|

-    ldrb    r8, [r0, #-2]                   ; load source data

-    ldrb    r9, [r0, #-1]

-    ldrb    r10, [r0], #2

-    orr     r7, r7, r3, lsr #2              ; construct loop counter

-|width_loop_1st_6|

-    ldrb    r11, [r0, #-1]

-    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8

-    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9

-    ldrb    r9, [r0]

-    smuad   lr, lr, r4                      ; apply the filter

-    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10

-    smuad   r8, r8, r4

-    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11

-    smlad   lr, r10, r5, lr

-    ldrb    r10, [r0, #1]

-    smlad   r8, r11, r5, r8

-    ldrb    r11, [r0, #2]

-    sub     r7, r7, #1

-    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9

-    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10

-    smlad   lr, r9, r6, lr

-    smlad   r11, r10, r6, r8

-    ands    r10, r7, #0xff                  ; test loop counter

-    add     lr, lr, #0x40                   ; round_shift_and_clamp

-    ldrneb  r8, [r0, #-2]                   ; load data for next loop

-    usat    lr, #8, lr, asr #7

-    add     r11, r11, #0x40

-    ldrneb  r9, [r0, #-1]

-    usat    r11, #8, r11, asr #7

-    strh    lr, [r1], r12                   ; result is transposed and stored, which

-                                            ; will make second pass filtering easier.

-    ldrneb  r10, [r0], #2

-    strh    r11, [r1], r12

-    bne     width_loop_1st_6

-    ldr     r1, [sp]                        ; load and update dst address

-    subs    r7, r7, #0x10000

-    add     r0, r0, r2                      ; move to next input line

-    add     r1, r1, #2                      ; move over to next column

-    str     r1, [sp]

-    bne     height_loop_1st_6

-    add     sp, sp, #4

-    ldmia   sp!, {r4 - r11, pc}

-    ENDP

-; --------------------------

-; 16x16 version

-; -----------------------------

-|vp9_filter_block2d_first_pass_16x16_armv6| PROC

-    stmdb   sp!, {r4 - r11, lr}

-    ldr     r11, [sp, #40]                  ; vp9_filter address

-    ldr     r7, [sp, #36]                   ; output height

-    add     r4, r2, #18                     ; preload next low

-    pld     [r0, r4]

-    sub     r2, r2, r3                      ; inside loop increments input array,

-                                            ; so the height loop only needs to add

-                                            ; r2 - width to the input pointer

-    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts

-    add     r12, r3, #16                    ; square off the output

-    sub     sp, sp, #4

-    ldr     r4, [r11]                       ; load up packed filter coefficients

-    ldr     r5, [r11, #4]

-    ldr     r6, [r11, #8]

-    str     r1, [sp]                        ; push destination to stack

-    mov     r7, r7, lsl #16                 ; height is top part of counter

-; six tap filter

-|height_loop_1st_16_6|

-    ldrb    r8, [r0, #-2]                   ; load source data

-    ldrb    r9, [r0, #-1]

-    ldrb    r10, [r0], #2

-    orr     r7, r7, r3, lsr #2              ; construct loop counter

-|width_loop_1st_16_6|

-    ldrb    r11, [r0, #-1]

-    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8

-    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9

-    ldrb    r9, [r0]

-    smuad   lr, lr, r4                      ; apply the filter

-    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10

-    smuad   r8, r8, r4

-    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11

-    smlad   lr, r10, r5, lr

-    ldrb    r10, [r0, #1]

-    smlad   r8, r11, r5, r8

-    ldrb    r11, [r0, #2]

-    sub     r7, r7, #1

-    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9

-    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10

-    smlad   lr, r9, r6, lr

-    smlad   r11, r10, r6, r8

-    ands    r10, r7, #0xff                  ; test loop counter

-    add     lr, lr, #0x40                   ; round_shift_and_clamp

-    ldrneb  r8, [r0, #-2]                   ; load data for next loop

-    usat    lr, #8, lr, asr #7

-    add     r11, r11, #0x40

-    ldrneb  r9, [r0, #-1]

-    usat    r11, #8, r11, asr #7

-    strh    lr, [r1], r12                   ; result is transposed and stored, which

-                                            ; will make second pass filtering easier.

-    ldrneb  r10, [r0], #2

-    strh    r11, [r1], r12

-    bne     width_loop_1st_16_6

-    ldr     r1, [sp]                        ; load and update dst address

-    subs    r7, r7, #0x10000

-    add     r0, r0, r2                      ; move to next input line

-    add     r11, r2, #34                    ; adding back block width(=16)

-    pld     [r0, r11]                       ; preload next low

-    add     r1, r1, #2                      ; move over to next column

-    str     r1, [sp]

-    bne     height_loop_1st_16_6

-    add     sp, sp, #4

-    ldmia   sp!, {r4 - r11, pc}

-    ENDP

-; --------------------------

-; 8x8 version

-; -----------------------------

-|vp9_filter_block2d_first_pass_8x8_armv6| PROC

-    stmdb   sp!, {r4 - r11, lr}

-    ldr     r11, [sp, #40]                  ; vp9_filter address

-    ldr     r7, [sp, #36]                   ; output height

-    add     r4, r2, #10                     ; preload next low

-    pld     [r0, r4]

-    sub     r2, r2, r3                      ; inside loop increments input array,

-                                            ; so the height loop only needs to add

-                                            ; r2 - width to the input pointer

-    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts

-    add     r12, r3, #16                    ; square off the output

-    sub     sp, sp, #4

-    ldr     r4, [r11]                       ; load up packed filter coefficients

-    ldr     r5, [r11, #4]

-    ldr     r6, [r11, #8]

-    str     r1, [sp]                        ; push destination to stack

-    mov     r7, r7, lsl #16                 ; height is top part of counter

-; six tap filter

-|height_loop_1st_8_6|

-    ldrb    r8, [r0, #-2]                   ; load source data

-    ldrb    r9, [r0, #-1]

-    ldrb    r10, [r0], #2

-    orr     r7, r7, r3, lsr #2              ; construct loop counter

-|width_loop_1st_8_6|

-    ldrb    r11, [r0, #-1]

-    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8

-    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9

-    ldrb    r9, [r0]

-    smuad   lr, lr, r4                      ; apply the filter

-    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10

-    smuad   r8, r8, r4

-    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11

-    smlad   lr, r10, r5, lr

-    ldrb    r10, [r0, #1]

-    smlad   r8, r11, r5, r8

-    ldrb    r11, [r0, #2]

-    sub     r7, r7, #1

-    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9

-    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10

-    smlad   lr, r9, r6, lr

-    smlad   r11, r10, r6, r8

-    ands    r10, r7, #0xff                  ; test loop counter

-    add     lr, lr, #0x40                   ; round_shift_and_clamp

-    ldrneb  r8, [r0, #-2]                   ; load data for next loop

-    usat    lr, #8, lr, asr #7

-    add     r11, r11, #0x40

-    ldrneb  r9, [r0, #-1]

-    usat    r11, #8, r11, asr #7

-    strh    lr, [r1], r12                   ; result is transposed and stored, which

-                                            ; will make second pass filtering easier.

-    ldrneb  r10, [r0], #2

-    strh    r11, [r1], r12

-    bne     width_loop_1st_8_6

-    ldr     r1, [sp]                        ; load and update dst address

-    subs    r7, r7, #0x10000

-    add     r0, r0, r2                      ; move to next input line

-    add     r11, r2, #18                    ; adding back block width(=8)

-    pld     [r0, r11]                       ; preload next low

-    add     r1, r1, #2                      ; move over to next column

-    str     r1, [sp]

-    bne     height_loop_1st_8_6

-    add     sp, sp, #4

-    ldmia   sp!, {r4 - r11, pc}

-    ENDP

-;---------------------------------

-; r0    short         *src_ptr,

-; r1    unsigned char *output_ptr,

-; r2    unsigned int output_pitch,

-; r3    unsigned int cnt,

-; stack const short *vp9_filter

-;---------------------------------

-|vp9_filter_block2d_second_pass_armv6| PROC

-    stmdb   sp!, {r4 - r11, lr}

-    ldr     r11, [sp, #36]                  ; vp9_filter address

-    sub     sp, sp, #4

-    mov     r7, r3, lsl #16                 ; height is top part of counter

-    str     r1, [sp]                        ; push destination to stack

-    ldr     r4, [r11]                       ; load up packed filter coefficients

-    ldr     r5, [r11, #4]

-    ldr     r6, [r11, #8]

-    pkhbt   r12, r5, r4                     ; pack the filter differently

-    pkhbt   r11, r6, r5

-    sub     r0, r0, #4                      ; offset input buffer

-|height_loop_2nd|

-    ldr     r8, [r0]                        ; load the data

-    ldr     r9, [r0, #4]

-    orr     r7, r7, r3, lsr #1              ; loop counter

-|width_loop_2nd|

-    smuad   lr, r4, r8                      ; apply filter

-    sub     r7, r7, #1

-    smulbt  r8, r4, r8

-    ldr     r10, [r0, #8]

-    smlad   lr, r5, r9, lr

-    smladx  r8, r12, r9, r8

-    ldrh    r9, [r0, #12]

-    smlad   lr, r6, r10, lr

-    smladx  r8, r11, r10, r8

-    add     r0, r0, #4

-    smlatb  r10, r6, r9, r8

-    add     lr, lr, #0x40                   ; round_shift_and_clamp

-    ands    r8, r7, #0xff

-    usat    lr, #8, lr, asr #7

-    add     r10, r10, #0x40

-    strb    lr, [r1], r2                    ; the result is transposed back and stored

-    usat    r10, #8, r10, asr #7

-    ldrne   r8, [r0]                        ; load data for next loop

-    ldrne   r9, [r0, #4]

-    strb    r10, [r1], r2

-    bne     width_loop_2nd

-    ldr     r1, [sp]                        ; update dst for next loop

-    subs    r7, r7, #0x10000

-    add     r0, r0, #16                     ; updata src for next loop

-    add     r1, r1, #1

-    str     r1, [sp]

-    bne     height_loop_2nd

-    add     sp, sp, #4

-    ldmia   sp!, {r4 - r11, pc}

-    ENDP

-;---------------------------------

-; r0    short         *src_ptr,

-; r1    unsigned char *output_ptr,

-; r2    unsigned int output_pitch,

-; r3    unsigned int cnt,

-; stack const short *vp9_filter

-;---------------------------------

-|vp9_filter4_block2d_second_pass_armv6| PROC

-    stmdb   sp!, {r4 - r11, lr}

-    ldr     r11, [sp, #36]                  ; vp9_filter address

-    mov     r7, r3, lsl #16                 ; height is top part of counter

-    ldr     r4, [r11]                       ; load up packed filter coefficients

-    add     lr, r1, r3                      ; save final destination pointer

-    ldr     r5, [r11, #4]

-    ldr     r6, [r11, #8]

-    pkhbt   r12, r5, r4                     ; pack the filter differently

-    pkhbt   r11, r6, r5

-    mov     r4, #0x40                       ; rounding factor (for smlad{x})

-|height_loop_2nd_4|

-    ldrd    r8, [r0, #-4]                   ; load the data

-    orr     r7, r7, r3, lsr #1              ; loop counter

-|width_loop_2nd_4|

-    ldr     r10, [r0, #4]!

-    smladx  r6, r9, r12, r4                 ; apply filter

-    pkhbt   r8, r9, r8

-    smlad   r5, r8, r12, r4

-    pkhbt   r8, r10, r9

-    smladx  r6, r10, r11, r6

-    sub     r7, r7, #1

-    smlad   r5, r8, r11, r5

-    mov     r8, r9                          ; shift the data for the next loop

-    mov     r9, r10

-    usat    r6, #8, r6, asr #7              ; shift and clamp

-    usat    r5, #8, r5, asr #7

-    strb    r5, [r1], r2                    ; the result is transposed back and stored

-    tst     r7, #0xff

-    strb    r6, [r1], r2

-    bne     width_loop_2nd_4

-    subs    r7, r7, #0x10000

-    add     r0, r0, #16                     ; update src for next loop

-    sub     r1, lr, r7, lsr #16             ; update dst for next loop

-    bne     height_loop_2nd_4

-    ldmia   sp!, {r4 - r11, pc}

-    ENDP

-;------------------------------------

-; r0    unsigned char *src_ptr

-; r1    unsigned char *output_ptr,

-; r2    unsigned int src_pixels_per_line

-; r3    unsigned int cnt,

-; stack unsigned int output_pitch,

-; stack const short *vp9_filter

-;------------------------------------

-|vp9_filter_block2d_first_pass_only_armv6| PROC

-    stmdb   sp!, {r4 - r11, lr}

-    add     r7, r2, r3                      ; preload next low

-    add     r7, r7, #2

-    pld     [r0, r7]

-    ldr     r4, [sp, #36]                   ; output pitch

-    ldr     r11, [sp, #40]                  ; HFilter address

-    sub     sp, sp, #8

-    mov     r7, r3

-    sub     r2, r2, r3                      ; inside loop increments input array,

-                                            ; so the height loop only needs to add

-                                            ; r2 - width to the input pointer

-    sub     r4, r4, r3

-    str     r4, [sp]                        ; save modified output pitch

-    str     r2, [sp, #4]

-    mov     r2, #0x40

-    ldr     r4, [r11]                       ; load up packed filter coefficients

-    ldr     r5, [r11, #4]

-    ldr     r6, [r11, #8]

-; six tap filter

-|height_loop_1st_only_6|

-    ldrb    r8, [r0, #-2]                   ; load data

-    ldrb    r9, [r0, #-1]

-    ldrb    r10, [r0], #2

-    mov     r12, r3, lsr #1                 ; loop counter

-|width_loop_1st_only_6|

-    ldrb    r11, [r0, #-1]

-    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8

-    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9

-    ldrb    r9, [r0]

-;;  smuad   lr, lr, r4

-    smlad   lr, lr, r4, r2

-    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10

-;;  smuad   r8, r8, r4

-    smlad   r8, r8, r4, r2

-    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11

-    smlad   lr, r10, r5, lr

-    ldrb    r10, [r0, #1]

-    smlad   r8, r11, r5, r8

-    ldrb    r11, [r0, #2]

-    subs    r12, r12, #1

-    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9

-    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10

-    smlad   lr, r9, r6, lr

-    smlad   r10, r10, r6, r8

-;;  add     lr, lr, #0x40                   ; round_shift_and_clamp

-    ldrneb  r8, [r0, #-2]                   ; load data for next loop

-    usat    lr, #8, lr, asr #7

-;;  add     r10, r10, #0x40

-    strb    lr, [r1], #1                    ; store the result

-    usat    r10, #8, r10, asr #7

-    ldrneb  r9, [r0, #-1]

-    strb    r10, [r1], #1

-    ldrneb  r10, [r0], #2

-    bne     width_loop_1st_only_6

-    ldr     lr, [sp]                        ; load back output pitch

-    ldr     r12, [sp, #4]                   ; load back output pitch

-    subs    r7, r7, #1

-    add     r0, r0, r12                     ; updata src for next loop

-    add     r11, r12, r3                    ; preload next low

-    add     r11, r11, #2

-    pld     [r0, r11]

-    add     r1, r1, lr                      ; update dst for next loop

-    bne     height_loop_1st_only_6

-    add     sp, sp, #8

-    ldmia   sp!, {r4 - r11, pc}

-    ENDP  ; |vp9_filter_block2d_first_pass_only_armv6|

-;------------------------------------

-; r0    unsigned char *src_ptr,

-; r1    unsigned char *output_ptr,

-; r2    unsigned int src_pixels_per_line

-; r3    unsigned int cnt,

-; stack unsigned int output_pitch,

-; stack const short *vp9_filter

-;------------------------------------

-|vp9_filter_block2d_second_pass_only_armv6| PROC

-    stmdb   sp!, {r4 - r11, lr}

-    ldr     r11, [sp, #40]                  ; VFilter address

-    ldr     r12, [sp, #36]                  ; output pitch

-    mov     r7, r3, lsl #16                 ; height is top part of counter

-    sub     r0, r0, r2, lsl #1              ; need 6 elements for filtering, 2 before, 3 after

-    sub     sp, sp, #8

-    ldr     r4, [r11]                       ; load up packed filter coefficients

-    ldr     r5, [r11, #4]

-    ldr     r6, [r11, #8]

-    str     r0, [sp]                        ; save r0 to stack

-    str     r1, [sp, #4]                    ; save dst to stack

-; six tap filter

-|width_loop_2nd_only_6|

-    ldrb    r8, [r0], r2                    ; load data

-    orr     r7, r7, r3                      ; loop counter

-    ldrb    r9, [r0], r2

-    ldrb    r10, [r0], r2

-|height_loop_2nd_only_6|

-    ; filter first column in this inner loop, than, move to next colum.

-    ldrb    r11, [r0], r2

-    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8

-    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9

-    ldrb    r9, [r0], r2

-    smuad   lr, lr, r4

-    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10

-    smuad   r8, r8, r4

-    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11

-    smlad   lr, r10, r5, lr

-    ldrb    r10, [r0], r2

-    smlad   r8, r11, r5, r8

-    ldrb    r11, [r0]

-    sub     r7, r7, #2

-    sub     r0, r0, r2, lsl #2

-    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9

-    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10

-    smlad   lr, r9, r6, lr

-    smlad   r10, r10, r6, r8

-    ands    r9, r7, #0xff

-    add     lr, lr, #0x40                   ; round_shift_and_clamp

-    ldrneb  r8, [r0], r2                    ; load data for next loop

-    usat    lr, #8, lr, asr #7

-    add     r10, r10, #0x40

-    strb    lr, [r1], r12                   ; store the result for the column

-    usat    r10, #8, r10, asr #7

-    ldrneb  r9, [r0], r2

-    strb    r10, [r1], r12

-    ldrneb  r10, [r0], r2

-    bne     height_loop_2nd_only_6

-    ldr     r0, [sp]

-    ldr     r1, [sp, #4]

-    subs    r7, r7, #0x10000

-    add     r0, r0, #1                      ; move to filter next column

-    str     r0, [sp]

-    add     r1, r1, #1

-    str     r1, [sp, #4]

-    bne     width_loop_2nd_only_6

-    add     sp, sp, #8

-    ldmia   sp!, {r4 - r11, pc}

-    ENDP  ; |vp9_filter_block2d_second_pass_only_armv6|

-    END

--- a/vp8/common/arm/armv6/idct_v6.asm

+++ /dev/null

@@ -1,345 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-;                   r0  r1  r2  r3  r4  r5  r6  r7  r8  r9  r10 r11 r12     r14

-    EXPORT  |vp8_short_idct4x4llm_1_v6|

-    EXPORT  |vp8_short_idct4x4llm_v6|

-    EXPORT  |vp8_short_idct4x4llm_v6_scott|

-    EXPORT  |vp8_short_idct4x4llm_v6_dual|

-    AREA    |.text|, CODE, READONLY

-;********************************************************************************

-;*  void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch)

-;*      r0  INT16 * input

-;*      r1  INT16 * output

-;*      r2  INT32 pitch

-;*  bench:  3/5

-;********************************************************************************

-|vp8_short_idct4x4llm_1_v6| PROC         ;   cycles  in  out pit

-            ;

-    ldrsh   r0, [r0]    ; load input[0] 1, r0 un 2

-    add r0, r0, #4  ;   1   +4

-    stmdb   sp!, {r4, r5, lr}   ; make room for wide writes 1                   backup

-    mov r0, r0, asr #3  ; (input[0] + 4) >> 3   1, r0 req`d ^1  >> 3

-    pkhbt   r4, r0, r0, lsl #16 ; pack r0 into r4   1, r0 req`d ^1                  pack

-    mov r5, r4  ; expand                        expand

-    strd    r4, [r1], r2    ; *output = r0, post inc    1

-    strd    r4, [r1], r2    ;   1

-    strd    r4, [r1], r2    ;   1

-    strd    r4, [r1]    ;   1

-            ;

-    ldmia   sp!, {r4, r5, pc}   ; replace vars, return                      restore

-    ENDP        ; |vp8_short_idct4x4llm_1_v6|

-;********************************************************************************

-;********************************************************************************

-;********************************************************************************

-;********************************************************************************

-;*  void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch)

-;*      r0  INT16 * input

-;*      r1  INT16 * output

-;*      r2  INT32 pitch

-;*  bench:

-;********************************************************************************

-|vp8_short_idct4x4llm_v6| PROC           ;   cycles  in  out pit

-            ;

-    stmdb   sp!, {r4-r11, lr}   ; backup registers  1                   backup

-            ;

-    mov r4, #0x00004E00 ;   1                   cst

-    orr r4, r4, #0x0000007B ; cospi8sqrt2minus1

-    mov r5, #0x00008A00 ;   1                       cst

-    orr r5, r5, #0x0000008C ; sinpi8sqrt2

-            ;

-    mov r6, #4  ; i=4   1                           i

-loop1           ;

-    ldrsh   r12, [r0, #8]   ; input[4]  1, r12 unavail 2                                                    [4]

-    ldrsh   r3, [r0, #24]   ; input[12] 1, r3 unavail 2             [12]

-    ldrsh   r8, [r0, #16]   ; input[8]  1, r8 unavail 2                                 [8]

-    ldrsh   r7, [r0], #0x2  ; input[0]  1, r7 unavail 2 ++                          [0]

-    smulwb  r10, r5, r12    ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1                                          t1

-    smulwb  r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16  1, r11 un 2, r3/r4 ^1                                               t2

-    add r9, r7, r8  ; a1 = [0] + [8]    1                                       a1

-    sub r7, r7, r8  ; b1 = [0] - [8]    1                               b1

-    add r11, r3, r11    ; temp2 1

-    rsb r11, r11, r10   ; c1 = temp1 - temp2    1                                               c1

-    smulwb  r3, r5, r3  ; ([12] * sinpi8sqrt2) >> 16    1, r3 un 2, r3/r5 ^ 1               t2

-    smulwb  r10, r4, r12    ; ([4] * cospi8sqrt2minus1) >> 16   1, r10 un 2, r12/r4 ^1                                          t1

-    add r8, r7, r11 ; b1 + c1   1                                   b+c

-    strh    r8, [r1, r2]    ; out[pitch] = b1+c1    1

-    sub r7, r7, r11 ; b1 - c1   1                               b-c

-    add r10, r12, r10   ; temp1 1

-    add r3, r10, r3 ; d1 = temp1 + temp2    1               d1

-    add r10, r9, r3 ; a1 + d1   1                                           a+d

-    sub r3, r9, r3  ; a1 - d1   1               a-d

-    add r8, r2, r2  ; pitch * 2 1                                   p*2

-    strh    r7, [r1, r8]    ; out[pitch*2] = b1-c1  1

-    add r7, r2, r2, lsl #1  ; pitch * 3 1                               p*3

-    strh    r3, [r1, r7]    ; out[pitch*3] = a1-d1  1

-    subs    r6, r6, #1  ; i--   1                           --

-    strh    r10, [r1], #0x2 ; out[0] = a1+d1    1       ++

-    bne loop1   ; if i>0, continue

-            ;

-    sub r1, r1, #8  ; set up out for next loop  1       -4

-            ; for this iteration, input=prev output

-    mov r6, #4  ; i=4   1                           i

-;   b   returnfull

-loop2           ;

-    ldrsh   r11, [r1, #2]   ; input[1]  1, r11 un 2                                             [1]

-    ldrsh   r8, [r1, #6]    ; input[3]  1, r8 un 2                                  [3]

-    ldrsh   r3, [r1, #4]    ; input[2]  1, r3 un 2              [2]

-    ldrsh   r0, [r1]    ; input[0]  1, r0 un 2  [0]

-    smulwb  r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1                                       t1

-    smulwb  r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16   1, r10 un 2, r4/r8 ^1                                           t2

-    add r7, r0, r3  ; a1 = [0] + [2]    1                               a1

-    sub r0, r0, r3  ; b1 = [0] - [2]    1   b1

-    add r10, r8, r10    ; temp2 1

-    rsb r9, r10, r9 ; c1 = temp1 - temp2    1                                       c1

-    smulwb  r8, r5, r8  ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1                                    t2

-    smulwb  r10, r4, r11    ; ([1] * cospi8sqrt2minus1) >> 16   1, r10 un 2, r4/r11 ^1                                          t1

-    add r3, r0, r9  ; b1+c1 1               b+c

-    add r3, r3, #4  ; b1+c1+4   1               +4

-    add r10, r11, r10   ; temp1 1

-    mov r3, r3, asr #3  ; b1+c1+4 >> 3  1, r3 ^1                >>3

-    strh    r3, [r1, #2]    ; out[1] = b1+c1    1

-    add r10, r10, r8    ; d1 = temp1 + temp2    1                                           d1

-    add r3, r7, r10 ; a1+d1 1               a+d

-    add r3, r3, #4  ; a1+d1+4   1               +4

-    sub r7, r7, r10 ; a1-d1 1                               a-d

-    add r7, r7, #4  ; a1-d1+4   1                               +4

-    mov r3, r3, asr #3  ; a1+d1+4 >> 3  1, r3 ^1                >>3

-    mov r7, r7, asr #3  ; a1-d1+4 >> 3  1, r7 ^1                                >>3

-    strh    r7, [r1, #6]    ; out[3] = a1-d1    1

-    sub r0, r0, r9  ; b1-c1 1   b-c

-    add r0, r0, #4  ; b1-c1+4   1   +4

-    subs    r6, r6, #1  ; i--   1                           --

-    mov r0, r0, asr #3  ; b1-c1+4 >> 3  1, r0 ^1    >>3

-    strh    r0, [r1, #4]    ; out[2] = b1-c1    1

-    strh    r3, [r1], r2    ; out[0] = a1+d1    1

-;   add r1, r1, r2  ; out += pitch  1       ++

-    bne loop2   ; if i>0, continue

-returnfull          ;

-    ldmia   sp!, {r4 - r11, pc} ; replace vars, return                      restore

-    ENDP

-;********************************************************************************

-;********************************************************************************

-;********************************************************************************

-;********************************************************************************

-;*  void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch)

-;*      r0  INT16 * input

-;*      r1  INT16 * output

-;*      r2  INT32 pitch

-;*  bench:

-;********************************************************************************

-|vp8_short_idct4x4llm_v6_scott| PROC         ;   cycles  in  out pit

-;   mov r0, #0  ;

-;   ldr r0, [r0]    ;

-    stmdb   sp!, {r4 - r11, lr} ; backup registers  1                   backup

-            ;

-    mov r3, #0x00004E00 ;                   cos

-    orr r3, r3, #0x0000007B ; cospi8sqrt2minus1

-    mov r4, #0x00008A00 ;                       sin

-    orr r4, r4, #0x0000008C ; sinpi8sqrt2

-            ;

-    mov r5, #0x2    ; i                         i

-            ;

-short_idct4x4llm_v6_scott_loop1          ;

-    ldr r10, [r0, #(4*2)]   ; i5 | i4                                               5,4

-    ldr r11, [r0, #(12*2)]  ; i13 | i12                                                 13,12

-            ;

-    smulwb  r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16)                             lt1

-    smulwb  r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16)                                  lt2

-            ;

-    smulwb  r12, r3, r10    ; ((ip[4] * cospi8sqrt2misu1) >> 16)                                                        l2t2

-    smulwb  r14, r4, r11    ; ((ip[12] * sinpi8sqrt2) >> 16)                                                                l2t1

-            ;

-    add r6, r6, r7  ; partial c1                                lt1-lt2

-    add r12, r12, r14   ; partial d1                                                        l2t2+l2t1

-            ;

-    smulwt  r14, r4, r10    ; ((ip[5] * sinpi8sqrt2) >> 16)                                                             ht1

-    smulwt  r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16)                                  ht2

-            ;

-    smulwt  r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16)                                       h2t1

-    smulwt  r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16)                                            h2t2

-            ;

-    add r7, r14, r7 ; partial c1_2                                  ht1+ht2

-    sub r8, r8, r9  ; partial d1_2                                      h2t1-h2t2

-            ;

-    pkhbt   r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1                               pack

-    pkhbt   r12, r12, r8, lsl #16   ; partial d1_2 | partial d1_1                                                       pack

-            ;

-    usub16  r6, r6, r10 ; c1_2 | c1_1                               c

-    uadd16  r12, r12, r11   ; d1_2 | d1_1                                                       d

-            ;

-    ldr r10, [r0, #0]   ; i1 | i0                                               1,0

-    ldr r11, [r0, #(8*2)]   ; i9 | i10                                                  9,10

-            ;

-;;;;;;  add r0, r0, #0x4    ;       +4

-;;;;;;  add r1, r1, #0x4    ;           +4

-            ;

-    uadd16  r8, r10, r11    ; i1 + i9 | i0 + i8 aka a1                                      a

-    usub16  r9, r10, r11    ; i1 - i9 | i0 - i8 aka b1                                          b

-            ;

-    uadd16  r7, r8, r12 ; a1 + d1 pair                                  a+d

-    usub16  r14, r8, r12    ; a1 - d1 pair                                                              a-d

-            ;

-    str r7, [r1]    ; op[0] = a1 + d1

-    str r14, [r1, r2]   ; op[pitch*3] = a1 - d1

-            ;

-    add r0, r0, #0x4    ; op[pitch] = b1 + c1       ++

-    add r1, r1, #0x4    ; op[pitch*2] = b1 - c1         ++

-            ;

-    subs    r5, r5, #0x1    ;                           --

-    bne short_idct4x4llm_v6_scott_loop1  ;

-            ;

-    sub r1, r1, #16 ; reset output ptr

-    mov r5, #0x4    ;

-    mov r0, r1  ; input = output

-            ;

-short_idct4x4llm_v6_scott_loop2          ;

-            ;

-    subs    r5, r5, #0x1    ;

-    bne short_idct4x4llm_v6_scott_loop2  ;

-            ;

-    ldmia   sp!, {r4 - r11, pc} ;

-    ENDP        ;

-            ;

-;********************************************************************************

-;********************************************************************************

-;********************************************************************************

-;********************************************************************************

-;*  void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch)

-;*      r0  INT16 * input

-;*      r1  INT16 * output

-;*      r2  INT32 pitch

-;*  bench:

-;********************************************************************************

-|vp8_short_idct4x4llm_v6_dual| PROC          ;   cycles  in  out pit

-            ;

-    stmdb   sp!, {r4-r11, lr}   ; backup registers  1                   backup

-    mov r3, #0x00004E00 ;                   cos

-    orr r3, r3, #0x0000007B ; cospi8sqrt2minus1

-    mov r4, #0x00008A00 ;                       sin

-    orr r4, r4, #0x0000008C ; sinpi8sqrt2

-    mov r5, #0x2    ; i=2                           i

-loop1_dual

-    ldr r6, [r0, #(4*2)]    ; i5 | i4                               5|4

-    ldr r12, [r0, #(12*2)]  ; i13 | i12                                                     13|12

-    ldr r14, [r0, #(8*2)]   ; i9 | i8                                                               9|8

-    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c

-    smulwb  r7, r3, r6  ; (ip[4] * cospi8sqrt2minus1) >> 16                                 4c

-    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s

-    smulwb  r8, r4, r6  ; (ip[4] * sinpi8sqrt2) >> 16                                       4s

-    pkhbt   r7, r7, r9, lsl #16 ; 5c | 4c

-    smulwt  r11, r3, r12    ; (ip[13] * cospi8sqrt2minus1) >> 16                                                    13c

-    pkhbt   r8, r8, r10, lsl #16    ; 5s | 4s

-    uadd16  r6, r6, r7  ; 5c+5 | 4c+4

-    smulwt  r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16                                  13s

-    smulwb  r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16                                            12c

-    smulwb  r10, r4, r12    ; (ip[12] * sinpi8sqrt2) >> 16                                              12s

-    subs    r5, r5, #0x1    ; i--                           --

-    pkhbt   r9, r9, r11, lsl #16    ; 13c | 12c

-    ldr r11, [r0], #0x4 ; i1 | i0       ++                                          1|0

-    pkhbt   r10, r10, r7, lsl #16   ; 13s | 12s

-    uadd16  r7, r12, r9 ; 13c+13 | 12c+12

-    usub16  r7, r8, r7  ; c                                 c

-    uadd16  r6, r6, r10 ; d                             d

-    uadd16  r10, r11, r14   ; a                                             a

-    usub16  r8, r11, r14    ; b                                     b

-    uadd16  r9, r10, r6 ; a+d                                           a+d

-    usub16  r10, r10, r6    ; a-d                                               a-d

-    uadd16  r6, r8, r7  ; b+c                               b+c

-    usub16  r7, r8, r7  ; b-c                                   b-c

-    str r6, [r1, r2]    ; o5 | o4

-    add r6, r2, r2  ; pitch * 2                             p2

-    str r7, [r1, r6]    ; o9 | o8

-    add r6,  r6, r2 ; pitch * 3                             p3

-    str r10, [r1, r6]   ; o13 | o12

-    str r9, [r1], #0x4  ; o1 | o0           ++

-    bne loop1_dual  ;

-    mov r5, #0x2    ; i=2                           i

-    sub r0, r1, #8  ; reset input/output        i/o

-loop2_dual

-    ldr r6, [r0, r2]    ; i5 | i4                               5|4

-    ldr r1, [r0]    ; i1 | i0           1|0

-    ldr r12, [r0, #0x4] ; i3 | i2                                                       3|2

-    add r14, r2, #0x4   ; pitch + 2                                                             p+2

-    ldr r14, [r0, r14]  ; i7 | i6                                                               7|6

-    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c

-    smulwt  r7, r3, r1  ; (ip[1] * cospi8sqrt2minus1) >> 16                                 1c

-    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s

-    smulwt  r8, r4, r1  ; (ip[1] * sinpi8sqrt2) >> 16                                       1s

-    pkhbt   r11, r6, r1, lsl #16    ; i0 | i4                                                   0|4

-    pkhbt   r7, r9, r7, lsl #16 ; 1c | 5c

-    pkhbt   r8, r10, r8, lsl #16    ; 1s | 5s = temp1 �                                     tc1

-    pkhtb   r1, r1, r6, asr #16 ; i1 | i5           1|5

-    uadd16  r1, r7, r1  ; 1c+1 | 5c+5 = temp2 (d)           td2

-    pkhbt   r9, r14, r12, lsl #16   ; i2 | i6                                           2|6

-    uadd16  r10, r11, r9    ; a                                             a

-    usub16  r9, r11, r9 ; b                                         b

-    pkhtb   r6, r12, r14, asr #16   ; i3 | i7                               3|7

-    subs    r5, r5, #0x1    ; i--                           --

-    smulwt  r7, r3, r6  ; (ip[3] * cospi8sqrt2minus1) >> 16                                 3c

-    smulwt  r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16                                                   3s

-    smulwb  r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16                                                     7c

-    smulwb  r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16                                                               7s

-    pkhbt   r7, r12, r7, lsl #16    ; 3c | 7c

-    pkhbt   r11, r14, r11, lsl #16  ; 3s | 7s = temp1 (d)                                                   td1

-    uadd16  r6, r7, r6  ; 3c+3 | 7c+7 = temp2  (c)                              tc2

-    usub16  r12, r8, r6 ; c (o1 | o5)                                                       c

-    uadd16  r6, r11, r1 ; d (o3 | o7)                               d

-    uadd16  r7, r10, r6 ; a+d                                   a+d

-    mov r8, #0x4    ; set up 4's                                        4

-    orr r8, r8, #0x40000    ;                                       4|4

-    usub16  r6, r10, r6 ; a-d                               a-d

-    uadd16  r6, r6, r8  ; a-d+4                             3|7

-    uadd16  r7, r7, r8  ; a+d+4                                 0|4

-    uadd16  r10, r9, r12    ; b+c                                               b+c

-    usub16  r1, r9, r12 ; b-c           b-c

-    uadd16  r10, r10, r8    ; b+c+4                                             1|5

-    uadd16  r1, r1, r8  ; b-c+4         2|6

-    mov r8, r10, asr #19    ; o1 >> 3

-    strh    r8, [r0, #2]    ; o1

-    mov r8, r1, asr #19 ; o2 >> 3

-    strh    r8, [r0, #4]    ; o2

-    mov r8, r6, asr #19 ; o3 >> 3

-    strh    r8, [r0, #6]    ; o3

-    mov r8, r7, asr #19 ; o0 >> 3

-    strh    r8, [r0], r2    ; o0        +p

-    sxth    r10, r10    ;

-    mov r8, r10, asr #3 ; o5 >> 3

-    strh    r8, [r0, #2]    ; o5

-    sxth    r1, r1  ;

-    mov r8, r1, asr #3  ; o6 >> 3

-    strh    r8, [r0, #4]    ; o6

-    sxth    r6, r6  ;

-    mov r8, r6, asr #3  ; o7 >> 3

-    strh    r8, [r0, #6]    ; o7

-    sxth    r7, r7  ;

-    mov r8, r7, asr #3  ; o4 >> 3

-    strh    r8, [r0], r2    ; o4        +p

-;;;;;   subs    r5, r5, #0x1    ; i--                           --

-    bne loop2_dual  ;

-            ;

-    ldmia   sp!, {r4 - r11, pc} ; replace vars, return                      restore

-    ENDP

-    END

--- a/vp8/common/arm/armv6/iwalsh_v6.asm

+++ /dev/null

@@ -1,152 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT |vp8_short_inv_walsh4x4_v6|

-    EXPORT |vp8_short_inv_walsh4x4_1_v6|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA    |.text|, CODE, READONLY  ; name this block of code

-;short vp8_short_inv_walsh4x4_v6(short *input, short *output)

-|vp8_short_inv_walsh4x4_v6| PROC

-    stmdb       sp!, {r4 - r11, lr}

-    ldr         r2, [r0], #4         ; [1  |  0]

-    ldr         r3, [r0], #4         ; [3  |  2]

-    ldr         r4, [r0], #4         ; [5  |  4]

-    ldr         r5, [r0], #4         ; [7  |  6]

-    ldr         r6, [r0], #4         ; [9  |  8]

-    ldr         r7, [r0], #4         ; [11 | 10]

-    ldr         r8, [r0], #4         ; [13 | 12]

-    ldr         r9, [r0]             ; [15 | 14]

-    qadd16      r10, r2, r8          ; a1 [1+13  |  0+12]

-    qadd16      r11, r4, r6          ; b1 [5+9   |  4+8]

-    qsub16      r12, r4, r6          ; c1 [5-9   |  4-8]

-    qsub16      lr, r2, r8           ; d1 [1-13  |  0-12]

-    qadd16      r2, r10, r11         ; a1 + b1 [1  |  0]

-    qadd16      r4, r12, lr          ; c1 + d1 [5  |  4]

-    qsub16      r6, r10, r11         ; a1 - b1 [9  |  8]

-    qsub16      r8, lr, r12          ; d1 - c1 [13 | 12]

-    qadd16      r10, r3, r9          ; a1 [3+15  |  2+14]

-    qadd16      r11, r5, r7          ; b1 [7+11  |  6+10]

-    qsub16      r12, r5, r7          ; c1 [7-11  |  6-10]

-    qsub16      lr, r3, r9           ; d1 [3-15  |  2-14]

-    qadd16      r3, r10, r11         ; a1 + b1 [3  |  2]

-    qadd16      r5, r12, lr          ; c1 + d1 [7  |  6]

-    qsub16      r7, r10, r11         ; a1 - b1 [11 | 10]

-    qsub16      r9, lr, r12          ; d1 - c1 [15 | 14]

-    ; first transform complete

-    qsubaddx    r10, r2, r3          ; [c1|a1] [1-2   |   0+3]

-    qaddsubx    r11, r2, r3          ; [b1|d1] [1+2   |   0-3]

-    qsubaddx    r12, r4, r5          ; [c1|a1] [5-6   |   4+7]

-    qaddsubx    lr, r4, r5           ; [b1|d1] [5+6   |   4-7]

-    qaddsubx    r2, r10, r11         ; [b2|c2] [c1+d1 | a1-b1]

-    qaddsubx    r3, r11, r10         ; [a2|d2] [b1+a1 | d1-c1]

-    ldr         r10, c0x00030003

-    qaddsubx    r4, r12, lr          ; [b2|c2] [c1+d1 | a1-b1]

-    qaddsubx    r5, lr, r12          ; [a2|d2] [b1+a1 | d1-c1]

-    qadd16      r2, r2, r10          ; [b2+3|c2+3]

-    qadd16      r3, r3, r10          ; [a2+3|d2+3]

-    qadd16      r4, r4, r10          ; [b2+3|c2+3]

-    qadd16      r5, r5, r10          ; [a2+3|d2+3]

-    asr         r12, r2, #3          ; [1  |  x]

-    pkhtb       r12, r12, r3, asr #19; [1  |  0]

-    lsl         lr, r3, #16          ; [~3 |  x]

-    lsl         r2, r2, #16          ; [~2 |  x]

-    asr         lr, lr, #3           ; [3  |  x]

-    pkhtb       lr, lr, r2, asr #19  ; [3  |  2]

-    asr         r2, r4, #3           ; [5  |  x]

-    pkhtb       r2, r2, r5, asr #19  ; [5  |  4]

-    lsl         r3, r5, #16          ; [~7 |  x]

-    lsl         r4, r4, #16          ; [~6 |  x]

-    asr         r3, r3, #3           ; [7  |  x]

-    pkhtb       r3, r3, r4, asr #19  ; [7  |  6]

-    str         r12, [r1], #4

-    str         lr, [r1], #4

-    str         r2, [r1], #4

-    str         r3, [r1], #4

-    qsubaddx    r2, r6, r7           ; [c1|a1] [9-10  |  8+11]

-    qaddsubx    r3, r6, r7           ; [b1|d1] [9+10  |  8-11]

-    qsubaddx    r4, r8, r9           ; [c1|a1] [13-14 | 12+15]

-    qaddsubx    r5, r8, r9           ; [b1|d1] [13+14 | 12-15]

-    qaddsubx    r6, r2, r3           ; [b2|c2] [c1+d1 | a1-b1]

-    qaddsubx    r7, r3, r2           ; [a2|d2] [b1+a1 | d1-c1]

-    qaddsubx    r8, r4, r5           ; [b2|c2] [c1+d1 | a1-b1]

-    qaddsubx    r9, r5, r4           ; [a2|d2] [b1+a1 | d1-c1]

-    qadd16      r6, r6, r10          ; [b2+3|c2+3]

-    qadd16      r7, r7, r10          ; [a2+3|d2+3]

-    qadd16      r8, r8, r10          ; [b2+3|c2+3]

-    qadd16      r9, r9, r10          ; [a2+3|d2+3]

-    asr         r2, r6, #3           ; [9  |  x]

-    pkhtb       r2, r2, r7, asr #19  ; [9  |  8]

-    lsl         r3, r7, #16          ; [~11|  x]

-    lsl         r4, r6, #16          ; [~10|  x]

-    asr         r3, r3, #3           ; [11 |  x]

-    pkhtb       r3, r3, r4, asr #19  ; [11 | 10]

-    asr         r4, r8, #3           ; [13 |  x]

-    pkhtb       r4, r4, r9, asr #19  ; [13 | 12]

-    lsl         r5, r9, #16          ; [~15|  x]

-    lsl         r6, r8, #16          ; [~14|  x]

-    asr         r5, r5, #3           ; [15 |  x]

-    pkhtb       r5, r5, r6, asr #19  ; [15 | 14]

-    str         r2, [r1], #4

-    str         r3, [r1], #4

-    str         r4, [r1], #4

-    str         r5, [r1]

-    ldmia       sp!, {r4 - r11, pc}

-    ENDP        ; |vp8_short_inv_walsh4x4_v6|

-;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)

-|vp8_short_inv_walsh4x4_1_v6| PROC

-    ldrsh       r2, [r0]             ; [0]

-    add         r2, r2, #3           ; [0] + 3

-    asr         r2, r2, #3           ; a1 ([0]+3) >> 3

-    lsl         r2, r2, #16          ; [a1 |  x]

-    orr         r2, r2, r2, lsr #16  ; [a1 | a1]

-    str         r2, [r1], #4

-    str         r2, [r1], #4

-    str         r2, [r1], #4

-    str         r2, [r1], #4

-    str         r2, [r1], #4

-    str         r2, [r1], #4

-    str         r2, [r1], #4

-    str         r2, [r1]

-    bx          lr

-    ENDP        ; |vp8_short_inv_walsh4x4_1_v6|

-; Constant Pool

-c0x00030003 DCD 0x00030003

-    END

--- a/vp8/common/arm/armv6/loopfilter_v6.asm

+++ /dev/null

@@ -1,1282 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT |vp9_loop_filter_horizontal_edge_armv6|

-    EXPORT |vp9_mbloop_filter_horizontal_edge_armv6|

-    EXPORT |vp9_loop_filter_vertical_edge_armv6|

-    EXPORT |vp9_mbloop_filter_vertical_edge_armv6|

-    AREA    |.text|, CODE, READONLY  ; name this block of code

-    MACRO

-    TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3

-    ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3

-    ; a0: 03 02 01 00

-    ; a1: 13 12 11 10

-    ; a2: 23 22 21 20

-    ; a3: 33 32 31 30

-    ;     b3 b2 b1 b0

-    uxtb16      $b1, $a1                    ; xx 12 xx 10

-    uxtb16      $b0, $a0                    ; xx 02 xx 00

-    uxtb16      $b3, $a3                    ; xx 32 xx 30

-    uxtb16      $b2, $a2                    ; xx 22 xx 20

-    orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00

-    orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20

-    uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11

-    uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31

-    uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01

-    uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21

-    orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01

-    orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21

-    pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1

-    pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3

-    pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0

-    pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2

-    MEND

-src         RN  r0

-pstep       RN  r1

-count       RN  r5

-;r0     unsigned char *src_ptr,

-;r1     int src_pixel_step,

-;r2     const char *blimit,

-;r3     const char *limit,

-;stack  const char *thresh,

-;stack  int  count

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

-|vp9_loop_filter_horizontal_edge_armv6| PROC

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

-    stmdb       sp!, {r4 - r11, lr}

-    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines

-    ldr         count, [sp, #40]            ; count for 8-in-parallel

-    ldr         r6, [sp, #36]               ; load thresh address

-    sub         sp, sp, #16                 ; create temp buffer

-    ldr         r9, [src], pstep            ; p3

-    ldrb        r4, [r2]                    ; blimit

-    ldr         r10, [src], pstep           ; p2

-    ldrb        r2, [r3]                    ; limit

-    ldr         r11, [src], pstep           ; p1

-    orr         r4, r4, r4, lsl #8

-    ldrb        r3, [r6]                    ; thresh

-    orr         r2, r2, r2, lsl #8

-    mov         count, count, lsl #1        ; 4-in-parallel

-    orr         r4, r4, r4, lsl #16

-    orr         r3, r3, r3, lsl #8

-    orr         r2, r2, r2, lsl #16

-    orr         r3, r3, r3, lsl #16

-|Hnext8|

-    ; vp9_filter_mask() function

-    ; calculate breakout conditions

-    ldr         r12, [src], pstep           ; p0

-    uqsub8      r6, r9, r10                 ; p3 - p2

-    uqsub8      r7, r10, r9                 ; p2 - p3

-    uqsub8      r8, r10, r11                ; p2 - p1

-    uqsub8      r10, r11, r10               ; p1 - p2

-    orr         r6, r6, r7                  ; abs (p3-p2)

-    orr         r8, r8, r10                 ; abs (p2-p1)

-    uqsub8      lr, r6, r2                  ; compare to limit. lr: vp9_filter_mask

-    uqsub8      r8, r8, r2                  ; compare to limit

-    uqsub8      r6, r11, r12                ; p1 - p0

-    orr         lr, lr, r8

-    uqsub8      r7, r12, r11                ; p0 - p1

-    ldr         r9, [src], pstep            ; q0

-    ldr         r10, [src], pstep           ; q1

-    orr         r6, r6, r7                  ; abs (p1-p0)

-    uqsub8      r7, r6, r2                  ; compare to limit

-    uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later

-    orr         lr, lr, r7

-    uqsub8      r6, r11, r10                ; p1 - q1

-    uqsub8      r7, r10, r11                ; q1 - p1

-    uqsub8      r11, r12, r9                ; p0 - q0

-    uqsub8      r12, r9, r12                ; q0 - p0

-    orr         r6, r6, r7                  ; abs (p1-q1)

-    ldr         r7, c0x7F7F7F7F

-    orr         r12, r11, r12               ; abs (p0-q0)

-    ldr         r11, [src], pstep           ; q2

-    uqadd8      r12, r12, r12               ; abs (p0-q0) * 2

-    and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2

-    uqsub8      r7, r9, r10                 ; q0 - q1

-    uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2

-    uqsub8      r6, r10, r9                 ; q1 - q0

-    uqsub8      r12, r12, r4                ; compare to flimit

-    uqsub8      r9, r11, r10                ; q2 - q1

-    orr         lr, lr, r12

-    ldr         r12, [src], pstep           ; q3

-    uqsub8      r10, r10, r11               ; q1 - q2

-    orr         r6, r7, r6                  ; abs (q1-q0)

-    orr         r10, r9, r10                ; abs (q2-q1)

-    uqsub8      r7, r6, r2                  ; compare to limit

-    uqsub8      r10, r10, r2                ; compare to limit

-    uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later

-    orr         lr, lr, r7

-    orr         lr, lr, r10

-    uqsub8      r10, r12, r11               ; q3 - q2

-    uqsub8      r9, r11, r12                ; q2 - q3

-    mvn         r11, #0                     ; r11 == -1

-    orr         r10, r10, r9                ; abs (q3-q2)

-    uqsub8      r10, r10, r2                ; compare to limit

-    mov         r12, #0

-    orr         lr, lr, r10

-    sub         src, src, pstep, lsl #2

-    usub8       lr, r12, lr                 ; use usub8 instead of ssub8

-    sel         lr, r11, r12                ; filter mask: lr

-    cmp         lr, #0

-    beq         hskip_filter                 ; skip filtering

-    sub         src, src, pstep, lsl #1     ; move src pointer down by 6 lines

-    ;vp8_hevmask() function

-    ;calculate high edge variance

-    orr         r10, r6, r8                 ; calculate vp8_hevmask

-    ldr         r7, [src], pstep            ; p1

-    usub8       r10, r12, r10               ; use usub8 instead of ssub8

-    sel         r6, r12, r11                ; obtain vp8_hevmask: r6

-    ;vp9_filter() function

-    ldr         r8, [src], pstep            ; p0

-    ldr         r12, c0x80808080

-    ldr         r9, [src], pstep            ; q0

-    ldr         r10, [src], pstep           ; q1

-    eor         r7, r7, r12                 ; p1 offset to convert to a signed value

-    eor         r8, r8, r12                 ; p0 offset to convert to a signed value

-    eor         r9, r9, r12                 ; q0 offset to convert to a signed value

-    eor         r10, r10, r12               ; q1 offset to convert to a signed value

-    str         r9, [sp]                    ; store qs0 temporarily

-    str         r8, [sp, #4]                ; store ps0 temporarily

-    str         r10, [sp, #8]               ; store qs1 temporarily

-    str         r7, [sp, #12]               ; store ps1 temporarily

-    qsub8       r7, r7, r10                 ; vp9_signed_char_clamp(ps1-qs1)

-    qsub8       r8, r9, r8                  ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))

-    and         r7, r7, r6                  ; vp9_filter (r7) &= hev

-    qadd8       r7, r7, r8

-    ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8

-    qadd8       r7, r7, r8

-    ldr         r10, c0x04040404

-    qadd8       r7, r7, r8

-    and         r7, r7, lr                  ; vp9_filter &= mask;

-    ;modify code for vp8 -- Filter1 = vp9_filter (r7)

-    qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp9_signed_char_clamp(vp9_filter+3)

-    qadd8       r7 , r7 , r10               ; vp9_filter = vp9_signed_char_clamp(vp9_filter+4)

-    mov         r9, #0

-    shadd8      r8 , r8 , r9                ; Filter2 >>= 3

-    shadd8      r7 , r7 , r9                ; vp9_filter >>= 3

-    shadd8      r8 , r8 , r9

-    shadd8      r7 , r7 , r9

-    shadd8      lr , r8 , r9                ; lr: Filter2

-    shadd8      r7 , r7 , r9                ; r7: filter

-    ;usub8      lr, r8, r10                 ; s = (s==4)*-1

-    ;sel        lr, r11, r9

-    ;usub8      r8, r10, r8

-    ;sel        r8, r11, r9

-    ;and        r8, r8, lr                  ; -1 for each element that equals 4

-    ;calculate output

-    ;qadd8      lr, r8, r7                  ; u = vp9_signed_char_clamp(s + vp9_filter)

-    ldr         r8, [sp]                    ; load qs0

-    ldr         r9, [sp, #4]                ; load ps0

-    ldr         r10, c0x01010101

-    qsub8       r8 ,r8, r7                  ; u = vp9_signed_char_clamp(qs0 - vp9_filter)

-    qadd8       r9, r9, lr                  ; u = vp9_signed_char_clamp(ps0 + Filter2)

-    ;end of modification for vp8

-    mov         lr, #0

-    sadd8       r7, r7 , r10                ; vp9_filter += 1

-    shadd8      r7, r7, lr                  ; vp9_filter >>= 1

-    ldr         r11, [sp, #12]              ; load ps1

-    ldr         r10, [sp, #8]               ; load qs1

-    bic         r7, r7, r6                  ; vp9_filter &= ~hev

-    sub         src, src, pstep, lsl #2

-    qadd8       r11, r11, r7                ; u = vp9_signed_char_clamp(ps1 + vp9_filter)

-    qsub8       r10, r10,r7                 ; u = vp9_signed_char_clamp(qs1 - vp9_filter)

-    eor         r11, r11, r12               ; *op1 = u^0x80

-    str         r11, [src], pstep           ; store op1

-    eor         r9, r9, r12                 ; *op0 = u^0x80

-    str         r9, [src], pstep            ; store op0 result

-    eor         r8, r8, r12                 ; *oq0 = u^0x80

-    str         r8, [src], pstep            ; store oq0 result

-    eor         r10, r10, r12               ; *oq1 = u^0x80

-    str         r10, [src], pstep           ; store oq1

-    sub         src, src, pstep, lsl #1

-|hskip_filter|

-    add         src, src, #4

-    sub         src, src, pstep, lsl #2

-    subs        count, count, #1

-    ldrne       r9, [src], pstep            ; p3

-    ldrne       r10, [src], pstep           ; p2

-    ldrne       r11, [src], pstep           ; p1

-    bne         Hnext8

-    add         sp, sp, #16

-    ldmia       sp!, {r4 - r11, pc}

-    ENDP        ; |vp9_loop_filter_horizontal_edge_armv6|

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

-|vp8_mbloop_filter_horizontal_edge_armv6| PROC

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

-    stmdb       sp!, {r4 - r11, lr}

-    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines

-    ldr         count, [sp, #40]            ; count for 8-in-parallel

-    ldr         r6, [sp, #36]               ; load thresh address

-    sub         sp, sp, #16                 ; create temp buffer

-    ldr         r9, [src], pstep            ; p3

-    ldrb        r4, [r2]                    ; blimit

-    ldr         r10, [src], pstep           ; p2

-    ldrb        r2, [r3]                    ; limit

-    ldr         r11, [src], pstep           ; p1

-    orr         r4, r4, r4, lsl #8

-    ldrb        r3, [r6]                    ; thresh

-    orr         r2, r2, r2, lsl #8

-    mov         count, count, lsl #1        ; 4-in-parallel

-    orr         r4, r4, r4, lsl #16

-    orr         r3, r3, r3, lsl #8

-    orr         r2, r2, r2, lsl #16

-    orr         r3, r3, r3, lsl #16

-|MBHnext8|

-    ; vp9_filter_mask() function

-    ; calculate breakout conditions

-    ldr         r12, [src], pstep           ; p0

-    uqsub8      r6, r9, r10                 ; p3 - p2

-    uqsub8      r7, r10, r9                 ; p2 - p3

-    uqsub8      r8, r10, r11                ; p2 - p1

-    uqsub8      r10, r11, r10               ; p1 - p2

-    orr         r6, r6, r7                  ; abs (p3-p2)

-    orr         r8, r8, r10                 ; abs (p2-p1)

-    uqsub8      lr, r6, r2                  ; compare to limit. lr: vp9_filter_mask

-    uqsub8      r8, r8, r2                  ; compare to limit

-    uqsub8      r6, r11, r12                ; p1 - p0

-    orr         lr, lr, r8

-    uqsub8      r7, r12, r11                ; p0 - p1

-    ldr         r9, [src], pstep            ; q0

-    ldr         r10, [src], pstep           ; q1

-    orr         r6, r6, r7                  ; abs (p1-p0)

-    uqsub8      r7, r6, r2                  ; compare to limit

-    uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later

-    orr         lr, lr, r7

-    uqsub8      r6, r11, r10                ; p1 - q1

-    uqsub8      r7, r10, r11                ; q1 - p1

-    uqsub8      r11, r12, r9                ; p0 - q0

-    uqsub8      r12, r9, r12                ; q0 - p0

-    orr         r6, r6, r7                  ; abs (p1-q1)

-    ldr         r7, c0x7F7F7F7F

-    orr         r12, r11, r12               ; abs (p0-q0)

-    ldr         r11, [src], pstep           ; q2

-    uqadd8      r12, r12, r12               ; abs (p0-q0) * 2

-    and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2

-    uqsub8      r7, r9, r10                 ; q0 - q1

-    uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2

-    uqsub8      r6, r10, r9                 ; q1 - q0

-    uqsub8      r12, r12, r4                ; compare to flimit

-    uqsub8      r9, r11, r10                ; q2 - q1

-    orr         lr, lr, r12

-    ldr         r12, [src], pstep           ; q3

-    uqsub8      r10, r10, r11               ; q1 - q2

-    orr         r6, r7, r6                  ; abs (q1-q0)

-    orr         r10, r9, r10                ; abs (q2-q1)

-    uqsub8      r7, r6, r2                  ; compare to limit

-    uqsub8      r10, r10, r2                ; compare to limit

-    uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later

-    orr         lr, lr, r7

-    orr         lr, lr, r10

-    uqsub8      r10, r12, r11               ; q3 - q2

-    uqsub8      r9, r11, r12                ; q2 - q3

-    mvn         r11, #0                     ; r11 == -1

-    orr         r10, r10, r9                ; abs (q3-q2)

-    uqsub8      r10, r10, r2                ; compare to limit

-    mov         r12, #0

-    orr         lr, lr, r10

-    usub8       lr, r12, lr                 ; use usub8 instead of ssub8

-    sel         lr, r11, r12                ; filter mask: lr

-    cmp         lr, #0

-    beq         mbhskip_filter               ; skip filtering

-    ;vp8_hevmask() function

-    ;calculate high edge variance

-    sub         src, src, pstep, lsl #2     ; move src pointer down by 6 lines

-    sub         src, src, pstep, lsl #1

-    orr         r10, r6, r8

-    ldr         r7, [src], pstep            ; p1

-    usub8       r10, r12, r10

-    sel         r6, r12, r11                ; hev mask: r6

-    ;vp8_mbfilter() function

-    ;p2, q2 are only needed at the end. Don't need to load them in now.

-    ldr         r8, [src], pstep            ; p0

-    ldr         r12, c0x80808080

-    ldr         r9, [src], pstep            ; q0

-    ldr         r10, [src]                  ; q1

-    eor         r7, r7, r12                 ; ps1

-    eor         r8, r8, r12                 ; ps0

-    eor         r9, r9, r12                 ; qs0

-    eor         r10, r10, r12               ; qs1

-    qsub8       r12, r9, r8                 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))

-    str         r7, [sp, #12]               ; store ps1 temporarily

-    qsub8       r7, r7, r10                 ; vp9_signed_char_clamp(ps1-qs1)

-    str         r10, [sp, #8]               ; store qs1 temporarily

-    qadd8       r7, r7, r12

-    str         r9, [sp]                    ; store qs0 temporarily

-    qadd8       r7, r7, r12

-    str         r8, [sp, #4]                ; store ps0 temporarily

-    qadd8       r7, r7, r12                 ; vp9_filter: r7

-    ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8

-    ldr         r9, c0x04040404

-    and         r7, r7, lr                  ; vp9_filter &= mask (lr is free)

-    mov         r12, r7                     ; Filter2: r12

-    and         r12, r12, r6                ; Filter2 &= hev

-    ;modify code for vp8

-    ;save bottom 3 bits so that we round one side +4 and the other +3

-    qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp9_signed_char_clamp(Filter2+4)

-    qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp9_signed_char_clamp(Filter2+3)

-    mov         r10, #0

-    shadd8      r8 , r8 , r10               ; Filter1 >>= 3

-    shadd8      r12 , r12 , r10             ; Filter2 >>= 3

-    shadd8      r8 , r8 , r10

-    shadd8      r12 , r12 , r10

-    shadd8      r8 , r8 , r10               ; r8: Filter1

-    shadd8      r12 , r12 , r10             ; r12: Filter2

-    ldr         r9, [sp]                    ; load qs0

-    ldr         r11, [sp, #4]               ; load ps0

-    qsub8       r9 , r9, r8                 ; qs0 = vp9_signed_char_clamp(qs0 - Filter1)

-    qadd8       r11, r11, r12               ; ps0 = vp9_signed_char_clamp(ps0 + Filter2)

-    ;save bottom 3 bits so that we round one side +4 and the other +3

-    ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)

-    ;qadd8      r12 , r12 , r9              ; Filter2 = vp9_signed_char_clamp(Filter2+4)

-    ;mov            r10, #0

-    ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3

-    ;usub8      lr, r8, r9                  ; s = (s==4)*-1

-    ;sel            lr, r11, r10

-    ;shadd8     r12 , r12 , r10

-    ;usub8      r8, r9, r8

-    ;sel            r8, r11, r10

-    ;ldr            r9, [sp]                    ; load qs0

-    ;ldr            r11, [sp, #4]               ; load ps0

-    ;shadd8     r12 , r12 , r10

-    ;and            r8, r8, lr                  ; -1 for each element that equals 4

-    ;qadd8      r10, r8, r12                ; u = vp9_signed_char_clamp(s + Filter2)

-    ;qsub8      r9 , r9, r12                ; qs0 = vp9_signed_char_clamp(qs0 - Filter2)

-    ;qadd8      r11, r11, r10               ; ps0 = vp9_signed_char_clamp(ps0 + u)

-    ;end of modification for vp8

-    bic         r12, r7, r6                 ; vp9_filter &= ~hev    ( r6 is free)

-    ;mov        r12, r7

-    ;roughly 3/7th difference across boundary

-    mov         lr, #0x1b                   ; 27

-    mov         r7, #0x3f                   ; 63

-    sxtb16      r6, r12

-    sxtb16      r10, r12, ror #8

-    smlabb      r8, r6, lr, r7

-    smlatb      r6, r6, lr, r7

-    smlabb      r7, r10, lr, r7

-    smultb      r10, r10, lr

-    ssat        r8, #8, r8, asr #7

-    ssat        r6, #8, r6, asr #7

-    add         r10, r10, #63

-    ssat        r7, #8, r7, asr #7

-    ssat        r10, #8, r10, asr #7

-    ldr         lr, c0x80808080

-    pkhbt       r6, r8, r6, lsl #16

-    pkhbt       r10, r7, r10, lsl #16

-    uxtb16      r6, r6

-    uxtb16      r10, r10

-    sub         src, src, pstep

-    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 27)>>7)

-    qsub8       r8, r9, r10                 ; s = vp9_signed_char_clamp(qs0 - u)

-    qadd8       r10, r11, r10               ; s = vp9_signed_char_clamp(ps0 + u)

-    eor         r8, r8, lr                  ; *oq0 = s^0x80

-    str         r8, [src]                   ; store *oq0

-    sub         src, src, pstep

-    eor         r10, r10, lr                ; *op0 = s^0x80

-    str         r10, [src]                  ; store *op0

-    ;roughly 2/7th difference across boundary

-    mov         lr, #0x12                   ; 18

-    mov         r7, #0x3f                   ; 63

-    sxtb16      r6, r12

-    sxtb16      r10, r12, ror #8

-    smlabb      r8, r6, lr, r7

-    smlatb      r6, r6, lr, r7

-    smlabb      r9, r10, lr, r7

-    smlatb      r10, r10, lr, r7

-    ssat        r8, #8, r8, asr #7

-    ssat        r6, #8, r6, asr #7

-    ssat        r9, #8, r9, asr #7

-    ssat        r10, #8, r10, asr #7

-    ldr         lr, c0x80808080

-    pkhbt       r6, r8, r6, lsl #16

-    pkhbt       r10, r9, r10, lsl #16

-    ldr         r9, [sp, #8]                ; load qs1

-    ldr         r11, [sp, #12]              ; load ps1

-    uxtb16      r6, r6

-    uxtb16      r10, r10

-    sub         src, src, pstep

-    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 18)>>7)

-    qadd8       r11, r11, r10               ; s = vp9_signed_char_clamp(ps1 + u)

-    qsub8       r8, r9, r10                 ; s = vp9_signed_char_clamp(qs1 - u)

-    eor         r11, r11, lr                ; *op1 = s^0x80

-    str         r11, [src], pstep           ; store *op1

-    eor         r8, r8, lr                  ; *oq1 = s^0x80

-    add         src, src, pstep, lsl #1

-    mov         r7, #0x3f                   ; 63

-    str         r8, [src], pstep            ; store *oq1

-    ;roughly 1/7th difference across boundary

-    mov         lr, #0x9                    ; 9

-    ldr         r9, [src]                   ; load q2

-    sxtb16      r6, r12

-    sxtb16      r10, r12, ror #8

-    smlabb      r8, r6, lr, r7

-    smlatb      r6, r6, lr, r7

-    smlabb      r12, r10, lr, r7

-    smlatb      r10, r10, lr, r7

-    ssat        r8, #8, r8, asr #7

-    ssat        r6, #8, r6, asr #7

-    ssat        r12, #8, r12, asr #7

-    ssat        r10, #8, r10, asr #7

-    sub         src, src, pstep, lsl #2

-    pkhbt       r6, r8, r6, lsl #16

-    pkhbt       r10, r12, r10, lsl #16

-    sub         src, src, pstep

-    ldr         lr, c0x80808080

-    ldr         r11, [src]                  ; load p2

-    uxtb16      r6, r6

-    uxtb16      r10, r10

-    eor         r9, r9, lr

-    eor         r11, r11, lr

-    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 9)>>7)

-    qadd8       r8, r11, r10                ; s = vp9_signed_char_clamp(ps2 + u)

-    qsub8       r10, r9, r10                ; s = vp9_signed_char_clamp(qs2 - u)

-    eor         r8, r8, lr                  ; *op2 = s^0x80

-    str         r8, [src], pstep, lsl #2    ; store *op2

-    add         src, src, pstep

-    eor         r10, r10, lr                ; *oq2 = s^0x80

-    str         r10, [src], pstep, lsl #1   ; store *oq2

-|mbhskip_filter|

-    add         src, src, #4

-    sub         src, src, pstep, lsl #3

-    subs        count, count, #1

-    ldrne       r9, [src], pstep            ; p3

-    ldrne       r10, [src], pstep           ; p2

-    ldrne       r11, [src], pstep           ; p1

-    bne         MBHnext8

-    add         sp, sp, #16

-    ldmia       sp!, {r4 - r11, pc}

-    ENDP        ; |vp8_mbloop_filter_horizontal_edge_armv6|

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

-|vp9_loop_filter_vertical_edge_armv6| PROC

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

-    stmdb       sp!, {r4 - r11, lr}

-    sub         src, src, #4                ; move src pointer down by 4

-    ldr         count, [sp, #40]            ; count for 8-in-parallel

-    ldr         r12, [sp, #36]              ; load thresh address

-    sub         sp, sp, #16                 ; create temp buffer

-    ldr         r6, [src], pstep            ; load source data

-    ldrb        r4, [r2]                    ; blimit

-    ldr         r7, [src], pstep

-    ldrb        r2, [r3]                    ; limit

-    ldr         r8, [src], pstep

-    orr         r4, r4, r4, lsl #8

-    ldrb        r3, [r12]                   ; thresh

-    orr         r2, r2, r2, lsl #8

-    ldr         lr, [src], pstep

-    mov         count, count, lsl #1        ; 4-in-parallel

-    orr         r4, r4, r4, lsl #16

-    orr         r3, r3, r3, lsl #8

-    orr         r2, r2, r2, lsl #16

-    orr         r3, r3, r3, lsl #16

-|Vnext8|

-    ; vp9_filter_mask() function

-    ; calculate breakout conditions

-    ; transpose the source data for 4-in-parallel operation

-    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12

-    uqsub8      r7, r9, r10                 ; p3 - p2

-    uqsub8      r8, r10, r9                 ; p2 - p3

-    uqsub8      r9, r10, r11                ; p2 - p1

-    uqsub8      r10, r11, r10               ; p1 - p2

-    orr         r7, r7, r8                  ; abs (p3-p2)

-    orr         r10, r9, r10                ; abs (p2-p1)

-    uqsub8      lr, r7, r2                  ; compare to limit. lr: vp9_filter_mask

-    uqsub8      r10, r10, r2                ; compare to limit

-    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines

-    orr         lr, lr, r10

-    uqsub8      r6, r11, r12                ; p1 - p0

-    uqsub8      r7, r12, r11                ; p0 - p1

-    add         src, src, #4                ; move src pointer up by 4

-    orr         r6, r6, r7                  ; abs (p1-p0)

-    str         r11, [sp, #12]              ; save p1

-    uqsub8      r10, r6, r2                 ; compare to limit

-    uqsub8      r11, r6, r3                 ; compare to thresh

-    orr         lr, lr, r10

-    ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now

-    ; transpose the source data for 4-in-parallel operation

-    ldr         r6, [src], pstep            ; load source data

-    str         r11, [sp]                   ; push r11 to stack

-    ldr         r7, [src], pstep

-    str         r12, [sp, #4]               ; save current reg before load q0 - q3 data

-    ldr         r8, [src], pstep

-    str         lr, [sp, #8]

-    ldr         lr, [src], pstep

-    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12

-    ldr         lr, [sp, #8]                ; load back (f)limit accumulator

-    uqsub8      r6, r12, r11                ; q3 - q2

-    uqsub8      r7, r11, r12                ; q2 - q3

-    uqsub8      r12, r11, r10               ; q2 - q1

-    uqsub8      r11, r10, r11               ; q1 - q2

-    orr         r6, r6, r7                  ; abs (q3-q2)

-    orr         r7, r12, r11                ; abs (q2-q1)

-    uqsub8      r6, r6, r2                  ; compare to limit

-    uqsub8      r7, r7, r2                  ; compare to limit

-    ldr         r11, [sp, #4]               ; load back p0

-    ldr         r12, [sp, #12]              ; load back p1

-    orr         lr, lr, r6

-    orr         lr, lr, r7

-    uqsub8      r6, r11, r9                 ; p0 - q0

-    uqsub8      r7, r9, r11                 ; q0 - p0

-    uqsub8      r8, r12, r10                ; p1 - q1

-    uqsub8      r11, r10, r12               ; q1 - p1

-    orr         r6, r6, r7                  ; abs (p0-q0)

-    ldr         r7, c0x7F7F7F7F

-    orr         r8, r8, r11                 ; abs (p1-q1)

-    uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2

-    and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2

-    uqsub8      r11, r10, r9                ; q1 - q0

-    uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2

-    uqsub8      r12, r9, r10                ; q0 - q1

-    uqsub8      r6, r6, r4                  ; compare to flimit

-    orr         r9, r11, r12                ; abs (q1-q0)

-    uqsub8      r8, r9, r2                  ; compare to limit

-    uqsub8      r10, r9, r3                 ; compare to thresh

-    orr         lr, lr, r6

-    orr         lr, lr, r8

-    mvn         r11, #0                     ; r11 == -1

-    mov         r12, #0

-    usub8       lr, r12, lr

-    ldr         r9, [sp]                    ; load the compared result

-    sel         lr, r11, r12                ; filter mask: lr

-    cmp         lr, #0

-    beq         vskip_filter                 ; skip filtering

-    ;vp8_hevmask() function

-    ;calculate high edge variance

-    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines

-    orr         r9, r9, r10

-    ldrh        r7, [src, #-2]

-    ldrh        r8, [src], pstep

-    usub8       r9, r12, r9

-    sel         r6, r12, r11                ; hev mask: r6

-    ;vp9_filter() function

-    ; load soure data to r6, r11, r12, lr

-    ldrh        r9, [src, #-2]

-    ldrh        r10, [src], pstep

-    pkhbt       r12, r7, r8, lsl #16

-    ldrh        r7, [src, #-2]

-    ldrh        r8, [src], pstep

-    pkhbt       r11, r9, r10, lsl #16

-    ldrh        r9, [src, #-2]

-    ldrh        r10, [src], pstep

-    ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first

-    str         r6, [sp]

-    str         lr, [sp, #4]

-    pkhbt       r6, r7, r8, lsl #16

-    pkhbt       lr, r9, r10, lsl #16

-    ;transpose r12, r11, r6, lr to r7, r8, r9, r10

-    TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10

-    ;load back hev_mask r6 and filter_mask lr

-    ldr         r12, c0x80808080

-    ldr         r6, [sp]

-    ldr         lr, [sp, #4]

-    eor         r7, r7, r12                 ; p1 offset to convert to a signed value

-    eor         r8, r8, r12                 ; p0 offset to convert to a signed value

-    eor         r9, r9, r12                 ; q0 offset to convert to a signed value

-    eor         r10, r10, r12               ; q1 offset to convert to a signed value

-    str         r9, [sp]                    ; store qs0 temporarily

-    str         r8, [sp, #4]                ; store ps0 temporarily

-    str         r10, [sp, #8]               ; store qs1 temporarily

-    str         r7, [sp, #12]               ; store ps1 temporarily

-    qsub8       r7, r7, r10                 ; vp9_signed_char_clamp(ps1-qs1)

-    qsub8       r8, r9, r8                  ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))

-    and         r7, r7, r6                  ;  vp9_filter (r7) &= hev (r7 : filter)

-    qadd8       r7, r7, r8

-    ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8

-    qadd8       r7, r7, r8

-    ldr         r10, c0x04040404

-    qadd8       r7, r7, r8

-    ;mvn         r11, #0                     ; r11 == -1

-    and         r7, r7, lr                  ; vp9_filter &= mask

-    ;modify code for vp8 -- Filter1 = vp9_filter (r7)

-    qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp9_signed_char_clamp(vp9_filter+3)

-    qadd8       r7 , r7 , r10               ; vp9_filter = vp9_signed_char_clamp(vp9_filter+4)

-    mov         r9, #0

-    shadd8      r8 , r8 , r9                ; Filter2 >>= 3

-    shadd8      r7 , r7 , r9                ; vp9_filter >>= 3

-    shadd8      r8 , r8 , r9

-    shadd8      r7 , r7 , r9

-    shadd8      lr , r8 , r9                ; lr: filter2

-    shadd8      r7 , r7 , r9                ; r7: filter

-    ;usub8      lr, r8, r10                 ; s = (s==4)*-1

-    ;sel            lr, r11, r9

-    ;usub8      r8, r10, r8

-    ;sel            r8, r11, r9

-    ;and            r8, r8, lr                  ; -1 for each element that equals 4 -- r8: s

-    ;calculate output

-    ;qadd8      lr, r8, r7                  ; u = vp9_signed_char_clamp(s + vp9_filter)

-    ldr         r8, [sp]                    ; load qs0

-    ldr         r9, [sp, #4]                ; load ps0

-    ldr         r10, c0x01010101

-    qsub8       r8, r8, r7                  ; u = vp9_signed_char_clamp(qs0 - vp9_filter)

-    qadd8       r9, r9, lr                  ; u = vp9_signed_char_clamp(ps0 + Filter2)

-    ;end of modification for vp8

-    eor         r8, r8, r12

-    eor         r9, r9, r12

-    mov         lr, #0

-    sadd8       r7, r7, r10

-    shadd8      r7, r7, lr

-    ldr         r10, [sp, #8]               ; load qs1

-    ldr         r11, [sp, #12]              ; load ps1

-    bic         r7, r7, r6                  ; r7: vp9_filter

-    qsub8       r10 , r10, r7               ; u = vp9_signed_char_clamp(qs1 - vp9_filter)

-    qadd8       r11, r11, r7                ; u = vp9_signed_char_clamp(ps1 + vp9_filter)

-    eor         r10, r10, r12

-    eor         r11, r11, r12

-    sub         src, src, pstep, lsl #2

-    ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1

-    ;output is b0, b1, b2, b3

-    ;b0: 03 02 01 00

-    ;b1: 13 12 11 10

-    ;b2: 23 22 21 20

-    ;b3: 33 32 31 30

-    ;    p1 p0 q0 q1

-    ;   (a3 a2 a1 a0)

-    TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr

-    strh        r6, [src, #-2]              ; store the result

-    mov         r6, r6, lsr #16

-    strh        r6, [src], pstep

-    strh        r7, [src, #-2]

-    mov         r7, r7, lsr #16

-    strh        r7, [src], pstep

-    strh        r12, [src, #-2]

-    mov         r12, r12, lsr #16

-    strh        r12, [src], pstep

-    strh        lr, [src, #-2]

-    mov         lr, lr, lsr #16

-    strh        lr, [src], pstep

-|vskip_filter|

-    sub         src, src, #4

-    subs        count, count, #1

-    ldrne       r6, [src], pstep            ; load source data

-    ldrne       r7, [src], pstep

-    ldrne       r8, [src], pstep

-    ldrne       lr, [src], pstep

-    bne         Vnext8

-    add         sp, sp, #16

-    ldmia       sp!, {r4 - r11, pc}

-    ENDP        ; |vp9_loop_filter_vertical_edge_armv6|

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

-|vp8_mbloop_filter_vertical_edge_armv6| PROC

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

-    stmdb       sp!, {r4 - r11, lr}

-    sub         src, src, #4                ; move src pointer down by 4

-    ldr         count, [sp, #40]            ; count for 8-in-parallel

-    ldr         r12, [sp, #36]              ; load thresh address

-    pld         [src, #23]                  ; preload for next block

-    sub         sp, sp, #16                 ; create temp buffer

-    ldr         r6, [src], pstep            ; load source data

-    ldrb        r4, [r2]                    ; blimit

-    pld         [src, #23]

-    ldr         r7, [src], pstep

-    ldrb        r2, [r3]                    ; limit

-    pld         [src, #23]

-    ldr         r8, [src], pstep

-    orr         r4, r4, r4, lsl #8

-    ldrb        r3, [r12]                   ; thresh

-    orr         r2, r2, r2, lsl #8

-    pld         [src, #23]

-    ldr         lr, [src], pstep

-    mov         count, count, lsl #1        ; 4-in-parallel

-    orr         r4, r4, r4, lsl #16

-    orr         r3, r3, r3, lsl #8

-    orr         r2, r2, r2, lsl #16

-    orr         r3, r3, r3, lsl #16

-|MBVnext8|

-    ; vp9_filter_mask() function

-    ; calculate breakout conditions

-    ; transpose the source data for 4-in-parallel operation

-    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12

-    uqsub8      r7, r9, r10                 ; p3 - p2

-    uqsub8      r8, r10, r9                 ; p2 - p3

-    uqsub8      r9, r10, r11                ; p2 - p1

-    uqsub8      r10, r11, r10               ; p1 - p2

-    orr         r7, r7, r8                  ; abs (p3-p2)

-    orr         r10, r9, r10                ; abs (p2-p1)

-    uqsub8      lr, r7, r2                  ; compare to limit. lr: vp9_filter_mask

-    uqsub8      r10, r10, r2                ; compare to limit

-    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines

-    orr         lr, lr, r10

-    uqsub8      r6, r11, r12                ; p1 - p0

-    uqsub8      r7, r12, r11                ; p0 - p1

-    add         src, src, #4                ; move src pointer up by 4

-    orr         r6, r6, r7                  ; abs (p1-p0)

-    str         r11, [sp, #12]              ; save p1

-    uqsub8      r10, r6, r2                 ; compare to limit

-    uqsub8      r11, r6, r3                 ; compare to thresh

-    orr         lr, lr, r10

-    ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now

-    ; transpose the source data for 4-in-parallel operation

-    ldr         r6, [src], pstep            ; load source data

-    str         r11, [sp]                   ; push r11 to stack

-    ldr         r7, [src], pstep

-    str         r12, [sp, #4]               ; save current reg before load q0 - q3 data

-    ldr         r8, [src], pstep

-    str         lr, [sp, #8]

-    ldr         lr, [src], pstep

-    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12

-    ldr         lr, [sp, #8]                ; load back (f)limit accumulator

-    uqsub8      r6, r12, r11                ; q3 - q2

-    uqsub8      r7, r11, r12                ; q2 - q3

-    uqsub8      r12, r11, r10               ; q2 - q1

-    uqsub8      r11, r10, r11               ; q1 - q2

-    orr         r6, r6, r7                  ; abs (q3-q2)

-    orr         r7, r12, r11                ; abs (q2-q1)

-    uqsub8      r6, r6, r2                  ; compare to limit

-    uqsub8      r7, r7, r2                  ; compare to limit

-    ldr         r11, [sp, #4]               ; load back p0

-    ldr         r12, [sp, #12]              ; load back p1

-    orr         lr, lr, r6

-    orr         lr, lr, r7

-    uqsub8      r6, r11, r9                 ; p0 - q0

-    uqsub8      r7, r9, r11                 ; q0 - p0

-    uqsub8      r8, r12, r10                ; p1 - q1

-    uqsub8      r11, r10, r12               ; q1 - p1

-    orr         r6, r6, r7                  ; abs (p0-q0)

-    ldr         r7, c0x7F7F7F7F

-    orr         r8, r8, r11                 ; abs (p1-q1)

-    uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2

-    and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2

-    uqsub8      r11, r10, r9                ; q1 - q0

-    uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2

-    uqsub8      r12, r9, r10                ; q0 - q1

-    uqsub8      r6, r6, r4                  ; compare to flimit

-    orr         r9, r11, r12                ; abs (q1-q0)

-    uqsub8      r8, r9, r2                  ; compare to limit

-    uqsub8      r10, r9, r3                 ; compare to thresh

-    orr         lr, lr, r6

-    orr         lr, lr, r8

-    mvn         r11, #0                     ; r11 == -1

-    mov         r12, #0

-    usub8       lr, r12, lr

-    ldr         r9, [sp]                    ; load the compared result

-    sel         lr, r11, r12                ; filter mask: lr

-    cmp         lr, #0

-    beq         mbvskip_filter               ; skip filtering

-    ;vp8_hevmask() function

-    ;calculate high edge variance

-    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines

-    orr         r9, r9, r10

-    ldrh        r7, [src, #-2]

-    ldrh        r8, [src], pstep

-    usub8       r9, r12, r9

-    sel         r6, r12, r11                ; hev mask: r6

-    ; vp8_mbfilter() function

-    ; p2, q2 are only needed at the end. Don't need to load them in now.

-    ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first

-    ; load soure data to r6, r11, r12, lr

-    ldrh        r9, [src, #-2]

-    ldrh        r10, [src], pstep

-    pkhbt       r12, r7, r8, lsl #16

-    ldrh        r7, [src, #-2]

-    ldrh        r8, [src], pstep

-    pkhbt       r11, r9, r10, lsl #16

-    ldrh        r9, [src, #-2]

-    ldrh        r10, [src], pstep

-    str         r6, [sp]                    ; save r6

-    str         lr, [sp, #4]                ; save lr

-    pkhbt       r6, r7, r8, lsl #16

-    pkhbt       lr, r9, r10, lsl #16

-    ;transpose r12, r11, r6, lr to p1, p0, q0, q1

-    TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10

-    ;load back hev_mask r6 and filter_mask lr

-    ldr         r12, c0x80808080

-    ldr         r6, [sp]

-    ldr         lr, [sp, #4]

-    eor         r7, r7, r12                 ; ps1

-    eor         r8, r8, r12                 ; ps0

-    eor         r9, r9, r12                 ; qs0

-    eor         r10, r10, r12               ; qs1

-    qsub8       r12, r9, r8                 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))

-    str         r7, [sp, #12]               ; store ps1 temporarily

-    qsub8       r7, r7, r10                 ; vp9_signed_char_clamp(ps1-qs1)

-    str         r10, [sp, #8]               ; store qs1 temporarily

-    qadd8       r7, r7, r12

-    str         r9, [sp]                    ; store qs0 temporarily

-    qadd8       r7, r7, r12

-    str         r8, [sp, #4]                ; store ps0 temporarily

-    qadd8       r7, r7, r12                 ; vp9_filter: r7

-    ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8

-    ldr         r9, c0x04040404

-    ;mvn         r11, #0                     ; r11 == -1

-    and         r7, r7, lr                  ; vp9_filter &= mask (lr is free)

-    mov         r12, r7                     ; Filter2: r12

-    and         r12, r12, r6                ; Filter2 &= hev

-    ;modify code for vp8

-    ;save bottom 3 bits so that we round one side +4 and the other +3

-    qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp9_signed_char_clamp(Filter2+4)

-    qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp9_signed_char_clamp(Filter2+3)

-    mov         r10, #0

-    shadd8      r8 , r8 , r10               ; Filter1 >>= 3

-    shadd8      r12 , r12 , r10             ; Filter2 >>= 3

-    shadd8      r8 , r8 , r10

-    shadd8      r12 , r12 , r10

-    shadd8      r8 , r8 , r10               ; r8: Filter1

-    shadd8      r12 , r12 , r10             ; r12: Filter2

-    ldr         r9, [sp]                    ; load qs0

-    ldr         r11, [sp, #4]               ; load ps0

-    qsub8       r9 , r9, r8                 ; qs0 = vp9_signed_char_clamp(qs0 - Filter1)

-    qadd8       r11, r11, r12               ; ps0 = vp9_signed_char_clamp(ps0 + Filter2)

-    ;save bottom 3 bits so that we round one side +4 and the other +3

-    ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)

-    ;qadd8      r12 , r12 , r9              ; Filter2 = vp9_signed_char_clamp(Filter2+4)

-    ;mov            r10, #0

-    ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3

-    ;usub8      lr, r8, r9                  ; s = (s==4)*-1

-    ;sel            lr, r11, r10

-    ;shadd8     r12 , r12 , r10

-    ;usub8      r8, r9, r8

-    ;sel            r8, r11, r10

-    ;ldr            r9, [sp]                    ; load qs0

-    ;ldr            r11, [sp, #4]               ; load ps0

-    ;shadd8     r12 , r12 , r10

-    ;and            r8, r8, lr                  ; -1 for each element that equals 4

-    ;qadd8      r10, r8, r12                ; u = vp9_signed_char_clamp(s + Filter2)

-    ;qsub8      r9 , r9, r12                ; qs0 = vp9_signed_char_clamp(qs0 - Filter2)

-    ;qadd8      r11, r11, r10               ; ps0 = vp9_signed_char_clamp(ps0 + u)

-    ;end of modification for vp8

-    bic         r12, r7, r6                 ;vp9_filter &= ~hev    ( r6 is free)

-    ;mov            r12, r7

-    ;roughly 3/7th difference across boundary

-    mov         lr, #0x1b                   ; 27

-    mov         r7, #0x3f                   ; 63

-    sxtb16      r6, r12

-    sxtb16      r10, r12, ror #8

-    smlabb      r8, r6, lr, r7

-    smlatb      r6, r6, lr, r7

-    smlabb      r7, r10, lr, r7

-    smultb      r10, r10, lr

-    ssat        r8, #8, r8, asr #7

-    ssat        r6, #8, r6, asr #7

-    add         r10, r10, #63

-    ssat        r7, #8, r7, asr #7

-    ssat        r10, #8, r10, asr #7

-    ldr         lr, c0x80808080

-    pkhbt       r6, r8, r6, lsl #16

-    pkhbt       r10, r7, r10, lsl #16

-    uxtb16      r6, r6

-    uxtb16      r10, r10

-    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines

-    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 27)>>7)

-    qsub8       r8, r9, r10                 ; s = vp9_signed_char_clamp(qs0 - u)

-    qadd8       r10, r11, r10               ; s = vp9_signed_char_clamp(ps0 + u)

-    eor         r8, r8, lr                  ; *oq0 = s^0x80

-    eor         r10, r10, lr                ; *op0 = s^0x80

-    strb        r10, [src, #-1]             ; store op0 result

-    strb        r8, [src], pstep            ; store oq0 result

-    mov         r10, r10, lsr #8

-    mov         r8, r8, lsr #8

-    strb        r10, [src, #-1]

-    strb        r8, [src], pstep

-    mov         r10, r10, lsr #8

-    mov         r8, r8, lsr #8

-    strb        r10, [src, #-1]

-    strb        r8, [src], pstep

-    mov         r10, r10, lsr #8

-    mov         r8, r8, lsr #8

-    strb        r10, [src, #-1]

-    strb        r8, [src], pstep

-    ;roughly 2/7th difference across boundary

-    mov         lr, #0x12                   ; 18

-    mov         r7, #0x3f                   ; 63

-    sxtb16      r6, r12

-    sxtb16      r10, r12, ror #8

-    smlabb      r8, r6, lr, r7

-    smlatb      r6, r6, lr, r7

-    smlabb      r9, r10, lr, r7

-    smlatb      r10, r10, lr, r7

-    ssat        r8, #8, r8, asr #7

-    ssat        r6, #8, r6, asr #7

-    ssat        r9, #8, r9, asr #7

-    ssat        r10, #8, r10, asr #7

-    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines

-    pkhbt       r6, r8, r6, lsl #16

-    pkhbt       r10, r9, r10, lsl #16

-    ldr         r9, [sp, #8]                ; load qs1

-    ldr         r11, [sp, #12]              ; load ps1

-    ldr         lr, c0x80808080

-    uxtb16      r6, r6

-    uxtb16      r10, r10

-    add         src, src, #2

-    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 18)>>7)

-    qsub8       r8, r9, r10                 ; s = vp9_signed_char_clamp(qs1 - u)

-    qadd8       r10, r11, r10               ; s = vp9_signed_char_clamp(ps1 + u)

-    eor         r8, r8, lr                  ; *oq1 = s^0x80

-    eor         r10, r10, lr                ; *op1 = s^0x80

-    ldrb        r11, [src, #-5]             ; load p2 for 1/7th difference across boundary

-    strb        r10, [src, #-4]             ; store op1

-    strb        r8, [src, #-1]              ; store oq1

-    ldrb        r9, [src], pstep            ; load q2 for 1/7th difference across boundary

-    mov         r10, r10, lsr #8

-    mov         r8, r8, lsr #8

-    ldrb        r6, [src, #-5]

-    strb        r10, [src, #-4]

-    strb        r8, [src, #-1]

-    ldrb        r7, [src], pstep

-    mov         r10, r10, lsr #8

-    mov         r8, r8, lsr #8

-    orr         r11, r11, r6, lsl #8

-    orr         r9, r9, r7, lsl #8

-    ldrb        r6, [src, #-5]

-    strb        r10, [src, #-4]

-    strb        r8, [src, #-1]

-    ldrb        r7, [src], pstep

-    mov         r10, r10, lsr #8

-    mov         r8, r8, lsr #8

-    orr         r11, r11, r6, lsl #16

-    orr         r9, r9, r7, lsl #16

-    ldrb        r6, [src, #-5]

-    strb        r10, [src, #-4]

-    strb        r8, [src, #-1]

-    ldrb        r7, [src], pstep

-    orr         r11, r11, r6, lsl #24

-    orr         r9, r9, r7, lsl #24

-    ;roughly 1/7th difference across boundary

-    eor         r9, r9, lr

-    eor         r11, r11, lr

-    mov         lr, #0x9                    ; 9

-    mov         r7, #0x3f                   ; 63

-    sxtb16      r6, r12

-    sxtb16      r10, r12, ror #8

-    smlabb      r8, r6, lr, r7

-    smlatb      r6, r6, lr, r7

-    smlabb      r12, r10, lr, r7

-    smlatb      r10, r10, lr, r7

-    ssat        r8, #8, r8, asr #7

-    ssat        r6, #8, r6, asr #7

-    ssat        r12, #8, r12, asr #7

-    ssat        r10, #8, r10, asr #7

-    sub         src, src, pstep, lsl #2

-    pkhbt       r6, r8, r6, lsl #16

-    pkhbt       r10, r12, r10, lsl #16

-    uxtb16      r6, r6

-    uxtb16      r10, r10

-    ldr         lr, c0x80808080

-    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 9)>>7)

-    qadd8       r8, r11, r10                ; s = vp9_signed_char_clamp(ps2 + u)

-    qsub8       r10, r9, r10                ; s = vp9_signed_char_clamp(qs2 - u)

-    eor         r8, r8, lr                  ; *op2 = s^0x80

-    eor         r10, r10, lr                ; *oq2 = s^0x80

-    strb        r8, [src, #-5]              ; store *op2

-    strb        r10, [src], pstep           ; store *oq2

-    mov         r8, r8, lsr #8

-    mov         r10, r10, lsr #8

-    strb        r8, [src, #-5]

-    strb        r10, [src], pstep

-    mov         r8, r8, lsr #8

-    mov         r10, r10, lsr #8

-    strb        r8, [src, #-5]

-    strb        r10, [src], pstep

-    mov         r8, r8, lsr #8

-    mov         r10, r10, lsr #8

-    strb        r8, [src, #-5]

-    strb        r10, [src], pstep

-    ;adjust src pointer for next loop

-    sub         src, src, #2

-|mbvskip_filter|

-    sub         src, src, #4

-    subs        count, count, #1

-    pld         [src, #23]                  ; preload for next block

-    ldrne       r6, [src], pstep            ; load source data

-    pld         [src, #23]

-    ldrne       r7, [src], pstep

-    pld         [src, #23]

-    ldrne       r8, [src], pstep

-    pld         [src, #23]

-    ldrne       lr, [src], pstep

-    bne         MBVnext8

-    add         sp, sp, #16

-    ldmia       sp!, {r4 - r11, pc}

-    ENDP        ; |vp8_mbloop_filter_vertical_edge_armv6|

-; Constant Pool

-c0x80808080 DCD     0x80808080

-c0x03030303 DCD     0x03030303

-c0x04040404 DCD     0x04040404

-c0x01010101 DCD     0x01010101

-c0x7F7F7F7F DCD     0x7F7F7F7F

-    END

--- a/vp8/common/arm/armv6/recon_v6.asm

+++ /dev/null

@@ -1,281 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_recon_b_armv6|

-    EXPORT  |vp8_recon2b_armv6|

-    EXPORT  |vp8_recon4b_armv6|

-    AREA    |.text|, CODE, READONLY  ; name this block of code

-prd     RN  r0

-dif     RN  r1

-dst     RN  r2

-stride      RN  r3

-;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride)

-; R0 char* pred_ptr

-; R1 short * dif_ptr

-; R2 char * dst_ptr

-; R3 int stride

-; Description:

-; Loop through the block adding the Pred and Diff together.  Clamp and then

-; store back into the Dst.

-; Restrictions :

-; all buffers are expected to be 4 byte aligned coming in and

-; going out.

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

-;

-;

-;

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

-|vp8_recon_b_armv6| PROC

-    stmdb   sp!, {r4 - r9, lr}

-    ;0, 1, 2, 3

-    ldr     r4, [prd], #16          ; 3 | 2 | 1 | 0

-    ldr     r6, [dif, #0]           ;     1 |     0

-    ldr     r7, [dif, #4]           ;     3 |     2

-    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0

-    pkhtb   r9, r7, r6, asr #16     ;     3 |     1

-    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0

-    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1

-    usat16  r8, #8, r8

-    usat16  r9, #8, r9

-    add     dif, dif, #32

-    orr     r8, r8, r9, lsl #8

-    str     r8, [dst], stride

-    ;0, 1, 2, 3

-    ldr     r4, [prd], #16          ; 3 | 2 | 1 | 0

-;;  ldr     r6, [dif, #8]           ;     1 |     0

-;;  ldr     r7, [dif, #12]          ;     3 |     2

-    ldr     r6, [dif, #0]           ;     1 |     0

-    ldr     r7, [dif, #4]           ;     3 |     2

-    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0

-    pkhtb   r9, r7, r6, asr #16     ;     3 |     1

-    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0

-    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1

-    usat16  r8, #8, r8

-    usat16  r9, #8, r9

-    add     dif, dif, #32

-    orr     r8, r8, r9, lsl #8

-    str     r8, [dst], stride

-    ;0, 1, 2, 3

-    ldr     r4, [prd], #16          ; 3 | 2 | 1 | 0

-;;  ldr     r6, [dif, #16]          ;     1 |     0

-;;  ldr     r7, [dif, #20]          ;     3 |     2

-    ldr     r6, [dif, #0]           ;     1 |     0

-    ldr     r7, [dif, #4]           ;     3 |     2

-    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0

-    pkhtb   r9, r7, r6, asr #16     ;     3 |     1

-    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0

-    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1

-    usat16  r8, #8, r8

-    usat16  r9, #8, r9

-    add     dif, dif, #32

-    orr     r8, r8, r9, lsl #8

-    str     r8, [dst], stride

-    ;0, 1, 2, 3

-    ldr     r4, [prd], #16          ; 3 | 2 | 1 | 0

-;;  ldr     r6, [dif, #24]          ;     1 |     0

-;;  ldr     r7, [dif, #28]          ;     3 |     2

-    ldr     r6, [dif, #0]           ;     1 |     0

-    ldr     r7, [dif, #4]           ;     3 |     2

-    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0

-    pkhtb   r9, r7, r6, asr #16     ;     3 |     1

-    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0

-    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1

-    usat16  r8, #8, r8

-    usat16  r9, #8, r9

-    orr     r8, r8, r9, lsl #8

-    str     r8, [dst], stride

-    ldmia   sp!, {r4 - r9, pc}

-    ENDP    ; |recon_b|

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

-;

-;

-;

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

-; R0 char  *pred_ptr

-; R1 short *dif_ptr

-; R2 char  *dst_ptr

-; R3 int stride

-|vp8_recon4b_armv6| PROC

-    stmdb   sp!, {r4 - r9, lr}

-    mov     lr, #4

-recon4b_loop

-    ;0, 1, 2, 3

-    ldr     r4, [prd], #4           ; 3 | 2 | 1 | 0

-    ldr     r6, [dif, #0]           ;     1 |     0

-    ldr     r7, [dif, #4]           ;     3 |     2

-    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0

-    pkhtb   r9, r7, r6, asr #16     ;     3 |     1

-    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0

-    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1

-    usat16  r8, #8, r8

-    usat16  r9, #8, r9

-    orr     r8, r8, r9, lsl #8

-    str     r8, [dst]

-    ;4, 5, 6, 7

-    ldr     r4, [prd], #4

-;;  ldr     r6, [dif, #32]

-;;  ldr     r7, [dif, #36]

-    ldr     r6, [dif, #8]

-    ldr     r7, [dif, #12]

-    pkhbt   r8, r6, r7, lsl #16

-    pkhtb   r9, r7, r6, asr #16

-    uxtab16 r8, r8, r4

-    uxtab16 r9, r9, r4, ror #8

-    usat16  r8, #8, r8

-    usat16  r9, #8, r9

-    orr     r8, r8, r9, lsl #8

-    str     r8, [dst, #4]

-    ;8, 9, 10, 11

-    ldr     r4, [prd], #4

-;;  ldr     r6, [dif, #64]

-;;  ldr     r7, [dif, #68]

-    ldr     r6, [dif, #16]

-    ldr     r7, [dif, #20]

-    pkhbt   r8, r6, r7, lsl #16

-    pkhtb   r9, r7, r6, asr #16

-    uxtab16 r8, r8, r4

-    uxtab16 r9, r9, r4, ror #8

-    usat16  r8, #8, r8

-    usat16  r9, #8, r9

-    orr     r8, r8, r9, lsl #8

-    str     r8, [dst, #8]

-    ;12, 13, 14, 15

-    ldr     r4, [prd], #4

-;;  ldr     r6, [dif, #96]

-;;  ldr     r7, [dif, #100]

-    ldr     r6, [dif, #24]

-    ldr     r7, [dif, #28]

-    pkhbt   r8, r6, r7, lsl #16

-    pkhtb   r9, r7, r6, asr #16

-    uxtab16 r8, r8, r4

-    uxtab16 r9, r9, r4, ror #8

-    usat16  r8, #8, r8

-    usat16  r9, #8, r9

-    orr     r8, r8, r9, lsl #8

-    str     r8, [dst, #12]

-    add     dst, dst, stride

-;;  add     dif, dif, #8

-    add     dif, dif, #32

-    subs    lr, lr, #1

-    bne     recon4b_loop

-    ldmia   sp!, {r4 - r9, pc}

-    ENDP    ; |Recon4B|

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

-;

-;

-;

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

-; R0 char  *pred_ptr

-; R1 short *dif_ptr

-; R2 char  *dst_ptr

-; R3 int stride

-|vp8_recon2b_armv6| PROC

-    stmdb   sp!, {r4 - r9, lr}

-    mov     lr, #4

-recon2b_loop

-    ;0, 1, 2, 3

-    ldr     r4, [prd], #4

-    ldr     r6, [dif, #0]

-    ldr     r7, [dif, #4]

-    pkhbt   r8, r6, r7, lsl #16

-    pkhtb   r9, r7, r6, asr #16

-    uxtab16 r8, r8, r4

-    uxtab16 r9, r9, r4, ror #8

-    usat16  r8, #8, r8

-    usat16  r9, #8, r9

-    orr     r8, r8, r9, lsl #8

-    str     r8, [dst]

-    ;4, 5, 6, 7

-    ldr     r4, [prd], #4

-;;  ldr     r6, [dif, #32]

-;;  ldr     r7, [dif, #36]

-    ldr     r6, [dif, #8]

-    ldr     r7, [dif, #12]

-    pkhbt   r8, r6, r7, lsl #16

-    pkhtb   r9, r7, r6, asr #16

-    uxtab16 r8, r8, r4

-    uxtab16 r9, r9, r4, ror #8

-    usat16  r8, #8, r8

-    usat16  r9, #8, r9

-    orr     r8, r8, r9, lsl #8

-    str     r8, [dst, #4]

-    add     dst, dst, stride

-;;  add     dif, dif, #8

-    add     dif, dif, #16

-    subs    lr, lr, #1

-    bne     recon2b_loop

-    ldmia   sp!, {r4 - r9, pc}

-    ENDP    ; |Recon2B|

-    END

--- a/vp8/common/arm/armv6/simpleloopfilter_v6.asm

+++ /dev/null

@@ -1,286 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT |vp9_loop_filter_simple_horizontal_edge_armv6|

-    EXPORT |vp9_loop_filter_simple_vertical_edge_armv6|

-    AREA    |.text|, CODE, READONLY  ; name this block of code

-    MACRO

-    TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3

-    ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3

-    ; a0: 03 02 01 00

-    ; a1: 13 12 11 10

-    ; a2: 23 22 21 20

-    ; a3: 33 32 31 30

-    ;     b3 b2 b1 b0

-    uxtb16      $b1, $a1                    ; xx 12 xx 10

-    uxtb16      $b0, $a0                    ; xx 02 xx 00

-    uxtb16      $b3, $a3                    ; xx 32 xx 30

-    uxtb16      $b2, $a2                    ; xx 22 xx 20

-    orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00

-    orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20

-    uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11

-    uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31

-    uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01

-    uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21

-    orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01

-    orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21

-    pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1

-    pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3

-    pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0

-    pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2

-    MEND

-src         RN  r0

-pstep       RN  r1

-;r0     unsigned char *src_ptr,

-;r1     int src_pixel_step,

-;r2     const char *blimit

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

-|vp9_loop_filter_simple_horizontal_edge_armv6| PROC

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

-    stmdb       sp!, {r4 - r11, lr}

-    ldrb        r12, [r2]                   ; blimit

-    ldr         r3, [src, -pstep, lsl #1]   ; p1

-    ldr         r4, [src, -pstep]           ; p0

-    ldr         r5, [src]                   ; q0

-    ldr         r6, [src, pstep]            ; q1

-    orr         r12, r12, r12, lsl #8       ; blimit

-    ldr         r2, c0x80808080

-    orr         r12, r12, r12, lsl #16      ; blimit

-    mov         r9, #4                      ; double the count. we're doing 4 at a time

-    mov         lr, #0                      ; need 0 in a couple places

-|simple_hnext8|

-    ; vp8_simple_filter_mask()

-    uqsub8      r7, r3, r6                  ; p1 - q1

-    uqsub8      r8, r6, r3                  ; q1 - p1

-    uqsub8      r10, r4, r5                 ; p0 - q0

-    uqsub8      r11, r5, r4                 ; q0 - p0

-    orr         r8, r8, r7                  ; abs(p1 - q1)

-    orr         r10, r10, r11               ; abs(p0 - q0)

-    uqadd8      r10, r10, r10               ; abs(p0 - q0) * 2

-    uhadd8      r8, r8, lr                  ; abs(p1 - q2) >> 1

-    uqadd8      r10, r10, r8                ; abs(p0 - q0)*2 + abs(p1 - q1)/2

-    mvn         r8, #0

-    usub8       r10, r12, r10               ; compare to flimit. usub8 sets GE flags

-    sel         r10, r8, lr                 ; filter mask: F or 0

-    cmp         r10, #0

-    beq         simple_hskip_filter         ; skip filtering if all masks are 0x00

-    ;vp8_simple_filter()

-    eor         r3, r3, r2                  ; p1 offset to convert to a signed value

-    eor         r6, r6, r2                  ; q1 offset to convert to a signed value

-    eor         r4, r4, r2                  ; p0 offset to convert to a signed value

-    eor         r5, r5, r2                  ; q0 offset to convert to a signed value

-    qsub8       r3, r3, r6                  ; vp9_filter = p1 - q1

-    qsub8       r6, r5, r4                  ; q0 - p0

-    qadd8       r3, r3, r6                  ; += q0 - p0

-    ldr         r7, c0x04040404

-    qadd8       r3, r3, r6                  ; += q0 - p0

-    ldr         r8, c0x03030303

-    qadd8       r3, r3, r6                  ; vp9_filter = p1-q1 + 3*(q0-p0))

-    ;STALL

-    and         r3, r3, r10                 ; vp9_filter &= mask

-    qadd8       r7 , r3 , r7                ; Filter1 = vp9_filter + 4

-    qadd8       r8 , r3 , r8                ; Filter2 = vp9_filter + 3

-    shadd8      r7 , r7 , lr

-    shadd8      r8 , r8 , lr

-    shadd8      r7 , r7 , lr

-    shadd8      r8 , r8 , lr

-    shadd8      r7 , r7 , lr                ; Filter1 >>= 3

-    shadd8      r8 , r8 , lr                ; Filter2 >>= 3

-    qsub8       r5 ,r5, r7                  ; u = q0 - Filter1

-    qadd8       r4, r4, r8                  ; u = p0 + Filter2

-    eor         r5, r5, r2                  ; *oq0 = u^0x80

-    str         r5, [src]                   ; store oq0 result

-    eor         r4, r4, r2                  ; *op0 = u^0x80

-    str         r4, [src, -pstep]           ; store op0 result

-|simple_hskip_filter|

-    subs        r9, r9, #1

-    addne       src, src, #4                ; next row

-    ldrne       r3, [src, -pstep, lsl #1]   ; p1

-    ldrne       r4, [src, -pstep]           ; p0

-    ldrne       r5, [src]                   ; q0

-    ldrne       r6, [src, pstep]            ; q1

-    bne         simple_hnext8

-    ldmia       sp!, {r4 - r11, pc}

-    ENDP        ; |vp9_loop_filter_simple_horizontal_edge_armv6|

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

-|vp9_loop_filter_simple_vertical_edge_armv6| PROC

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

-    stmdb       sp!, {r4 - r11, lr}

-    ldrb        r12, [r2]                   ; r12: blimit

-    ldr         r2, c0x80808080

-    orr         r12, r12, r12, lsl #8

-    ; load soure data to r7, r8, r9, r10

-    ldrh        r3, [src, #-2]

-    pld         [src, #23]                  ; preload for next block

-    ldrh        r4, [src], pstep

-    orr         r12, r12, r12, lsl #16

-    ldrh        r5, [src, #-2]

-    pld         [src, #23]

-    ldrh        r6, [src], pstep

-    pkhbt       r7, r3, r4, lsl #16

-    ldrh        r3, [src, #-2]

-    pld         [src, #23]

-    ldrh        r4, [src], pstep

-    pkhbt       r8, r5, r6, lsl #16

-    ldrh        r5, [src, #-2]

-    pld         [src, #23]

-    ldrh        r6, [src], pstep

-    mov         r11, #4                     ; double the count. we're doing 4 at a time

-|simple_vnext8|

-    ; vp8_simple_filter_mask() function

-    pkhbt       r9, r3, r4, lsl #16

-    pkhbt       r10, r5, r6, lsl #16

-    ;transpose r7, r8, r9, r10 to r3, r4, r5, r6

-    TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6

-    uqsub8      r7, r3, r6                  ; p1 - q1

-    uqsub8      r8, r6, r3                  ; q1 - p1

-    uqsub8      r9, r4, r5                  ; p0 - q0

-    uqsub8      r10, r5, r4                 ; q0 - p0

-    orr         r7, r7, r8                  ; abs(p1 - q1)

-    orr         r9, r9, r10                 ; abs(p0 - q0)

-    mov         r8, #0

-    uqadd8      r9, r9, r9                  ; abs(p0 - q0) * 2

-    uhadd8      r7, r7, r8                  ; abs(p1 - q1) / 2

-    uqadd8      r7, r7, r9                  ; abs(p0 - q0)*2 + abs(p1 - q1)/2

-    mvn         r10, #0                     ; r10 == -1

-    usub8       r7, r12, r7                 ; compare to flimit

-    sel         lr, r10, r8                 ; filter mask

-    cmp         lr, #0

-    beq         simple_vskip_filter         ; skip filtering

-    ;vp8_simple_filter() function

-    eor         r3, r3, r2                  ; p1 offset to convert to a signed value

-    eor         r6, r6, r2                  ; q1 offset to convert to a signed value

-    eor         r4, r4, r2                  ; p0 offset to convert to a signed value

-    eor         r5, r5, r2                  ; q0 offset to convert to a signed value

-    qsub8       r3, r3, r6                  ; vp9_filter = p1 - q1

-    qsub8       r6, r5, r4                  ; q0 - p0

-    qadd8       r3, r3, r6                  ; vp9_filter += q0 - p0

-    ldr         r9, c0x03030303             ; r9 = 3

-    qadd8       r3, r3, r6                  ; vp9_filter += q0 - p0

-    ldr         r7, c0x04040404

-    qadd8       r3, r3, r6                  ; vp9_filter = p1-q1 + 3*(q0-p0))

-    ;STALL

-    and         r3, r3, lr                  ; vp9_filter &= mask

-    qadd8       r9 , r3 , r9                ; Filter2 = vp9_filter + 3

-    qadd8       r3 , r3 , r7                ; Filter1 = vp9_filter + 4

-    shadd8      r9 , r9 , r8

-    shadd8      r3 , r3 , r8

-    shadd8      r9 , r9 , r8

-    shadd8      r3 , r3 , r8

-    shadd8      r9 , r9 , r8                ; Filter2 >>= 3

-    shadd8      r3 , r3 , r8                ; Filter1 >>= 3

-    ;calculate output

-    sub         src, src, pstep, lsl #2

-    qadd8       r4, r4, r9                  ; u = p0 + Filter2

-    qsub8       r5, r5, r3                  ; u = q0 - Filter1

-    eor         r4, r4, r2                  ; *op0 = u^0x80

-    eor         r5, r5, r2                  ; *oq0 = u^0x80

-    strb        r4, [src, #-1]              ; store the result

-    mov         r4, r4, lsr #8

-    strb        r5, [src], pstep

-    mov         r5, r5, lsr #8

-    strb        r4, [src, #-1]

-    mov         r4, r4, lsr #8

-    strb        r5, [src], pstep

-    mov         r5, r5, lsr #8

-    strb        r4, [src, #-1]

-    mov         r4, r4, lsr #8

-    strb        r5, [src], pstep

-    mov         r5, r5, lsr #8

-    strb        r4, [src, #-1]

-    strb        r5, [src], pstep

-|simple_vskip_filter|

-    subs        r11, r11, #1

-    ; load soure data to r7, r8, r9, r10

-    ldrneh      r3, [src, #-2]

-    pld         [src, #23]                  ; preload for next block

-    ldrneh      r4, [src], pstep

-    ldrneh      r5, [src, #-2]

-    pld         [src, #23]

-    ldrneh      r6, [src], pstep

-    pkhbt       r7, r3, r4, lsl #16

-    ldrneh      r3, [src, #-2]

-    pld         [src, #23]

-    ldrneh      r4, [src], pstep

-    pkhbt       r8, r5, r6, lsl #16

-    ldrneh      r5, [src, #-2]

-    pld         [src, #23]

-    ldrneh      r6, [src], pstep

-    bne         simple_vnext8

-    ldmia       sp!, {r4 - r11, pc}

-    ENDP        ; |vp9_loop_filter_simple_vertical_edge_armv6|

-; Constant Pool

-c0x80808080 DCD     0x80808080

-c0x03030303 DCD     0x03030303

-c0x04040404 DCD     0x04040404

-    END

--- a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm

+++ /dev/null

@@ -1,273 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_sixtap_predict8x4_armv6|

-    AREA    |.text|, CODE, READONLY  ; name this block of code

-;-------------------------------------

-; r0    unsigned char *src_ptr,

-; r1    int  src_pixels_per_line,

-; r2    int  xoffset,

-; r3    int  yoffset,

-; stack unsigned char *dst_ptr,

-; stack int  dst_pitch

-;-------------------------------------

-;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.

-;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,

-;and the result is stored in transpose.

-|vp8_sixtap_predict8x4_armv6| PROC

-    stmdb       sp!, {r4 - r11, lr}

-    str         r3, [sp, #-184]!            ;reserve space on stack for temporary storage, store yoffset

-    cmp         r2, #0                      ;skip first_pass filter if xoffset=0

-    add         lr, sp, #4                  ;point to temporary buffer

-    beq         skip_firstpass_filter

-;first-pass filter

-    adr         r12, filter8_coeff

-    sub         r0, r0, r1, lsl #1

-    add         r3, r1, #10                 ; preload next low

-    pld         [r0, r3]

-    add         r2, r12, r2, lsl #4         ;calculate filter location

-    add         r0, r0, #3                  ;adjust src only for loading convinience

-    ldr         r3, [r2]                    ; load up packed filter coefficients

-    ldr         r4, [r2, #4]

-    ldr         r5, [r2, #8]

-    mov         r2, #0x90000                ; height=9 is top part of counter

-    sub         r1, r1, #8

-|first_pass_hloop_v6|

-    ldrb        r6, [r0, #-5]               ; load source data

-    ldrb        r7, [r0, #-4]

-    ldrb        r8, [r0, #-3]

-    ldrb        r9, [r0, #-2]

-    ldrb        r10, [r0, #-1]

-    orr         r2, r2, #0x4                ; construct loop counter. width=8=4x2

-    pkhbt       r6, r6, r7, lsl #16         ; r7 | r6

-    pkhbt       r7, r7, r8, lsl #16         ; r8 | r7

-    pkhbt       r8, r8, r9, lsl #16         ; r9 | r8

-    pkhbt       r9, r9, r10, lsl #16        ; r10 | r9

-|first_pass_wloop_v6|

-    smuad       r11, r6, r3                 ; vp9_filter[0], vp9_filter[1]

-    smuad       r12, r7, r3

-    ldrb        r6, [r0], #1

-    smlad       r11, r8, r4, r11            ; vp9_filter[2], vp9_filter[3]

-    ldrb        r7, [r0], #1

-    smlad       r12, r9, r4, r12

-    pkhbt       r10, r10, r6, lsl #16       ; r10 | r9

-    pkhbt       r6, r6, r7, lsl #16         ; r11 | r10

-    smlad       r11, r10, r5, r11           ; vp9_filter[4], vp9_filter[5]

-    smlad       r12, r6, r5, r12

-    sub         r2, r2, #1

-    add         r11, r11, #0x40             ; round_shift_and_clamp

-    tst         r2, #0xff                   ; test loop counter

-    usat        r11, #8, r11, asr #7

-    add         r12, r12, #0x40

-    strh        r11, [lr], #20              ; result is transposed and stored, which

-    usat        r12, #8, r12, asr #7

-    strh        r12, [lr], #20

-    movne       r11, r6

-    movne       r12, r7

-    movne       r6, r8

-    movne       r7, r9

-    movne       r8, r10

-    movne       r9, r11

-    movne       r10, r12

-    bne         first_pass_wloop_v6

-    ;;add       r9, ppl, #30                ; attempt to load 2 adjacent cache lines

-    ;;IF ARCHITECTURE=6

-    ;pld        [src, ppl]

-    ;;pld       [src, r9]

-    ;;ENDIF

-    subs        r2, r2, #0x10000

-    sub         lr, lr, #158

-    add         r0, r0, r1                  ; move to next input line

-    add         r11, r1, #18                ; preload next low. adding back block width(=8), which is subtracted earlier

-    pld         [r0, r11]

-    bne         first_pass_hloop_v6

-;second pass filter

-secondpass_filter

-    ldr         r3, [sp], #4                ; load back yoffset

-    ldr         r0, [sp, #216]              ; load dst address from stack 180+36

-    ldr         r1, [sp, #220]              ; load dst stride from stack 180+40

-    cmp         r3, #0

-    beq         skip_secondpass_filter

-    adr         r12, filter8_coeff

-    add         lr, r12, r3, lsl #4         ;calculate filter location

-    mov         r2, #0x00080000

-    ldr         r3, [lr]                    ; load up packed filter coefficients

-    ldr         r4, [lr, #4]

-    ldr         r5, [lr, #8]

-    pkhbt       r12, r4, r3                 ; pack the filter differently

-    pkhbt       r11, r5, r4

-second_pass_hloop_v6

-    ldr         r6, [sp]                    ; load the data

-    ldr         r7, [sp, #4]

-    orr         r2, r2, #2                  ; loop counter

-second_pass_wloop_v6

-    smuad       lr, r3, r6                  ; apply filter

-    smulbt      r10, r3, r6

-    ldr         r8, [sp, #8]

-    smlad       lr, r4, r7, lr

-    smladx      r10, r12, r7, r10

-    ldrh        r9, [sp, #12]

-    smlad       lr, r5, r8, lr

-    smladx      r10, r11, r8, r10

-    add         sp, sp, #4

-    smlatb      r10, r5, r9, r10

-    sub         r2, r2, #1

-    add         lr, lr, #0x40               ; round_shift_and_clamp

-    tst         r2, #0xff

-    usat        lr, #8, lr, asr #7

-    add         r10, r10, #0x40

-    strb        lr, [r0], r1                ; the result is transposed back and stored

-    usat        r10, #8, r10, asr #7

-    strb        r10, [r0],r1

-    movne       r6, r7

-    movne       r7, r8

-    bne         second_pass_wloop_v6

-    subs        r2, r2, #0x10000

-    add         sp, sp, #12                 ; updata src for next loop (20-8)

-    sub         r0, r0, r1, lsl #2

-    add         r0, r0, #1

-    bne         second_pass_hloop_v6

-    add         sp, sp, #20

-    ldmia       sp!, {r4 - r11, pc}

-;--------------------

-skip_firstpass_filter

-    sub         r0, r0, r1, lsl #1

-    sub         r1, r1, #8

-    mov         r2, #9

-skip_firstpass_hloop

-    ldrb        r4, [r0], #1                ; load data

-    subs        r2, r2, #1

-    ldrb        r5, [r0], #1

-    strh        r4, [lr], #20               ; store it to immediate buffer

-    ldrb        r6, [r0], #1                ; load data

-    strh        r5, [lr], #20

-    ldrb        r7, [r0], #1

-    strh        r6, [lr], #20

-    ldrb        r8, [r0], #1

-    strh        r7, [lr], #20

-    ldrb        r9, [r0], #1

-    strh        r8, [lr], #20

-    ldrb        r10, [r0], #1

-    strh        r9, [lr], #20

-    ldrb        r11, [r0], #1

-    strh        r10, [lr], #20

-    add         r0, r0, r1                  ; move to next input line

-    strh        r11, [lr], #20

-    sub         lr, lr, #158                ; move over to next column

-    bne         skip_firstpass_hloop

-    b           secondpass_filter

-;--------------------

-skip_secondpass_filter

-    mov         r2, #8

-    add         sp, sp, #4                  ;start from src[0] instead of src[-2]

-skip_secondpass_hloop

-    ldr         r6, [sp], #4

-    subs        r2, r2, #1

-    ldr         r8, [sp], #4

-    mov         r7, r6, lsr #16             ; unpack

-    strb        r6, [r0], r1

-    mov         r9, r8, lsr #16

-    strb        r7, [r0], r1

-    add         sp, sp, #12                 ; 20-8

-    strb        r8, [r0], r1

-    strb        r9, [r0], r1

-    sub         r0, r0, r1, lsl #2

-    add         r0, r0, #1

-    bne         skip_secondpass_hloop

-    add         sp, sp, #16                 ; 180 - (160 +4)

-    ldmia       sp!, {r4 - r11, pc}

-    ENDP

-;-----------------

-;One word each is reserved. Label filter_coeff can be used to access the data.

-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...

-filter8_coeff

-    DCD     0x00000000,     0x00000080,     0x00000000,     0x00000000

-    DCD     0xfffa0000,     0x000c007b,     0x0000ffff,     0x00000000

-    DCD     0xfff50002,     0x0024006c,     0x0001fff8,     0x00000000

-    DCD     0xfff70000,     0x0032005d,     0x0000fffa,     0x00000000

-    DCD     0xfff00003,     0x004d004d,     0x0003fff0,     0x00000000

-    DCD     0xfffa0000,     0x005d0032,     0x0000fff7,     0x00000000

-    DCD     0xfff80001,     0x006c0024,     0x0002fff5,     0x00000000

-    DCD     0xffff0000,     0x007b000c,     0x0000fffa,     0x00000000

-    ;DCD        0,  0,  128,    0,   0,  0

-    ;DCD        0, -6,  123,   12,  -1,  0

-    ;DCD        2, -11, 108,   36,  -8,  1

-    ;DCD        0, -9,   93,   50,  -6,  0

-    ;DCD        3, -16,  77,   77, -16,  3

-    ;DCD        0, -6,   50,   93,  -9,  0

-    ;DCD        1, -8,   36,  108, -11,  2

-    ;DCD        0, -1,   12,  123,  -6,  0

-    END

--- a/vp8/common/arm/bilinearfilter_arm.c

+++ /dev/null

@@ -1,108 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <math.h>

-#include "vp8/common/filter.h"

-#include "vp8/common/subpixel.h"

-#include "bilinearfilter_arm.h"

-void vp9_filter_block2d_bil_armv6

-(

-  unsigned char *src_ptr,

-  unsigned char *dst_ptr,

-  unsigned int   src_pitch,

-  unsigned int   dst_pitch,

-  const short   *HFilter,

-  const short   *VFilter,

-  int            Width,

-  int            Height

-) {

-  unsigned short FData[36 * 16]; /* Temp data buffer used in filtering */

-  /* First filter 1-D horizontally... */

-  vp9_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);

-  /* then 1-D vertically... */

-  vp9_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter);

-}

-void vp9_bilinear_predict4x4_armv6

-(

-  unsigned char  *src_ptr,

-  int   src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  unsigned char *dst_ptr,

-  int dst_pitch

-) {

-  const short  *HFilter;

-  const short  *VFilter;

-  HFilter = vp8_bilinear_filters[xoffset];

-  VFilter = vp8_bilinear_filters[yoffset];

-  vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);

-}

-void vp9_bilinear_predict8x8_armv6

-(

-  unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  unsigned char *dst_ptr,

-  int  dst_pitch

-) {

-  const short  *HFilter;

-  const short  *VFilter;

-  HFilter = vp8_bilinear_filters[xoffset];

-  VFilter = vp8_bilinear_filters[yoffset];

-  vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);

-}

-void vp9_bilinear_predict8x4_armv6

-(

-  unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  unsigned char *dst_ptr,

-  int  dst_pitch

-) {

-  const short  *HFilter;

-  const short  *VFilter;

-  HFilter = vp8_bilinear_filters[xoffset];

-  VFilter = vp8_bilinear_filters[yoffset];

-  vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);

-}

-void vp9_bilinear_predict16x16_armv6

-(

-  unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  unsigned char *dst_ptr,

-  int  dst_pitch

-) {

-  const short  *HFilter;

-  const short  *VFilter;

-  HFilter = vp8_bilinear_filters[xoffset];

-  VFilter = vp8_bilinear_filters[yoffset];

-  vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);

-}

--- a/vp8/common/arm/bilinearfilter_arm.h

+++ /dev/null

@@ -1,35 +1,0 @@

-/*

- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef BILINEARFILTER_ARM_H

-#define BILINEARFILTER_ARM_H

-extern void vp9_filter_block2d_bil_first_pass_armv6

-(

-  const unsigned char  *src_ptr,

-  unsigned short       *dst_ptr,

-  unsigned int          src_pitch,

-  unsigned int          height,

-  unsigned int          width,

-  const short          *vp9_filter

-);

-extern void vp9_filter_block2d_bil_second_pass_armv6

-(

-  const unsigned short *src_ptr,

-  unsigned char        *dst_ptr,

-  int                   dst_pitch,

-  unsigned int          height,

-  unsigned int          width,

-  const short         *vp9_filter

-);

-#endif /* BILINEARFILTER_ARM_H */

--- a/vp8/common/arm/filter_arm.c

+++ /dev/null

@@ -1,198 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include <math.h>

-#include "vp8/common/filter.h"

-#include "vp8/common/subpixel.h"

-#include "vpx_ports/mem.h"

-extern void vp9_filter_block2d_first_pass_armv6

-(

-  unsigned char *src_ptr,

-  short         *output_ptr,

-  unsigned int src_pixels_per_line,

-  unsigned int output_width,

-  unsigned int output_height,

-  const short *vp9_filter

-);

-// 8x8

-extern void vp9_filter_block2d_first_pass_8x8_armv6

-(

-  unsigned char *src_ptr,

-  short         *output_ptr,

-  unsigned int src_pixels_per_line,

-  unsigned int output_width,

-  unsigned int output_height,

-  const short *vp9_filter

-);

-// 16x16

-extern void vp9_filter_block2d_first_pass_16x16_armv6

-(

-  unsigned char *src_ptr,

-  short         *output_ptr,

-  unsigned int src_pixels_per_line,

-  unsigned int output_width,

-  unsigned int output_height,

-  const short *vp9_filter

-);

-extern void vp9_filter_block2d_second_pass_armv6

-(

-  short         *src_ptr,

-  unsigned char *output_ptr,

-  unsigned int output_pitch,

-  unsigned int cnt,

-  const short *vp9_filter

-);

-extern void vp9_filter4_block2d_second_pass_armv6

-(

-  short         *src_ptr,

-  unsigned char *output_ptr,

-  unsigned int output_pitch,

-  unsigned int cnt,

-  const short *vp9_filter

-);

-extern void vp9_filter_block2d_first_pass_only_armv6

-(

-  unsigned char *src_ptr,

-  unsigned char *output_ptr,

-  unsigned int src_pixels_per_line,

-  unsigned int cnt,

-  unsigned int output_pitch,

-  const short *vp9_filter

-);

-extern void vp9_filter_block2d_second_pass_only_armv6

-(

-  unsigned char *src_ptr,

-  unsigned char *output_ptr,

-  unsigned int src_pixels_per_line,

-  unsigned int cnt,

-  unsigned int output_pitch,

-  const short *vp9_filter

-);

-#if HAVE_ARMV6

-void vp9_sixtap_predict_armv6

-(

-  unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  unsigned char *dst_ptr,

-  int  dst_pitch

-) {

-  const short  *HFilter;

-  const short  *VFilter;

-  DECLARE_ALIGNED_ARRAY(4, short, FData, 12 * 4); /* Temp data buffer used in filtering */

-  HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */

-  VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

-  /* Vfilter is null. First pass only */

-  if (xoffset && !yoffset) {

-    /*vp9_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter );

-    vp9_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );*/

-    vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter);

-  }

-  /* Hfilter is null. Second pass only */

-  else if (!xoffset && yoffset) {

-    vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter);

-  } else {

-    /* Vfilter is a 4 tap filter */

-    if (yoffset & 0x1) {

-      vp9_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter);

-      vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);

-    }

-    /* Vfilter is 6 tap filter */

-    else {

-      vp9_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter);

-      vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);

-    }

-  }

-}

-void vp9_sixtap_predict8x8_armv6

-(

-  unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  unsigned char *dst_ptr,

-  int  dst_pitch

-) {

-  const short  *HFilter;

-  const short  *VFilter;

-  DECLARE_ALIGNED_ARRAY(4, short, FData, 16 * 8); /* Temp data buffer used in filtering */

-  HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */

-  VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

-  if (xoffset && !yoffset) {

-    vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter);

-  }

-  /* Hfilter is null. Second pass only */

-  else if (!xoffset && yoffset) {

-    vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter);

-  } else {

-    if (yoffset & 0x1) {

-      vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);

-      vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);

-    } else {

-      vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);

-      vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);

-    }

-  }

-}

-void vp9_sixtap_predict16x16_armv6

-(

-  unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  unsigned char *dst_ptr,

-  int  dst_pitch

-) {

-  const short  *HFilter;

-  const short  *VFilter;

-  DECLARE_ALIGNED_ARRAY(4, short, FData, 24 * 16);  /* Temp data buffer used in filtering */

-  HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */

-  VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

-  if (xoffset && !yoffset) {

-    vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter);

-  }

-  /* Hfilter is null. Second pass only */

-  else if (!xoffset && yoffset) {

-    vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter);

-  } else {

-    if (yoffset & 0x1) {

-      vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);

-      vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);

-    } else {

-      vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);

-      vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);

-    }

-  }

-}

-#endif

--- a/vp8/common/arm/idct_arm.h

+++ /dev/null

@@ -1,65 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef IDCT_ARM_H

-#define IDCT_ARM_H

-#if HAVE_ARMV6

-extern prototype_idct(vp9_short_idct4x4llm_1_v6);

-extern prototype_idct(vp9_short_idct4x4llm_v6_dual);

-extern prototype_idct_scalar_add(vp9_dc_only_idct_add_v6);

-extern prototype_second_order(vp9_short_inv_walsh4x4_1_v6);

-extern prototype_second_order(vp9_short_inv_walsh4x4_v6);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_idct_idct1

-#define vp9_idct_idct1 vp9_short_idct4x4llm_1_v6

-#undef  vp9_idct_idct16

-#define vp9_idct_idct16 vp9_short_idct4x4llm_v6_dual

-#undef  vp9_idct_idct1_scalar_add

-#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_v6

-#undef  vp8_idct_iwalsh1

-#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_v6

-#undef  vp8_idct_iwalsh16

-#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_v6

-#endif

-#endif

-#if HAVE_ARMV7

-extern prototype_idct(vp9_short_idct4x4llm_1_neon);

-extern prototype_idct(vp9_short_idct4x4llm_neon);

-extern prototype_idct_scalar_add(vp9_dc_only_idct_add_neon);

-extern prototype_second_order(vp9_short_inv_walsh4x4_1_neon);

-extern prototype_second_order(vp9_short_inv_walsh4x4_neon);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_idct_idct1

-#define vp9_idct_idct1 vp9_short_idct4x4llm_1_neon

-#undef  vp9_idct_idct16

-#define vp9_idct_idct16 vp9_short_idct4x4llm_neon

-#undef  vp9_idct_idct1_scalar_add

-#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_neon

-#undef  vp8_idct_iwalsh1

-#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_neon

-#undef  vp8_idct_iwalsh16

-#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_neon

-#endif

-#endif

-#endif

--- a/vp8/common/arm/loopfilter_arm.c

+++ /dev/null

@@ -1,166 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_config.h"

-#include "vp8/common/loopfilter.h"

-#include "vp8/common/onyxc_int.h"

-#if HAVE_ARMV6

-extern prototype_loopfilter(vp9_loop_filter_horizontal_edge_armv6);

-extern prototype_loopfilter(vp9_loop_filter_vertical_edge_armv6);

-extern prototype_loopfilter(vp9_mbloop_filter_horizontal_edge_armv6);

-extern prototype_loopfilter(vp9_mbloop_filter_vertical_edge_armv6);

-#endif

-#if HAVE_ARMV7

-typedef void loopfilter_y_neon(unsigned char *src, int pitch,

-                               unsigned char blimit, unsigned char limit, unsigned char thresh);

-typedef void loopfilter_uv_neon(unsigned char *u, int pitch,

-                                unsigned char blimit, unsigned char limit, unsigned char thresh,

-                                unsigned char *v);

-extern loopfilter_y_neon vp9_loop_filter_horizontal_edge_y_neon;

-extern loopfilter_y_neon vp9_loop_filter_vertical_edge_y_neon;

-extern loopfilter_y_neon vp9_mbloop_filter_horizontal_edge_y_neon;

-extern loopfilter_y_neon vp9_mbloop_filter_vertical_edge_y_neon;

-extern loopfilter_uv_neon vp9_loop_filter_horizontal_edge_uv_neon;

-extern loopfilter_uv_neon vp9_loop_filter_vertical_edge_uv_neon;

-extern loopfilter_uv_neon vp9_mbloop_filter_horizontal_edge_uv_neon;

-extern loopfilter_uv_neon vp9_mbloop_filter_vertical_edge_uv_neon;

-#endif

-#if HAVE_ARMV6

-/*ARMV6 loopfilter functions*/

-/* Horizontal MB filtering */

-void vp9_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                               int y_stride, int uv_stride, loop_filter_info *lfi) {

-  vp9_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);

-  if (u_ptr)

-    vp9_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

-  if (v_ptr)

-    vp9_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

-}

-/* Vertical MB Filtering */

-void vp9_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                               int y_stride, int uv_stride, loop_filter_info *lfi) {

-  vp9_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);

-  if (u_ptr)

-    vp9_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

-  if (v_ptr)

-    vp9_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

-}

-/* Horizontal B Filtering */

-void vp9_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                              int y_stride, int uv_stride, loop_filter_info *lfi) {

-  vp9_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  vp9_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  vp9_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  if (u_ptr)

-    vp9_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);

-  if (v_ptr)

-    vp9_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);

-}

-void vp9_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,

-                               const unsigned char *blimit) {

-  vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);

-  vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);

-  vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);

-}

-/* Vertical B Filtering */

-void vp9_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                              int y_stride, int uv_stride, loop_filter_info *lfi) {

-  vp9_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  vp9_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  vp9_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  if (u_ptr)

-    vp9_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);

-  if (v_ptr)

-    vp9_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);

-}

-void vp9_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,

-                               const unsigned char *blimit) {

-  vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);

-  vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);

-  vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);

-}

-#endif

-#if HAVE_ARMV7

-/* NEON loopfilter functions */

-/* Horizontal MB filtering */

-void vp9_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                              int y_stride, int uv_stride, loop_filter_info *lfi) {

-  unsigned char mblim = *lfi->mblim;

-  unsigned char lim = *lfi->lim;

-  unsigned char hev_thr = *lfi->hev_thr;

-  vp9_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);

-  if (u_ptr)

-    vp9_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);

-}

-/* Vertical MB Filtering */

-void vp9_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                              int y_stride, int uv_stride, loop_filter_info *lfi) {

-  unsigned char mblim = *lfi->mblim;

-  unsigned char lim = *lfi->lim;

-  unsigned char hev_thr = *lfi->hev_thr;

-  vp9_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);

-  if (u_ptr)

-    vp9_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);

-}

-/* Horizontal B Filtering */

-void vp9_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                             int y_stride, int uv_stride, loop_filter_info *lfi) {

-  unsigned char blim = *lfi->blim;

-  unsigned char lim = *lfi->lim;

-  unsigned char hev_thr = *lfi->hev_thr;

-  vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);

-  vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);

-  vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);

-  if (u_ptr)

-    vp9_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);

-}

-/* Vertical B Filtering */

-void vp9_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                             int y_stride, int uv_stride, loop_filter_info *lfi) {

-  unsigned char blim = *lfi->blim;

-  unsigned char lim = *lfi->lim;

-  unsigned char hev_thr = *lfi->hev_thr;

-  vp9_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);

-  vp9_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);

-  vp9_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);

-  if (u_ptr)

-    vp9_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);

-}

-#endif

--- a/vp8/common/arm/loopfilter_arm.h

+++ /dev/null

@@ -1,41 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef LOOPFILTER_ARM_H

-#define LOOPFILTER_ARM_H

-#include "vpx_config.h"

-#if HAVE_ARMV6

-extern prototype_loopfilter_block(vp9_loop_filter_mbv_armv6);

-extern prototype_loopfilter_block(vp9_loop_filter_bv_armv6);

-extern prototype_loopfilter_block(vp9_loop_filter_mbh_armv6);

-extern prototype_loopfilter_block(vp9_loop_filter_bh_armv6);

-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_armv6);

-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_armv6);

-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_armv6);

-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_armv6);

-#endif /* HAVE_ARMV6 */

-#if HAVE_ARMV7

-extern prototype_loopfilter_block(vp9_loop_filter_mbv_neon);

-extern prototype_loopfilter_block(vp9_loop_filter_bv_neon);

-extern prototype_loopfilter_block(vp9_loop_filter_mbh_neon);

-extern prototype_loopfilter_block(vp9_loop_filter_bh_neon);

-extern prototype_simple_loopfilter(vp9_loop_filter_mbvs_neon);

-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_neon);

-extern prototype_simple_loopfilter(vp9_loop_filter_mbhs_neon);

-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_neon);

-#endif /* HAVE_ARMV7 */

-#endif /* LOOPFILTER_ARM_H */

--- a/vp8/common/arm/neon/bilinearpredict16x16_neon.asm

+++ /dev/null

@@ -1,357 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_bilinear_predict16x16_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char  *src_ptr,

-; r1    int  src_pixels_per_line,

-; r2    int  xoffset,

-; r3    int  yoffset,

-; r4    unsigned char *dst_ptr,

-; stack(r5) int  dst_pitch

-|vp8_bilinear_predict16x16_neon| PROC

-    push            {r4-r5, lr}

-    adr             r12, bifilter16_coeff

-    ldr             r4, [sp, #12]           ;load parameters from stack

-    ldr             r5, [sp, #16]           ;load parameters from stack

-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

-    beq             secondpass_bfilter16x16_only

-    add             r2, r12, r2, lsl #3     ;calculate filter location

-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

-    vld1.s32        {d31}, [r2]             ;load first_pass filter

-    beq             firstpass_bfilter16x16_only

-    sub             sp, sp, #272            ;reserve space on stack for temporary storage

-    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data

-    mov             lr, sp

-    vld1.u8         {d5, d6, d7}, [r0], r1

-    mov             r2, #3                  ;loop counter

-    vld1.u8         {d8, d9, d10}, [r0], r1

-    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)

-    vld1.u8         {d11, d12, d13}, [r0], r1

-    vdup.8          d1, d31[4]

-;First Pass: output_height lines x output_width columns (17x16)

-filt_blk2d_fp16x16_loop_neon

-    pld             [r0]

-    pld             [r0, r1]

-    pld             [r0, r1, lsl #1]

-    vmull.u8        q7, d2, d0              ;(src_ptr[0] * vp9_filter[0])

-    vmull.u8        q8, d3, d0

-    vmull.u8        q9, d5, d0

-    vmull.u8        q10, d6, d0

-    vmull.u8        q11, d8, d0

-    vmull.u8        q12, d9, d0

-    vmull.u8        q13, d11, d0

-    vmull.u8        q14, d12, d0

-    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]

-    vext.8          d5, d5, d6, #1

-    vext.8          d8, d8, d9, #1

-    vext.8          d11, d11, d12, #1

-    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * vp9_filter[1])

-    vmlal.u8        q9, d5, d1

-    vmlal.u8        q11, d8, d1

-    vmlal.u8        q13, d11, d1

-    vext.8          d3, d3, d4, #1

-    vext.8          d6, d6, d7, #1

-    vext.8          d9, d9, d10, #1

-    vext.8          d12, d12, d13, #1

-    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * vp9_filter[1])

-    vmlal.u8        q10, d6, d1

-    vmlal.u8        q12, d9, d1

-    vmlal.u8        q14, d12, d1

-    subs            r2, r2, #1

-    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8

-    vqrshrn.u16    d15, q8, #7

-    vqrshrn.u16    d16, q9, #7

-    vqrshrn.u16    d17, q10, #7

-    vqrshrn.u16    d18, q11, #7

-    vqrshrn.u16    d19, q12, #7

-    vqrshrn.u16    d20, q13, #7

-    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data

-    vqrshrn.u16    d21, q14, #7

-    vld1.u8         {d5, d6, d7}, [r0], r1

-    vst1.u8         {d14, d15, d16, d17}, [lr]!     ;store result

-    vld1.u8         {d8, d9, d10}, [r0], r1

-    vst1.u8         {d18, d19, d20, d21}, [lr]!

-    vld1.u8         {d11, d12, d13}, [r0], r1

-    bne             filt_blk2d_fp16x16_loop_neon

-;First-pass filtering for rest 5 lines

-    vld1.u8         {d14, d15, d16}, [r0], r1

-    vmull.u8        q9, d2, d0              ;(src_ptr[0] * vp9_filter[0])

-    vmull.u8        q10, d3, d0

-    vmull.u8        q11, d5, d0

-    vmull.u8        q12, d6, d0

-    vmull.u8        q13, d8, d0

-    vmull.u8        q14, d9, d0

-    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]

-    vext.8          d5, d5, d6, #1

-    vext.8          d8, d8, d9, #1

-    vmlal.u8        q9, d2, d1              ;(src_ptr[0] * vp9_filter[1])

-    vmlal.u8        q11, d5, d1

-    vmlal.u8        q13, d8, d1

-    vext.8          d3, d3, d4, #1

-    vext.8          d6, d6, d7, #1

-    vext.8          d9, d9, d10, #1

-    vmlal.u8        q10, d3, d1             ;(src_ptr[0] * vp9_filter[1])

-    vmlal.u8        q12, d6, d1

-    vmlal.u8        q14, d9, d1

-    vmull.u8        q1, d11, d0

-    vmull.u8        q2, d12, d0

-    vmull.u8        q3, d14, d0

-    vmull.u8        q4, d15, d0

-    vext.8          d11, d11, d12, #1       ;construct src_ptr[1]

-    vext.8          d14, d14, d15, #1

-    vmlal.u8        q1, d11, d1             ;(src_ptr[0] * vp9_filter[1])

-    vmlal.u8        q3, d14, d1

-    vext.8          d12, d12, d13, #1

-    vext.8          d15, d15, d16, #1

-    vmlal.u8        q2, d12, d1             ;(src_ptr[0] * vp9_filter[1])

-    vmlal.u8        q4, d15, d1

-    vqrshrn.u16    d10, q9, #7              ;shift/round/saturate to u8

-    vqrshrn.u16    d11, q10, #7

-    vqrshrn.u16    d12, q11, #7

-    vqrshrn.u16    d13, q12, #7

-    vqrshrn.u16    d14, q13, #7

-    vqrshrn.u16    d15, q14, #7

-    vqrshrn.u16    d16, q1, #7

-    vqrshrn.u16    d17, q2, #7

-    vqrshrn.u16    d18, q3, #7

-    vqrshrn.u16    d19, q4, #7

-    vst1.u8         {d10, d11, d12, d13}, [lr]!         ;store result

-    vst1.u8         {d14, d15, d16, d17}, [lr]!

-    vst1.u8         {d18, d19}, [lr]!

-;Second pass: 16x16

-;secondpass_filter

-    add             r3, r12, r3, lsl #3

-    sub             lr, lr, #272

-    vld1.u32        {d31}, [r3]             ;load second_pass filter

-    vld1.u8         {d22, d23}, [lr]!       ;load src data

-    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)

-    vdup.8          d1, d31[4]

-    mov             r12, #4                 ;loop counter

-filt_blk2d_sp16x16_loop_neon

-    vld1.u8         {d24, d25}, [lr]!

-    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp9_filter[0])

-    vld1.u8         {d26, d27}, [lr]!

-    vmull.u8        q2, d23, d0

-    vld1.u8         {d28, d29}, [lr]!

-    vmull.u8        q3, d24, d0

-    vld1.u8         {d30, d31}, [lr]!

-    vmull.u8        q4, d25, d0

-    vmull.u8        q5, d26, d0

-    vmull.u8        q6, d27, d0

-    vmull.u8        q7, d28, d0

-    vmull.u8        q8, d29, d0

-    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * vp9_filter[1])

-    vmlal.u8        q2, d25, d1

-    vmlal.u8        q3, d26, d1

-    vmlal.u8        q4, d27, d1

-    vmlal.u8        q5, d28, d1

-    vmlal.u8        q6, d29, d1

-    vmlal.u8        q7, d30, d1

-    vmlal.u8        q8, d31, d1

-    subs            r12, r12, #1

-    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8

-    vqrshrn.u16    d3, q2, #7

-    vqrshrn.u16    d4, q3, #7

-    vqrshrn.u16    d5, q4, #7

-    vqrshrn.u16    d6, q5, #7

-    vqrshrn.u16    d7, q6, #7

-    vqrshrn.u16    d8, q7, #7

-    vqrshrn.u16    d9, q8, #7

-    vst1.u8         {d2, d3}, [r4], r5      ;store result

-    vst1.u8         {d4, d5}, [r4], r5

-    vst1.u8         {d6, d7}, [r4], r5

-    vmov            q11, q15

-    vst1.u8         {d8, d9}, [r4], r5

-    bne             filt_blk2d_sp16x16_loop_neon

-    add             sp, sp, #272

-    pop             {r4-r5,pc}

-;--------------------

-firstpass_bfilter16x16_only

-    mov             r2, #4                      ;loop counter

-    vdup.8          d0, d31[0]                  ;first_pass filter (d0 d1)

-    vdup.8          d1, d31[4]

-;First Pass: output_height lines x output_width columns (16x16)

-filt_blk2d_fpo16x16_loop_neon

-    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data

-    vld1.u8         {d5, d6, d7}, [r0], r1

-    vld1.u8         {d8, d9, d10}, [r0], r1

-    vld1.u8         {d11, d12, d13}, [r0], r1

-    pld             [r0]

-    pld             [r0, r1]

-    pld             [r0, r1, lsl #1]

-    vmull.u8        q7, d2, d0              ;(src_ptr[0] * vp9_filter[0])

-    vmull.u8        q8, d3, d0

-    vmull.u8        q9, d5, d0

-    vmull.u8        q10, d6, d0

-    vmull.u8        q11, d8, d0

-    vmull.u8        q12, d9, d0

-    vmull.u8        q13, d11, d0

-    vmull.u8        q14, d12, d0

-    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]

-    vext.8          d5, d5, d6, #1

-    vext.8          d8, d8, d9, #1

-    vext.8          d11, d11, d12, #1

-    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * vp9_filter[1])

-    vmlal.u8        q9, d5, d1

-    vmlal.u8        q11, d8, d1

-    vmlal.u8        q13, d11, d1

-    vext.8          d3, d3, d4, #1

-    vext.8          d6, d6, d7, #1

-    vext.8          d9, d9, d10, #1

-    vext.8          d12, d12, d13, #1

-    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * vp9_filter[1])

-    vmlal.u8        q10, d6, d1

-    vmlal.u8        q12, d9, d1

-    vmlal.u8        q14, d12, d1

-    subs            r2, r2, #1

-    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8

-    vqrshrn.u16    d15, q8, #7

-    vqrshrn.u16    d16, q9, #7

-    vqrshrn.u16    d17, q10, #7

-    vqrshrn.u16    d18, q11, #7

-    vqrshrn.u16    d19, q12, #7

-    vqrshrn.u16    d20, q13, #7

-    vst1.u8         {d14, d15}, [r4], r5        ;store result

-    vqrshrn.u16    d21, q14, #7

-    vst1.u8         {d16, d17}, [r4], r5

-    vst1.u8         {d18, d19}, [r4], r5

-    vst1.u8         {d20, d21}, [r4], r5

-    bne             filt_blk2d_fpo16x16_loop_neon

-    pop             {r4-r5,pc}

-;---------------------

-secondpass_bfilter16x16_only

-;Second pass: 16x16

-;secondpass_filter

-    add             r3, r12, r3, lsl #3

-    mov             r12, #4                     ;loop counter

-    vld1.u32        {d31}, [r3]                 ;load second_pass filter

-    vld1.u8         {d22, d23}, [r0], r1        ;load src data

-    vdup.8          d0, d31[0]                  ;second_pass filter parameters (d0 d1)

-    vdup.8          d1, d31[4]

-filt_blk2d_spo16x16_loop_neon

-    vld1.u8         {d24, d25}, [r0], r1

-    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp9_filter[0])

-    vld1.u8         {d26, d27}, [r0], r1

-    vmull.u8        q2, d23, d0

-    vld1.u8         {d28, d29}, [r0], r1

-    vmull.u8        q3, d24, d0

-    vld1.u8         {d30, d31}, [r0], r1

-    vmull.u8        q4, d25, d0

-    vmull.u8        q5, d26, d0

-    vmull.u8        q6, d27, d0

-    vmull.u8        q7, d28, d0

-    vmull.u8        q8, d29, d0

-    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * vp9_filter[1])

-    vmlal.u8        q2, d25, d1

-    vmlal.u8        q3, d26, d1

-    vmlal.u8        q4, d27, d1

-    vmlal.u8        q5, d28, d1

-    vmlal.u8        q6, d29, d1

-    vmlal.u8        q7, d30, d1

-    vmlal.u8        q8, d31, d1

-    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8

-    vqrshrn.u16    d3, q2, #7

-    vqrshrn.u16    d4, q3, #7

-    vqrshrn.u16    d5, q4, #7

-    vqrshrn.u16    d6, q5, #7

-    vqrshrn.u16    d7, q6, #7

-    vqrshrn.u16    d8, q7, #7

-    vqrshrn.u16    d9, q8, #7

-    vst1.u8         {d2, d3}, [r4], r5      ;store result

-    subs            r12, r12, #1

-    vst1.u8         {d4, d5}, [r4], r5

-    vmov            q11, q15

-    vst1.u8         {d6, d7}, [r4], r5

-    vst1.u8         {d8, d9}, [r4], r5

-    bne             filt_blk2d_spo16x16_loop_neon

-    pop             {r4-r5,pc}

-    ENDP

-;-----------------

-bifilter16_coeff

-    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112

-    END

--- a/vp8/common/arm/neon/bilinearpredict4x4_neon.asm

+++ /dev/null

@@ -1,130 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_bilinear_predict4x4_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char  *src_ptr,

-; r1    int  src_pixels_per_line,

-; r2    int  xoffset,

-; r3    int  yoffset,

-; r4    unsigned char *dst_ptr,

-; stack(lr) int  dst_pitch

-|vp8_bilinear_predict4x4_neon| PROC

-    push            {r4, lr}

-    adr             r12, bifilter4_coeff

-    ldr             r4, [sp, #8]            ;load parameters from stack

-    ldr             lr, [sp, #12]           ;load parameters from stack

-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

-    beq             skip_firstpass_filter

-;First pass: output_height lines x output_width columns (5x4)

-    vld1.u8         {d2}, [r0], r1          ;load src data

-    add             r2, r12, r2, lsl #3     ;calculate Hfilter location (2coeffsx4bytes=8bytes)

-    vld1.u8         {d3}, [r0], r1

-    vld1.u32        {d31}, [r2]             ;first_pass filter

-    vld1.u8         {d4}, [r0], r1

-    vdup.8          d0, d31[0]              ;first_pass filter (d0-d1)

-    vld1.u8         {d5}, [r0], r1

-    vdup.8          d1, d31[4]

-    vld1.u8         {d6}, [r0], r1

-    vshr.u64        q4, q1, #8              ;construct src_ptr[1]

-    vshr.u64        q5, q2, #8

-    vshr.u64        d12, d6, #8

-    vzip.32         d2, d3                  ;put 2-line data in 1 register (src_ptr[0])

-    vzip.32         d4, d5

-    vzip.32         d8, d9                  ;put 2-line data in 1 register (src_ptr[1])

-    vzip.32         d10, d11

-    vmull.u8        q7, d2, d0              ;(src_ptr[0] * vp9_filter[0])

-    vmull.u8        q8, d4, d0

-    vmull.u8        q9, d6, d0

-    vmlal.u8        q7, d8, d1              ;(src_ptr[1] * vp9_filter[1])

-    vmlal.u8        q8, d10, d1

-    vmlal.u8        q9, d12, d1

-    vqrshrn.u16    d28, q7, #7              ;shift/round/saturate to u8

-    vqrshrn.u16    d29, q8, #7

-    vqrshrn.u16    d30, q9, #7

-;Second pass: 4x4

-secondpass_filter

-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

-    beq             skip_secondpass_filter

-    add             r3, r12, r3, lsl #3 ;calculate Vfilter location

-    vld1.u32        {d31}, [r3]         ;load second_pass filter

-    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0-d5)

-    vdup.8          d1, d31[4]

-    vmull.u8        q1, d28, d0

-    vmull.u8        q2, d29, d0

-    vext.8          d26, d28, d29, #4       ;construct src_ptr[pixel_step]

-    vext.8          d27, d29, d30, #4

-    vmlal.u8        q1, d26, d1

-    vmlal.u8        q2, d27, d1

-    add             r0, r4, lr

-    add             r1, r0, lr

-    add             r2, r1, lr

-    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8

-    vqrshrn.u16    d3, q2, #7

-    vst1.32         {d2[0]}, [r4]           ;store result

-    vst1.32         {d2[1]}, [r0]

-    vst1.32         {d3[0]}, [r1]

-    vst1.32         {d3[1]}, [r2]

-    pop             {r4, pc}

-;--------------------

-skip_firstpass_filter

-    vld1.32         {d28[0]}, [r0], r1      ;load src data

-    vld1.32         {d28[1]}, [r0], r1

-    vld1.32         {d29[0]}, [r0], r1

-    vld1.32         {d29[1]}, [r0], r1

-    vld1.32         {d30[0]}, [r0], r1

-    b               secondpass_filter

-;---------------------

-skip_secondpass_filter

-    vst1.32         {d28[0]}, [r4], lr      ;store result

-    vst1.32         {d28[1]}, [r4], lr

-    vst1.32         {d29[0]}, [r4], lr

-    vst1.32         {d29[1]}, [r4], lr

-    pop             {r4, pc}

-    ENDP

-;-----------------

-bifilter4_coeff

-    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112

-    END

--- a/vp8/common/arm/neon/bilinearpredict8x4_neon.asm

+++ /dev/null

@@ -1,135 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_bilinear_predict8x4_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char  *src_ptr,

-; r1    int  src_pixels_per_line,

-; r2    int  xoffset,

-; r3    int  yoffset,

-; r4    unsigned char *dst_ptr,

-; stack(lr) int  dst_pitch

-|vp8_bilinear_predict8x4_neon| PROC

-    push            {r4, lr}

-    adr             r12, bifilter8x4_coeff

-    ldr             r4, [sp, #8]            ;load parameters from stack

-    ldr             lr, [sp, #12]           ;load parameters from stack

-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

-    beq             skip_firstpass_filter

-;First pass: output_height lines x output_width columns (5x8)

-    add             r2, r12, r2, lsl #3     ;calculate filter location

-    vld1.u8         {q1}, [r0], r1          ;load src data

-    vld1.u32        {d31}, [r2]             ;load first_pass filter

-    vld1.u8         {q2}, [r0], r1

-    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)

-    vld1.u8         {q3}, [r0], r1

-    vdup.8          d1, d31[4]

-    vld1.u8         {q4}, [r0], r1

-    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp9_filter[0])

-    vld1.u8         {q5}, [r0], r1

-    vmull.u8        q7, d4, d0

-    vmull.u8        q8, d6, d0

-    vmull.u8        q9, d8, d0

-    vmull.u8        q10, d10, d0

-    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]

-    vext.8          d5, d4, d5, #1

-    vext.8          d7, d6, d7, #1

-    vext.8          d9, d8, d9, #1

-    vext.8          d11, d10, d11, #1

-    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp9_filter[1])

-    vmlal.u8        q7, d5, d1

-    vmlal.u8        q8, d7, d1

-    vmlal.u8        q9, d9, d1

-    vmlal.u8        q10, d11, d1

-    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8

-    vqrshrn.u16    d23, q7, #7

-    vqrshrn.u16    d24, q8, #7

-    vqrshrn.u16    d25, q9, #7

-    vqrshrn.u16    d26, q10, #7

-;Second pass: 4x8

-secondpass_filter

-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

-    beq             skip_secondpass_filter

-    add             r3, r12, r3, lsl #3

-    add             r0, r4, lr

-    vld1.u32        {d31}, [r3]             ;load second_pass filter

-    add             r1, r0, lr

-    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)

-    vdup.8          d1, d31[4]

-    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp9_filter[0])

-    vmull.u8        q2, d23, d0

-    vmull.u8        q3, d24, d0

-    vmull.u8        q4, d25, d0

-    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * vp9_filter[1])

-    vmlal.u8        q2, d24, d1

-    vmlal.u8        q3, d25, d1

-    vmlal.u8        q4, d26, d1

-    add             r2, r1, lr

-    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8

-    vqrshrn.u16    d3, q2, #7

-    vqrshrn.u16    d4, q3, #7

-    vqrshrn.u16    d5, q4, #7

-    vst1.u8         {d2}, [r4]              ;store result

-    vst1.u8         {d3}, [r0]

-    vst1.u8         {d4}, [r1]

-    vst1.u8         {d5}, [r2]

-    pop             {r4, pc}

-;--------------------

-skip_firstpass_filter

-    vld1.u8         {d22}, [r0], r1         ;load src data

-    vld1.u8         {d23}, [r0], r1

-    vld1.u8         {d24}, [r0], r1

-    vld1.u8         {d25}, [r0], r1

-    vld1.u8         {d26}, [r0], r1

-    b               secondpass_filter

-;---------------------

-skip_secondpass_filter

-    vst1.u8         {d22}, [r4], lr         ;store result

-    vst1.u8         {d23}, [r4], lr

-    vst1.u8         {d24}, [r4], lr

-    vst1.u8         {d25}, [r4], lr

-    pop             {r4, pc}

-    ENDP

-;-----------------

-bifilter8x4_coeff

-    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112

-    END

--- a/vp8/common/arm/neon/bilinearpredict8x8_neon.asm

+++ /dev/null

@@ -1,183 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_bilinear_predict8x8_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char  *src_ptr,

-; r1    int  src_pixels_per_line,

-; r2    int  xoffset,

-; r3    int  yoffset,

-; r4    unsigned char *dst_ptr,

-; stack(lr) int  dst_pitch

-|vp8_bilinear_predict8x8_neon| PROC

-    push            {r4, lr}

-    adr             r12, bifilter8_coeff

-    ldr             r4, [sp, #8]            ;load parameters from stack

-    ldr             lr, [sp, #12]           ;load parameters from stack

-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

-    beq             skip_firstpass_filter

-;First pass: output_height lines x output_width columns (9x8)

-    add             r2, r12, r2, lsl #3     ;calculate filter location

-    vld1.u8         {q1}, [r0], r1          ;load src data

-    vld1.u32        {d31}, [r2]             ;load first_pass filter

-    vld1.u8         {q2}, [r0], r1

-    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)

-    vld1.u8         {q3}, [r0], r1

-    vdup.8          d1, d31[4]

-    vld1.u8         {q4}, [r0], r1

-    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp9_filter[0])

-    vmull.u8        q7, d4, d0

-    vmull.u8        q8, d6, d0

-    vmull.u8        q9, d8, d0

-    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]

-    vext.8          d5, d4, d5, #1

-    vext.8          d7, d6, d7, #1

-    vext.8          d9, d8, d9, #1

-    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp9_filter[1])

-    vmlal.u8        q7, d5, d1

-    vmlal.u8        q8, d7, d1

-    vmlal.u8        q9, d9, d1

-    vld1.u8         {q1}, [r0], r1          ;load src data

-    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8

-    vld1.u8         {q2}, [r0], r1

-    vqrshrn.u16    d23, q7, #7

-    vld1.u8         {q3}, [r0], r1

-    vqrshrn.u16    d24, q8, #7

-    vld1.u8         {q4}, [r0], r1

-    vqrshrn.u16    d25, q9, #7

-    ;first_pass filtering on the rest 5-line data

-    vld1.u8         {q5}, [r0], r1

-    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp9_filter[0])

-    vmull.u8        q7, d4, d0

-    vmull.u8        q8, d6, d0

-    vmull.u8        q9, d8, d0

-    vmull.u8        q10, d10, d0

-    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]

-    vext.8          d5, d4, d5, #1

-    vext.8          d7, d6, d7, #1

-    vext.8          d9, d8, d9, #1

-    vext.8          d11, d10, d11, #1

-    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp9_filter[1])

-    vmlal.u8        q7, d5, d1

-    vmlal.u8        q8, d7, d1

-    vmlal.u8        q9, d9, d1

-    vmlal.u8        q10, d11, d1

-    vqrshrn.u16    d26, q6, #7              ;shift/round/saturate to u8

-    vqrshrn.u16    d27, q7, #7

-    vqrshrn.u16    d28, q8, #7

-    vqrshrn.u16    d29, q9, #7

-    vqrshrn.u16    d30, q10, #7

-;Second pass: 8x8

-secondpass_filter

-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

-    beq             skip_secondpass_filter

-    add             r3, r12, r3, lsl #3

-    add             r0, r4, lr

-    vld1.u32        {d31}, [r3]             ;load second_pass filter

-    add             r1, r0, lr

-    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)

-    vdup.8          d1, d31[4]

-    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp9_filter[0])

-    vmull.u8        q2, d23, d0

-    vmull.u8        q3, d24, d0

-    vmull.u8        q4, d25, d0

-    vmull.u8        q5, d26, d0

-    vmull.u8        q6, d27, d0

-    vmull.u8        q7, d28, d0

-    vmull.u8        q8, d29, d0

-    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * vp9_filter[1])

-    vmlal.u8        q2, d24, d1

-    vmlal.u8        q3, d25, d1

-    vmlal.u8        q4, d26, d1

-    vmlal.u8        q5, d27, d1

-    vmlal.u8        q6, d28, d1

-    vmlal.u8        q7, d29, d1

-    vmlal.u8        q8, d30, d1

-    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8

-    vqrshrn.u16    d3, q2, #7

-    vqrshrn.u16    d4, q3, #7

-    vqrshrn.u16    d5, q4, #7

-    vqrshrn.u16    d6, q5, #7

-    vqrshrn.u16    d7, q6, #7

-    vqrshrn.u16    d8, q7, #7

-    vqrshrn.u16    d9, q8, #7

-    vst1.u8         {d2}, [r4]              ;store result

-    vst1.u8         {d3}, [r0]

-    vst1.u8         {d4}, [r1], lr

-    vst1.u8         {d5}, [r1], lr

-    vst1.u8         {d6}, [r1], lr

-    vst1.u8         {d7}, [r1], lr

-    vst1.u8         {d8}, [r1], lr

-    vst1.u8         {d9}, [r1], lr

-    pop             {r4, pc}

-;--------------------

-skip_firstpass_filter

-    vld1.u8         {d22}, [r0], r1         ;load src data

-    vld1.u8         {d23}, [r0], r1

-    vld1.u8         {d24}, [r0], r1

-    vld1.u8         {d25}, [r0], r1

-    vld1.u8         {d26}, [r0], r1

-    vld1.u8         {d27}, [r0], r1

-    vld1.u8         {d28}, [r0], r1

-    vld1.u8         {d29}, [r0], r1

-    vld1.u8         {d30}, [r0], r1

-    b               secondpass_filter

-;---------------------

-skip_secondpass_filter

-    vst1.u8         {d22}, [r4], lr         ;store result

-    vst1.u8         {d23}, [r4], lr

-    vst1.u8         {d24}, [r4], lr

-    vst1.u8         {d25}, [r4], lr

-    vst1.u8         {d26}, [r4], lr

-    vst1.u8         {d27}, [r4], lr

-    vst1.u8         {d28}, [r4], lr

-    vst1.u8         {d29}, [r4], lr

-    pop             {r4, pc}

-    ENDP

-;-----------------

-bifilter8_coeff

-    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112

-    END

--- a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm

+++ /dev/null

@@ -1,584 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_build_intra_predictors_mby_neon_func|

-    EXPORT  |vp8_build_intra_predictors_mby_s_neon_func|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char *y_buffer

-; r1    unsigned char *ypred_ptr

-; r2    int y_stride

-; r3    int mode

-; stack int Up

-; stack int Left

-|vp8_build_intra_predictors_mby_neon_func| PROC

-    push            {r4-r8, lr}

-    cmp             r3, #0

-    beq             case_dc_pred

-    cmp             r3, #1

-    beq             case_v_pred

-    cmp             r3, #2

-    beq             case_h_pred

-    cmp             r3, #3

-    beq             case_tm_pred

-case_dc_pred

-    ldr             r4, [sp, #24]       ; Up

-    ldr             r5, [sp, #28]       ; Left

-    ; Default the DC average to 128

-    mov             r12, #128

-    vdup.u8         q0, r12

-    ; Zero out running sum

-    mov             r12, #0

-    ; compute shift and jump

-    adds            r7, r4, r5

-    beq             skip_dc_pred_up_left

-    ; Load above row, if it exists

-    cmp             r4, #0

-    beq             skip_dc_pred_up

-    sub             r6, r0, r2

-    vld1.8          {q1}, [r6]

-    vpaddl.u8       q2, q1

-    vpaddl.u16      q3, q2

-    vpaddl.u32      q4, q3

-    vmov.32         r4, d8[0]

-    vmov.32         r6, d9[0]

-    add             r12, r4, r6

-    ; Move back to interger registers

-skip_dc_pred_up

-    cmp             r5, #0

-    beq             skip_dc_pred_left

-    sub             r0, r0, #1

-    ; Load left row, if it exists

-    ldrb            r3, [r0], r2

-    ldrb            r4, [r0], r2

-    ldrb            r5, [r0], r2

-    ldrb            r6, [r0], r2

-    add             r12, r12, r3

-    add             r12, r12, r4

-    add             r12, r12, r5

-    add             r12, r12, r6

-    ldrb            r3, [r0], r2

-    ldrb            r4, [r0], r2

-    ldrb            r5, [r0], r2

-    ldrb            r6, [r0], r2

-    add             r12, r12, r3

-    add             r12, r12, r4

-    add             r12, r12, r5

-    add             r12, r12, r6

-    ldrb            r3, [r0], r2

-    ldrb            r4, [r0], r2

-    ldrb            r5, [r0], r2

-    ldrb            r6, [r0], r2

-    add             r12, r12, r3

-    add             r12, r12, r4

-    add             r12, r12, r5

-    add             r12, r12, r6

-    ldrb            r3, [r0], r2

-    ldrb            r4, [r0], r2

-    ldrb            r5, [r0], r2

-    ldrb            r6, [r0]

-    add             r12, r12, r3

-    add             r12, r12, r4

-    add             r12, r12, r5

-    add             r12, r12, r6

-skip_dc_pred_left

-    add             r7, r7, #3          ; Shift

-    sub             r4, r7, #1

-    mov             r5, #1

-    add             r12, r12, r5, lsl r4

-    mov             r5, r12, lsr r7     ; expected_dc

-    vdup.u8         q0, r5

-skip_dc_pred_up_left

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    pop             {r4-r8,pc}

-case_v_pred

-    ; Copy down above row

-    sub             r6, r0, r2

-    vld1.8          {q0}, [r6]

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q0}, [r1]!

-    pop             {r4-r8,pc}

-case_h_pred

-    ; Load 4x yleft_col

-    sub             r0, r0, #1

-    ldrb            r3, [r0], r2

-    ldrb            r4, [r0], r2

-    ldrb            r5, [r0], r2

-    ldrb            r6, [r0], r2

-    vdup.u8         q0, r3

-    vdup.u8         q1, r4

-    vdup.u8         q2, r5

-    vdup.u8         q3, r6

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q1}, [r1]!

-    vst1.u8         {q2}, [r1]!

-    vst1.u8         {q3}, [r1]!

-    ldrb            r3, [r0], r2

-    ldrb            r4, [r0], r2

-    ldrb            r5, [r0], r2

-    ldrb            r6, [r0], r2

-    vdup.u8         q0, r3

-    vdup.u8         q1, r4

-    vdup.u8         q2, r5

-    vdup.u8         q3, r6

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q1}, [r1]!

-    vst1.u8         {q2}, [r1]!

-    vst1.u8         {q3}, [r1]!

-    ldrb            r3, [r0], r2

-    ldrb            r4, [r0], r2

-    ldrb            r5, [r0], r2

-    ldrb            r6, [r0], r2

-    vdup.u8         q0, r3

-    vdup.u8         q1, r4

-    vdup.u8         q2, r5

-    vdup.u8         q3, r6

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q1}, [r1]!

-    vst1.u8         {q2}, [r1]!

-    vst1.u8         {q3}, [r1]!

-    ldrb            r3, [r0], r2

-    ldrb            r4, [r0], r2

-    ldrb            r5, [r0], r2

-    ldrb            r6, [r0], r2

-    vdup.u8         q0, r3

-    vdup.u8         q1, r4

-    vdup.u8         q2, r5

-    vdup.u8         q3, r6

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q1}, [r1]!

-    vst1.u8         {q2}, [r1]!

-    vst1.u8         {q3}, [r1]!

-    pop             {r4-r8,pc}

-case_tm_pred

-    ; Load yabove_row

-    sub             r3, r0, r2

-    vld1.8          {q8}, [r3]

-    ; Load ytop_left

-    sub             r3, r3, #1

-    ldrb            r7, [r3]

-    vdup.u16        q7, r7

-    ; Compute yabove_row - ytop_left

-    mov             r3, #1

-    vdup.u8         q0, r3

-    vmull.u8        q4, d16, d0

-    vmull.u8        q5, d17, d0

-    vsub.s16        q4, q4, q7

-    vsub.s16        q5, q5, q7

-    ; Load 4x yleft_col

-    sub             r0, r0, #1

-    mov             r12, #4

-case_tm_pred_loop

-    ldrb            r3, [r0], r2

-    ldrb            r4, [r0], r2

-    ldrb            r5, [r0], r2

-    ldrb            r6, [r0], r2

-    vdup.u16        q0, r3

-    vdup.u16        q1, r4

-    vdup.u16        q2, r5

-    vdup.u16        q3, r6

-    vqadd.s16       q8, q0, q4

-    vqadd.s16       q9, q0, q5

-    vqadd.s16       q10, q1, q4

-    vqadd.s16       q11, q1, q5

-    vqadd.s16       q12, q2, q4

-    vqadd.s16       q13, q2, q5

-    vqadd.s16       q14, q3, q4

-    vqadd.s16       q15, q3, q5

-    vqshrun.s16     d0, q8, #0

-    vqshrun.s16     d1, q9, #0

-    vqshrun.s16     d2, q10, #0

-    vqshrun.s16     d3, q11, #0

-    vqshrun.s16     d4, q12, #0

-    vqshrun.s16     d5, q13, #0

-    vqshrun.s16     d6, q14, #0

-    vqshrun.s16     d7, q15, #0

-    vst1.u8         {q0}, [r1]!

-    vst1.u8         {q1}, [r1]!

-    vst1.u8         {q2}, [r1]!

-    vst1.u8         {q3}, [r1]!

-    subs            r12, r12, #1

-    bne             case_tm_pred_loop

-    pop             {r4-r8,pc}

-    ENDP

-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-; r0    unsigned char *y_buffer

-; r1    unsigned char *ypred_ptr

-; r2    int y_stride

-; r3    int mode

-; stack int Up

-; stack int Left

-|vp8_build_intra_predictors_mby_s_neon_func| PROC

-    push            {r4-r8, lr}

-    mov             r1, r0      ;   unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;

-    cmp             r3, #0

-    beq             case_dc_pred_s

-    cmp             r3, #1

-    beq             case_v_pred_s

-    cmp             r3, #2

-    beq             case_h_pred_s

-    cmp             r3, #3

-    beq             case_tm_pred_s

-case_dc_pred_s

-    ldr             r4, [sp, #24]       ; Up

-    ldr             r5, [sp, #28]       ; Left

-    ; Default the DC average to 128

-    mov             r12, #128

-    vdup.u8         q0, r12

-    ; Zero out running sum

-    mov             r12, #0

-    ; compute shift and jump

-    adds            r7, r4, r5

-    beq             skip_dc_pred_up_left_s

-    ; Load above row, if it exists

-    cmp             r4, #0

-    beq             skip_dc_pred_up_s

-    sub             r6, r0, r2

-    vld1.8          {q1}, [r6]

-    vpaddl.u8       q2, q1

-    vpaddl.u16      q3, q2

-    vpaddl.u32      q4, q3

-    vmov.32         r4, d8[0]

-    vmov.32         r6, d9[0]

-    add             r12, r4, r6

-    ; Move back to interger registers

-skip_dc_pred_up_s

-    cmp             r5, #0

-    beq             skip_dc_pred_left_s

-    sub             r0, r0, #1

-    ; Load left row, if it exists

-    ldrb            r3, [r0], r2

-    ldrb            r4, [r0], r2

-    ldrb            r5, [r0], r2

-    ldrb            r6, [r0], r2

-    add             r12, r12, r3

-    add             r12, r12, r4

-    add             r12, r12, r5

-    add             r12, r12, r6

-    ldrb            r3, [r0], r2

-    ldrb            r4, [r0], r2

-    ldrb            r5, [r0], r2

-    ldrb            r6, [r0], r2

-    add             r12, r12, r3

-    add             r12, r12, r4

-    add             r12, r12, r5

-    add             r12, r12, r6

-    ldrb            r3, [r0], r2

-    ldrb            r4, [r0], r2

-    ldrb            r5, [r0], r2

-    ldrb            r6, [r0], r2

-    add             r12, r12, r3

-    add             r12, r12, r4

-    add             r12, r12, r5

-    add             r12, r12, r6

-    ldrb            r3, [r0], r2

-    ldrb            r4, [r0], r2

-    ldrb            r5, [r0], r2

-    ldrb            r6, [r0]

-    add             r12, r12, r3

-    add             r12, r12, r4

-    add             r12, r12, r5

-    add             r12, r12, r6

-skip_dc_pred_left_s

-    add             r7, r7, #3          ; Shift

-    sub             r4, r7, #1

-    mov             r5, #1

-    add             r12, r12, r5, lsl r4

-    mov             r5, r12, lsr r7     ; expected_dc

-    vdup.u8         q0, r5

-skip_dc_pred_up_left_s

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    pop             {r4-r8,pc}

-case_v_pred_s

-    ; Copy down above row

-    sub             r6, r0, r2

-    vld1.8          {q0}, [r6]

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q0}, [r1], r2

-    pop             {r4-r8,pc}

-case_h_pred_s

-    ; Load 4x yleft_col

-    sub             r0, r0, #1

-    ldrb            r3, [r0], r2

-    ldrb            r4, [r0], r2

-    ldrb            r5, [r0], r2

-    ldrb            r6, [r0], r2

-    vdup.u8         q0, r3

-    vdup.u8         q1, r4

-    vdup.u8         q2, r5

-    vdup.u8         q3, r6

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q1}, [r1], r2

-    vst1.u8         {q2}, [r1], r2

-    vst1.u8         {q3}, [r1], r2

-    ldrb            r3, [r0], r2

-    ldrb            r4, [r0], r2

-    ldrb            r5, [r0], r2

-    ldrb            r6, [r0], r2

-    vdup.u8         q0, r3

-    vdup.u8         q1, r4

-    vdup.u8         q2, r5

-    vdup.u8         q3, r6

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q1}, [r1], r2

-    vst1.u8         {q2}, [r1], r2

-    vst1.u8         {q3}, [r1], r2

-    ldrb            r3, [r0], r2

-    ldrb            r4, [r0], r2

-    ldrb            r5, [r0], r2

-    ldrb            r6, [r0], r2

-    vdup.u8         q0, r3

-    vdup.u8         q1, r4

-    vdup.u8         q2, r5

-    vdup.u8         q3, r6

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q1}, [r1], r2

-    vst1.u8         {q2}, [r1], r2

-    vst1.u8         {q3}, [r1], r2

-    ldrb            r3, [r0], r2

-    ldrb            r4, [r0], r2

-    ldrb            r5, [r0], r2

-    ldrb            r6, [r0], r2

-    vdup.u8         q0, r3

-    vdup.u8         q1, r4

-    vdup.u8         q2, r5

-    vdup.u8         q3, r6

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q1}, [r1], r2

-    vst1.u8         {q2}, [r1], r2

-    vst1.u8         {q3}, [r1], r2

-    pop             {r4-r8,pc}

-case_tm_pred_s

-    ; Load yabove_row

-    sub             r3, r0, r2

-    vld1.8          {q8}, [r3]

-    ; Load ytop_left

-    sub             r3, r3, #1

-    ldrb            r7, [r3]

-    vdup.u16        q7, r7

-    ; Compute yabove_row - ytop_left

-    mov             r3, #1

-    vdup.u8         q0, r3

-    vmull.u8        q4, d16, d0

-    vmull.u8        q5, d17, d0

-    vsub.s16        q4, q4, q7

-    vsub.s16        q5, q5, q7

-    ; Load 4x yleft_col

-    sub             r0, r0, #1

-    mov             r12, #4

-case_tm_pred_loop_s

-    ldrb            r3, [r0], r2

-    ldrb            r4, [r0], r2

-    ldrb            r5, [r0], r2

-    ldrb            r6, [r0], r2

-    vdup.u16        q0, r3

-    vdup.u16        q1, r4

-    vdup.u16        q2, r5

-    vdup.u16        q3, r6

-    vqadd.s16       q8, q0, q4

-    vqadd.s16       q9, q0, q5

-    vqadd.s16       q10, q1, q4

-    vqadd.s16       q11, q1, q5

-    vqadd.s16       q12, q2, q4

-    vqadd.s16       q13, q2, q5

-    vqadd.s16       q14, q3, q4

-    vqadd.s16       q15, q3, q5

-    vqshrun.s16     d0, q8, #0

-    vqshrun.s16     d1, q9, #0

-    vqshrun.s16     d2, q10, #0

-    vqshrun.s16     d3, q11, #0

-    vqshrun.s16     d4, q12, #0

-    vqshrun.s16     d5, q13, #0

-    vqshrun.s16     d6, q14, #0

-    vqshrun.s16     d7, q15, #0

-    vst1.u8         {q0}, [r1], r2

-    vst1.u8         {q1}, [r1], r2

-    vst1.u8         {q2}, [r1], r2

-    vst1.u8         {q3}, [r1], r2

-    subs            r12, r12, #1

-    bne             case_tm_pred_loop_s

-    pop             {r4-r8,pc}

-    ENDP

-    END

--- a/vp8/common/arm/neon/copymem16x16_neon.asm

+++ /dev/null

@@ -1,59 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_copy_mem16x16_neon|

-    ; ARM

-    ; REQUIRE8

-    ; PRESERVE8

-    AREA    Block, CODE, READONLY ; name this block of code

-;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

-|vp9_copy_mem16x16_neon| PROC

-    vld1.u8     {q0}, [r0], r1

-    vld1.u8     {q1}, [r0], r1

-    vld1.u8     {q2}, [r0], r1

-    vst1.u8     {q0}, [r2], r3

-    vld1.u8     {q3}, [r0], r1

-    vst1.u8     {q1}, [r2], r3

-    vld1.u8     {q4}, [r0], r1

-    vst1.u8     {q2}, [r2], r3

-    vld1.u8     {q5}, [r0], r1

-    vst1.u8     {q3}, [r2], r3

-    vld1.u8     {q6}, [r0], r1

-    vst1.u8     {q4}, [r2], r3

-    vld1.u8     {q7}, [r0], r1

-    vst1.u8     {q5}, [r2], r3

-    vld1.u8     {q8}, [r0], r1

-    vst1.u8     {q6}, [r2], r3

-    vld1.u8     {q9}, [r0], r1

-    vst1.u8     {q7}, [r2], r3

-    vld1.u8     {q10}, [r0], r1

-    vst1.u8     {q8}, [r2], r3

-    vld1.u8     {q11}, [r0], r1

-    vst1.u8     {q9}, [r2], r3

-    vld1.u8     {q12}, [r0], r1

-    vst1.u8     {q10}, [r2], r3

-    vld1.u8     {q13}, [r0], r1

-    vst1.u8     {q11}, [r2], r3

-    vld1.u8     {q14}, [r0], r1

-    vst1.u8     {q12}, [r2], r3

-    vld1.u8     {q15}, [r0], r1

-    vst1.u8     {q13}, [r2], r3

-    vst1.u8     {q14}, [r2], r3

-    vst1.u8     {q15}, [r2], r3

-    mov     pc, lr

-    ENDP  ; |vp9_copy_mem16x16_neon|

-    END

--- a/vp8/common/arm/neon/copymem8x4_neon.asm

+++ /dev/null

@@ -1,34 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_copy_mem8x4_neon|

-    ; ARM

-    ; REQUIRE8

-    ; PRESERVE8

-    AREA    Block, CODE, READONLY ; name this block of code

-;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

-|vp9_copy_mem8x4_neon| PROC

-    vld1.u8     {d0}, [r0], r1

-    vld1.u8     {d1}, [r0], r1

-    vst1.u8     {d0}, [r2], r3

-    vld1.u8     {d2}, [r0], r1

-    vst1.u8     {d1}, [r2], r3

-    vld1.u8     {d3}, [r0], r1

-    vst1.u8     {d2}, [r2], r3

-    vst1.u8     {d3}, [r2], r3

-    mov     pc, lr

-    ENDP  ; |vp9_copy_mem8x4_neon|

-    END

--- a/vp8/common/arm/neon/copymem8x8_neon.asm

+++ /dev/null

@@ -1,43 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_copy_mem8x8_neon|

-    ; ARM

-    ; REQUIRE8

-    ; PRESERVE8

-    AREA    Block, CODE, READONLY ; name this block of code

-;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)

-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

-|vp9_copy_mem8x8_neon| PROC

-    vld1.u8     {d0}, [r0], r1

-    vld1.u8     {d1}, [r0], r1

-    vst1.u8     {d0}, [r2], r3

-    vld1.u8     {d2}, [r0], r1

-    vst1.u8     {d1}, [r2], r3

-    vld1.u8     {d3}, [r0], r1

-    vst1.u8     {d2}, [r2], r3

-    vld1.u8     {d4}, [r0], r1

-    vst1.u8     {d3}, [r2], r3

-    vld1.u8     {d5}, [r0], r1

-    vst1.u8     {d4}, [r2], r3

-    vld1.u8     {d6}, [r0], r1

-    vst1.u8     {d5}, [r2], r3

-    vld1.u8     {d7}, [r0], r1

-    vst1.u8     {d6}, [r2], r3

-    vst1.u8     {d7}, [r2], r3

-    mov     pc, lr

-    ENDP  ; |vp9_copy_mem8x8_neon|

-    END

--- a/vp8/common/arm/neon/dc_only_idct_add_neon.asm

+++ /dev/null

@@ -1,49 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-    EXPORT  |vp8_dc_only_idct_add_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr,

-;                               unsigned char *dst_ptr, int pitch, int stride)

-; r0  input_dc

-; r1  pred_ptr

-; r2  dst_ptr

-; r3  pitch

-; sp  stride

-|vp8_dc_only_idct_add_neon| PROC

-    add             r0, r0, #4

-    asr             r0, r0, #3

-    ldr             r12, [sp]

-    vdup.16         q0, r0

-    vld1.32         {d2[0]}, [r1], r3

-    vld1.32         {d2[1]}, [r1], r3

-    vld1.32         {d4[0]}, [r1], r3

-    vld1.32         {d4[1]}, [r1]

-    vaddw.u8        q1, q0, d2

-    vaddw.u8        q2, q0, d4

-    vqmovun.s16     d2, q1

-    vqmovun.s16     d4, q2

-    vst1.32         {d2[0]}, [r2], r12

-    vst1.32         {d2[1]}, [r2], r12

-    vst1.32         {d4[0]}, [r2], r12

-    vst1.32         {d4[1]}, [r2]

-    bx             lr

-    ENDP

-    END

--- a/vp8/common/arm/neon/iwalsh_neon.asm

+++ /dev/null

@@ -1,80 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_short_inv_walsh4x4_neon|

-    EXPORT  |vp8_short_inv_walsh4x4_1_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA    |.text|, CODE, READONLY  ; name this block of code

-;short vp8_short_inv_walsh4x4_neon(short *input, short *output)

-|vp8_short_inv_walsh4x4_neon| PROC

-    ; read in all four lines of values: d0->d3

-    vld1.i16 {q0-q1}, [r0@128]

-    ; first for loop

-    vadd.s16 d4, d0, d3 ;a = [0] + [12]

-    vadd.s16 d6, d1, d2 ;b = [4] + [8]

-    vsub.s16 d5, d0, d3 ;d = [0] - [12]

-    vsub.s16 d7, d1, d2 ;c = [4] - [8]

-    vadd.s16 q0, q2, q3 ; a+b d+c

-    vsub.s16 q1, q2, q3 ; a-b d-c

-    vtrn.32 d0, d2 ;d0:  0  1  8  9

-                   ;d2:  2  3 10 11

-    vtrn.32 d1, d3 ;d1:  4  5 12 13

-                   ;d3:  6  7 14 15

-    vtrn.16 d0, d1 ;d0:  0  4  8 12

-                   ;d1:  1  5  9 13

-    vtrn.16 d2, d3 ;d2:  2  6 10 14

-                   ;d3:  3  7 11 15

-    ; second for loop

-    vadd.s16 d4, d0, d3 ;a = [0] + [3]

-    vadd.s16 d6, d1, d2 ;b = [1] + [2]

-    vsub.s16 d5, d0, d3 ;d = [0] - [3]

-    vsub.s16 d7, d1, d2 ;c = [1] - [2]

-    vmov.i16 q8, #3

-    vadd.s16 q0, q2, q3 ; a+b d+c

-    vsub.s16 q1, q2, q3 ; a-b d-c

-    vadd.i16 q0, q0, q8 ;e/f += 3

-    vadd.i16 q1, q1, q8 ;g/h += 3

-    vshr.s16 q0, q0, #3 ;e/f >> 3

-    vshr.s16 q1, q1, #3 ;g/h >> 3

-    vst4.i16 {d0,d1,d2,d3}, [r1@128]

-    bx lr

-    ENDP    ; |vp8_short_inv_walsh4x4_neon|

-;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)

-|vp8_short_inv_walsh4x4_1_neon| PROC

-    ldrsh r2, [r0]          ; load input[0]

-    add r3, r2, #3          ; add 3

-    add r2, r1, #16         ; base for last 8 output

-    asr r0, r3, #3          ; right shift 3

-    vdup.16 q0, r0          ; load and duplicate

-    vst1.16 {q0}, [r1@128]  ; write back 8

-    vst1.16 {q0}, [r2@128]  ; write back last 8

-    bx lr

-    ENDP    ; |vp8_short_inv_walsh4x4_1_neon|

-    END

--- a/vp8/common/arm/neon/loopfilter_neon.asm

+++ /dev/null

@@ -1,397 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_loop_filter_horizontal_edge_y_neon|

-    EXPORT  |vp9_loop_filter_horizontal_edge_uv_neon|

-    EXPORT  |vp9_loop_filter_vertical_edge_y_neon|

-    EXPORT  |vp9_loop_filter_vertical_edge_uv_neon|

-    ARM

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char *src

-; r1    int pitch

-; r2    unsigned char blimit

-; r3    unsigned char limit

-; sp    unsigned char thresh,

-|vp9_loop_filter_horizontal_edge_y_neon| PROC

-    push        {lr}

-    vdup.u8     q0, r2                     ; duplicate blimit

-    vdup.u8     q1, r3                     ; duplicate limit

-    sub         r2, r0, r1, lsl #2         ; move src pointer down by 4 lines

-    ldr         r3, [sp, #4]               ; load thresh

-    add         r12, r2, r1

-    add         r1, r1, r1

-    vdup.u8     q2, r3                     ; duplicate thresh

-    vld1.u8     {q3}, [r2@128], r1              ; p3

-    vld1.u8     {q4}, [r12@128], r1             ; p2

-    vld1.u8     {q5}, [r2@128], r1              ; p1

-    vld1.u8     {q6}, [r12@128], r1             ; p0

-    vld1.u8     {q7}, [r2@128], r1              ; q0

-    vld1.u8     {q8}, [r12@128], r1             ; q1

-    vld1.u8     {q9}, [r2@128]                  ; q2

-    vld1.u8     {q10}, [r12@128]                ; q3

-    sub         r2, r2, r1, lsl #1

-    sub         r12, r12, r1, lsl #1

-    bl          vp9_loop_filter_neon

-    vst1.u8     {q5}, [r2@128], r1              ; store op1

-    vst1.u8     {q6}, [r12@128], r1             ; store op0

-    vst1.u8     {q7}, [r2@128], r1              ; store oq0

-    vst1.u8     {q8}, [r12@128], r1             ; store oq1

-    pop         {pc}

-    ENDP        ; |vp9_loop_filter_horizontal_edge_y_neon|

-; r0    unsigned char *u,

-; r1    int pitch,

-; r2    unsigned char blimit

-; r3    unsigned char limit

-; sp    unsigned char thresh,

-; sp+4  unsigned char *v

-|vp9_loop_filter_horizontal_edge_uv_neon| PROC

-    push        {lr}

-    vdup.u8     q0, r2                      ; duplicate blimit

-    vdup.u8     q1, r3                      ; duplicate limit

-    ldr         r12, [sp, #4]               ; load thresh

-    ldr         r2, [sp, #8]                ; load v ptr

-    vdup.u8     q2, r12                     ; duplicate thresh

-    sub         r3, r0, r1, lsl #2          ; move u pointer down by 4 lines

-    sub         r12, r2, r1, lsl #2         ; move v pointer down by 4 lines

-    vld1.u8     {d6}, [r3@64], r1              ; p3

-    vld1.u8     {d7}, [r12@64], r1             ; p3

-    vld1.u8     {d8}, [r3@64], r1              ; p2

-    vld1.u8     {d9}, [r12@64], r1             ; p2

-    vld1.u8     {d10}, [r3@64], r1             ; p1

-    vld1.u8     {d11}, [r12@64], r1            ; p1

-    vld1.u8     {d12}, [r3@64], r1             ; p0

-    vld1.u8     {d13}, [r12@64], r1            ; p0

-    vld1.u8     {d14}, [r3@64], r1             ; q0

-    vld1.u8     {d15}, [r12@64], r1            ; q0

-    vld1.u8     {d16}, [r3@64], r1             ; q1

-    vld1.u8     {d17}, [r12@64], r1            ; q1

-    vld1.u8     {d18}, [r3@64], r1             ; q2

-    vld1.u8     {d19}, [r12@64], r1            ; q2

-    vld1.u8     {d20}, [r3@64]                 ; q3

-    vld1.u8     {d21}, [r12@64]                ; q3

-    bl          vp9_loop_filter_neon

-    sub         r0, r0, r1, lsl #1

-    sub         r2, r2, r1, lsl #1

-    vst1.u8     {d10}, [r0@64], r1             ; store u op1

-    vst1.u8     {d11}, [r2@64], r1             ; store v op1

-    vst1.u8     {d12}, [r0@64], r1             ; store u op0

-    vst1.u8     {d13}, [r2@64], r1             ; store v op0

-    vst1.u8     {d14}, [r0@64], r1             ; store u oq0

-    vst1.u8     {d15}, [r2@64], r1             ; store v oq0

-    vst1.u8     {d16}, [r0@64]                 ; store u oq1

-    vst1.u8     {d17}, [r2@64]                 ; store v oq1

-    pop         {pc}

-    ENDP        ; |vp9_loop_filter_horizontal_edge_uv_neon|

-; void vp9_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,

-;                                           const signed char *flimit,

-;                                           const signed char *limit,

-;                                           const signed char *thresh,

-;                                           int count)

-; r0    unsigned char *src

-; r1    int pitch

-; r2    unsigned char blimit

-; r3    unsigned char limit

-; sp    unsigned char thresh,

-|vp9_loop_filter_vertical_edge_y_neon| PROC

-    push        {lr}

-    vdup.u8     q0, r2                     ; duplicate blimit

-    vdup.u8     q1, r3                     ; duplicate limit

-    sub         r2, r0, #4                 ; src ptr down by 4 columns

-    add         r1, r1, r1

-    ldr         r3, [sp, #4]               ; load thresh

-    add         r12, r2, r1, asr #1

-    vld1.u8     {d6}, [r2], r1

-    vld1.u8     {d8}, [r12], r1

-    vld1.u8     {d10}, [r2], r1

-    vld1.u8     {d12}, [r12], r1

-    vld1.u8     {d14}, [r2], r1

-    vld1.u8     {d16}, [r12], r1

-    vld1.u8     {d18}, [r2], r1

-    vld1.u8     {d20}, [r12], r1

-    vld1.u8     {d7}, [r2], r1              ; load second 8-line src data

-    vld1.u8     {d9}, [r12], r1

-    vld1.u8     {d11}, [r2], r1

-    vld1.u8     {d13}, [r12], r1

-    vld1.u8     {d15}, [r2], r1

-    vld1.u8     {d17}, [r12], r1

-    vld1.u8     {d19}, [r2]

-    vld1.u8     {d21}, [r12]

-    ;transpose to 8x16 matrix

-    vtrn.32     q3, q7

-    vtrn.32     q4, q8

-    vtrn.32     q5, q9

-    vtrn.32     q6, q10

-    vdup.u8     q2, r3                     ; duplicate thresh

-    vtrn.16     q3, q5

-    vtrn.16     q4, q6

-    vtrn.16     q7, q9

-    vtrn.16     q8, q10

-    vtrn.8      q3, q4

-    vtrn.8      q5, q6

-    vtrn.8      q7, q8

-    vtrn.8      q9, q10

-    bl          vp9_loop_filter_neon

-    vswp        d12, d11

-    vswp        d16, d13

-    sub         r0, r0, #2                 ; dst ptr

-    vswp        d14, d12

-    vswp        d16, d15

-    add         r12, r0, r1, asr #1

-    ;store op1, op0, oq0, oq1

-    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1

-    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r12], r1

-    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1

-    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r12], r1

-    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1

-    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r12], r1

-    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1

-    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r12], r1

-    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r0], r1

-    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r12], r1

-    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r0], r1

-    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r12], r1

-    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r0], r1

-    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r12], r1

-    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r0]

-    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r12]

-    pop         {pc}

-    ENDP        ; |vp9_loop_filter_vertical_edge_y_neon|

-; void vp9_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch

-;                                            const signed char *flimit,

-;                                            const signed char *limit,

-;                                            const signed char *thresh,

-;                                            unsigned char *v)

-; r0    unsigned char *u,

-; r1    int pitch,

-; r2    unsigned char blimit

-; r3    unsigned char limit

-; sp    unsigned char thresh,

-; sp+4  unsigned char *v

-|vp9_loop_filter_vertical_edge_uv_neon| PROC

-    push        {lr}

-    vdup.u8     q0, r2                      ; duplicate blimit

-    sub         r12, r0, #4                 ; move u pointer down by 4 columns

-    ldr         r2, [sp, #8]                ; load v ptr

-    vdup.u8     q1, r3                      ; duplicate limit

-    sub         r3, r2, #4                  ; move v pointer down by 4 columns

-    vld1.u8     {d6}, [r12], r1             ;load u data

-    vld1.u8     {d7}, [r3], r1              ;load v data

-    vld1.u8     {d8}, [r12], r1

-    vld1.u8     {d9}, [r3], r1

-    vld1.u8     {d10}, [r12], r1

-    vld1.u8     {d11}, [r3], r1

-    vld1.u8     {d12}, [r12], r1

-    vld1.u8     {d13}, [r3], r1

-    vld1.u8     {d14}, [r12], r1

-    vld1.u8     {d15}, [r3], r1

-    vld1.u8     {d16}, [r12], r1

-    vld1.u8     {d17}, [r3], r1

-    vld1.u8     {d18}, [r12], r1

-    vld1.u8     {d19}, [r3], r1

-    vld1.u8     {d20}, [r12]

-    vld1.u8     {d21}, [r3]

-    ldr        r12, [sp, #4]               ; load thresh

-    ;transpose to 8x16 matrix

-    vtrn.32     q3, q7

-    vtrn.32     q4, q8

-    vtrn.32     q5, q9

-    vtrn.32     q6, q10

-    vdup.u8     q2, r12                     ; duplicate thresh

-    vtrn.16     q3, q5

-    vtrn.16     q4, q6

-    vtrn.16     q7, q9

-    vtrn.16     q8, q10

-    vtrn.8      q3, q4

-    vtrn.8      q5, q6

-    vtrn.8      q7, q8

-    vtrn.8      q9, q10

-    bl          vp9_loop_filter_neon

-    vswp        d12, d11

-    vswp        d16, d13

-    vswp        d14, d12

-    vswp        d16, d15

-    sub         r0, r0, #2

-    sub         r2, r2, #2

-    ;store op1, op0, oq0, oq1

-    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1

-    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r2], r1

-    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1

-    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r2], r1

-    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1

-    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r2], r1

-    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1

-    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r2], r1

-    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1

-    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r2], r1

-    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1

-    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r2], r1

-    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1

-    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r2], r1

-    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0]

-    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r2]

-    pop         {pc}

-    ENDP        ; |vp9_loop_filter_vertical_edge_uv_neon|

-; void vp9_loop_filter_neon();

-; This is a helper function for the loopfilters. The invidual functions do the

-; necessary load, transpose (if necessary) and store.

-; r0-r3 PRESERVE

-; q0    flimit

-; q1    limit

-; q2    thresh

-; q3    p3

-; q4    p2

-; q5    p1

-; q6    p0

-; q7    q0

-; q8    q1

-; q9    q2

-; q10   q3

-|vp9_loop_filter_neon| PROC

-    ; vp9_filter_mask

-    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)

-    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)

-    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)

-    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)

-    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)

-    vabd.u8     q4, q10, q9                 ; abs(q3 - q2)

-    vmax.u8     q11, q11, q12

-    vmax.u8     q12, q13, q14

-    vmax.u8     q3, q3, q4

-    vmax.u8     q15, q11, q12

-    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)

-    ; vp8_hevmask

-    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1

-    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1

-    vmax.u8     q15, q15, q3

-    vmov.u8     q10, #0x80                   ; 0x80

-    vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)

-    vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2

-    vcge.u8     q15, q1, q15

-    ; vp9_filter() function

-    ; convert to signed

-    veor        q7, q7, q10                 ; qs0

-    vshr.u8     q2, q2, #1                  ; a = a / 2

-    veor        q6, q6, q10                 ; ps0

-    veor        q5, q5, q10                 ; ps1

-    vqadd.u8    q9, q9, q2                  ; a = b + a

-    veor        q8, q8, q10                 ; qs1

-    vmov.u8     q10, #3                     ; #3

-    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)

-    vsubl.s8    q11, d15, d13

-    vcge.u8     q9, q0, q9                  ; (a > flimit * 2 + limit) * -1

-    vmovl.u8    q4, d20

-    vqsub.s8    q1, q5, q8                  ; vp9_filter = clamp(ps1-qs1)

-    vorr        q14, q13, q14               ; vp8_hevmask

-    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)

-    vmul.i16    q11, q11, q4

-    vand        q1, q1, q14                 ; vp9_filter &= hev

-    vand        q15, q15, q9                ; vp9_filter_mask

-    vaddw.s8    q2, q2, d2

-    vaddw.s8    q11, q11, d3

-    vmov.u8     q9, #4                      ; #4

-    ; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0))

-    vqmovn.s16  d2, q2

-    vqmovn.s16  d3, q11

-    vand        q1, q1, q15                 ; vp9_filter &= mask

-    vqadd.s8    q2, q1, q10                 ; Filter2 = clamp(vp9_filter+3)

-    vqadd.s8    q1, q1, q9                  ; Filter1 = clamp(vp9_filter+4)

-    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3

-    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3

-    vqadd.s8    q11, q6, q2                 ; u = clamp(ps0 + Filter2)

-    vqsub.s8    q10, q7, q1                 ; u = clamp(qs0 - Filter1)

-    ; outer tap adjustments: ++vp9_filter >> 1

-    vrshr.s8    q1, q1, #1

-    vbic        q1, q1, q14                 ; vp9_filter &= ~hev

-    vmov.u8     q0, #0x80                   ; 0x80

-    vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + vp9_filter)

-    vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - vp9_filter)

-    veor        q6, q11, q0                 ; *op0 = u^0x80

-    veor        q7, q10, q0                 ; *oq0 = u^0x80

-    veor        q5, q13, q0                 ; *op1 = u^0x80

-    veor        q8, q12, q0                 ; *oq1 = u^0x80

-    bx          lr

-    ENDP        ; |vp9_loop_filter_horizontal_edge_y_neon|

-;-----------------

-    END

--- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm

+++ /dev/null

@@ -1,117 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    ;EXPORT  |vp9_loop_filter_simple_horizontal_edge_neon|

-    EXPORT  |vp9_loop_filter_bhs_neon|

-    EXPORT  |vp9_loop_filter_mbhs_neon|

-    ARM

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char *s, PRESERVE

-; r1    int p, PRESERVE

-; q1    limit, PRESERVE

-|vp9_loop_filter_simple_horizontal_edge_neon| PROC

-    sub         r3, r0, r1, lsl #1          ; move src pointer down by 2 lines

-    vld1.u8     {q7}, [r0@128], r1          ; q0

-    vld1.u8     {q5}, [r3@128], r1          ; p0

-    vld1.u8     {q8}, [r0@128]              ; q1

-    vld1.u8     {q6}, [r3@128]              ; p1

-    vabd.u8     q15, q6, q7                 ; abs(p0 - q0)

-    vabd.u8     q14, q5, q8                 ; abs(p1 - q1)

-    vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2

-    vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2

-    vmov.u8     q0, #0x80                   ; 0x80

-    vmov.s16    q13, #3

-    vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2

-    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value

-    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value

-    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value

-    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value

-    vcge.u8     q15, q1, q15                ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1

-    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)

-    vsubl.s8    q3, d15, d13

-    vqsub.s8    q4, q5, q8                  ; q4: vp9_filter = vp9_signed_char_clamp(ps1-qs1)

-    vmul.s16    q2, q2, q13                 ;  3 * ( qs0 - ps0)

-    vmul.s16    q3, q3, q13

-    vmov.u8     q10, #0x03                  ; 0x03

-    vmov.u8     q9, #0x04                   ; 0x04

-    vaddw.s8    q2, q2, d8                  ; vp9_filter + 3 * ( qs0 - ps0)

-    vaddw.s8    q3, q3, d9

-    vqmovn.s16  d8, q2                      ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))

-    vqmovn.s16  d9, q3

-    vand        q14, q4, q15                ; vp9_filter &= mask

-    vqadd.s8    q2, q14, q10                ; Filter2 = vp9_signed_char_clamp(vp9_filter+3)

-    vqadd.s8    q3, q14, q9                 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4)

-    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3

-    vshr.s8     q4, q3, #3                  ; Filter1 >>= 3

-    sub         r0, r0, r1

-    ;calculate output

-    vqadd.s8    q11, q6, q2                 ; u = vp9_signed_char_clamp(ps0 + Filter2)

-    vqsub.s8    q10, q7, q4                 ; u = vp9_signed_char_clamp(qs0 - Filter1)

-    veor        q6, q11, q0                 ; *op0 = u^0x80

-    veor        q7, q10, q0                 ; *oq0 = u^0x80

-    vst1.u8     {q6}, [r3@128]              ; store op0

-    vst1.u8     {q7}, [r0@128]              ; store oq0

-    bx          lr

-    ENDP        ; |vp9_loop_filter_simple_horizontal_edge_neon|

-; r0    unsigned char *y

-; r1    int ystride

-; r2    const unsigned char *blimit

-|vp9_loop_filter_bhs_neon| PROC

-    push        {r4, lr}

-    ldrb        r3, [r2]                    ; load blim from mem

-    vdup.s8     q1, r3                      ; duplicate blim

-    add         r0, r0, r1, lsl #2          ; src = y_ptr + 4 * y_stride

-    bl          vp9_loop_filter_simple_horizontal_edge_neon

-    ; vp9_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1

-    add         r0, r0, r1, lsl #2          ; src = y_ptr + 8* y_stride

-    bl          vp9_loop_filter_simple_horizontal_edge_neon

-    add         r0, r0, r1, lsl #2          ; src = y_ptr + 12 * y_stride

-    pop         {r4, lr}

-    b           vp9_loop_filter_simple_horizontal_edge_neon

-    ENDP        ;|vp9_loop_filter_bhs_neon|

-; r0    unsigned char *y

-; r1    int ystride

-; r2    const unsigned char *blimit

-|vp9_loop_filter_mbhs_neon| PROC

-    ldrb        r3, [r2]                   ; load blim from mem

-    vdup.s8     q1, r3                     ; duplicate mblim

-    b           vp9_loop_filter_simple_horizontal_edge_neon

-    ENDP        ;|vp9_loop_filter_bhs_neon|

-    END

--- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm

+++ /dev/null

@@ -1,154 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    ;EXPORT  |vp9_loop_filter_simple_vertical_edge_neon|

-    EXPORT |vp9_loop_filter_bvs_neon|

-    EXPORT |vp9_loop_filter_mbvs_neon|

-    ARM

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char *s, PRESERVE

-; r1    int p, PRESERVE

-; q1    limit, PRESERVE

-|vp9_loop_filter_simple_vertical_edge_neon| PROC

-    sub         r0, r0, #2                  ; move src pointer down by 2 columns

-    add         r12, r1, r1

-    add         r3, r0, r1

-    vld4.8      {d6[0], d7[0], d8[0], d9[0]}, [r0], r12

-    vld4.8      {d6[1], d7[1], d8[1], d9[1]}, [r3], r12

-    vld4.8      {d6[2], d7[2], d8[2], d9[2]}, [r0], r12

-    vld4.8      {d6[3], d7[3], d8[3], d9[3]}, [r3], r12

-    vld4.8      {d6[4], d7[4], d8[4], d9[4]}, [r0], r12

-    vld4.8      {d6[5], d7[5], d8[5], d9[5]}, [r3], r12

-    vld4.8      {d6[6], d7[6], d8[6], d9[6]}, [r0], r12

-    vld4.8      {d6[7], d7[7], d8[7], d9[7]}, [r3], r12

-    vld4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r12

-    vld4.8      {d10[1], d11[1], d12[1], d13[1]}, [r3], r12

-    vld4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r12

-    vld4.8      {d10[3], d11[3], d12[3], d13[3]}, [r3], r12

-    vld4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r12

-    vld4.8      {d10[5], d11[5], d12[5], d13[5]}, [r3], r12

-    vld4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r12

-    vld4.8      {d10[7], d11[7], d12[7], d13[7]}, [r3]

-    vswp        d7, d10

-    vswp        d12, d9

-    ;vp9_filter_mask() function

-    ;vp8_hevmask() function

-    sub         r0, r0, r1, lsl #4

-    vabd.u8     q15, q5, q4                 ; abs(p0 - q0)

-    vabd.u8     q14, q3, q6                 ; abs(p1 - q1)

-    vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2

-    vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2

-    vmov.u8     q0, #0x80                   ; 0x80

-    vmov.s16    q11, #3

-    vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2

-    veor        q4, q4, q0                  ; qs0: q0 offset to convert to a signed value

-    veor        q5, q5, q0                  ; ps0: p0 offset to convert to a signed value

-    veor        q3, q3, q0                  ; ps1: p1 offset to convert to a signed value

-    veor        q6, q6, q0                  ; qs1: q1 offset to convert to a signed value

-    vcge.u8     q15, q1, q15                ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1

-    vsubl.s8    q2, d8, d10                 ; ( qs0 - ps0)

-    vsubl.s8    q13, d9, d11

-    vqsub.s8    q14, q3, q6                  ; vp9_filter = vp9_signed_char_clamp(ps1-qs1)

-    vmul.s16    q2, q2, q11                 ;  3 * ( qs0 - ps0)

-    vmul.s16    q13, q13, q11

-    vmov.u8     q11, #0x03                  ; 0x03

-    vmov.u8     q12, #0x04                  ; 0x04

-    vaddw.s8    q2, q2, d28                  ; vp9_filter + 3 * ( qs0 - ps0)

-    vaddw.s8    q13, q13, d29

-    vqmovn.s16  d28, q2                      ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))

-    vqmovn.s16  d29, q13

-    add         r0, r0, #1

-    add         r3, r0, r1

-    vand        q14, q14, q15                 ; vp9_filter &= mask

-    vqadd.s8    q2, q14, q11                 ; Filter2 = vp9_signed_char_clamp(vp9_filter+3)

-    vqadd.s8    q3, q14, q12                 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4)

-    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3

-    vshr.s8     q14, q3, #3                  ; Filter1 >>= 3

-    ;calculate output

-    vqadd.s8    q11, q5, q2                 ; u = vp9_signed_char_clamp(ps0 + Filter2)

-    vqsub.s8    q10, q4, q14                 ; u = vp9_signed_char_clamp(qs0 - Filter1)

-    veor        q6, q11, q0                 ; *op0 = u^0x80

-    veor        q7, q10, q0                 ; *oq0 = u^0x80

-    add         r12, r1, r1

-    vswp        d13, d14

-    ;store op1, op0, oq0, oq1

-    vst2.8      {d12[0], d13[0]}, [r0], r12

-    vst2.8      {d12[1], d13[1]}, [r3], r12

-    vst2.8      {d12[2], d13[2]}, [r0], r12

-    vst2.8      {d12[3], d13[3]}, [r3], r12

-    vst2.8      {d12[4], d13[4]}, [r0], r12

-    vst2.8      {d12[5], d13[5]}, [r3], r12

-    vst2.8      {d12[6], d13[6]}, [r0], r12

-    vst2.8      {d12[7], d13[7]}, [r3], r12

-    vst2.8      {d14[0], d15[0]}, [r0], r12

-    vst2.8      {d14[1], d15[1]}, [r3], r12

-    vst2.8      {d14[2], d15[2]}, [r0], r12

-    vst2.8      {d14[3], d15[3]}, [r3], r12

-    vst2.8      {d14[4], d15[4]}, [r0], r12

-    vst2.8      {d14[5], d15[5]}, [r3], r12

-    vst2.8      {d14[6], d15[6]}, [r0], r12

-    vst2.8      {d14[7], d15[7]}, [r3]

-    bx          lr

-    ENDP        ; |vp9_loop_filter_simple_vertical_edge_neon|

-; r0    unsigned char *y

-; r1    int ystride

-; r2    const unsigned char *blimit

-|vp9_loop_filter_bvs_neon| PROC

-    push        {r4, lr}

-    ldrb        r3, [r2]                   ; load blim from mem

-    mov         r4, r0

-    add         r0, r0, #4

-    vdup.s8     q1, r3                     ; duplicate blim

-    bl          vp9_loop_filter_simple_vertical_edge_neon

-    ; vp9_loop_filter_simple_vertical_edge_neon preserves  r1 and q1

-    add         r0, r4, #8

-    bl          vp9_loop_filter_simple_vertical_edge_neon

-    add         r0, r4, #12

-    pop         {r4, lr}

-    b           vp9_loop_filter_simple_vertical_edge_neon

-    ENDP        ;|vp9_loop_filter_bvs_neon|

-; r0    unsigned char *y

-; r1    int ystride

-; r2    const unsigned char *blimit

-|vp9_loop_filter_mbvs_neon| PROC

-    ldrb        r3, [r2]                   ; load mblim from mem

-    vdup.s8     q1, r3                     ; duplicate mblim

-    b           vp9_loop_filter_simple_vertical_edge_neon

-    ENDP        ;|vp9_loop_filter_bvs_neon|

-    END

--- a/vp8/common/arm/neon/mbloopfilter_neon.asm

+++ /dev/null

@@ -1,469 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_mbloop_filter_horizontal_edge_y_neon|

-    EXPORT  |vp8_mbloop_filter_horizontal_edge_uv_neon|

-    EXPORT  |vp8_mbloop_filter_vertical_edge_y_neon|

-    EXPORT  |vp8_mbloop_filter_vertical_edge_uv_neon|

-    ARM

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,

-;                                               const unsigned char *blimit,

-;                                               const unsigned char *limit,

-;                                               const unsigned char *thresh)

-; r0    unsigned char *src,

-; r1    int pitch,

-; r2    unsigned char blimit

-; r3    unsigned char limit

-; sp    unsigned char thresh,

-|vp8_mbloop_filter_horizontal_edge_y_neon| PROC

-    push        {lr}

-    add         r1, r1, r1                  ; double stride

-    ldr         r12, [sp, #4]               ; load thresh

-    sub         r0, r0, r1, lsl #1          ; move src pointer down by 4 lines

-    vdup.u8     q2, r12                     ; thresh

-    add         r12, r0, r1,  lsr #1        ; move src pointer up by 1 line

-    vld1.u8     {q3}, [r0@128], r1              ; p3

-    vld1.u8     {q4}, [r12@128], r1             ; p2

-    vld1.u8     {q5}, [r0@128], r1              ; p1

-    vld1.u8     {q6}, [r12@128], r1             ; p0

-    vld1.u8     {q7}, [r0@128], r1              ; q0

-    vld1.u8     {q8}, [r12@128], r1             ; q1

-    vld1.u8     {q9}, [r0@128], r1              ; q2

-    vld1.u8     {q10}, [r12@128], r1            ; q3

-    bl          vp8_mbloop_filter_neon

-    sub         r12, r12, r1, lsl #2

-    add         r0, r12, r1, lsr #1

-    vst1.u8     {q4}, [r12@128],r1         ; store op2

-    vst1.u8     {q5}, [r0@128],r1          ; store op1

-    vst1.u8     {q6}, [r12@128], r1        ; store op0

-    vst1.u8     {q7}, [r0@128],r1          ; store oq0

-    vst1.u8     {q8}, [r12@128]            ; store oq1

-    vst1.u8     {q9}, [r0@128]             ; store oq2

-    pop         {pc}

-    ENDP        ; |vp8_mbloop_filter_horizontal_edge_y_neon|

-; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,

-;                                                const unsigned char *blimit,

-;                                                const unsigned char *limit,

-;                                                const unsigned char *thresh,

-;                                                unsigned char *v)

-; r0    unsigned char *u,

-; r1    int pitch,

-; r2    unsigned char blimit

-; r3    unsigned char limit

-; sp    unsigned char thresh,

-; sp+4  unsigned char *v

-|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC

-    push        {lr}

-    ldr         r12, [sp, #4]                 ; load thresh

-    sub         r0, r0, r1, lsl #2            ; move u pointer down by 4 lines

-    vdup.u8     q2, r12                       ; thresh

-    ldr         r12, [sp, #8]                 ; load v ptr

-    sub         r12, r12, r1, lsl #2          ; move v pointer down by 4 lines

-    vld1.u8     {d6}, [r0@64], r1              ; p3

-    vld1.u8     {d7}, [r12@64], r1              ; p3

-    vld1.u8     {d8}, [r0@64], r1              ; p2

-    vld1.u8     {d9}, [r12@64], r1              ; p2

-    vld1.u8     {d10}, [r0@64], r1             ; p1

-    vld1.u8     {d11}, [r12@64], r1             ; p1

-    vld1.u8     {d12}, [r0@64], r1             ; p0

-    vld1.u8     {d13}, [r12@64], r1             ; p0

-    vld1.u8     {d14}, [r0@64], r1             ; q0

-    vld1.u8     {d15}, [r12@64], r1             ; q0

-    vld1.u8     {d16}, [r0@64], r1             ; q1

-    vld1.u8     {d17}, [r12@64], r1             ; q1

-    vld1.u8     {d18}, [r0@64], r1             ; q2

-    vld1.u8     {d19}, [r12@64], r1             ; q2

-    vld1.u8     {d20}, [r0@64], r1             ; q3

-    vld1.u8     {d21}, [r12@64], r1             ; q3

-    bl          vp8_mbloop_filter_neon

-    sub         r0, r0, r1, lsl #3

-    sub         r12, r12, r1, lsl #3

-    add         r0, r0, r1

-    add         r12, r12, r1

-    vst1.u8     {d8}, [r0@64], r1              ; store u op2

-    vst1.u8     {d9}, [r12@64], r1              ; store v op2

-    vst1.u8     {d10}, [r0@64], r1             ; store u op1

-    vst1.u8     {d11}, [r12@64], r1             ; store v op1

-    vst1.u8     {d12}, [r0@64], r1             ; store u op0

-    vst1.u8     {d13}, [r12@64], r1             ; store v op0

-    vst1.u8     {d14}, [r0@64], r1             ; store u oq0

-    vst1.u8     {d15}, [r12@64], r1             ; store v oq0

-    vst1.u8     {d16}, [r0@64], r1             ; store u oq1

-    vst1.u8     {d17}, [r12@64], r1             ; store v oq1

-    vst1.u8     {d18}, [r0@64], r1             ; store u oq2

-    vst1.u8     {d19}, [r12@64], r1             ; store v oq2

-    pop         {pc}

-    ENDP        ; |vp8_mbloop_filter_horizontal_edge_uv_neon|

-; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,

-;                                             const unsigned char *blimit,

-;                                             const unsigned char *limit,

-;                                             const unsigned char *thresh)

-; r0    unsigned char *src,

-; r1    int pitch,

-; r2    unsigned char blimit

-; r3    unsigned char limit

-; sp    unsigned char thresh,

-|vp8_mbloop_filter_vertical_edge_y_neon| PROC

-    push        {lr}

-    ldr         r12, [sp, #4]               ; load thresh

-    sub         r0, r0, #4                  ; move src pointer down by 4 columns

-    vdup.s8     q2, r12                     ; thresh

-    add         r12, r0, r1, lsl #3         ; move src pointer down by 8 lines

-    vld1.u8     {d6}, [r0], r1              ; load first 8-line src data

-    vld1.u8     {d7}, [r12], r1             ; load second 8-line src data

-    vld1.u8     {d8}, [r0], r1

-    vld1.u8     {d9}, [r12], r1

-    vld1.u8     {d10}, [r0], r1

-    vld1.u8     {d11}, [r12], r1

-    vld1.u8     {d12}, [r0], r1

-    vld1.u8     {d13}, [r12], r1

-    vld1.u8     {d14}, [r0], r1

-    vld1.u8     {d15}, [r12], r1

-    vld1.u8     {d16}, [r0], r1

-    vld1.u8     {d17}, [r12], r1

-    vld1.u8     {d18}, [r0], r1

-    vld1.u8     {d19}, [r12], r1

-    vld1.u8     {d20}, [r0], r1

-    vld1.u8     {d21}, [r12], r1

-    ;transpose to 8x16 matrix

-    vtrn.32     q3, q7

-    vtrn.32     q4, q8

-    vtrn.32     q5, q9

-    vtrn.32     q6, q10

-    vtrn.16     q3, q5

-    vtrn.16     q4, q6

-    vtrn.16     q7, q9

-    vtrn.16     q8, q10

-    vtrn.8      q3, q4

-    vtrn.8      q5, q6

-    vtrn.8      q7, q8

-    vtrn.8      q9, q10

-    sub         r0, r0, r1, lsl #3

-    bl          vp8_mbloop_filter_neon

-    sub         r12, r12, r1, lsl #3

-    ;transpose to 16x8 matrix

-    vtrn.32     q3, q7

-    vtrn.32     q4, q8

-    vtrn.32     q5, q9

-    vtrn.32     q6, q10

-    vtrn.16     q3, q5

-    vtrn.16     q4, q6

-    vtrn.16     q7, q9

-    vtrn.16     q8, q10

-    vtrn.8      q3, q4

-    vtrn.8      q5, q6

-    vtrn.8      q7, q8

-    vtrn.8      q9, q10

-    ;store op2, op1, op0, oq0, oq1, oq2

-    vst1.8      {d6}, [r0], r1

-    vst1.8      {d7}, [r12], r1

-    vst1.8      {d8}, [r0], r1

-    vst1.8      {d9}, [r12], r1

-    vst1.8      {d10}, [r0], r1

-    vst1.8      {d11}, [r12], r1

-    vst1.8      {d12}, [r0], r1

-    vst1.8      {d13}, [r12], r1

-    vst1.8      {d14}, [r0], r1

-    vst1.8      {d15}, [r12], r1

-    vst1.8      {d16}, [r0], r1

-    vst1.8      {d17}, [r12], r1

-    vst1.8      {d18}, [r0], r1

-    vst1.8      {d19}, [r12], r1

-    vst1.8      {d20}, [r0]

-    vst1.8      {d21}, [r12]

-    pop         {pc}

-    ENDP        ; |vp8_mbloop_filter_vertical_edge_y_neon|

-; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,

-;                                              const unsigned char *blimit,

-;                                              const unsigned char *limit,

-;                                              const unsigned char *thresh,

-;                                              unsigned char *v)

-; r0    unsigned char *u,

-; r1    int pitch,

-; r2    const signed char *flimit,

-; r3    const signed char *limit,

-; sp    const signed char *thresh,

-; sp+4  unsigned char *v

-|vp8_mbloop_filter_vertical_edge_uv_neon| PROC

-    push        {lr}

-    ldr         r12, [sp, #4]               ; load thresh

-    sub         r0, r0, #4                  ; move u pointer down by 4 columns

-    vdup.u8     q2, r12                     ; thresh

-    ldr         r12, [sp, #8]               ; load v ptr

-    sub         r12, r12, #4                ; move v pointer down by 4 columns

-    vld1.u8     {d6}, [r0], r1              ;load u data

-    vld1.u8     {d7}, [r12], r1             ;load v data

-    vld1.u8     {d8}, [r0], r1

-    vld1.u8     {d9}, [r12], r1

-    vld1.u8     {d10}, [r0], r1

-    vld1.u8     {d11}, [r12], r1

-    vld1.u8     {d12}, [r0], r1

-    vld1.u8     {d13}, [r12], r1

-    vld1.u8     {d14}, [r0], r1

-    vld1.u8     {d15}, [r12], r1

-    vld1.u8     {d16}, [r0], r1

-    vld1.u8     {d17}, [r12], r1

-    vld1.u8     {d18}, [r0], r1

-    vld1.u8     {d19}, [r12], r1

-    vld1.u8     {d20}, [r0], r1

-    vld1.u8     {d21}, [r12], r1

-    ;transpose to 8x16 matrix

-    vtrn.32     q3, q7

-    vtrn.32     q4, q8

-    vtrn.32     q5, q9

-    vtrn.32     q6, q10

-    vtrn.16     q3, q5

-    vtrn.16     q4, q6

-    vtrn.16     q7, q9

-    vtrn.16     q8, q10

-    vtrn.8      q3, q4

-    vtrn.8      q5, q6

-    vtrn.8      q7, q8

-    vtrn.8      q9, q10

-    sub         r0, r0, r1, lsl #3

-    bl          vp8_mbloop_filter_neon

-    sub         r12, r12, r1, lsl #3

-    ;transpose to 16x8 matrix

-    vtrn.32     q3, q7

-    vtrn.32     q4, q8

-    vtrn.32     q5, q9

-    vtrn.32     q6, q10

-    vtrn.16     q3, q5

-    vtrn.16     q4, q6

-    vtrn.16     q7, q9

-    vtrn.16     q8, q10

-    vtrn.8      q3, q4

-    vtrn.8      q5, q6

-    vtrn.8      q7, q8

-    vtrn.8      q9, q10

-    ;store op2, op1, op0, oq0, oq1, oq2

-    vst1.8      {d6}, [r0], r1

-    vst1.8      {d7}, [r12], r1

-    vst1.8      {d8}, [r0], r1

-    vst1.8      {d9}, [r12], r1

-    vst1.8      {d10}, [r0], r1

-    vst1.8      {d11}, [r12], r1

-    vst1.8      {d12}, [r0], r1

-    vst1.8      {d13}, [r12], r1

-    vst1.8      {d14}, [r0], r1

-    vst1.8      {d15}, [r12], r1

-    vst1.8      {d16}, [r0], r1

-    vst1.8      {d17}, [r12], r1

-    vst1.8      {d18}, [r0], r1

-    vst1.8      {d19}, [r12], r1

-    vst1.8      {d20}, [r0]

-    vst1.8      {d21}, [r12]

-    pop         {pc}

-    ENDP        ; |vp8_mbloop_filter_vertical_edge_uv_neon|

-; void vp8_mbloop_filter_neon()

-; This is a helper function for the macroblock loopfilters. The individual

-; functions do the necessary load, transpose (if necessary), preserve (if

-; necessary) and store.

-; r0,r1 PRESERVE

-; r2    mblimit

-; r3    limit

-; q2    thresh

-; q3    p3 PRESERVE

-; q4    p2

-; q5    p1

-; q6    p0

-; q7    q0

-; q8    q1

-; q9    q2

-; q10   q3 PRESERVE

-|vp8_mbloop_filter_neon| PROC

-    ; vp9_filter_mask

-    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)

-    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)

-    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)

-    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)

-    vabd.u8     q1, q9, q8                  ; abs(q2 - q1)

-    vabd.u8     q0, q10, q9                 ; abs(q3 - q2)

-    vmax.u8     q11, q11, q12

-    vmax.u8     q12, q13, q14

-    vmax.u8     q1, q1, q0

-    vmax.u8     q15, q11, q12

-    vabd.u8     q12, q6, q7                 ; abs(p0 - q0)

-    ; vp8_hevmask

-    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh) * -1

-    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh) * -1

-    vmax.u8     q15, q15, q1

-    vdup.u8     q1, r3                      ; limit

-    vdup.u8     q2, r2                      ; mblimit

-    vmov.u8     q0, #0x80                   ; 0x80

-    vcge.u8     q15, q1, q15

-    vabd.u8     q1, q5, q8                  ; a = abs(p1 - q1)

-    vqadd.u8    q12, q12, q12               ; b = abs(p0 - q0) * 2

-    vmov.u16    q11, #3                     ; #3

-    ; vp9_filter

-    ; convert to signed

-    veor        q7, q7, q0                  ; qs0

-    vshr.u8     q1, q1, #1                  ; a = a / 2

-    veor        q6, q6, q0                  ; ps0

-    veor        q5, q5, q0                  ; ps1

-    vqadd.u8    q12, q12, q1                ; a = b + a

-    veor        q8, q8, q0                  ; qs1

-    veor        q4, q4, q0                  ; ps2

-    veor        q9, q9, q0                  ; qs2

-    vorr        q14, q13, q14               ; vp8_hevmask

-    vcge.u8     q12, q2, q12                ; (a > flimit * 2 + limit) * -1

-    vsubl.s8    q2, d14, d12                ; qs0 - ps0

-    vsubl.s8    q13, d15, d13

-    vqsub.s8    q1, q5, q8                  ; vp9_filter = clamp(ps1-qs1)

-    vmul.i16    q2, q2, q11                 ; 3 * ( qs0 - ps0)

-    vand        q15, q15, q12               ; vp9_filter_mask

-    vmul.i16    q13, q13, q11

-    vmov.u8     q12, #3                     ; #3

-    vaddw.s8    q2, q2, d2                  ; vp9_filter + 3 * ( qs0 - ps0)

-    vaddw.s8    q13, q13, d3

-    vmov.u8     q11, #4                     ; #4

-    ; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0))

-    vqmovn.s16  d2, q2

-    vqmovn.s16  d3, q13

-    vand        q1, q1, q15                 ; vp9_filter &= mask

-    vmov.u16    q15, #63                    ; #63

-    vand        q13, q1, q14                ; Filter2 &= hev

-    vqadd.s8    q2, q13, q11                ; Filter1 = clamp(Filter2+4)

-    vqadd.s8    q13, q13, q12               ; Filter2 = clamp(Filter2+3)

-    vmov        q0, q15

-    vshr.s8     q2, q2, #3                  ; Filter1 >>= 3

-    vshr.s8     q13, q13, #3                ; Filter2 >>= 3

-    vmov        q11, q15

-    vmov        q12, q15

-    vqsub.s8    q7, q7, q2                  ; qs0 = clamp(qs0 - Filter1)

-    vqadd.s8    q6, q6, q13                 ; ps0 = clamp(ps0 + Filter2)

-    vbic        q1, q1, q14                 ; vp9_filter &= ~hev

-    ; roughly 1/7th difference across boundary

-    ; roughly 2/7th difference across boundary

-    ; roughly 3/7th difference across boundary

-    vmov.u8     d5, #9                      ; #9

-    vmov.u8     d4, #18                     ; #18

-    vmov        q13, q15

-    vmov        q14, q15

-    vmlal.s8    q0, d2, d5                  ; 63 + Filter2 * 9

-    vmlal.s8    q11, d3, d5

-    vmov.u8     d5, #27                     ; #27

-    vmlal.s8    q12, d2, d4                 ; 63 + Filter2 * 18

-    vmlal.s8    q13, d3, d4

-    vmlal.s8    q14, d2, d5                 ; 63 + Filter2 * 27

-    vmlal.s8    q15, d3, d5

-    vqshrn.s16  d0, q0, #7                  ; u = clamp((63 + Filter2 * 9)>>7)

-    vqshrn.s16  d1, q11, #7

-    vqshrn.s16  d24, q12, #7                ; u = clamp((63 + Filter2 * 18)>>7)

-    vqshrn.s16  d25, q13, #7

-    vqshrn.s16  d28, q14, #7                ; u = clamp((63 + Filter2 * 27)>>7)

-    vqshrn.s16  d29, q15, #7

-    vmov.u8     q1, #0x80                   ; 0x80

-    vqsub.s8    q11, q9, q0                 ; s = clamp(qs2 - u)

-    vqadd.s8    q0, q4, q0                  ; s = clamp(ps2 + u)

-    vqsub.s8    q13, q8, q12                ; s = clamp(qs1 - u)

-    vqadd.s8    q12, q5, q12                ; s = clamp(ps1 + u)

-    vqsub.s8    q15, q7, q14                ; s = clamp(qs0 - u)

-    vqadd.s8    q14, q6, q14                ; s = clamp(ps0 + u)

-    veor        q9, q11, q1                 ; *oq2 = s^0x80

-    veor        q4, q0, q1                  ; *op2 = s^0x80

-    veor        q8, q13, q1                 ; *oq1 = s^0x80

-    veor        q5, q12, q1                 ; *op2 = s^0x80

-    veor        q7, q15, q1                 ; *oq0 = s^0x80

-    veor        q6, q14, q1                 ; *op0 = s^0x80

-    bx          lr

-    ENDP        ; |vp8_mbloop_filter_neon|

-;-----------------

-    END

--- a/vp8/common/arm/neon/recon16x16mb_neon.asm

+++ /dev/null

@@ -1,131 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_recon16x16mb_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char  *pred_ptr,

-; r1    short *diff_ptr,

-; r2    unsigned char *dst_ptr,

-; r3    int ystride,

-; stack unsigned char *udst_ptr,

-; stack unsigned char *vdst_ptr

-|vp8_recon16x16mb_neon| PROC

-    mov             r12, #4             ;loop counter for Y loop

-recon16x16mb_loop_y

-    vld1.u8         {q12, q13}, [r0]!   ;load data from pred_ptr

-    vld1.16         {q8, q9}, [r1]!     ;load data from diff_ptr

-    vld1.u8         {q14, q15}, [r0]!

-    vld1.16         {q10, q11}, [r1]!

-    vmovl.u8        q0, d24             ;modify Pred data from 8 bits to 16 bits

-    vmovl.u8        q1, d25

-    vmovl.u8        q2, d26

-    vmovl.u8        q3, d27

-    vmovl.u8        q4, d28

-    vmovl.u8        q5, d29

-    vmovl.u8        q6, d30

-    vld1.16         {q12, q13}, [r1]!

-    vmovl.u8        q7, d31

-    vld1.16         {q14, q15}, [r1]!

-    pld             [r0]

-    pld             [r1]

-    pld             [r1, #64]

-    vadd.s16        q0, q0, q8          ;add Diff data and Pred data together

-    vadd.s16        q1, q1, q9

-    vadd.s16        q2, q2, q10

-    vadd.s16        q3, q3, q11

-    vadd.s16        q4, q4, q12

-    vadd.s16        q5, q5, q13

-    vadd.s16        q6, q6, q14

-    vadd.s16        q7, q7, q15

-    vqmovun.s16     d0, q0              ;CLAMP() saturation

-    vqmovun.s16     d1, q1

-    vqmovun.s16     d2, q2

-    vqmovun.s16     d3, q3

-    vqmovun.s16     d4, q4

-    vqmovun.s16     d5, q5

-    vst1.u8         {q0}, [r2], r3      ;store result

-    vqmovun.s16     d6, q6

-    vst1.u8         {q1}, [r2], r3

-    vqmovun.s16     d7, q7

-    vst1.u8         {q2}, [r2], r3

-    subs            r12, r12, #1

-    moveq           r12, #2             ;loop counter for UV loop

-    vst1.u8         {q3}, [r2], r3

-    bne             recon16x16mb_loop_y

-    mov             r3, r3, lsr #1      ;uv_stride = ystride>>1

-    ldr             r2, [sp]            ;load upred_ptr

-recon16x16mb_loop_uv

-    vld1.u8         {q12, q13}, [r0]!   ;load data from pred_ptr

-    vld1.16         {q8, q9}, [r1]!     ;load data from diff_ptr

-    vld1.u8         {q14, q15}, [r0]!

-    vld1.16         {q10, q11}, [r1]!

-    vmovl.u8        q0, d24             ;modify Pred data from 8 bits to 16 bits

-    vmovl.u8        q1, d25

-    vmovl.u8        q2, d26

-    vmovl.u8        q3, d27

-    vmovl.u8        q4, d28

-    vmovl.u8        q5, d29

-    vmovl.u8        q6, d30

-    vld1.16         {q12, q13}, [r1]!

-    vmovl.u8        q7, d31

-    vld1.16         {q14, q15}, [r1]!

-    vadd.s16        q0, q0, q8          ;add Diff data and Pred data together

-    vadd.s16        q1, q1, q9

-    vadd.s16        q2, q2, q10

-    vadd.s16        q3, q3, q11

-    vadd.s16        q4, q4, q12

-    vadd.s16        q5, q5, q13

-    vadd.s16        q6, q6, q14

-    vqmovun.s16     d0, q0              ;CLAMP() saturation

-    vadd.s16        q7, q7, q15

-    vqmovun.s16     d1, q1

-    vqmovun.s16     d2, q2

-    vqmovun.s16     d3, q3

-    vst1.u8         {d0}, [r2], r3      ;store result

-    vqmovun.s16     d4, q4

-    vst1.u8         {d1}, [r2], r3

-    vqmovun.s16     d5, q5

-    vst1.u8         {d2}, [r2], r3

-    vqmovun.s16     d6, q6

-    vst1.u8         {d3}, [r2], r3

-    vqmovun.s16     d7, q7

-    vst1.u8         {d4}, [r2], r3

-    subs            r12, r12, #1

-    vst1.u8         {d5}, [r2], r3

-    vst1.u8         {d6}, [r2], r3

-    vst1.u8         {d7}, [r2], r3

-    ldrne           r2, [sp, #4]        ;load vpred_ptr

-    bne             recon16x16mb_loop_uv

-    bx             lr

-    ENDP

-    END

--- a/vp8/common/arm/neon/recon2b_neon.asm

+++ /dev/null

@@ -1,54 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_recon2b_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char  *pred_ptr,

-; r1    short *diff_ptr,

-; r2    unsigned char *dst_ptr,

-; r3    int stride

-|vp8_recon2b_neon| PROC

-    vld1.u8         {q8, q9}, [r0]      ;load data from pred_ptr

-    vld1.16         {q4, q5}, [r1]!     ;load data from diff_ptr

-    vmovl.u8        q0, d16             ;modify Pred data from 8 bits to 16 bits

-    vld1.16         {q6, q7}, [r1]!

-    vmovl.u8        q1, d17

-    vmovl.u8        q2, d18

-    vmovl.u8        q3, d19

-    vadd.s16        q0, q0, q4          ;add Diff data and Pred data together

-    vadd.s16        q1, q1, q5

-    vadd.s16        q2, q2, q6

-    vadd.s16        q3, q3, q7

-    vqmovun.s16     d0, q0              ;CLAMP() saturation

-    vqmovun.s16     d1, q1

-    vqmovun.s16     d2, q2

-    vqmovun.s16     d3, q3

-    add             r0, r2, r3

-    vst1.u8         {d0}, [r2]          ;store result

-    vst1.u8         {d1}, [r0], r3

-    add             r2, r0, r3

-    vst1.u8         {d2}, [r0]

-    vst1.u8         {d3}, [r2], r3

-    bx             lr

-    ENDP

-    END

--- a/vp8/common/arm/neon/recon4b_neon.asm

+++ /dev/null

@@ -1,69 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_recon4b_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char  *pred_ptr,

-; r1    short *diff_ptr,

-; r2    unsigned char *dst_ptr,

-; r3    int stride

-|vp8_recon4b_neon| PROC

-    vld1.u8         {q12, q13}, [r0]!   ;load data from pred_ptr

-    vld1.16         {q8, q9}, [r1]!     ;load data from diff_ptr

-    vld1.u8         {q14, q15}, [r0]

-    vld1.16         {q10, q11}, [r1]!

-    vmovl.u8        q0, d24             ;modify Pred data from 8 bits to 16 bits

-    vmovl.u8        q1, d25

-    vmovl.u8        q2, d26

-    vmovl.u8        q3, d27

-    vmovl.u8        q4, d28

-    vmovl.u8        q5, d29

-    vmovl.u8        q6, d30

-    vld1.16         {q12, q13}, [r1]!

-    vmovl.u8        q7, d31

-    vld1.16         {q14, q15}, [r1]

-    vadd.s16        q0, q0, q8          ;add Diff data and Pred data together

-    vadd.s16        q1, q1, q9

-    vadd.s16        q2, q2, q10

-    vadd.s16        q3, q3, q11

-    vadd.s16        q4, q4, q12

-    vadd.s16        q5, q5, q13

-    vadd.s16        q6, q6, q14

-    vadd.s16        q7, q7, q15

-    vqmovun.s16     d0, q0              ;CLAMP() saturation

-    vqmovun.s16     d1, q1

-    vqmovun.s16     d2, q2

-    vqmovun.s16     d3, q3

-    vqmovun.s16     d4, q4

-    vqmovun.s16     d5, q5

-    vqmovun.s16     d6, q6

-    vqmovun.s16     d7, q7

-    add             r0, r2, r3

-    vst1.u8         {q0}, [r2]          ;store result

-    vst1.u8         {q1}, [r0], r3

-    add             r2, r0, r3

-    vst1.u8         {q2}, [r0]

-    vst1.u8         {q3}, [r2], r3

-    bx             lr

-    ENDP

-    END

--- a/vp8/common/arm/neon/recon_neon.c

+++ /dev/null

@@ -1,29 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "vp8/common/recon.h"

-#include "vp8/common/blockd.h"

-extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr);

-void vp8_recon_mb_neon(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *xd) {

-  unsigned char *pred_ptr = &xd->predictor[0];

-  short *diff_ptr = &xd->diff[0];

-  unsigned char *dst_ptr = xd->dst.y_buffer;

-  unsigned char *udst_ptr = xd->dst.u_buffer;

-  unsigned char *vdst_ptr = xd->dst.v_buffer;

-  int ystride = xd->dst.y_stride;

-  /*int uv_stride = xd->dst.uv_stride;*/

-  vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride,

-                        udst_ptr, vdst_ptr);

-}

--- a/vp8/common/arm/neon/reconb_neon.asm

+++ /dev/null

@@ -1,61 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_recon_b_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char  *pred_ptr,

-; r1    short *diff_ptr,

-; r2    unsigned char *dst_ptr,

-; r3    int stride

-|vp8_recon_b_neon| PROC

-    mov             r12, #16

-    vld1.u8         {d28}, [r0], r12    ;load 4 data/line from pred_ptr

-    vld1.16         {q10, q11}, [r1]!   ;load data from diff_ptr

-    vld1.u8         {d29}, [r0], r12

-    vld1.16         {q11, q12}, [r1]!

-    vld1.u8         {d30}, [r0], r12

-    vld1.16         {q12, q13}, [r1]!

-    vld1.u8         {d31}, [r0], r12

-    vld1.16         {q13}, [r1]

-    vmovl.u8        q0, d28             ;modify Pred data from 8 bits to 16 bits

-    vmovl.u8        q1, d29             ;Pred data in d0, d2, d4, d6

-    vmovl.u8        q2, d30

-    vmovl.u8        q3, d31

-    vadd.s16        d0, d0, d20         ;add Diff data and Pred data together

-    vadd.s16        d2, d2, d22

-    vadd.s16        d4, d4, d24

-    vadd.s16        d6, d6, d26

-    vqmovun.s16     d0, q0              ;CLAMP() saturation

-    vqmovun.s16     d1, q1

-    vqmovun.s16     d2, q2

-    vqmovun.s16     d3, q3

-    add             r1, r2, r3

-    vst1.32         {d0[0]}, [r2]       ;store result

-    vst1.32         {d1[0]}, [r1], r3

-    add             r2, r1, r3

-    vst1.32         {d2[0]}, [r1]

-    vst1.32         {d3[0]}, [r2], r3

-    bx             lr

-    ENDP

-    END

--- a/vp8/common/arm/neon/save_neon_reg.asm

+++ /dev/null

@@ -1,36 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_push_neon|

-    EXPORT  |vp9_pop_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-|vp9_push_neon| PROC

-    vst1.i64            {d8, d9, d10, d11}, [r0]!

-    vst1.i64            {d12, d13, d14, d15}, [r0]!

-    bx              lr

-    ENDP

-|vp9_pop_neon| PROC

-    vld1.i64            {d8, d9, d10, d11}, [r0]!

-    vld1.i64            {d12, d13, d14, d15}, [r0]!

-    bx              lr

-    ENDP

-    END

--- a/vp8/common/arm/neon/shortidct4x4llm_1_neon.asm

+++ /dev/null

@@ -1,67 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_short_idct4x4llm_1_neon|

-    EXPORT  |vp8_dc_only_idct_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-;void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);

-; r0    short *input;

-; r1    short *output;

-; r2    int pitch;

-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-|vp8_short_idct4x4llm_1_neon| PROC

-    vld1.16         {d0[]}, [r0]            ;load input[0]

-    add             r3, r1, r2

-    add             r12, r3, r2

-    vrshr.s16       d0, d0, #3

-    add             r0, r12, r2

-    vst1.16         {d0}, [r1]

-    vst1.16         {d0}, [r3]

-    vst1.16         {d0}, [r12]

-    vst1.16         {d0}, [r0]

-    bx             lr

-    ENDP

-;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-;void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);

-; r0    short input_dc;

-; r1    short *output;

-; r2    int pitch;

-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-|vp8_dc_only_idct_neon| PROC

-    vdup.16         d0, r0

-    add             r3, r1, r2

-    add             r12, r3, r2

-    vrshr.s16       d0, d0, #3

-    add             r0, r12, r2

-    vst1.16         {d0}, [r1]

-    vst1.16         {d0}, [r3]

-    vst1.16         {d0}, [r12]

-    vst1.16         {d0}, [r0]

-    bx             lr

-    ENDP

-    END

--- a/vp8/common/arm/neon/shortidct4x4llm_neon.asm

+++ /dev/null

@@ -1,122 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_short_idct4x4llm_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;*************************************************************

-;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)

-;r0 short * input

-;r1 short * output

-;r2 int pitch

-;*************************************************************

-;static const int cospi8sqrt2minus1=20091;

-;static const int sinpi8sqrt2      =35468;

-;static const int rounding = 0;

-;Optimization note: The resulted data from dequantization are signed 13-bit data that is

-;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since

-;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half

-;result of the multiplication that is needed in IDCT.

-|vp8_short_idct4x4llm_neon| PROC

-    adr             r12, idct_coeff

-    vld1.16         {q1, q2}, [r0]

-    vld1.16         {d0}, [r12]

-    vswp            d3, d4                  ;q2(vp[4] vp[12])

-    vqdmulh.s16     q3, q2, d0[2]

-    vqdmulh.s16     q4, q2, d0[0]

-    vqadd.s16       d12, d2, d3             ;a1

-    vqsub.s16       d13, d2, d3             ;b1

-    vshr.s16        q3, q3, #1

-    vshr.s16        q4, q4, #1

-    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)

-    vqadd.s16       q4, q4, q2

-    ;d6 - c1:temp1

-    ;d7 - d1:temp2

-    ;d8 - d1:temp1

-    ;d9 - c1:temp2

-    vqsub.s16       d10, d6, d9             ;c1

-    vqadd.s16       d11, d7, d8             ;d1

-    vqadd.s16       d2, d12, d11

-    vqadd.s16       d3, d13, d10

-    vqsub.s16       d4, d13, d10

-    vqsub.s16       d5, d12, d11

-    vtrn.32         d2, d4

-    vtrn.32         d3, d5

-    vtrn.16         d2, d3

-    vtrn.16         d4, d5

-    vswp            d3, d4

-    vqdmulh.s16     q3, q2, d0[2]

-    vqdmulh.s16     q4, q2, d0[0]

-    vqadd.s16       d12, d2, d3             ;a1

-    vqsub.s16       d13, d2, d3             ;b1

-    vshr.s16        q3, q3, #1

-    vshr.s16        q4, q4, #1

-    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)

-    vqadd.s16       q4, q4, q2

-    vqsub.s16       d10, d6, d9             ;c1

-    vqadd.s16       d11, d7, d8             ;d1

-    vqadd.s16       d2, d12, d11

-    vqadd.s16       d3, d13, d10

-    vqsub.s16       d4, d13, d10

-    vqsub.s16       d5, d12, d11

-    vrshr.s16       d2, d2, #3

-    vrshr.s16       d3, d3, #3

-    vrshr.s16       d4, d4, #3

-    vrshr.s16       d5, d5, #3

-    add             r3, r1, r2

-    add             r12, r3, r2

-    add             r0, r12, r2

-    vtrn.32         d2, d4

-    vtrn.32         d3, d5

-    vtrn.16         d2, d3

-    vtrn.16         d4, d5

-    vst1.16         {d2}, [r1]

-    vst1.16         {d3}, [r3]

-    vst1.16         {d4}, [r12]

-    vst1.16         {d5}, [r0]

-    bx             lr

-    ENDP

-;-----------------

-idct_coeff

-    DCD     0x4e7b4e7b, 0x8a8c8a8c

-;20091, 20091, 35468, 35468

-    END

--- a/vp8/common/arm/neon/sixtappredict16x16_neon.asm

+++ /dev/null

@@ -1,490 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_sixtap_predict16x16_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-filter16_coeff

-    DCD     0,  0,  128,    0,   0,  0,   0,  0

-    DCD     0, -6,  123,   12,  -1,  0,   0,  0

-    DCD     2, -11, 108,   36,  -8,  1,   0,  0

-    DCD     0, -9,   93,   50,  -6,  0,   0,  0

-    DCD     3, -16,  77,   77, -16,  3,   0,  0

-    DCD     0, -6,   50,   93,  -9,  0,   0,  0

-    DCD     1, -8,   36,  108, -11,  2,   0,  0

-    DCD     0, -1,   12,  123,  -6,   0,  0,  0

-; r0    unsigned char  *src_ptr,

-; r1    int  src_pixels_per_line,

-; r2    int  xoffset,

-; r3    int  yoffset,

-; r4    unsigned char *dst_ptr,

-; stack(r5) int  dst_pitch

-;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to

-; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication,

-; the result can be negtive. So, I treat the result as s16. But, since it is also possible

-; that the result can be a large positive number (> 2^15-1), which could be confused as a

-; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,

-; which ensures that the result stays in s16 range. Finally, saturated add the result by

-; applying 3rd filter coeff. Same applys to other filter functions.

-|vp8_sixtap_predict16x16_neon| PROC

-    push            {r4-r5, lr}

-    adr             r12, filter16_coeff

-    ldr             r4, [sp, #12]           ;load parameters from stack

-    ldr             r5, [sp, #16]           ;load parameters from stack

-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

-    beq             secondpass_filter16x16_only

-    add             r2, r12, r2, lsl #5     ;calculate filter location

-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

-    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter

-    beq             firstpass_filter16x16_only

-    sub             sp, sp, #336            ;reserve space on stack for temporary storage

-    mov             lr, sp

-    vabs.s32        q12, q14

-    vabs.s32        q13, q15

-    mov             r2, #7                  ;loop counter

-    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)

-    sub             r0, r0, r1, lsl #1

-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)

-    vdup.8          d1, d24[4]

-    vdup.8          d2, d25[0]

-    vdup.8          d3, d25[4]

-    vdup.8          d4, d26[0]

-    vdup.8          d5, d26[4]

-;First Pass: output_height lines x output_width columns (21x16)

-filt_blk2d_fp16x16_loop_neon

-    vld1.u8         {d6, d7, d8}, [r0], r1      ;load src data

-    vld1.u8         {d9, d10, d11}, [r0], r1

-    vld1.u8         {d12, d13, d14}, [r0], r1

-    pld             [r0]

-    pld             [r0, r1]

-    pld             [r0, r1, lsl #1]

-    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp9_filter[0])

-    vmull.u8        q9, d7, d0

-    vmull.u8        q10, d9, d0

-    vmull.u8        q11, d10, d0

-    vmull.u8        q12, d12, d0

-    vmull.u8        q13, d13, d0

-    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]

-    vext.8          d29, d9, d10, #1

-    vext.8          d30, d12, d13, #1

-    vmlsl.u8        q8, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])

-    vmlsl.u8        q10, d29, d1

-    vmlsl.u8        q12, d30, d1

-    vext.8          d28, d7, d8, #1

-    vext.8          d29, d10, d11, #1

-    vext.8          d30, d13, d14, #1

-    vmlsl.u8        q9, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])

-    vmlsl.u8        q11, d29, d1

-    vmlsl.u8        q13, d30, d1

-    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]

-    vext.8          d29, d9, d10, #4

-    vext.8          d30, d12, d13, #4

-    vmlsl.u8        q8, d28, d4             ;-(src_ptr[2] * vp9_filter[4])

-    vmlsl.u8        q10, d29, d4

-    vmlsl.u8        q12, d30, d4

-    vext.8          d28, d7, d8, #4

-    vext.8          d29, d10, d11, #4

-    vext.8          d30, d13, d14, #4

-    vmlsl.u8        q9, d28, d4             ;-(src_ptr[2] * vp9_filter[4])

-    vmlsl.u8        q11, d29, d4

-    vmlsl.u8        q13, d30, d4

-    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]

-    vext.8          d29, d9, d10, #5

-    vext.8          d30, d12, d13, #5

-    vmlal.u8        q8, d28, d5             ;(src_ptr[3] * vp9_filter[5])

-    vmlal.u8        q10, d29, d5

-    vmlal.u8        q12, d30, d5

-    vext.8          d28, d7, d8, #5

-    vext.8          d29, d10, d11, #5

-    vext.8          d30, d13, d14, #5

-    vmlal.u8        q9, d28, d5             ;(src_ptr[3] * vp9_filter[5])

-    vmlal.u8        q11, d29, d5

-    vmlal.u8        q13, d30, d5

-    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]

-    vext.8          d29, d9, d10, #2

-    vext.8          d30, d12, d13, #2

-    vmlal.u8        q8, d28, d2             ;(src_ptr[0] * vp9_filter[2])

-    vmlal.u8        q10, d29, d2

-    vmlal.u8        q12, d30, d2

-    vext.8          d28, d7, d8, #2

-    vext.8          d29, d10, d11, #2

-    vext.8          d30, d13, d14, #2

-    vmlal.u8        q9, d28, d2             ;(src_ptr[0] * vp9_filter[2])

-    vmlal.u8        q11, d29, d2

-    vmlal.u8        q13, d30, d2

-    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]

-    vext.8          d29, d9, d10, #3

-    vext.8          d30, d12, d13, #3

-    vext.8          d15, d7, d8, #3

-    vext.8          d31, d10, d11, #3

-    vext.8          d6, d13, d14, #3

-    vmull.u8        q4, d28, d3             ;(src_ptr[1] * vp9_filter[3])

-    vmull.u8        q5, d29, d3

-    vmull.u8        q6, d30, d3

-    vqadd.s16       q8, q4                  ;sum of all (src_data*filter_parameters)

-    vqadd.s16       q10, q5

-    vqadd.s16       q12, q6

-    vmull.u8        q6, d15, d3             ;(src_ptr[1] * vp9_filter[3])

-    vmull.u8        q7, d31, d3

-    vmull.u8        q3, d6, d3

-    subs            r2, r2, #1

-    vqadd.s16       q9, q6

-    vqadd.s16       q11, q7

-    vqadd.s16       q13, q3

-    vqrshrun.s16    d6, q8, #7              ;shift/round/saturate to u8

-    vqrshrun.s16    d7, q9, #7

-    vqrshrun.s16    d8, q10, #7

-    vqrshrun.s16    d9, q11, #7

-    vqrshrun.s16    d10, q12, #7

-    vqrshrun.s16    d11, q13, #7

-    vst1.u8         {d6, d7, d8}, [lr]!     ;store result

-    vst1.u8         {d9, d10, d11}, [lr]!

-    bne             filt_blk2d_fp16x16_loop_neon

-;Second pass: 16x16

-;secondpass_filter - do first 8-columns and then second 8-columns

-    add             r3, r12, r3, lsl #5

-    sub             lr, lr, #336

-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter

-    mov             r3, #2                  ;loop counter

-    vabs.s32        q7, q5

-    vabs.s32        q8, q6

-    mov             r2, #16

-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)

-    vdup.8          d1, d14[4]

-    vdup.8          d2, d15[0]

-    vdup.8          d3, d15[4]

-    vdup.8          d4, d16[0]

-    vdup.8          d5, d16[4]

-filt_blk2d_sp16x16_outloop_neon

-    vld1.u8         {d18}, [lr], r2         ;load src data

-    vld1.u8         {d19}, [lr], r2

-    vld1.u8         {d20}, [lr], r2

-    vld1.u8         {d21}, [lr], r2

-    mov             r12, #4                 ;loop counter

-    vld1.u8         {d22}, [lr], r2

-secondpass_inner_loop_neon

-    vld1.u8         {d23}, [lr], r2         ;load src data

-    vld1.u8         {d24}, [lr], r2

-    vld1.u8         {d25}, [lr], r2

-    vld1.u8         {d26}, [lr], r2

-    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp9_filter[0])

-    vmull.u8        q4, d19, d0

-    vmull.u8        q5, d20, d0

-    vmull.u8        q6, d21, d0

-    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp9_filter[1])

-    vmlsl.u8        q4, d20, d1

-    vmlsl.u8        q5, d21, d1

-    vmlsl.u8        q6, d22, d1

-    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp9_filter[4])

-    vmlsl.u8        q4, d23, d4

-    vmlsl.u8        q5, d24, d4

-    vmlsl.u8        q6, d25, d4

-    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp9_filter[2])

-    vmlal.u8        q4, d21, d2

-    vmlal.u8        q5, d22, d2

-    vmlal.u8        q6, d23, d2

-    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp9_filter[5])

-    vmlal.u8        q4, d24, d5

-    vmlal.u8        q5, d25, d5

-    vmlal.u8        q6, d26, d5

-    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp9_filter[3])

-    vmull.u8        q8, d22, d3

-    vmull.u8        q9, d23, d3

-    vmull.u8        q10, d24, d3

-    subs            r12, r12, #1

-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)

-    vqadd.s16       q8, q4

-    vqadd.s16       q9, q5

-    vqadd.s16       q10, q6

-    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8

-    vqrshrun.s16    d7, q8, #7

-    vqrshrun.s16    d8, q9, #7

-    vqrshrun.s16    d9, q10, #7

-    vst1.u8         {d6}, [r4], r5          ;store result

-    vmov            q9, q11

-    vst1.u8         {d7}, [r4], r5

-    vmov            q10, q12

-    vst1.u8         {d8}, [r4], r5

-    vmov            d22, d26

-    vst1.u8         {d9}, [r4], r5

-    bne             secondpass_inner_loop_neon

-    subs            r3, r3, #1

-    sub             lr, lr, #336

-    add             lr, lr, #8

-    sub             r4, r4, r5, lsl #4

-    add             r4, r4, #8

-    bne filt_blk2d_sp16x16_outloop_neon

-    add             sp, sp, #336

-    pop             {r4-r5,pc}

-;--------------------

-firstpass_filter16x16_only

-    vabs.s32        q12, q14

-    vabs.s32        q13, q15

-    mov             r2, #8                  ;loop counter

-    sub             r0, r0, #2              ;move srcptr back to (column-2)

-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)

-    vdup.8          d1, d24[4]

-    vdup.8          d2, d25[0]

-    vdup.8          d3, d25[4]

-    vdup.8          d4, d26[0]

-    vdup.8          d5, d26[4]

-;First Pass: output_height lines x output_width columns (16x16)

-filt_blk2d_fpo16x16_loop_neon

-    vld1.u8         {d6, d7, d8}, [r0], r1      ;load src data

-    vld1.u8         {d9, d10, d11}, [r0], r1

-    pld             [r0]

-    pld             [r0, r1]

-    vmull.u8        q6, d6, d0              ;(src_ptr[-2] * vp9_filter[0])

-    vmull.u8        q7, d7, d0

-    vmull.u8        q8, d9, d0

-    vmull.u8        q9, d10, d0

-    vext.8          d20, d6, d7, #1         ;construct src_ptr[-1]

-    vext.8          d21, d9, d10, #1

-    vext.8          d22, d7, d8, #1

-    vext.8          d23, d10, d11, #1

-    vext.8          d24, d6, d7, #4         ;construct src_ptr[2]

-    vext.8          d25, d9, d10, #4

-    vext.8          d26, d7, d8, #4

-    vext.8          d27, d10, d11, #4

-    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]

-    vext.8          d29, d9, d10, #5

-    vmlsl.u8        q6, d20, d1             ;-(src_ptr[-1] * vp9_filter[1])

-    vmlsl.u8        q8, d21, d1

-    vmlsl.u8        q7, d22, d1             ;-(src_ptr[-1] * vp9_filter[1])

-    vmlsl.u8        q9, d23, d1

-    vmlsl.u8        q6, d24, d4             ;-(src_ptr[2] * vp9_filter[4])

-    vmlsl.u8        q8, d25, d4

-    vmlsl.u8        q7, d26, d4             ;-(src_ptr[2] * vp9_filter[4])

-    vmlsl.u8        q9, d27, d4

-    vmlal.u8        q6, d28, d5             ;(src_ptr[3] * vp9_filter[5])

-    vmlal.u8        q8, d29, d5

-    vext.8          d20, d7, d8, #5

-    vext.8          d21, d10, d11, #5

-    vext.8          d22, d6, d7, #2         ;construct src_ptr[0]

-    vext.8          d23, d9, d10, #2

-    vext.8          d24, d7, d8, #2

-    vext.8          d25, d10, d11, #2

-    vext.8          d26, d6, d7, #3         ;construct src_ptr[1]

-    vext.8          d27, d9, d10, #3

-    vext.8          d28, d7, d8, #3

-    vext.8          d29, d10, d11, #3

-    vmlal.u8        q7, d20, d5             ;(src_ptr[3] * vp9_filter[5])

-    vmlal.u8        q9, d21, d5

-    vmlal.u8        q6, d22, d2             ;(src_ptr[0] * vp9_filter[2])

-    vmlal.u8        q8, d23, d2

-    vmlal.u8        q7, d24, d2             ;(src_ptr[0] * vp9_filter[2])

-    vmlal.u8        q9, d25, d2

-    vmull.u8        q10, d26, d3            ;(src_ptr[1] * vp9_filter[3])

-    vmull.u8        q11, d27, d3

-    vmull.u8        q12, d28, d3            ;(src_ptr[1] * vp9_filter[3])

-    vmull.u8        q15, d29, d3

-    vqadd.s16       q6, q10                 ;sum of all (src_data*filter_parameters)

-    vqadd.s16       q8, q11

-    vqadd.s16       q7, q12

-    vqadd.s16       q9, q15

-    subs            r2, r2, #1

-    vqrshrun.s16    d6, q6, #7              ;shift/round/saturate to u8

-    vqrshrun.s16    d7, q7, #7

-    vqrshrun.s16    d8, q8, #7

-    vqrshrun.s16    d9, q9, #7

-    vst1.u8         {q3}, [r4], r5              ;store result

-    vst1.u8         {q4}, [r4], r5

-    bne             filt_blk2d_fpo16x16_loop_neon

-    pop             {r4-r5,pc}

-;--------------------

-secondpass_filter16x16_only

-;Second pass: 16x16

-    add             r3, r12, r3, lsl #5

-    sub             r0, r0, r1, lsl #1

-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter

-    mov             r3, #2                  ;loop counter

-    vabs.s32        q7, q5

-    vabs.s32        q8, q6

-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)

-    vdup.8          d1, d14[4]

-    vdup.8          d2, d15[0]

-    vdup.8          d3, d15[4]

-    vdup.8          d4, d16[0]

-    vdup.8          d5, d16[4]

-filt_blk2d_spo16x16_outloop_neon

-    vld1.u8         {d18}, [r0], r1         ;load src data

-    vld1.u8         {d19}, [r0], r1

-    vld1.u8         {d20}, [r0], r1

-    vld1.u8         {d21}, [r0], r1

-    mov             r12, #4                 ;loop counter

-    vld1.u8         {d22}, [r0], r1

-secondpass_only_inner_loop_neon

-    vld1.u8         {d23}, [r0], r1         ;load src data

-    vld1.u8         {d24}, [r0], r1

-    vld1.u8         {d25}, [r0], r1

-    vld1.u8         {d26}, [r0], r1

-    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp9_filter[0])

-    vmull.u8        q4, d19, d0

-    vmull.u8        q5, d20, d0

-    vmull.u8        q6, d21, d0

-    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp9_filter[1])

-    vmlsl.u8        q4, d20, d1

-    vmlsl.u8        q5, d21, d1

-    vmlsl.u8        q6, d22, d1

-    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp9_filter[4])

-    vmlsl.u8        q4, d23, d4

-    vmlsl.u8        q5, d24, d4

-    vmlsl.u8        q6, d25, d4

-    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp9_filter[2])

-    vmlal.u8        q4, d21, d2

-    vmlal.u8        q5, d22, d2

-    vmlal.u8        q6, d23, d2

-    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp9_filter[5])

-    vmlal.u8        q4, d24, d5

-    vmlal.u8        q5, d25, d5

-    vmlal.u8        q6, d26, d5

-    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp9_filter[3])

-    vmull.u8        q8, d22, d3

-    vmull.u8        q9, d23, d3

-    vmull.u8        q10, d24, d3

-    subs            r12, r12, #1

-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)

-    vqadd.s16       q8, q4

-    vqadd.s16       q9, q5

-    vqadd.s16       q10, q6

-    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8

-    vqrshrun.s16    d7, q8, #7

-    vqrshrun.s16    d8, q9, #7

-    vqrshrun.s16    d9, q10, #7

-    vst1.u8         {d6}, [r4], r5          ;store result

-    vmov            q9, q11

-    vst1.u8         {d7}, [r4], r5

-    vmov            q10, q12

-    vst1.u8         {d8}, [r4], r5

-    vmov            d22, d26

-    vst1.u8         {d9}, [r4], r5

-    bne             secondpass_only_inner_loop_neon

-    subs            r3, r3, #1

-    sub             r0, r0, r1, lsl #4

-    sub             r0, r0, r1, lsl #2

-    sub             r0, r0, r1

-    add             r0, r0, #8

-    sub             r4, r4, r5, lsl #4

-    add             r4, r4, #8

-    bne filt_blk2d_spo16x16_outloop_neon

-    pop             {r4-r5,pc}

-    ENDP

-;-----------------

-    END

--- a/vp8/common/arm/neon/sixtappredict4x4_neon.asm

+++ /dev/null

@@ -1,422 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_sixtap_predict_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-filter4_coeff

-    DCD     0,  0,  128,    0,   0,  0,   0,  0

-    DCD     0, -6,  123,   12,  -1,  0,   0,  0

-    DCD     2, -11, 108,   36,  -8,  1,   0,  0

-    DCD     0, -9,   93,   50,  -6,  0,   0,  0

-    DCD     3, -16,  77,   77, -16,  3,   0,  0

-    DCD     0, -6,   50,   93,  -9,  0,   0,  0

-    DCD     1, -8,   36,  108, -11,  2,   0,  0

-    DCD     0, -1,   12,  123,  -6,   0,  0,  0

-; r0    unsigned char  *src_ptr,

-; r1    int  src_pixels_per_line,

-; r2    int  xoffset,

-; r3    int  yoffset,

-; stack(r4) unsigned char *dst_ptr,

-; stack(lr) int  dst_pitch

-|vp8_sixtap_predict_neon| PROC

-    push            {r4, lr}

-    adr             r12, filter4_coeff

-    ldr             r4, [sp, #8]            ;load parameters from stack

-    ldr             lr, [sp, #12]           ;load parameters from stack

-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

-    beq             secondpass_filter4x4_only

-    add             r2, r12, r2, lsl #5     ;calculate filter location

-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

-    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter

-    beq             firstpass_filter4x4_only

-    vabs.s32        q12, q14                ;get abs(filer_parameters)

-    vabs.s32        q13, q15

-    sub             r0, r0, #2              ;go back 2 columns of src data

-    sub             r0, r0, r1, lsl #1      ;go back 2 lines of src data

-;First pass: output_height lines x output_width columns (9x4)

-    vld1.u8         {q3}, [r0], r1          ;load first 4-line src data

-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)

-    vld1.u8         {q4}, [r0], r1

-    vdup.8          d1, d24[4]

-    vld1.u8         {q5}, [r0], r1

-    vdup.8          d2, d25[0]

-    vld1.u8         {q6}, [r0], r1

-    vdup.8          d3, d25[4]

-    vdup.8          d4, d26[0]

-    vdup.8          d5, d26[4]

-    pld             [r0]

-    pld             [r0, r1]

-    pld             [r0, r1, lsl #1]

-    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]

-    vext.8          d19, d8, d9, #5

-    vext.8          d20, d10, d11, #5

-    vext.8          d21, d12, d13, #5

-    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done

-    vswp            d11, d12

-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])

-    vzip.32         d20, d21

-    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp9_filter[5])

-    vmull.u8        q8, d20, d5

-    vmov            q4, q3                  ;keep original src data in q4 q6

-    vmov            q6, q5

-    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together

-    vzip.32         d10, d11

-    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]

-    vshr.u64        q10, q6, #8

-    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp9_filter[0])

-    vmlal.u8        q8, d10, d0

-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])

-    vzip.32         d20, d21

-    vshr.u64        q3, q4, #32             ;construct src_ptr[2]

-    vshr.u64        q5, q6, #32

-    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp9_filter[1])

-    vmlsl.u8        q8, d20, d1

-    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])

-    vzip.32         d10, d11

-    vshr.u64        q9, q4, #16             ;construct src_ptr[0]

-    vshr.u64        q10, q6, #16

-    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp9_filter[4])

-    vmlsl.u8        q8, d10, d4

-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])

-    vzip.32         d20, d21

-    vshr.u64        q3, q4, #24             ;construct src_ptr[1]

-    vshr.u64        q5, q6, #24

-    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp9_filter[2])

-    vmlal.u8        q8, d20, d2

-    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])

-    vzip.32         d10, d11

-    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp9_filter[3])

-    vmull.u8        q10, d10, d3

-    vld1.u8         {q3}, [r0], r1          ;load rest 5-line src data

-    vld1.u8         {q4}, [r0], r1

-    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)

-    vqadd.s16       q8, q10

-    vld1.u8         {q5}, [r0], r1

-    vld1.u8         {q6}, [r0], r1

-    vqrshrun.s16    d27, q7, #7             ;shift/round/saturate to u8

-    vqrshrun.s16    d28, q8, #7

-    ;First Pass on rest 5-line data

-    vld1.u8         {q11}, [r0], r1

-    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]

-    vext.8          d19, d8, d9, #5

-    vext.8          d20, d10, d11, #5

-    vext.8          d21, d12, d13, #5

-    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done

-    vswp            d11, d12

-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])

-    vzip.32         d20, d21

-    vext.8          d31, d22, d23, #5       ;construct src_ptr[3]

-    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp9_filter[5])

-    vmull.u8        q8, d20, d5

-    vmull.u8        q12, d31, d5            ;(src_ptr[3] * vp9_filter[5])

-    vmov            q4, q3                  ;keep original src data in q4 q6

-    vmov            q6, q5

-    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together

-    vzip.32         d10, d11

-    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]

-    vshr.u64        q10, q6, #8

-    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp9_filter[0])

-    vmlal.u8        q8, d10, d0

-    vmlal.u8        q12, d22, d0            ;(src_ptr[-2] * vp9_filter[0])

-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])

-    vzip.32         d20, d21

-    vshr.u64        q3, q4, #32             ;construct src_ptr[2]

-    vshr.u64        q5, q6, #32

-    vext.8          d31, d22, d23, #1       ;construct src_ptr[-1]

-    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp9_filter[1])

-    vmlsl.u8        q8, d20, d1

-    vmlsl.u8        q12, d31, d1            ;-(src_ptr[-1] * vp9_filter[1])

-    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])

-    vzip.32         d10, d11

-    vshr.u64        q9, q4, #16             ;construct src_ptr[0]

-    vshr.u64        q10, q6, #16

-    vext.8          d31, d22, d23, #4       ;construct src_ptr[2]

-    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp9_filter[4])

-    vmlsl.u8        q8, d10, d4

-    vmlsl.u8        q12, d31, d4            ;-(src_ptr[2] * vp9_filter[4])

-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])

-    vzip.32         d20, d21

-    vshr.u64        q3, q4, #24             ;construct src_ptr[1]

-    vshr.u64        q5, q6, #24

-    vext.8          d31, d22, d23, #2       ;construct src_ptr[0]

-    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp9_filter[2])

-    vmlal.u8        q8, d20, d2

-    vmlal.u8        q12, d31, d2            ;(src_ptr[0] * vp9_filter[2])

-    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])

-    vzip.32         d10, d11

-    vext.8          d31, d22, d23, #3       ;construct src_ptr[1]

-    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp9_filter[3])

-    vmull.u8        q10, d10, d3

-    vmull.u8        q11, d31, d3            ;(src_ptr[1] * vp9_filter[3])

-    add             r3, r12, r3, lsl #5

-    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)

-    vqadd.s16       q8, q10

-    vqadd.s16       q12, q11

-    vext.8          d23, d27, d28, #4

-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter

-    vqrshrun.s16    d29, q7, #7             ;shift/round/saturate to u8

-    vqrshrun.s16    d30, q8, #7

-    vqrshrun.s16    d31, q12, #7

-;Second pass: 4x4

-    vabs.s32        q7, q5

-    vabs.s32        q8, q6

-    vext.8          d24, d28, d29, #4

-    vext.8          d25, d29, d30, #4

-    vext.8          d26, d30, d31, #4

-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)

-    vdup.8          d1, d14[4]

-    vdup.8          d2, d15[0]

-    vdup.8          d3, d15[4]

-    vdup.8          d4, d16[0]

-    vdup.8          d5, d16[4]

-    vmull.u8        q3, d27, d0             ;(src_ptr[-2] * vp9_filter[0])

-    vmull.u8        q4, d28, d0

-    vmull.u8        q5, d25, d5             ;(src_ptr[3] * vp9_filter[5])

-    vmull.u8        q6, d26, d5

-    vmlsl.u8        q3, d29, d4             ;-(src_ptr[2] * vp9_filter[4])

-    vmlsl.u8        q4, d30, d4

-    vmlsl.u8        q5, d23, d1             ;-(src_ptr[-1] * vp9_filter[1])

-    vmlsl.u8        q6, d24, d1

-    vmlal.u8        q3, d28, d2             ;(src_ptr[0] * vp9_filter[2])

-    vmlal.u8        q4, d29, d2

-    vmlal.u8        q5, d24, d3             ;(src_ptr[1] * vp9_filter[3])

-    vmlal.u8        q6, d25, d3

-    add             r0, r4, lr

-    add             r1, r0, lr

-    add             r2, r1, lr

-    vqadd.s16       q5, q3                  ;sum of all (src_data*filter_parameters)

-    vqadd.s16       q6, q4

-    vqrshrun.s16    d3, q5, #7              ;shift/round/saturate to u8

-    vqrshrun.s16    d4, q6, #7

-    vst1.32         {d3[0]}, [r4]           ;store result

-    vst1.32         {d3[1]}, [r0]

-    vst1.32         {d4[0]}, [r1]

-    vst1.32         {d4[1]}, [r2]

-    pop             {r4, pc}

-;---------------------

-firstpass_filter4x4_only

-    vabs.s32        q12, q14                ;get abs(filer_parameters)

-    vabs.s32        q13, q15

-    sub             r0, r0, #2              ;go back 2 columns of src data

-;First pass: output_height lines x output_width columns (4x4)

-    vld1.u8         {q3}, [r0], r1          ;load first 4-line src data

-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)

-    vld1.u8         {q4}, [r0], r1

-    vdup.8          d1, d24[4]

-    vld1.u8         {q5}, [r0], r1

-    vdup.8          d2, d25[0]

-    vld1.u8         {q6}, [r0], r1

-    vdup.8          d3, d25[4]

-    vdup.8          d4, d26[0]

-    vdup.8          d5, d26[4]

-    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]

-    vext.8          d19, d8, d9, #5

-    vext.8          d20, d10, d11, #5

-    vext.8          d21, d12, d13, #5

-    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done

-    vswp            d11, d12

-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])

-    vzip.32         d20, d21

-    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp9_filter[5])

-    vmull.u8        q8, d20, d5

-    vmov            q4, q3                  ;keep original src data in q4 q6

-    vmov            q6, q5

-    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together

-    vzip.32         d10, d11

-    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]

-    vshr.u64        q10, q6, #8

-    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp9_filter[0])

-    vmlal.u8        q8, d10, d0

-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])

-    vzip.32         d20, d21

-    vshr.u64        q3, q4, #32             ;construct src_ptr[2]

-    vshr.u64        q5, q6, #32

-    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp9_filter[1])

-    vmlsl.u8        q8, d20, d1

-    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])

-    vzip.32         d10, d11

-    vshr.u64        q9, q4, #16             ;construct src_ptr[0]

-    vshr.u64        q10, q6, #16

-    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp9_filter[4])

-    vmlsl.u8        q8, d10, d4

-    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])

-    vzip.32         d20, d21

-    vshr.u64        q3, q4, #24             ;construct src_ptr[1]

-    vshr.u64        q5, q6, #24

-    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp9_filter[2])

-    vmlal.u8        q8, d20, d2

-    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])

-    vzip.32         d10, d11

-    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp9_filter[3])

-    vmull.u8        q10, d10, d3

-    add             r0, r4, lr

-    add             r1, r0, lr

-    add             r2, r1, lr

-    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)

-    vqadd.s16       q8, q10

-    vqrshrun.s16    d27, q7, #7             ;shift/round/saturate to u8

-    vqrshrun.s16    d28, q8, #7

-    vst1.32         {d27[0]}, [r4]          ;store result

-    vst1.32         {d27[1]}, [r0]

-    vst1.32         {d28[0]}, [r1]

-    vst1.32         {d28[1]}, [r2]

-    pop             {r4, pc}

-;---------------------

-secondpass_filter4x4_only

-    sub             r0, r0, r1, lsl #1

-    add             r3, r12, r3, lsl #5

-    vld1.32         {d27[0]}, [r0], r1      ;load src data

-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter

-    vld1.32         {d27[1]}, [r0], r1

-    vabs.s32        q7, q5

-    vld1.32         {d28[0]}, [r0], r1

-    vabs.s32        q8, q6

-    vld1.32         {d28[1]}, [r0], r1

-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)

-    vld1.32         {d29[0]}, [r0], r1

-    vdup.8          d1, d14[4]

-    vld1.32         {d29[1]}, [r0], r1

-    vdup.8          d2, d15[0]

-    vld1.32         {d30[0]}, [r0], r1

-    vdup.8          d3, d15[4]

-    vld1.32         {d30[1]}, [r0], r1

-    vdup.8          d4, d16[0]

-    vld1.32         {d31[0]}, [r0], r1

-    vdup.8          d5, d16[4]

-    vext.8          d23, d27, d28, #4

-    vext.8          d24, d28, d29, #4

-    vext.8          d25, d29, d30, #4

-    vext.8          d26, d30, d31, #4

-    vmull.u8        q3, d27, d0             ;(src_ptr[-2] * vp9_filter[0])

-    vmull.u8        q4, d28, d0

-    vmull.u8        q5, d25, d5             ;(src_ptr[3] * vp9_filter[5])

-    vmull.u8        q6, d26, d5

-    vmlsl.u8        q3, d29, d4             ;-(src_ptr[2] * vp9_filter[4])

-    vmlsl.u8        q4, d30, d4

-    vmlsl.u8        q5, d23, d1             ;-(src_ptr[-1] * vp9_filter[1])

-    vmlsl.u8        q6, d24, d1

-    vmlal.u8        q3, d28, d2             ;(src_ptr[0] * vp9_filter[2])

-    vmlal.u8        q4, d29, d2

-    vmlal.u8        q5, d24, d3             ;(src_ptr[1] * vp9_filter[3])

-    vmlal.u8        q6, d25, d3

-    add             r0, r4, lr

-    add             r1, r0, lr

-    add             r2, r1, lr

-    vqadd.s16       q5, q3                  ;sum of all (src_data*filter_parameters)

-    vqadd.s16       q6, q4

-    vqrshrun.s16    d3, q5, #7              ;shift/round/saturate to u8

-    vqrshrun.s16    d4, q6, #7

-    vst1.32         {d3[0]}, [r4]           ;store result

-    vst1.32         {d3[1]}, [r0]

-    vst1.32         {d4[0]}, [r1]

-    vst1.32         {d4[1]}, [r2]

-    pop             {r4, pc}

-    ENDP

-;-----------------

-    END

--- a/vp8/common/arm/neon/sixtappredict8x4_neon.asm

+++ /dev/null

@@ -1,473 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_sixtap_predict8x4_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-filter8_coeff

-    DCD     0,  0,  128,    0,   0,  0,   0,  0

-    DCD     0, -6,  123,   12,  -1,  0,   0,  0

-    DCD     2, -11, 108,   36,  -8,  1,   0,  0

-    DCD     0, -9,   93,   50,  -6,  0,   0,  0

-    DCD     3, -16,  77,   77, -16,  3,   0,  0

-    DCD     0, -6,   50,   93,  -9,  0,   0,  0

-    DCD     1, -8,   36,  108, -11,  2,   0,  0

-    DCD     0, -1,   12,  123,  -6,   0,  0,  0

-; r0    unsigned char  *src_ptr,

-; r1    int  src_pixels_per_line,

-; r2    int  xoffset,

-; r3    int  yoffset,

-; r4    unsigned char *dst_ptr,

-; stack(r5) int  dst_pitch

-|vp8_sixtap_predict8x4_neon| PROC

-    push            {r4-r5, lr}

-    adr             r12, filter8_coeff

-    ldr             r4, [sp, #12]           ;load parameters from stack

-    ldr             r5, [sp, #16]           ;load parameters from stack

-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

-    beq             secondpass_filter8x4_only

-    add             r2, r12, r2, lsl #5     ;calculate filter location

-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

-    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter

-    beq             firstpass_filter8x4_only

-    sub             sp, sp, #32             ;reserve space on stack for temporary storage

-    vabs.s32        q12, q14

-    vabs.s32        q13, q15

-    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)

-    mov             lr, sp

-    sub             r0, r0, r1, lsl #1

-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)

-    vdup.8          d1, d24[4]

-    vdup.8          d2, d25[0]

-;First pass: output_height lines x output_width columns (9x8)

-    vld1.u8         {q3}, [r0], r1          ;load src data

-    vdup.8          d3, d25[4]

-    vld1.u8         {q4}, [r0], r1

-    vdup.8          d4, d26[0]

-    vld1.u8         {q5}, [r0], r1

-    vdup.8          d5, d26[4]

-    vld1.u8         {q6}, [r0], r1

-    pld             [r0]

-    pld             [r0, r1]

-    pld             [r0, r1, lsl #1]

-    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp9_filter[0])

-    vmull.u8        q8, d8, d0

-    vmull.u8        q9, d10, d0

-    vmull.u8        q10, d12, d0

-    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]

-    vext.8          d29, d8, d9, #1

-    vext.8          d30, d10, d11, #1

-    vext.8          d31, d12, d13, #1

-    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])

-    vmlsl.u8        q8, d29, d1

-    vmlsl.u8        q9, d30, d1

-    vmlsl.u8        q10, d31, d1

-    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]

-    vext.8          d29, d8, d9, #4

-    vext.8          d30, d10, d11, #4

-    vext.8          d31, d12, d13, #4

-    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp9_filter[4])

-    vmlsl.u8        q8, d29, d4

-    vmlsl.u8        q9, d30, d4

-    vmlsl.u8        q10, d31, d4

-    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]

-    vext.8          d29, d8, d9, #2

-    vext.8          d30, d10, d11, #2

-    vext.8          d31, d12, d13, #2

-    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp9_filter[2])

-    vmlal.u8        q8, d29, d2

-    vmlal.u8        q9, d30, d2

-    vmlal.u8        q10, d31, d2

-    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]

-    vext.8          d29, d8, d9, #5

-    vext.8          d30, d10, d11, #5

-    vext.8          d31, d12, d13, #5

-    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp9_filter[5])

-    vmlal.u8        q8, d29, d5

-    vmlal.u8        q9, d30, d5

-    vmlal.u8        q10, d31, d5

-    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]

-    vext.8          d29, d8, d9, #3

-    vext.8          d30, d10, d11, #3

-    vext.8          d31, d12, d13, #3

-    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp9_filter[3])

-    vmull.u8        q4, d29, d3

-    vmull.u8        q5, d30, d3

-    vmull.u8        q6, d31, d3

-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)

-    vqadd.s16       q8, q4

-    vqadd.s16       q9, q5

-    vqadd.s16       q10, q6

-    vld1.u8         {q3}, [r0], r1          ;load src data

-    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8

-    vqrshrun.s16    d23, q8, #7

-    vqrshrun.s16    d24, q9, #7

-    vqrshrun.s16    d25, q10, #7

-    vld1.u8         {q4}, [r0], r1

-    vst1.u8         {d22}, [lr]!            ;store result

-    vld1.u8         {q5}, [r0], r1

-    vst1.u8         {d23}, [lr]!

-    vld1.u8         {q6}, [r0], r1

-    vst1.u8         {d24}, [lr]!

-    vld1.u8         {q7}, [r0], r1

-    vst1.u8         {d25}, [lr]!

-    ;first_pass filtering on the rest 5-line data

-    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp9_filter[0])

-    vmull.u8        q9, d8, d0

-    vmull.u8        q10, d10, d0

-    vmull.u8        q11, d12, d0

-    vmull.u8        q12, d14, d0

-    vext.8          d27, d6, d7, #1         ;construct src_ptr[-1]

-    vext.8          d28, d8, d9, #1

-    vext.8          d29, d10, d11, #1

-    vext.8          d30, d12, d13, #1

-    vext.8          d31, d14, d15, #1

-    vmlsl.u8        q8, d27, d1             ;-(src_ptr[-1] * vp9_filter[1])

-    vmlsl.u8        q9, d28, d1

-    vmlsl.u8        q10, d29, d1

-    vmlsl.u8        q11, d30, d1

-    vmlsl.u8        q12, d31, d1

-    vext.8          d27, d6, d7, #4         ;construct src_ptr[2]

-    vext.8          d28, d8, d9, #4

-    vext.8          d29, d10, d11, #4

-    vext.8          d30, d12, d13, #4

-    vext.8          d31, d14, d15, #4

-    vmlsl.u8        q8, d27, d4             ;-(src_ptr[2] * vp9_filter[4])

-    vmlsl.u8        q9, d28, d4

-    vmlsl.u8        q10, d29, d4

-    vmlsl.u8        q11, d30, d4

-    vmlsl.u8        q12, d31, d4

-    vext.8          d27, d6, d7, #2         ;construct src_ptr[0]

-    vext.8          d28, d8, d9, #2

-    vext.8          d29, d10, d11, #2

-    vext.8          d30, d12, d13, #2

-    vext.8          d31, d14, d15, #2

-    vmlal.u8        q8, d27, d2             ;(src_ptr[0] * vp9_filter[2])

-    vmlal.u8        q9, d28, d2

-    vmlal.u8        q10, d29, d2

-    vmlal.u8        q11, d30, d2

-    vmlal.u8        q12, d31, d2

-    vext.8          d27, d6, d7, #5         ;construct src_ptr[3]

-    vext.8          d28, d8, d9, #5

-    vext.8          d29, d10, d11, #5

-    vext.8          d30, d12, d13, #5

-    vext.8          d31, d14, d15, #5

-    vmlal.u8        q8, d27, d5             ;(src_ptr[3] * vp9_filter[5])

-    vmlal.u8        q9, d28, d5

-    vmlal.u8        q10, d29, d5

-    vmlal.u8        q11, d30, d5

-    vmlal.u8        q12, d31, d5

-    vext.8          d27, d6, d7, #3         ;construct src_ptr[1]

-    vext.8          d28, d8, d9, #3

-    vext.8          d29, d10, d11, #3

-    vext.8          d30, d12, d13, #3

-    vext.8          d31, d14, d15, #3

-    vmull.u8        q3, d27, d3             ;(src_ptr[1] * vp9_filter[3])

-    vmull.u8        q4, d28, d3

-    vmull.u8        q5, d29, d3

-    vmull.u8        q6, d30, d3

-    vmull.u8        q7, d31, d3

-    vqadd.s16       q8, q3                  ;sum of all (src_data*filter_parameters)

-    vqadd.s16       q9, q4

-    vqadd.s16       q10, q5

-    vqadd.s16       q11, q6

-    vqadd.s16       q12, q7

-    vqrshrun.s16    d26, q8, #7             ;shift/round/saturate to u8

-    vqrshrun.s16    d27, q9, #7

-    vqrshrun.s16    d28, q10, #7

-    vqrshrun.s16    d29, q11, #7                ;load intermediate data from stack

-    vqrshrun.s16    d30, q12, #7

-;Second pass: 8x4

-;secondpass_filter

-    add             r3, r12, r3, lsl #5

-    sub             lr, lr, #32

-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter

-    vld1.u8         {q11}, [lr]!

-    vabs.s32        q7, q5

-    vabs.s32        q8, q6

-    vld1.u8         {q12}, [lr]!

-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)

-    vdup.8          d1, d14[4]

-    vdup.8          d2, d15[0]

-    vdup.8          d3, d15[4]

-    vdup.8          d4, d16[0]

-    vdup.8          d5, d16[4]

-    vmull.u8        q3, d22, d0             ;(src_ptr[-2] * vp9_filter[0])

-    vmull.u8        q4, d23, d0

-    vmull.u8        q5, d24, d0

-    vmull.u8        q6, d25, d0

-    vmlsl.u8        q3, d23, d1             ;-(src_ptr[-1] * vp9_filter[1])

-    vmlsl.u8        q4, d24, d1

-    vmlsl.u8        q5, d25, d1

-    vmlsl.u8        q6, d26, d1

-    vmlsl.u8        q3, d26, d4             ;-(src_ptr[2] * vp9_filter[4])

-    vmlsl.u8        q4, d27, d4

-    vmlsl.u8        q5, d28, d4

-    vmlsl.u8        q6, d29, d4

-    vmlal.u8        q3, d24, d2             ;(src_ptr[0] * vp9_filter[2])

-    vmlal.u8        q4, d25, d2

-    vmlal.u8        q5, d26, d2

-    vmlal.u8        q6, d27, d2

-    vmlal.u8        q3, d27, d5             ;(src_ptr[3] * vp9_filter[5])

-    vmlal.u8        q4, d28, d5

-    vmlal.u8        q5, d29, d5

-    vmlal.u8        q6, d30, d5

-    vmull.u8        q7, d25, d3             ;(src_ptr[1] * vp9_filter[3])

-    vmull.u8        q8, d26, d3

-    vmull.u8        q9, d27, d3

-    vmull.u8        q10, d28, d3

-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)

-    vqadd.s16       q8, q4

-    vqadd.s16       q9, q5

-    vqadd.s16       q10, q6

-    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8

-    vqrshrun.s16    d7, q8, #7

-    vqrshrun.s16    d8, q9, #7

-    vqrshrun.s16    d9, q10, #7

-    vst1.u8         {d6}, [r4], r5          ;store result

-    vst1.u8         {d7}, [r4], r5

-    vst1.u8         {d8}, [r4], r5

-    vst1.u8         {d9}, [r4], r5

-    add             sp, sp, #32

-    pop             {r4-r5,pc}

-;--------------------

-firstpass_filter8x4_only

-    vabs.s32        q12, q14

-    vabs.s32        q13, q15

-    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)

-    vld1.u8         {q3}, [r0], r1          ;load src data

-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)

-    vld1.u8         {q4}, [r0], r1

-    vdup.8          d1, d24[4]

-    vld1.u8         {q5}, [r0], r1

-    vdup.8          d2, d25[0]

-    vld1.u8         {q6}, [r0], r1

-    vdup.8          d3, d25[4]

-    vdup.8          d4, d26[0]

-    vdup.8          d5, d26[4]

-;First pass: output_height lines x output_width columns (4x8)

-    pld             [r0]

-    pld             [r0, r1]

-    pld             [r0, r1, lsl #1]

-    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp9_filter[0])

-    vmull.u8        q8, d8, d0

-    vmull.u8        q9, d10, d0

-    vmull.u8        q10, d12, d0

-    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]

-    vext.8          d29, d8, d9, #1

-    vext.8          d30, d10, d11, #1

-    vext.8          d31, d12, d13, #1

-    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])

-    vmlsl.u8        q8, d29, d1

-    vmlsl.u8        q9, d30, d1

-    vmlsl.u8        q10, d31, d1

-    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]

-    vext.8          d29, d8, d9, #4

-    vext.8          d30, d10, d11, #4

-    vext.8          d31, d12, d13, #4

-    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp9_filter[4])

-    vmlsl.u8        q8, d29, d4

-    vmlsl.u8        q9, d30, d4

-    vmlsl.u8        q10, d31, d4

-    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]

-    vext.8          d29, d8, d9, #2

-    vext.8          d30, d10, d11, #2

-    vext.8          d31, d12, d13, #2

-    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp9_filter[2])

-    vmlal.u8        q8, d29, d2

-    vmlal.u8        q9, d30, d2

-    vmlal.u8        q10, d31, d2

-    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]

-    vext.8          d29, d8, d9, #5

-    vext.8          d30, d10, d11, #5

-    vext.8          d31, d12, d13, #5

-    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp9_filter[5])

-    vmlal.u8        q8, d29, d5

-    vmlal.u8        q9, d30, d5

-    vmlal.u8        q10, d31, d5

-    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]

-    vext.8          d29, d8, d9, #3

-    vext.8          d30, d10, d11, #3

-    vext.8          d31, d12, d13, #3

-    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp9_filter[3])

-    vmull.u8        q4, d29, d3

-    vmull.u8        q5, d30, d3

-    vmull.u8        q6, d31, d3

-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)

-    vqadd.s16       q8, q4

-    vqadd.s16       q9, q5

-    vqadd.s16       q10, q6

-    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8

-    vqrshrun.s16    d23, q8, #7

-    vqrshrun.s16    d24, q9, #7

-    vqrshrun.s16    d25, q10, #7

-    vst1.u8         {d22}, [r4], r5         ;store result

-    vst1.u8         {d23}, [r4], r5

-    vst1.u8         {d24}, [r4], r5

-    vst1.u8         {d25}, [r4], r5

-    pop             {r4-r5,pc}

-;---------------------

-secondpass_filter8x4_only

-;Second pass: 8x4

-    add             r3, r12, r3, lsl #5

-    sub             r0, r0, r1, lsl #1

-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter

-    vabs.s32        q7, q5

-    vabs.s32        q8, q6

-    vld1.u8         {d22}, [r0], r1

-    vld1.u8         {d23}, [r0], r1

-    vld1.u8         {d24}, [r0], r1

-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)

-    vld1.u8         {d25}, [r0], r1

-    vdup.8          d1, d14[4]

-    vld1.u8         {d26}, [r0], r1

-    vdup.8          d2, d15[0]

-    vld1.u8         {d27}, [r0], r1

-    vdup.8          d3, d15[4]

-    vld1.u8         {d28}, [r0], r1

-    vdup.8          d4, d16[0]

-    vld1.u8         {d29}, [r0], r1

-    vdup.8          d5, d16[4]

-    vld1.u8         {d30}, [r0], r1

-    vmull.u8        q3, d22, d0             ;(src_ptr[-2] * vp9_filter[0])

-    vmull.u8        q4, d23, d0

-    vmull.u8        q5, d24, d0

-    vmull.u8        q6, d25, d0

-    vmlsl.u8        q3, d23, d1             ;-(src_ptr[-1] * vp9_filter[1])

-    vmlsl.u8        q4, d24, d1

-    vmlsl.u8        q5, d25, d1

-    vmlsl.u8        q6, d26, d1

-    vmlsl.u8        q3, d26, d4             ;-(src_ptr[2] * vp9_filter[4])

-    vmlsl.u8        q4, d27, d4

-    vmlsl.u8        q5, d28, d4

-    vmlsl.u8        q6, d29, d4

-    vmlal.u8        q3, d24, d2             ;(src_ptr[0] * vp9_filter[2])

-    vmlal.u8        q4, d25, d2

-    vmlal.u8        q5, d26, d2

-    vmlal.u8        q6, d27, d2

-    vmlal.u8        q3, d27, d5             ;(src_ptr[3] * vp9_filter[5])

-    vmlal.u8        q4, d28, d5

-    vmlal.u8        q5, d29, d5

-    vmlal.u8        q6, d30, d5

-    vmull.u8        q7, d25, d3             ;(src_ptr[1] * vp9_filter[3])

-    vmull.u8        q8, d26, d3

-    vmull.u8        q9, d27, d3

-    vmull.u8        q10, d28, d3

-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)

-    vqadd.s16       q8, q4

-    vqadd.s16       q9, q5

-    vqadd.s16       q10, q6

-    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8

-    vqrshrun.s16    d7, q8, #7

-    vqrshrun.s16    d8, q9, #7

-    vqrshrun.s16    d9, q10, #7

-    vst1.u8         {d6}, [r4], r5          ;store result

-    vst1.u8         {d7}, [r4], r5

-    vst1.u8         {d8}, [r4], r5

-    vst1.u8         {d9}, [r4], r5

-    pop             {r4-r5,pc}

-    ENDP

-;-----------------

-    END

--- a/vp8/common/arm/neon/sixtappredict8x8_neon.asm

+++ /dev/null

@@ -1,524 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_sixtap_predict8x8_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-filter8_coeff

-    DCD     0,  0,  128,    0,   0,  0,   0,  0

-    DCD     0, -6,  123,   12,  -1,  0,   0,  0

-    DCD     2, -11, 108,   36,  -8,  1,   0,  0

-    DCD     0, -9,   93,   50,  -6,  0,   0,  0

-    DCD     3, -16,  77,   77, -16,  3,   0,  0

-    DCD     0, -6,   50,   93,  -9,  0,   0,  0

-    DCD     1, -8,   36,  108, -11,  2,   0,  0

-    DCD     0, -1,   12,  123,  -6,   0,  0,  0

-; r0    unsigned char  *src_ptr,

-; r1    int  src_pixels_per_line,

-; r2    int  xoffset,

-; r3    int  yoffset,

-; stack(r4) unsigned char *dst_ptr,

-; stack(r5) int  dst_pitch

-|vp8_sixtap_predict8x8_neon| PROC

-    push            {r4-r5, lr}

-    adr             r12, filter8_coeff

-    ldr             r4, [sp, #12]           ;load parameters from stack

-    ldr             r5, [sp, #16]           ;load parameters from stack

-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

-    beq             secondpass_filter8x8_only

-    add             r2, r12, r2, lsl #5     ;calculate filter location

-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

-    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter

-    beq             firstpass_filter8x8_only

-    sub             sp, sp, #64             ;reserve space on stack for temporary storage

-    mov             lr, sp

-    vabs.s32        q12, q14

-    vabs.s32        q13, q15

-    mov             r2, #2                  ;loop counter

-    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)

-    sub             r0, r0, r1, lsl #1

-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)

-    vdup.8          d1, d24[4]

-    vdup.8          d2, d25[0]

-;First pass: output_height lines x output_width columns (13x8)

-    vld1.u8         {q3}, [r0], r1          ;load src data

-    vdup.8          d3, d25[4]

-    vld1.u8         {q4}, [r0], r1

-    vdup.8          d4, d26[0]

-    vld1.u8         {q5}, [r0], r1

-    vdup.8          d5, d26[4]

-    vld1.u8         {q6}, [r0], r1

-filt_blk2d_fp8x8_loop_neon

-    pld             [r0]

-    pld             [r0, r1]

-    pld             [r0, r1, lsl #1]

-    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp9_filter[0])

-    vmull.u8        q8, d8, d0

-    vmull.u8        q9, d10, d0

-    vmull.u8        q10, d12, d0

-    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]

-    vext.8          d29, d8, d9, #1

-    vext.8          d30, d10, d11, #1

-    vext.8          d31, d12, d13, #1

-    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])

-    vmlsl.u8        q8, d29, d1

-    vmlsl.u8        q9, d30, d1

-    vmlsl.u8        q10, d31, d1

-    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]

-    vext.8          d29, d8, d9, #4

-    vext.8          d30, d10, d11, #4

-    vext.8          d31, d12, d13, #4

-    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp9_filter[4])

-    vmlsl.u8        q8, d29, d4

-    vmlsl.u8        q9, d30, d4

-    vmlsl.u8        q10, d31, d4

-    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]

-    vext.8          d29, d8, d9, #2

-    vext.8          d30, d10, d11, #2

-    vext.8          d31, d12, d13, #2

-    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp9_filter[2])

-    vmlal.u8        q8, d29, d2

-    vmlal.u8        q9, d30, d2

-    vmlal.u8        q10, d31, d2

-    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]

-    vext.8          d29, d8, d9, #5

-    vext.8          d30, d10, d11, #5

-    vext.8          d31, d12, d13, #5

-    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp9_filter[5])

-    vmlal.u8        q8, d29, d5

-    vmlal.u8        q9, d30, d5

-    vmlal.u8        q10, d31, d5

-    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]

-    vext.8          d29, d8, d9, #3

-    vext.8          d30, d10, d11, #3

-    vext.8          d31, d12, d13, #3

-    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp9_filter[3])

-    vmull.u8        q4, d29, d3

-    vmull.u8        q5, d30, d3

-    vmull.u8        q6, d31, d3

-    subs            r2, r2, #1

-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)

-    vqadd.s16       q8, q4

-    vqadd.s16       q9, q5

-    vqadd.s16       q10, q6

-    vld1.u8         {q3}, [r0], r1          ;load src data

-    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8

-    vqrshrun.s16    d23, q8, #7

-    vqrshrun.s16    d24, q9, #7

-    vqrshrun.s16    d25, q10, #7

-    vst1.u8         {d22}, [lr]!            ;store result

-    vld1.u8         {q4}, [r0], r1

-    vst1.u8         {d23}, [lr]!

-    vld1.u8         {q5}, [r0], r1

-    vst1.u8         {d24}, [lr]!

-    vld1.u8         {q6}, [r0], r1

-    vst1.u8         {d25}, [lr]!

-    bne             filt_blk2d_fp8x8_loop_neon

-    ;first_pass filtering on the rest 5-line data

-    ;vld1.u8            {q3}, [r0], r1          ;load src data

-    ;vld1.u8            {q4}, [r0], r1

-    ;vld1.u8            {q5}, [r0], r1

-    ;vld1.u8            {q6}, [r0], r1

-    vld1.u8         {q7}, [r0], r1

-    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp9_filter[0])

-    vmull.u8        q9, d8, d0

-    vmull.u8        q10, d10, d0

-    vmull.u8        q11, d12, d0

-    vmull.u8        q12, d14, d0

-    vext.8          d27, d6, d7, #1         ;construct src_ptr[-1]

-    vext.8          d28, d8, d9, #1

-    vext.8          d29, d10, d11, #1

-    vext.8          d30, d12, d13, #1

-    vext.8          d31, d14, d15, #1

-    vmlsl.u8        q8, d27, d1             ;-(src_ptr[-1] * vp9_filter[1])

-    vmlsl.u8        q9, d28, d1

-    vmlsl.u8        q10, d29, d1

-    vmlsl.u8        q11, d30, d1

-    vmlsl.u8        q12, d31, d1

-    vext.8          d27, d6, d7, #4         ;construct src_ptr[2]

-    vext.8          d28, d8, d9, #4

-    vext.8          d29, d10, d11, #4

-    vext.8          d30, d12, d13, #4

-    vext.8          d31, d14, d15, #4

-    vmlsl.u8        q8, d27, d4             ;-(src_ptr[2] * vp9_filter[4])

-    vmlsl.u8        q9, d28, d4

-    vmlsl.u8        q10, d29, d4

-    vmlsl.u8        q11, d30, d4

-    vmlsl.u8        q12, d31, d4

-    vext.8          d27, d6, d7, #2         ;construct src_ptr[0]

-    vext.8          d28, d8, d9, #2

-    vext.8          d29, d10, d11, #2

-    vext.8          d30, d12, d13, #2

-    vext.8          d31, d14, d15, #2

-    vmlal.u8        q8, d27, d2             ;(src_ptr[0] * vp9_filter[2])

-    vmlal.u8        q9, d28, d2

-    vmlal.u8        q10, d29, d2

-    vmlal.u8        q11, d30, d2

-    vmlal.u8        q12, d31, d2

-    vext.8          d27, d6, d7, #5         ;construct src_ptr[3]

-    vext.8          d28, d8, d9, #5

-    vext.8          d29, d10, d11, #5

-    vext.8          d30, d12, d13, #5

-    vext.8          d31, d14, d15, #5

-    vmlal.u8        q8, d27, d5             ;(src_ptr[3] * vp9_filter[5])

-    vmlal.u8        q9, d28, d5

-    vmlal.u8        q10, d29, d5

-    vmlal.u8        q11, d30, d5

-    vmlal.u8        q12, d31, d5

-    vext.8          d27, d6, d7, #3         ;construct src_ptr[1]

-    vext.8          d28, d8, d9, #3

-    vext.8          d29, d10, d11, #3

-    vext.8          d30, d12, d13, #3

-    vext.8          d31, d14, d15, #3

-    vmull.u8        q3, d27, d3             ;(src_ptr[1] * vp9_filter[3])

-    vmull.u8        q4, d28, d3

-    vmull.u8        q5, d29, d3

-    vmull.u8        q6, d30, d3

-    vmull.u8        q7, d31, d3

-    vqadd.s16       q8, q3                  ;sum of all (src_data*filter_parameters)

-    vqadd.s16       q9, q4

-    vqadd.s16       q10, q5

-    vqadd.s16       q11, q6

-    vqadd.s16       q12, q7

-    add             r3, r12, r3, lsl #5

-    vqrshrun.s16    d26, q8, #7             ;shift/round/saturate to u8

-    sub             lr, lr, #64

-    vqrshrun.s16    d27, q9, #7

-    vld1.u8         {q9}, [lr]!             ;load intermediate data from stack

-    vqrshrun.s16    d28, q10, #7

-    vld1.u8         {q10}, [lr]!

-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter

-    vqrshrun.s16    d29, q11, #7

-    vld1.u8         {q11}, [lr]!

-    vabs.s32        q7, q5

-    vabs.s32        q8, q6

-    vqrshrun.s16    d30, q12, #7

-    vld1.u8         {q12}, [lr]!

-;Second pass: 8x8

-    mov             r3, #2                  ;loop counter

-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)

-    vdup.8          d1, d14[4]

-    vdup.8          d2, d15[0]

-    vdup.8          d3, d15[4]

-    vdup.8          d4, d16[0]

-    vdup.8          d5, d16[4]

-filt_blk2d_sp8x8_loop_neon

-    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp9_filter[0])

-    vmull.u8        q4, d19, d0

-    vmull.u8        q5, d20, d0

-    vmull.u8        q6, d21, d0

-    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp9_filter[1])

-    vmlsl.u8        q4, d20, d1

-    vmlsl.u8        q5, d21, d1

-    vmlsl.u8        q6, d22, d1

-    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp9_filter[4])

-    vmlsl.u8        q4, d23, d4

-    vmlsl.u8        q5, d24, d4

-    vmlsl.u8        q6, d25, d4

-    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp9_filter[2])

-    vmlal.u8        q4, d21, d2

-    vmlal.u8        q5, d22, d2

-    vmlal.u8        q6, d23, d2

-    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp9_filter[5])

-    vmlal.u8        q4, d24, d5

-    vmlal.u8        q5, d25, d5

-    vmlal.u8        q6, d26, d5

-    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp9_filter[3])

-    vmull.u8        q8, d22, d3

-    vmull.u8        q9, d23, d3

-    vmull.u8        q10, d24, d3

-    subs            r3, r3, #1

-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)

-    vqadd.s16       q8, q4

-    vqadd.s16       q9, q5

-    vqadd.s16       q10, q6

-    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8

-    vqrshrun.s16    d7, q8, #7

-    vqrshrun.s16    d8, q9, #7

-    vqrshrun.s16    d9, q10, #7

-    vmov            q9, q11

-    vst1.u8         {d6}, [r4], r5          ;store result

-    vmov            q10, q12

-    vst1.u8         {d7}, [r4], r5

-    vmov            q11, q13

-    vst1.u8         {d8}, [r4], r5

-    vmov            q12, q14

-    vst1.u8         {d9}, [r4], r5

-    vmov            d26, d30

-    bne filt_blk2d_sp8x8_loop_neon

-    add             sp, sp, #64

-    pop             {r4-r5,pc}

-;---------------------

-firstpass_filter8x8_only

-    ;add                r2, r12, r2, lsl #5     ;calculate filter location

-    ;vld1.s32       {q14, q15}, [r2]        ;load first_pass filter

-    vabs.s32        q12, q14

-    vabs.s32        q13, q15

-    mov             r2, #2                  ;loop counter

-    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)

-    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)

-    vdup.8          d1, d24[4]

-    vdup.8          d2, d25[0]

-    vdup.8          d3, d25[4]

-    vdup.8          d4, d26[0]

-    vdup.8          d5, d26[4]

-;First pass: output_height lines x output_width columns (8x8)

-filt_blk2d_fpo8x8_loop_neon

-    vld1.u8         {q3}, [r0], r1          ;load src data

-    vld1.u8         {q4}, [r0], r1

-    vld1.u8         {q5}, [r0], r1

-    vld1.u8         {q6}, [r0], r1

-    pld             [r0]

-    pld             [r0, r1]

-    pld             [r0, r1, lsl #1]

-    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp9_filter[0])

-    vmull.u8        q8, d8, d0

-    vmull.u8        q9, d10, d0

-    vmull.u8        q10, d12, d0

-    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]

-    vext.8          d29, d8, d9, #1

-    vext.8          d30, d10, d11, #1

-    vext.8          d31, d12, d13, #1

-    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])

-    vmlsl.u8        q8, d29, d1

-    vmlsl.u8        q9, d30, d1

-    vmlsl.u8        q10, d31, d1

-    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]

-    vext.8          d29, d8, d9, #4

-    vext.8          d30, d10, d11, #4

-    vext.8          d31, d12, d13, #4

-    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp9_filter[4])

-    vmlsl.u8        q8, d29, d4

-    vmlsl.u8        q9, d30, d4

-    vmlsl.u8        q10, d31, d4

-    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]

-    vext.8          d29, d8, d9, #2

-    vext.8          d30, d10, d11, #2

-    vext.8          d31, d12, d13, #2

-    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp9_filter[2])

-    vmlal.u8        q8, d29, d2

-    vmlal.u8        q9, d30, d2

-    vmlal.u8        q10, d31, d2

-    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]

-    vext.8          d29, d8, d9, #5

-    vext.8          d30, d10, d11, #5

-    vext.8          d31, d12, d13, #5

-    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp9_filter[5])

-    vmlal.u8        q8, d29, d5

-    vmlal.u8        q9, d30, d5

-    vmlal.u8        q10, d31, d5

-    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]

-    vext.8          d29, d8, d9, #3

-    vext.8          d30, d10, d11, #3

-    vext.8          d31, d12, d13, #3

-    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp9_filter[3])

-    vmull.u8        q4, d29, d3

-    vmull.u8        q5, d30, d3

-    vmull.u8        q6, d31, d3

- ;

-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)

-    vqadd.s16       q8, q4

-    vqadd.s16       q9, q5

-    vqadd.s16       q10, q6

-    subs            r2, r2, #1

-    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8

-    vqrshrun.s16    d23, q8, #7

-    vqrshrun.s16    d24, q9, #7

-    vqrshrun.s16    d25, q10, #7

-    vst1.u8         {d22}, [r4], r5         ;store result

-    vst1.u8         {d23}, [r4], r5

-    vst1.u8         {d24}, [r4], r5

-    vst1.u8         {d25}, [r4], r5

-    bne             filt_blk2d_fpo8x8_loop_neon

-    pop             {r4-r5,pc}

-;---------------------

-secondpass_filter8x8_only

-    sub             r0, r0, r1, lsl #1

-    add             r3, r12, r3, lsl #5

-    vld1.u8         {d18}, [r0], r1         ;load src data

-    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter

-    vld1.u8         {d19}, [r0], r1

-    vabs.s32        q7, q5

-    vld1.u8         {d20}, [r0], r1

-    vabs.s32        q8, q6

-    vld1.u8         {d21}, [r0], r1

-    mov             r3, #2                  ;loop counter

-    vld1.u8         {d22}, [r0], r1

-    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)

-    vld1.u8         {d23}, [r0], r1

-    vdup.8          d1, d14[4]

-    vld1.u8         {d24}, [r0], r1

-    vdup.8          d2, d15[0]

-    vld1.u8         {d25}, [r0], r1

-    vdup.8          d3, d15[4]

-    vld1.u8         {d26}, [r0], r1

-    vdup.8          d4, d16[0]

-    vld1.u8         {d27}, [r0], r1

-    vdup.8          d5, d16[4]

-    vld1.u8         {d28}, [r0], r1

-    vld1.u8         {d29}, [r0], r1

-    vld1.u8         {d30}, [r0], r1

-;Second pass: 8x8

-filt_blk2d_spo8x8_loop_neon

-    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp9_filter[0])

-    vmull.u8        q4, d19, d0

-    vmull.u8        q5, d20, d0

-    vmull.u8        q6, d21, d0

-    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp9_filter[1])

-    vmlsl.u8        q4, d20, d1

-    vmlsl.u8        q5, d21, d1

-    vmlsl.u8        q6, d22, d1

-    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp9_filter[4])

-    vmlsl.u8        q4, d23, d4

-    vmlsl.u8        q5, d24, d4

-    vmlsl.u8        q6, d25, d4

-    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp9_filter[2])

-    vmlal.u8        q4, d21, d2

-    vmlal.u8        q5, d22, d2

-    vmlal.u8        q6, d23, d2

-    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp9_filter[5])

-    vmlal.u8        q4, d24, d5

-    vmlal.u8        q5, d25, d5

-    vmlal.u8        q6, d26, d5

-    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp9_filter[3])

-    vmull.u8        q8, d22, d3

-    vmull.u8        q9, d23, d3

-    vmull.u8        q10, d24, d3

-    subs            r3, r3, #1

-    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)

-    vqadd.s16       q8, q4

-    vqadd.s16       q9, q5

-    vqadd.s16       q10, q6

-    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8

-    vqrshrun.s16    d7, q8, #7

-    vqrshrun.s16    d8, q9, #7

-    vqrshrun.s16    d9, q10, #7

-    vmov            q9, q11

-    vst1.u8         {d6}, [r4], r5          ;store result

-    vmov            q10, q12

-    vst1.u8         {d7}, [r4], r5

-    vmov            q11, q13

-    vst1.u8         {d8}, [r4], r5

-    vmov            q12, q14

-    vst1.u8         {d9}, [r4], r5

-    vmov            d26, d30

-    bne filt_blk2d_spo8x8_loop_neon

-    pop             {r4-r5,pc}

-    ENDP

-;-----------------

-    END

--- a/vp8/common/arm/recon_arm.h

+++ /dev/null

@@ -1,90 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef RECON_ARM_H

-#define RECON_ARM_H

-#if HAVE_ARMV6

-extern prototype_recon_block(vp9_recon_b_armv6);

-extern prototype_recon_block(vp9_recon2b_armv6);

-extern prototype_recon_block(vp9_recon4b_armv6);

-extern prototype_copy_block(vp9_copy_mem8x8_v6);

-extern prototype_copy_block(vp9_copy_mem8x4_v6);

-extern prototype_copy_block(vp9_copy_mem16x16_v6);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp8_recon_recon

-#define vp8_recon_recon vp9_recon_b_armv6

-#undef  vp8_recon_recon2

-#define vp8_recon_recon2 vp9_recon2b_armv6

-#undef  vp8_recon_recon4

-#define vp8_recon_recon4 vp9_recon4b_armv6

-#undef  vp8_recon_copy8x8

-#define vp8_recon_copy8x8 vp9_copy_mem8x8_v6

-#undef  vp8_recon_copy8x4

-#define vp8_recon_copy8x4 vp9_copy_mem8x4_v6

-#undef  vp8_recon_copy16x16

-#define vp8_recon_copy16x16 vp9_copy_mem16x16_v6

-#endif

-#endif

-#if HAVE_ARMV7

-extern prototype_recon_block(vp9_recon_b_neon);

-extern prototype_recon_block(vp9_recon2b_neon);

-extern prototype_recon_block(vp9_recon4b_neon);

-extern prototype_copy_block(vp9_copy_mem8x8_neon);

-extern prototype_copy_block(vp9_copy_mem8x4_neon);

-extern prototype_copy_block(vp9_copy_mem16x16_neon);

-extern prototype_recon_macroblock(vp9_recon_mb_neon);

-extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_neon);

-extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_s_neon);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp8_recon_recon

-#define vp8_recon_recon vp9_recon_b_neon

-#undef  vp8_recon_recon2

-#define vp8_recon_recon2 vp9_recon2b_neon

-#undef  vp8_recon_recon4

-#define vp8_recon_recon4 vp9_recon4b_neon

-#undef  vp8_recon_copy8x8

-#define vp8_recon_copy8x8 vp9_copy_mem8x8_neon

-#undef  vp8_recon_copy8x4

-#define vp8_recon_copy8x4 vp9_copy_mem8x4_neon

-#undef  vp8_recon_copy16x16

-#define vp8_recon_copy16x16 vp9_copy_mem16x16_neon

-#undef  vp8_recon_recon_mb

-#define vp8_recon_recon_mb vp9_recon_mb_neon

-#undef  vp9_recon_build_intra_predictors_mby

-#define vp9_recon_build_intra_predictors_mby vp9_build_intra_predictors_mby_neon

-#undef  vp9_recon_build_intra_predictors_mby_s

-#define vp9_recon_build_intra_predictors_mby_s vp9_build_intra_predictors_mby_s_neon

-#endif

-#endif

-#endif

--- a/vp8/common/arm/reconintra_arm.c

+++ /dev/null

@@ -1,62 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "vp8/common/blockd.h"

-#include "vp8/common/reconintra.h"

-#include "vpx_mem/vpx_mem.h"

-#include "vp8/common/recon.h"

-#if HAVE_ARMV7

-extern void vp9_build_intra_predictors_mby_neon_func(

-  unsigned char *y_buffer,

-  unsigned char *ypred_ptr,

-  int y_stride,

-  int mode,

-  int Up,

-  int Left);

-void vp9_build_intra_predictors_mby_neon(MACROBLOCKD *xd) {

-  unsigned char *y_buffer = xd->dst.y_buffer;

-  unsigned char *ypred_ptr = xd->predictor;

-  int y_stride = xd->dst.y_stride;

-  int mode = xd->mode_info_context->mbmi.mode;

-  int Up = xd->up_available;

-  int Left = xd->left_available;

-  vp9_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr,

-                                           y_stride, mode, Up, Left);

-}

-#endif

-#if HAVE_ARMV7

-extern void vp9_build_intra_predictors_mby_s_neon_func(

-  unsigned char *y_buffer,

-  unsigned char *ypred_ptr,

-  int y_stride,

-  int mode,

-  int Up,

-  int Left);

-void vp9_build_intra_predictors_mby_s_neon(MACROBLOCKD *xd) {

-  unsigned char *y_buffer = xd->dst.y_buffer;

-  unsigned char *ypred_ptr = xd->predictor;

-  int y_stride = xd->dst.y_stride;

-  int mode = xd->mode_info_context->mbmi.mode;

-  int Up = xd->up_available;

-  int Left = xd->left_available;

-  vp9_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr,

-                                             y_stride, mode, Up, Left);

-}

-#endif

--- a/vp8/common/arm/subpixel_arm.h

+++ /dev/null

@@ -1,89 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef SUBPIXEL_ARM_H

-#define SUBPIXEL_ARM_H

-#if HAVE_ARMV6

-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_armv6);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_armv6);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_armv6);

-extern prototype_subpixel_predict(vp9_sixtap_predict_armv6);

-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_armv6);

-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_armv6);

-extern prototype_subpixel_predict(vp9_bilinear_predict8x4_armv6);

-extern prototype_subpixel_predict(vp9_bilinear_predict4x4_armv6);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_subpix_sixtap16x16

-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_armv6

-#undef  vp9_subpix_sixtap8x8

-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_armv6

-#undef  vp9_subpix_sixtap8x4

-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_armv6

-#undef  vp9_subpix_sixtap4x4

-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_armv6

-#undef  vp9_subpix_bilinear16x16

-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_armv6

-#undef  vp9_subpix_bilinear8x8

-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_armv6

-#undef  vp9_subpix_bilinear8x4

-#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_armv6

-#undef  vp9_subpix_bilinear4x4

-#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_armv6

-#endif

-#endif

-#if HAVE_ARMV7

-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_neon);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_neon);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_neon);

-extern prototype_subpixel_predict(vp9_sixtap_predict_neon);

-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_neon);

-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_neon);

-extern prototype_subpixel_predict(vp9_bilinear_predict8x4_neon);

-extern prototype_subpixel_predict(vp9_bilinear_predict4x4_neon);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_subpix_sixtap16x16

-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_neon

-#undef  vp9_subpix_sixtap8x8

-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_neon

-#undef  vp9_subpix_sixtap8x4

-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_neon

-#undef  vp9_subpix_sixtap4x4

-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_neon

-#undef  vp9_subpix_bilinear16x16

-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_neon

-#undef  vp9_subpix_bilinear8x8

-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_neon

-#undef  vp9_subpix_bilinear8x4

-#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_neon

-#undef  vp9_subpix_bilinear4x4

-#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_neon

-#endif

-#endif

-#endif

--- a/vp8/common/asm_com_offsets.c

+++ /dev/null

@@ -1,40 +1,0 @@

-/*

- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_config.h"

-#include "vpx/vpx_codec.h"

-#include "vpx_ports/asm_offsets.h"

-#include "vpx_scale/yv12config.h"

-BEGIN

-/* vpx_scale */

-DEFINE(yv12_buffer_config_y_width,              offsetof(YV12_BUFFER_CONFIG, y_width));

-DEFINE(yv12_buffer_config_y_height,             offsetof(YV12_BUFFER_CONFIG, y_height));

-DEFINE(yv12_buffer_config_y_stride,             offsetof(YV12_BUFFER_CONFIG, y_stride));

-DEFINE(yv12_buffer_config_uv_width,             offsetof(YV12_BUFFER_CONFIG, uv_width));

-DEFINE(yv12_buffer_config_uv_height,            offsetof(YV12_BUFFER_CONFIG, uv_height));

-DEFINE(yv12_buffer_config_uv_stride,            offsetof(YV12_BUFFER_CONFIG, uv_stride));

-DEFINE(yv12_buffer_config_y_buffer,             offsetof(YV12_BUFFER_CONFIG, y_buffer));

-DEFINE(yv12_buffer_config_u_buffer,             offsetof(YV12_BUFFER_CONFIG, u_buffer));

-DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));

-DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));

-DEFINE(VP8BORDERINPIXELS_VAL,                   VP8BORDERINPIXELS);

-END

-/* add asserts for any offset that is not supported by assembly code */

-/* add asserts for any size that is not supported by assembly code */

-#if HAVE_ARMV7

-/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */

-ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32)

-#endif

--- a/vp8/common/blockd.c

+++ /dev/null

@@ -1,29 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "blockd.h"

-#include "vpx_mem/vpx_mem.h"

-const unsigned char vp9_block2left[25] = {

-  0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8

-};

-const unsigned char vp9_block2above[25] = {

-  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8

-};

-const unsigned char vp9_block2left_8x8[25] = {

-  0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8

-};

-const unsigned char vp9_block2above_8x8[25] = {

-  0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8

-};

--- a/vp8/common/blockd.h

+++ /dev/null

@@ -1,518 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_BLOCKD_H

-#define __INC_BLOCKD_H

-void vpx_log(const char *format, ...);

-#include "vpx_ports/config.h"

-#include "vpx_scale/yv12config.h"

-#include "mv.h"

-#include "treecoder.h"

-#include "subpixel.h"

-#include "vpx_ports/mem.h"

-#include "common.h"

-#define TRUE    1

-#define FALSE   0

-// #define MODE_STATS

-/*#define DCPRED 1*/

-#define DCPREDSIMTHRESH 0

-#define DCPREDCNTTHRESH 3

-#define MB_FEATURE_TREE_PROBS   3

-#define PREDICTION_PROBS 3

-#define MBSKIP_CONTEXTS 3

-#define MAX_MB_SEGMENTS         4

-#define MAX_REF_LF_DELTAS       4

-#define MAX_MODE_LF_DELTAS      4

-/* Segment Feature Masks */

-#define SEGMENT_DELTADATA   0

-#define SEGMENT_ABSDATA     1

-#if CONFIG_NEWBESTREFMV || CONFIG_NEW_MVREF

-#define MAX_MV_REFS 19

-#endif

-typedef struct {

-  int r, c;

-} POS;

-typedef enum PlaneType {

-  PLANE_TYPE_Y_NO_DC = 0,

-  PLANE_TYPE_Y2,

-  PLANE_TYPE_UV,

-  PLANE_TYPE_Y_WITH_DC,

-} PLANE_TYPE;

-typedef char ENTROPY_CONTEXT;

-typedef struct {

-  ENTROPY_CONTEXT y1[4];

-  ENTROPY_CONTEXT u[2];

-  ENTROPY_CONTEXT v[2];

-  ENTROPY_CONTEXT y2;

-} ENTROPY_CONTEXT_PLANES;

-extern const unsigned char vp9_block2left[25];

-extern const unsigned char vp9_block2above[25];

-extern const unsigned char vp9_block2left_8x8[25];

-extern const unsigned char vp9_block2above_8x8[25];

-#define VP9_COMBINEENTROPYCONTEXTS( Dest, A, B) \

-  Dest = ((A)!=0) + ((B)!=0);

-typedef enum {

-  KEY_FRAME = 0,

-  INTER_FRAME = 1

-} FRAME_TYPE;

-typedef enum

-{

-  SIXTAP   = 0,

-  BILINEAR = 1,

-  EIGHTTAP = 2,

-  EIGHTTAP_SHARP = 3,

-  SWITCHABLE  /* should be the last one */

-} INTERPOLATIONFILTERTYPE;

-typedef enum

-{

-  DC_PRED,            /* average of above and left pixels */

-  V_PRED,             /* vertical prediction */

-  H_PRED,             /* horizontal prediction */

-  D45_PRED,           /* Directional 45 deg prediction  [anti-clockwise from 0 deg hor] */

-  D135_PRED,          /* Directional 135 deg prediction [anti-clockwise from 0 deg hor] */

-  D117_PRED,          /* Directional 112 deg prediction [anti-clockwise from 0 deg hor] */

-  D153_PRED,          /* Directional 157 deg prediction [anti-clockwise from 0 deg hor] */

-  D27_PRED,           /* Directional 22 deg prediction  [anti-clockwise from 0 deg hor] */

-  D63_PRED,           /* Directional 67 deg prediction  [anti-clockwise from 0 deg hor] */

-  TM_PRED,            /* Truemotion prediction */

-  I8X8_PRED,          /* 8x8 based prediction, each 8x8 has its own prediction mode */

-  B_PRED,             /* block based prediction, each block has its own prediction mode */

-  NEARESTMV,

-  NEARMV,

-  ZEROMV,

-  NEWMV,

-  SPLITMV,

-  MB_MODE_COUNT

-} MB_PREDICTION_MODE;

-// Segment level features.

-typedef enum {

-  SEG_LVL_ALT_Q = 0,               // Use alternate Quantizer ....

-  SEG_LVL_ALT_LF = 1,              // Use alternate loop filter value...

-  SEG_LVL_REF_FRAME = 2,           // Optional Segment reference frame

-  SEG_LVL_MODE = 3,                // Optional Segment mode

-  SEG_LVL_EOB = 4,                 // EOB end stop marker.

-  SEG_LVL_TRANSFORM = 5,           // Block transform size.

-  SEG_LVL_MAX = 6                  // Number of MB level features supported

-} SEG_LVL_FEATURES;

-// Segment level features.

-typedef enum {

-  TX_4X4,                      // 4x4 dct transform

-  TX_8X8,                      // 8x8 dct transform

-  TX_16X16,                    // 16x16 dct transform

-  TX_SIZE_MAX                  // Number of different transforms available

-} TX_SIZE;

-typedef enum {

-  DCT_DCT   = 0,                      // DCT  in both horizontal and vertical

-  ADST_DCT  = 1,                      // ADST in vertical, DCT in horizontal

-  DCT_ADST  = 2,                      // DCT  in vertical, ADST in horizontal

-  ADST_ADST = 3                       // ADST in both directions

-} TX_TYPE;

-#define VP9_YMODES  (B_PRED + 1)

-#define VP9_UV_MODES (TM_PRED + 1)

-#define VP9_I8X8_MODES (TM_PRED + 1)

-#define VP9_I32X32_MODES (TM_PRED + 1)

-#define VP9_MVREFS (1 + SPLITMV - NEARESTMV)

-typedef enum {

-  B_DC_PRED,          /* average of above and left pixels */

-  B_TM_PRED,

-  B_VE_PRED,           /* vertical prediction */

-  B_HE_PRED,           /* horizontal prediction */

-  B_LD_PRED,

-  B_RD_PRED,

-  B_VR_PRED,

-  B_VL_PRED,

-  B_HD_PRED,

-  B_HU_PRED,

-  LEFT4X4,

-  ABOVE4X4,

-  ZERO4X4,

-  NEW4X4,

-  B_MODE_COUNT

-} B_PREDICTION_MODE;

-#define VP9_BINTRAMODES (B_HU_PRED + 1)  /* 10 */

-#define VP9_SUBMVREFS (1 + NEW4X4 - LEFT4X4)

-typedef enum {

-  PARTITIONING_16X8 = 0,

-  PARTITIONING_8X16,

-  PARTITIONING_8X8,

-  PARTITIONING_4X4,

-  NB_PARTITIONINGS,

-} SPLITMV_PARTITIONING_TYPE;

-/* For keyframes, intra block modes are predicted by the (already decoded)

-   modes for the Y blocks to the left and above us; for interframes, there

-   is a single probability table. */

-union b_mode_info {

-  struct {

-    B_PREDICTION_MODE first;

-    TX_TYPE           tx_type;

-#if CONFIG_COMP_INTRA_PRED

-    B_PREDICTION_MODE second;

-#endif

-  } as_mode;

-  struct {

-    int_mv first;

-    int_mv second;

-  } as_mv;

-};

-typedef enum {

-  INTRA_FRAME = 0,

-  LAST_FRAME = 1,

-  GOLDEN_FRAME = 2,

-  ALTREF_FRAME = 3,

-  MAX_REF_FRAMES = 4

-} MV_REFERENCE_FRAME;

-typedef struct {

-  MB_PREDICTION_MODE mode, uv_mode;

-#if CONFIG_COMP_INTRA_PRED

-  MB_PREDICTION_MODE second_mode, second_uv_mode;

-#endif

-  MV_REFERENCE_FRAME ref_frame, second_ref_frame;

-  TX_SIZE txfm_size;

-  int_mv mv[2]; // for each reference frame used

-#if CONFIG_NEWBESTREFMV || CONFIG_NEW_MVREF

-  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REFS];

-#endif

-  SPLITMV_PARTITIONING_TYPE partitioning;

-  unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */

-  unsigned char need_to_clamp_mvs;

-  unsigned char need_to_clamp_secondmv;

-  unsigned char segment_id;                  /* Which set of segmentation parameters should be used for this MB */

-  // Flags used for prediction status of various bistream signals

-  unsigned char seg_id_predicted;

-  unsigned char ref_predicted;

-  // Indicates if the mb is part of the image (1) vs border (0)

-  // This can be useful in determining whether the MB provides

-  // a valid predictor

-  unsigned char mb_in_image;

-#if CONFIG_PRED_FILTER

-  // Flag to turn prediction signal filter on(1)/off(0 ) at the MB level

-  unsigned int pred_filter_enabled;

-#endif

-    INTERPOLATIONFILTERTYPE interp_filter;

-#if CONFIG_SUPERBLOCKS

-  // FIXME need a SB array of 4 MB_MODE_INFOs that

-  // only needs one encoded_as_sb.

-  unsigned char encoded_as_sb;

-#endif

-} MB_MODE_INFO;

-typedef struct {

-  MB_MODE_INFO mbmi;

-  union b_mode_info bmi[16];

-} MODE_INFO;

-typedef struct blockd {

-  short *qcoeff;

-  short *dqcoeff;

-  unsigned char  *predictor;

-  short *diff;

-  short *dequant;

-  /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */

-  unsigned char **base_pre;

-  unsigned char **base_second_pre;

-  int pre;

-  int pre_stride;

-  unsigned char **base_dst;

-  int dst;

-  int dst_stride;

-  int eob;

-  union b_mode_info bmi;

-} BLOCKD;

-typedef struct macroblockd {

-  DECLARE_ALIGNED(16, short, diff[400]);      /* from idct diff */

-  DECLARE_ALIGNED(16, unsigned char,  predictor[384]);

-  DECLARE_ALIGNED(16, short, qcoeff[400]);

-  DECLARE_ALIGNED(16, short, dqcoeff[400]);

-  DECLARE_ALIGNED(16, char,  eobs[25]);

-  /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */

-  BLOCKD block[25];

-  int fullpixel_mask;

-  YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */

-  struct {

-    uint8_t *y_buffer, *u_buffer, *v_buffer;

-  } second_pre;

-  YV12_BUFFER_CONFIG dst;

-  MODE_INFO *prev_mode_info_context;

-  MODE_INFO *mode_info_context;

-  int mode_info_stride;

-  FRAME_TYPE frame_type;

-  int up_available;

-  int left_available;

-  /* Y,U,V,Y2 */

-  ENTROPY_CONTEXT_PLANES *above_context;

-  ENTROPY_CONTEXT_PLANES *left_context;

-  /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */

-  unsigned char segmentation_enabled;

-  /* 0 (do not update) 1 (update) the macroblock segmentation map. */

-  unsigned char update_mb_segmentation_map;

-  /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */

-  unsigned char update_mb_segmentation_data;

-  /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */

-  unsigned char mb_segment_abs_delta;

-  /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */

-  /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */

-  // Probability Tree used to code Segment number

-  vp9_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];

-#if CONFIG_NEW_MVREF

-  vp9_prob mb_mv_ref_id_probs[MAX_REF_FRAMES][3];

-#endif

-  // Segment features

-  signed char segment_feature_data[MAX_MB_SEGMENTS][SEG_LVL_MAX];

-  unsigned int segment_feature_mask[MAX_MB_SEGMENTS];

-  /* mode_based Loop filter adjustment */

-  unsigned char mode_ref_lf_delta_enabled;

-  unsigned char mode_ref_lf_delta_update;

-  /* Delta values have the range +/- MAX_LOOP_FILTER */

-  signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];                /* 0 = Intra, Last, GF, ARF */

-  signed char ref_lf_deltas[MAX_REF_LF_DELTAS];                     /* 0 = Intra, Last, GF, ARF */

-  signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];              /* 0 = BPRED, ZERO_MV, MV, SPLIT */

-  signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];                   /* 0 = BPRED, ZERO_MV, MV, SPLIT */

-  /* Distance of MB away from frame edges */

-  int mb_to_left_edge;

-  int mb_to_right_edge;

-  int mb_to_top_edge;

-  int mb_to_bottom_edge;

-  unsigned int frames_since_golden;

-  unsigned int frames_till_alt_ref_frame;

-  vp9_subpix_fn_t  subpixel_predict;

-  vp9_subpix_fn_t  subpixel_predict8x4;

-  vp9_subpix_fn_t  subpixel_predict8x8;

-  vp9_subpix_fn_t  subpixel_predict16x16;

-  vp9_subpix_fn_t  subpixel_predict_avg;

-  vp9_subpix_fn_t  subpixel_predict_avg8x4;

-  vp9_subpix_fn_t  subpixel_predict_avg8x8;

-  vp9_subpix_fn_t  subpixel_predict_avg16x16;

-  int allow_high_precision_mv;

-  int corrupted;

-#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)

-  /* This is an intermediate buffer currently used in sub-pixel motion search

-   * to keep a copy of the reference area. This buffer can be used for other

-   * purpose.

-   */

-  DECLARE_ALIGNED(32, unsigned char, y_buf[22 * 32]);

-#endif

-#if CONFIG_RUNTIME_CPU_DETECT

-  struct VP9_COMMON_RTCD  *rtcd;

-#endif

-  int mb_index;   // Index of the MB in the SB (0..3)

-  int q_index;

-} MACROBLOCKD;

-#define ACTIVE_HT 110                // quantization stepsize threshold

-#define ACTIVE_HT8 300

-#define ACTIVE_HT16 300

-// convert MB_PREDICTION_MODE to B_PREDICTION_MODE

-static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) {

-  B_PREDICTION_MODE b_mode;

-  switch (mode) {

-    case DC_PRED:

-      b_mode = B_DC_PRED;

-      break;

-    case V_PRED:

-      b_mode = B_VE_PRED;

-      break;

-    case H_PRED:

-      b_mode = B_HE_PRED;

-      break;

-    case TM_PRED:

-      b_mode = B_TM_PRED;

-      break;

-    case D45_PRED:

-      b_mode = B_LD_PRED;

-      break;

-    case D135_PRED:

-      b_mode = B_RD_PRED;

-      break;

-    case D117_PRED:

-      b_mode = B_VR_PRED;

-      break;

-    case D153_PRED:

-      b_mode = B_HD_PRED;

-      break;

-    case D27_PRED:

-      b_mode = B_HU_PRED;

-      break;

-    case D63_PRED:

-      b_mode = B_VL_PRED;

-      break;

-    default :

-      // for debug purpose, to be removed after full testing

-      assert(0);

-      break;

-  }

-  return b_mode;

-}

-// transform mapping

-static TX_TYPE txfm_map(B_PREDICTION_MODE bmode) {

-  // map transform type

-  TX_TYPE tx_type;

-  switch (bmode) {

-    case B_TM_PRED :

-    case B_RD_PRED :

-      tx_type = ADST_ADST;

-      break;

-    case B_VE_PRED :

-    case B_VR_PRED :

-      tx_type = ADST_DCT;

-      break;

-    case B_HE_PRED :

-    case B_HD_PRED :

-    case B_HU_PRED :

-      tx_type = DCT_ADST;

-      break;

-    default :

-      tx_type = DCT_DCT;

-      break;

-  }

-  return tx_type;

-}

-static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) {

-  TX_TYPE tx_type = DCT_DCT;

-  if (xd->mode_info_context->mbmi.mode == B_PRED &&

-      xd->q_index < ACTIVE_HT) {

-    tx_type = txfm_map(b->bmi.as_mode.first);

-  }

-  return tx_type;

-}

-static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, const BLOCKD *b) {

-  TX_TYPE tx_type = DCT_DCT;

-  if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&

-      xd->q_index < ACTIVE_HT8) {

-    tx_type = txfm_map(pred_mode_conv(b->bmi.as_mode.first));

-  }

-  return tx_type;

-}

-static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, const BLOCKD *b) {

-  TX_TYPE tx_type = DCT_DCT;

-  if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&

-      xd->q_index < ACTIVE_HT16) {

-    tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));

-  }

-  return tx_type;

-}

-static TX_TYPE get_tx_type(const MACROBLOCKD *xd, const BLOCKD *b) {

-  TX_TYPE tx_type = DCT_DCT;

-  int ib = (b - xd->block);

-  if (ib >= 16)

-    return tx_type;

-  if (xd->mode_info_context->mbmi.txfm_size == TX_16X16) {

-    tx_type = get_tx_type_16x16(xd, b);

-  }

-  if (xd->mode_info_context->mbmi.txfm_size  == TX_8X8) {

-    ib = (ib & 8) + ((ib & 4) >> 1);

-    tx_type = get_tx_type_8x8(xd, &xd->block[ib]);

-  }

-  if (xd->mode_info_context->mbmi.txfm_size  == TX_4X4) {

-    tx_type = get_tx_type_4x4(xd, b);

-  }

-  return tx_type;

-}

-extern void vp9_build_block_doffsets(MACROBLOCKD *xd);

-extern void vp9_setup_block_dptrs(MACROBLOCKD *xd);

-static void update_blockd_bmi(MACROBLOCKD *xd) {

-  int i;

-  int is_4x4;

-  is_4x4 = (xd->mode_info_context->mbmi.mode == SPLITMV) ||

-           (xd->mode_info_context->mbmi.mode == I8X8_PRED) ||

-           (xd->mode_info_context->mbmi.mode == B_PRED);

-  if (is_4x4) {

-    for (i = 0; i < 16; i++) {

-      xd->block[i].bmi = xd->mode_info_context->bmi[i];

-    }

-  }

-}

-#endif  /* __INC_BLOCKD_H */

--- a/vp8/common/coefupdateprobs.h

+++ /dev/null

@@ -1,16 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-/* Update probabilities for the nodes in the token entropy tree.

-   Generated file included by entropy.c */

-#define COEF_UPDATE_PROB 252

-#define COEF_UPDATE_PROB_8X8 252

-#define COEF_UPDATE_PROB_16X16 252

--- a/vp8/common/common.h

+++ /dev/null

@@ -1,41 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef common_h

-#define common_h 1

-#include <assert.h>

-#include "vpx_config.h"

-/* Interface header for common constant data structures and lookup tables */

-#include "vpx_mem/vpx_mem.h"

-#include "common_types.h"

-/* Only need this for fixed-size arrays, for structs just assign. */

-#define vp9_copy( Dest, Src) { \

-    assert( sizeof( Dest) == sizeof( Src)); \

-    vpx_memcpy( Dest, Src, sizeof( Src)); \

-  }

-/* Use this for variably-sized arrays. */

-#define vp9_copy_array( Dest, Src, N) { \

-    assert( sizeof( *Dest) == sizeof( *Src)); \

-    vpx_memcpy( Dest, Src, N * sizeof( *Src)); \

-  }

-#define vp9_zero( Dest)  vpx_memset( &Dest, 0, sizeof( Dest));

-#define vp9_zero_array( Dest, N)  vpx_memset( Dest, 0, N * sizeof( *Dest));

-#endif  /* common_h */

--- a/vp8/common/common_types.h

+++ /dev/null

@@ -1,18 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_COMMON_TYPES

-#define __INC_COMMON_TYPES

-#define TRUE    1

-#define FALSE   0

-#endif

--- a/vp8/common/context.c

+++ /dev/null

@@ -1,397 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "entropy.h"

-/* *** GENERATED FILE: DO NOT EDIT *** */

-#if 0

-int Contexts[vp8_coef_counter_dimen];

-const int default_contexts[vp8_coef_counter_dimen] = {

-  {

-    // Block Type ( 0 )

-    {

-      // Coeff Band ( 0 )

-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-    },

-    {

-      // Coeff Band ( 1 )

-      {30190, 26544, 225,  24,   4,   0,   0,   0,   0,   0,   0, 4171593},

-      {26846, 25157, 1241, 130,  26,   6,   1,   0,   0,   0,   0, 149987},

-      {10484, 9538, 1006, 160,  36,  18,   0,   0,   0,   0,   0, 15104},

-    },

-    {

-      // Coeff Band ( 2 )

-      {25842, 40456, 1126,  83,  11,   2,   0,   0,   0,   0,   0,   0},

-      {9338, 8010, 512,  73,   7,   3,   2,   0,   0,   0,   0, 43294},

-      {1047, 751, 149,  31,  13,   6,   1,   0,   0,   0,   0, 879},

-    },

-    {

-      // Coeff Band ( 3 )

-      {26136, 9826, 252,  13,   0,   0,   0,   0,   0,   0,   0,   0},

-      {8134, 5574, 191,  14,   2,   0,   0,   0,   0,   0,   0, 35302},

-      { 605, 677, 116,   9,   1,   0,   0,   0,   0,   0,   0, 611},

-    },

-    {

-      // Coeff Band ( 4 )

-      {10263, 15463, 283,  17,   0,   0,   0,   0,   0,   0,   0,   0},

-      {2773, 2191, 128,   9,   2,   2,   0,   0,   0,   0,   0, 10073},

-      { 134, 125,  32,   4,   0,   2,   0,   0,   0,   0,   0,  50},

-    },

-    {

-      // Coeff Band ( 5 )

-      {10483, 2663,  23,   1,   0,   0,   0,   0,   0,   0,   0,   0},

-      {2137, 1251,  27,   1,   1,   0,   0,   0,   0,   0,   0, 14362},

-      { 116, 156,  14,   2,   1,   0,   0,   0,   0,   0,   0, 190},

-    },

-    {

-      // Coeff Band ( 6 )

-      {40977, 27614, 412,  28,   0,   0,   0,   0,   0,   0,   0,   0},

-      {6113, 5213, 261,  22,   3,   0,   0,   0,   0,   0,   0, 26164},

-      { 382, 312,  50,  14,   2,   0,   0,   0,   0,   0,   0, 345},

-    },

-    {

-      // Coeff Band ( 7 )

-      {   0,  26,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-      {   0,  13,   0,   0,   0,   0,   0,   0,   0,   0,   0, 319},

-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   8},

-    },

-  },

-  {

-    // Block Type ( 1 )

-    {

-      // Coeff Band ( 0 )

-      {3268, 19382, 1043, 250,  93,  82,  49,  26,  17,   8,  25, 82289},

-      {8758, 32110, 5436, 1832, 827, 668, 420, 153,  24,   0,   3, 52914},

-      {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399,  59,   0,   0, 18620},

-    },

-    {

-      // Coeff Band ( 1 )

-      {12419, 8420, 452,  62,   9,   1,   0,   0,   0,   0,   0,   0},

-      {11715, 8705, 693,  92,  15,   7,   2,   0,   0,   0,   0, 53988},

-      {7603, 8585, 2306, 778, 270, 145,  39,   5,   0,   0,   0, 9136},

-    },

-    {

-      // Coeff Band ( 2 )

-      {15938, 14335, 1207, 184,  55,  13,   4,   1,   0,   0,   0,   0},

-      {7415, 6829, 1138, 244,  71,  26,   7,   0,   0,   0,   0, 9980},

-      {1580, 1824, 655, 241,  89,  46,  10,   2,   0,   0,   0, 429},

-    },

-    {

-      // Coeff Band ( 3 )

-      {19453, 5260, 201,  19,   0,   0,   0,   0,   0,   0,   0,   0},

-      {9173, 3758, 213,  22,   1,   1,   0,   0,   0,   0,   0, 9820},

-      {1689, 1277, 276,  51,  17,   4,   0,   0,   0,   0,   0, 679},

-    },

-    {

-      // Coeff Band ( 4 )

-      {12076, 10667, 620,  85,  19,   9,   5,   0,   0,   0,   0,   0},

-      {4665, 3625, 423,  55,  19,   9,   0,   0,   0,   0,   0, 5127},

-      { 415, 440, 143,  34,  20,   7,   2,   0,   0,   0,   0, 101},

-    },

-    {

-      // Coeff Band ( 5 )

-      {12183, 4846, 115,  11,   1,   0,   0,   0,   0,   0,   0,   0},

-      {4226, 3149, 177,  21,   2,   0,   0,   0,   0,   0,   0, 7157},

-      { 375, 621, 189,  51,  11,   4,   1,   0,   0,   0,   0, 198},

-    },

-    {

-      // Coeff Band ( 6 )

-      {61658, 37743, 1203,  94,  10,   3,   0,   0,   0,   0,   0,   0},

-      {15514, 11563, 903, 111,  14,   5,   0,   0,   0,   0,   0, 25195},

-      { 929, 1077, 291,  78,  14,   7,   1,   0,   0,   0,   0, 507},

-    },

-    {

-      // Coeff Band ( 7 )

-      {   0, 990,  15,   3,   0,   0,   0,   0,   0,   0,   0,   0},

-      {   0, 412,  13,   0,   0,   0,   0,   0,   0,   0,   0, 1641},

-      {   0,  18,   7,   1,   0,   0,   0,   0,   0,   0,   0,  30},

-    },

-  },

-  {

-    // Block Type ( 2 )

-    {

-      // Coeff Band ( 0 )

-      { 953, 24519, 628, 120,  28,  12,   4,   0,   0,   0,   0, 2248798},

-      {1525, 25654, 2647, 617, 239, 143,  42,   5,   0,   0,   0, 66837},

-      {1180, 11011, 3001, 1237, 532, 448, 239,  54,   5,   0,   0, 7122},

-    },

-    {

-      // Coeff Band ( 1 )

-      {1356, 2220,  67,  10,   4,   1,   0,   0,   0,   0,   0,   0},

-      {1450, 2544, 102,  18,   4,   3,   0,   0,   0,   0,   0, 57063},

-      {1182, 2110, 470, 130,  41,  21,   0,   0,   0,   0,   0, 6047},

-    },

-    {

-      // Coeff Band ( 2 )

-      { 370, 3378, 200,  30,   5,   4,   1,   0,   0,   0,   0,   0},

-      { 293, 1006, 131,  29,  11,   0,   0,   0,   0,   0,   0, 5404},

-      { 114, 387,  98,  23,   4,   8,   1,   0,   0,   0,   0, 236},

-    },

-    {

-      // Coeff Band ( 3 )

-      { 579, 194,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-      { 395, 213,   5,   1,   0,   0,   0,   0,   0,   0,   0, 4157},

-      { 119, 122,   4,   0,   0,   0,   0,   0,   0,   0,   0, 300},

-    },

-    {

-      // Coeff Band ( 4 )

-      {  38, 557,  19,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-      {  21, 114,  12,   1,   0,   0,   0,   0,   0,   0,   0, 427},

-      {   0,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   7},

-    },

-    {

-      // Coeff Band ( 5 )

-      {  52,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-      {  18,   6,   0,   0,   0,   0,   0,   0,   0,   0,   0, 652},

-      {   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  30},

-    },

-    {

-      // Coeff Band ( 6 )

-      { 640, 569,  10,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-      {  25,  77,   2,   0,   0,   0,   0,   0,   0,   0,   0, 517},

-      {   4,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3},

-    },

-    {

-      // Coeff Band ( 7 )

-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-    },

-  },

-  {

-    // Block Type ( 3 )

-    {

-      // Coeff Band ( 0 )

-      {2506, 20161, 2707, 767, 261, 178, 107,  30,  14,   3,   0, 100694},

-      {8806, 36478, 8817, 3268, 1280, 850, 401, 114,  42,   0,   0, 58572},

-      {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175,  32,   0,   0, 19284},

-    },

-    {

-      // Coeff Band ( 1 )

-      {9738, 11313, 959, 205,  70,  18,  11,   1,   0,   0,   0,   0},

-      {12628, 15085, 1507, 273,  52,  19,   9,   0,   0,   0,   0, 54280},

-      {10701, 15846, 5561, 1926, 813, 570, 249,  36,   0,   0,   0, 6460},

-    },

-    {

-      // Coeff Band ( 2 )

-      {6781, 22539, 2784, 634, 182, 123,  20,   4,   0,   0,   0,   0},

-      {6263, 11544, 2649, 790, 259, 168,  27,   5,   0,   0,   0, 20539},

-      {3109, 4075, 2031, 896, 457, 386, 158,  29,   0,   0,   0, 1138},

-    },

-    {

-      // Coeff Band ( 3 )

-      {11515, 4079, 465,  73,   5,  14,   2,   0,   0,   0,   0,   0},

-      {9361, 5834, 650,  96,  24,   8,   4,   0,   0,   0,   0, 22181},

-      {4343, 3974, 1360, 415, 132,  96,  14,   1,   0,   0,   0, 1267},

-    },

-    {

-      // Coeff Band ( 4 )

-      {4787, 9297, 823, 168,  44,  12,   4,   0,   0,   0,   0,   0},

-      {3619, 4472, 719, 198,  60,  31,   3,   0,   0,   0,   0, 8401},

-      {1157, 1175, 483, 182,  88,  31,   8,   0,   0,   0,   0, 268},

-    },

-    {

-      // Coeff Band ( 5 )

-      {8299, 1226,  32,   5,   1,   0,   0,   0,   0,   0,   0,   0},

-      {3502, 1568,  57,   4,   1,   1,   0,   0,   0,   0,   0, 9811},

-      {1055, 1070, 166,  29,   6,   1,   0,   0,   0,   0,   0, 527},

-    },

-    {

-      // Coeff Band ( 6 )

-      {27414, 27927, 1989, 347,  69,  26,   0,   0,   0,   0,   0,   0},

-      {5876, 10074, 1574, 341,  91,  24,   4,   0,   0,   0,   0, 21954},

-      {1571, 2171, 778, 324, 124,  65,  16,   0,   0,   0,   0, 979},

-    },

-    {

-      // Coeff Band ( 7 )

-      {   0,  29,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

-      {   0,  23,   0,   0,   0,   0,   0,   0,   0,   0,   0, 459},

-      {   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  13},

-    },

-  },

-};

-// Update probabilities for the nodes in the token entropy tree.

-const vp9_prob tree_update_probs[vp9_coef_tree_dimen] = {

-  {

-    {

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },

-      {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },

-      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-  },

-  {

-    {

-      {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },

-      {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },

-    },

-    {

-      {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-  },

-  {

-    {

-      {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },

-      {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },

-    },

-    {

-      {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-  },

-  {

-    {

-      {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },

-      {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-    {

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

-    },

-  },

-};

-#endif

--- a/vp8/common/debugmodes.c

+++ /dev/null

@@ -1,146 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <stdio.h>

-#include "blockd.h"

-void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols,

-                                        int frame) {

-  int mb_row;

-  int mb_col;

-  int mb_index = 0;

-  FILE *mvs = fopen("mvs.stt", "a");

-  /* print out the macroblock Y modes */

-  mb_index = 0;

-  fprintf(mvs, "Mb Modes for Frame %d\n", frame);

-  for (mb_row = 0; mb_row < rows; mb_row++) {

-    for (mb_col = 0; mb_col < cols; mb_col++) {

-      fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode);

-      mb_index++;

-    }

-    fprintf(mvs, "\n");

-    mb_index++;

-  }

-  fprintf(mvs, "\n");

-  mb_index = 0;

-  fprintf(mvs, "Mb mv ref for Frame %d\n", frame);

-  for (mb_row = 0; mb_row < rows; mb_row++) {

-    for (mb_col = 0; mb_col < cols; mb_col++) {

-      fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame);

-      mb_index++;

-    }

-    fprintf(mvs, "\n");

-    mb_index++;

-  }

-  fprintf(mvs, "\n");

-  /* print out the macroblock UV modes */

-  mb_index = 0;

-  fprintf(mvs, "UV Modes for Frame %d\n", frame);

-  for (mb_row = 0; mb_row < rows; mb_row++) {

-    for (mb_col = 0; mb_col < cols; mb_col++) {

-      fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode);

-      mb_index++;

-    }

-    mb_index++;

-    fprintf(mvs, "\n");

-  }

-  fprintf(mvs, "\n");

-  /* print out the block modes */

-  mb_index = 0;

-  fprintf(mvs, "Mbs for Frame %d\n", frame);

-  {

-    int b_row;

-    for (b_row = 0; b_row < 4 * rows; b_row++) {

-      int b_col;

-      int bindex;

-      for (b_col = 0; b_col < 4 * cols; b_col++) {

-        mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);

-        bindex = (b_row & 3) * 4 + (b_col & 3);

-        if (mi[mb_index].mbmi.mode == B_PRED) {

-          fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.first);

-#if CONFIG_COMP_INTRA_PRED

-          fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.second);

-#endif

-        } else

-          fprintf(mvs, "xx ");

-      }

-      fprintf(mvs, "\n");

-    }

-  }

-  fprintf(mvs, "\n");

-  /* print out the macroblock mvs */

-  mb_index = 0;

-  fprintf(mvs, "MVs for Frame %d\n", frame);

-  for (mb_row = 0; mb_row < rows; mb_row++) {

-    for (mb_col = 0; mb_col < cols; mb_col++) {

-      fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv[0].as_mv.row / 2,

-          mi[mb_index].mbmi.mv[0].as_mv.col / 2);

-      mb_index++;

-    }

-    mb_index++;

-    fprintf(mvs, "\n");

-  }

-  fprintf(mvs, "\n");

-  /* print out the block modes */

-  mb_index = 0;

-  fprintf(mvs, "MVs for Frame %d\n", frame);

-  {

-    int b_row;

-    for (b_row = 0; b_row < 4 * rows; b_row++) {

-      int b_col;

-      int bindex;

-      for (b_col = 0; b_col < 4 * cols; b_col++) {

-        mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);

-        bindex = (b_row & 3) * 4 + (b_col & 3);

-        fprintf(mvs, "%3d:%-3d ",

-                mi[mb_index].bmi[bindex].as_mv.first.as_mv.row,

-                mi[mb_index].bmi[bindex].as_mv.first.as_mv.col);

-      }

-      fprintf(mvs, "\n");

-    }

-  }

-  fprintf(mvs, "\n");

-  fclose(mvs);

-}

--- a/vp8/common/default_coef_probs.h

+++ /dev/null

@@ -1,1377 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

-*/

-/*Generated file, included by entropy.c*/

-static const vp9_prob default_coef_probs [BLOCK_TYPES]

-                                         [COEF_BANDS]

-                                         [PREV_COEF_CONTEXTS]

-                                         [ENTROPY_NODES] = {

-  {

-    /* Block Type ( 0 ) */

-    {

-      /* Coeff Band ( 0 )*/

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 1 )*/

-      { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },

-      { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },

-      { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 },

-      { 90, 116, 227, 252, 214, 209, 255, 255, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 2 )*/

-      {   1,  98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },

-      { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },

-      {  78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },

-      {  64, 128, 202, 247, 198, 180, 255, 219, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 3 )*/

-      {   1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },

-      { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },

-      {  77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },

-      {  64, 100, 216, 255, 236, 230, 128, 128, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 4 )*/

-      {   1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },

-      { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },

-      {  37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 },

-      {  28, 110, 196, 243, 228, 255, 255, 255, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 5 )*/

-      {   1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },

-      { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },

-      { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 },

-      { 90, 90, 231, 255, 211, 171, 128, 128, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 6 )*/

-      {   1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },

-      { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },

-      {  80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 },

-      {  64, 120, 211, 255, 194, 224, 128, 128, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 7 )*/

-      {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 246,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-    }

-  },

-  {

-    /* Block Type ( 1 ) */

-    {

-      /* Coeff Band ( 0 )*/

-      { 198,  35, 237, 223, 193, 187, 162, 160, 145, 155,  62 },

-      { 131,  45, 198, 221, 172, 176, 220, 157, 252, 221,   1 },

-      {  68,  47, 146, 208, 149, 167, 221, 162, 255, 223, 128 },

-      {  48,  32, 146, 208, 149, 167, 221, 162, 255, 223, 128 },

-    },

-    {

-      /* Coeff Band ( 1 )*/

-      {   1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },

-      { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },

-      {  81,  99, 181, 242, 176, 190, 249, 202, 255, 255, 128 },

-      {  66,  90, 181, 242, 176, 190, 249, 202, 255, 255, 128 },

-    },

-    {

-      /* Coeff Band ( 2 )*/

-      {   1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },

-      {  99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },

-      {  23,  91, 163, 242, 170, 187, 247, 210, 255, 255, 128 },

-      {  18,  80, 163, 242, 170, 187, 247, 210, 255, 255, 128 },

-    },

-    {

-      /* Coeff Band ( 3 )*/

-      {   1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },

-      { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },

-      {  44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 },

-      {  36, 120, 201, 253, 205, 192, 255, 255, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 4 )*/

-      {   1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },

-      {  94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },

-      {  22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 },

-      {  18, 90, 174, 245, 186, 161, 255, 199, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 5 )*/

-      {   1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },

-      { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },

-      {  35,  77, 181, 251, 193, 211, 255, 205, 128, 128, 128 },

-      {  28,  70, 181, 251, 193, 211, 255, 205, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 6 )*/

-      {   1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },

-      { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },

-      {  45,  99, 188, 251, 195, 217, 255, 224, 128, 128, 128 },

-      {  40,  90, 188, 251, 195, 217, 255, 224, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 7 )*/

-      {   1,   1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },

-      { 203,   1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },

-      { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },

-      { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },

-    }

-  },

-  {

-    /* Block Type ( 2 ) */

-    {

-      /* Coeff Band ( 0 )*/

-      { 253,   9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },

-      { 175,  13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },

-      {  73,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },

-      {  64,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },

-    },

-    {

-      /* Coeff Band ( 1 )*/

-      {   1,  95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },

-      { 239,  90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },

-      { 155,  77, 195, 248, 188, 195, 255, 255, 128, 128, 128 },

-      { 140,  70, 195, 248, 188, 195, 255, 255, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 2 )*/

-      {   1,  24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },

-      { 201,  51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },

-      {  69,  46, 190, 239, 201, 218, 255, 228, 128, 128, 128 },

-      {  60,  40, 190, 239, 201, 218, 255, 228, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 3 )*/

-      {   1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },

-      { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },

-      { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 },

-      { 132, 118, 248, 255, 255, 128, 128, 128, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 4 )*/

-      {   1,  16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },

-      { 190,  36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },

-      { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 5 )*/

-      {   1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 6 )*/

-      {   1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },

-      { 213,  62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },

-      {  55,  93, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-      {  48,  85, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 7 )*/

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-    }

-  },

-  {

-    /* Block Type ( 3 ) */

-    {

-      /* Coeff Band ( 0 )*/

-      { 202,  24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },

-      { 126,  38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },

-      {  63,  48, 138, 219, 151, 178, 240, 170, 255, 216, 128 },

-      {  54,  40, 138, 219, 151, 178, 240, 170, 255, 216, 128 },

-    },

-    {

-      /* Coeff Band ( 1 )*/

-      {   1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },

-      { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },

-      {  44,  84, 162, 232, 172, 180, 245, 178, 255, 255, 128 },

-      {  32,  70, 162, 232, 172, 180, 245, 178, 255, 255, 128 },

-    },

-    {

-      /* Coeff Band ( 2 )*/

-      {   1,  52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },

-      { 124,  74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },

-      {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },

-      {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },

-    },

-    {

-      /* Coeff Band ( 3 )*/

-      {   1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },

-      { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },

-      {  28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 },

-      {  26, 104, 170, 242, 183, 194, 254, 223, 255, 255, 128 },

-    },

-    {

-      /* Coeff Band ( 4 )*/

-      {   1,  81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },

-      { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },

-      {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },

-      {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 5 )*/

-      {   1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },

-      { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },

-      {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },

-      {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 6 )*/

-      {   1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },

-      { 141,  84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },

-      {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },

-      {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 7 )*/

-      {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 244,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-    }

-  }

-};

-static const vp9_prob default_hybrid_coef_probs [BLOCK_TYPES]

-                                                [COEF_BANDS]

-                                                [PREV_COEF_CONTEXTS]

-                                                [ENTROPY_NODES] = {

-  {

-    /* Block Type ( 0 ) */

-    {

-      /* Coeff Band ( 0 )*/

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 1 )*/

-      { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },

-      { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },

-      { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 },

-      { 90, 116, 227, 252, 214, 209, 255, 255, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 2 )*/

-      {   1,  98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },

-      { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },

-      {  78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },

-      {  64, 128, 202, 247, 198, 180, 255, 219, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 3 )*/

-      {   1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },

-      { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },

-      {  77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },

-      {  64, 100, 216, 255, 236, 230, 128, 128, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 4 )*/

-      {   1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },

-      { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },

-      {  37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 },

-      {  28, 110, 196, 243, 228, 255, 255, 255, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 5 )*/

-      {   1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },

-      { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },

-      { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 },

-      { 90, 90, 231, 255, 211, 171, 128, 128, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 6 )*/

-      {   1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },

-      { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },

-      {  80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 },

-      {  64, 120, 211, 255, 194, 224, 128, 128, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 7 )*/

-      {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 246,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-    }

-  },

-  {

-    /* Block Type ( 1 ) */

-    {

-      /* Coeff Band ( 0 )*/

-      { 198,  35, 237, 223, 193, 187, 162, 160, 145, 155,  62 },

-      { 131,  45, 198, 221, 172, 176, 220, 157, 252, 221,   1 },

-      {  68,  47, 146, 208, 149, 167, 221, 162, 255, 223, 128 },

-      {  48,  32, 146, 208, 149, 167, 221, 162, 255, 223, 128 },

-    },

-    {

-      /* Coeff Band ( 1 )*/

-      {   1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },

-      { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },

-      {  81,  99, 181, 242, 176, 190, 249, 202, 255, 255, 128 },

-      {  66,  90, 181, 242, 176, 190, 249, 202, 255, 255, 128 },

-    },

-    {

-      /* Coeff Band ( 2 )*/

-      {   1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },

-      {  99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },

-      {  23,  91, 163, 242, 170, 187, 247, 210, 255, 255, 128 },

-      {  18,  80, 163, 242, 170, 187, 247, 210, 255, 255, 128 },

-    },

-    {

-      /* Coeff Band ( 3 )*/

-      {   1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },

-      { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },

-      {  44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 },

-      {  36, 120, 201, 253, 205, 192, 255, 255, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 4 )*/

-      {   1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },

-      {  94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },

-      {  22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 },

-      {  18, 90, 174, 245, 186, 161, 255, 199, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 5 )*/

-      {   1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },

-      { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },

-      {  35,  77, 181, 251, 193, 211, 255, 205, 128, 128, 128 },

-      {  28,  70, 181, 251, 193, 211, 255, 205, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 6 )*/

-      {   1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },

-      { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },

-      {  45,  99, 188, 251, 195, 217, 255, 224, 128, 128, 128 },

-      {  40,  90, 188, 251, 195, 217, 255, 224, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 7 )*/

-      {   1,   1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },

-      { 203,   1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },

-      { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },

-      { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },

-    }

-  },

-  {

-    /* Block Type ( 2 ) */

-    {

-      /* Coeff Band ( 0 )*/

-      { 253,   9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },

-      { 175,  13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },

-      {  73,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },

-      {  64,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },

-    },

-    {

-      /* Coeff Band ( 1 )*/

-      {   1,  95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },

-      { 239,  90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },

-      { 155,  77, 195, 248, 188, 195, 255, 255, 128, 128, 128 },

-      { 140,  70, 195, 248, 188, 195, 255, 255, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 2 )*/

-      {   1,  24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },

-      { 201,  51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },

-      {  69,  46, 190, 239, 201, 218, 255, 228, 128, 128, 128 },

-      {  60,  40, 190, 239, 201, 218, 255, 228, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 3 )*/

-      {   1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },

-      { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },

-      { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 },

-      { 132, 118, 248, 255, 255, 128, 128, 128, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 4 )*/

-      {   1,  16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },

-      { 190,  36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },

-      { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 5 )*/

-      {   1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 6 )*/

-      {   1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },

-      { 213,  62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },

-      {  55,  93, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-      {  48,  85, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 7 )*/

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-    }

-  },

-  {

-    /* Block Type ( 3 ) */

-    {

-      /* Coeff Band ( 0 )*/

-      { 202,  24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },

-      { 126,  38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },

-      {  63,  48, 138, 219, 151, 178, 240, 170, 255, 216, 128 },

-      {  54,  40, 138, 219, 151, 178, 240, 170, 255, 216, 128 },

-    },

-    {

-      /* Coeff Band ( 1 )*/

-      {   1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },

-      { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },

-      {  44,  84, 162, 232, 172, 180, 245, 178, 255, 255, 128 },

-      {  32,  70, 162, 232, 172, 180, 245, 178, 255, 255, 128 },

-    },

-    {

-      /* Coeff Band ( 2 )*/

-      {   1,  52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },

-      { 124,  74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },

-      {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },

-      {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },

-    },

-    {

-      /* Coeff Band ( 3 )*/

-      {   1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },

-      { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },

-      {  28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 },

-      {  26, 104, 170, 242, 183, 194, 254, 223, 255, 255, 128 },

-    },

-    {

-      /* Coeff Band ( 4 )*/

-      {   1,  81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },

-      { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },

-      {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },

-      {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 5 )*/

-      {   1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },

-      { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },

-      {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },

-      {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 6 )*/

-      {   1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },

-      { 141,  84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },

-      {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },

-      {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },

-    },

-    {

-      /* Coeff Band ( 7 )*/

-      {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 244,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

-    }

-  }

-};

-static const vp9_prob

-default_coef_probs_8x8[BLOCK_TYPES_8X8]

-[COEF_BANDS]

-[PREV_COEF_CONTEXTS]

-[ENTROPY_NODES] = {

-  {

-    /* block Type 0 */

-    {

-      /* Coeff Band 0 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 1 */

-      { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},

-      { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},

-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},

-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 2 */

-      { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},

-      { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},

-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},

-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 3 */

-      { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},

-      { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},

-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},

-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 4 */

-      { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},

-      { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},

-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},

-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 5 */

-      { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},

-      { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},

-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},

-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 6 */

-      { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},

-      { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},

-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},

-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 7 */

-      { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},

-      { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},

-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},

-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}

-    }

-  },

-  {

-    /* block Type 1 */

-    {

-      /* Coeff Band 0 */

-      { 134, 152, 233, 224, 234, 52, 255, 166, 128, 128, 128},

-      { 97, 132, 185, 234, 186, 189, 197, 171, 255, 212, 128},

-      { 84, 110, 185, 237, 182, 182, 145, 145, 255, 255, 128}

-    },

-    {

-      /* Coeff Band 1 */

-      { 1, 124, 213, 247, 192, 212, 255, 255, 128, 128, 128},

-      { 88, 111, 178, 254, 189, 211, 255, 255, 128, 128, 128},

-      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128},

-      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128}

-    },

-    {

-      /* Coeff Band 2 */

-      { 1, 102, 225, 255, 210, 240, 128, 128, 128, 128, 128},

-      { 110, 78, 195, 254, 200, 191, 255, 255, 128, 128, 128},

-      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128},

-      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 3 */

-      { 1, 1, 229, 255, 202, 224, 128, 128, 128, 128, 128},

-      { 150, 1, 192, 255, 206, 226, 128, 128, 128, 128, 128},

-      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128},

-      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 4 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 5 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 6 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 7 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-    }

-  },

-  {

-    /* block Type 2 */

-    {

-      /* Coeff Band 0 */

-      { 11, 181, 226, 199, 183, 255, 255, 255, 128, 128, 128},

-      { 2, 147, 185, 248, 163, 180, 255, 236, 128, 128, 128},

-      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128},

-      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128}

-    },

-    {

-      /* Coeff Band 1 */

-      { 1, 150, 191, 246, 174, 188, 255, 235, 128, 128, 128},

-      { 1, 125, 166, 245, 165, 185, 255, 234, 128, 128, 128},

-      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128},

-      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128}

-    },

-    {

-      /* Coeff Band 2 */

-      { 1, 146, 184, 242, 167, 183, 255, 230, 255, 255, 128},

-      { 1, 119, 160, 239, 156, 178, 255, 231, 255, 255, 128},

-      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128},

-      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128}

-    },

-    {

-      /* Coeff Band 3 */

-      { 1, 150, 188, 244, 169, 183, 255, 233, 255, 255, 128},

-      { 1, 123, 162, 243, 161, 180, 255, 233, 128, 128, 128},

-      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128},

-      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128}

-    },

-    {

-      /* Coeff Band 4 */

-      { 1, 163, 202, 252, 188, 204, 255, 248, 128, 128, 128},

-      { 1, 136, 180, 251, 181, 201, 255, 246, 128, 128, 128},

-      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128},

-      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 5 */

-      { 1, 156, 195, 249, 179, 193, 255, 241, 255, 255, 128},

-      { 1, 128, 169, 248, 171, 192, 255, 242, 255, 255, 128},

-      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128},

-      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128}

-    },

-    {

-      /* Coeff Band 6 */

-      { 1, 36, 71, 251, 192, 201, 255, 243, 255, 255, 128},

-      { 1, 49, 185, 250, 184, 199, 255, 242, 128, 128, 128},

-      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128},

-      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128}

-    },

-    {

-      /* Coeff Band 7 */

-      { 1, 19, 98, 255, 218, 222, 255, 255, 128, 128, 128},

-      { 36, 50, 210, 255, 212, 221, 255, 255, 128, 128, 128},

-      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128},

-      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128}

-    }

-  },

-  { /* block Type 3 */

-    { /* Coeff Band 0 */

-      { 192, 18, 155, 172, 145, 164, 192, 135, 246, 223, 255},

-      { 94, 29, 97, 131, 131, 153, 171, 121, 250, 190, 255},

-      { 25, 29, 63, 128, 119, 147, 168, 124, 251, 183, 255},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-    },

-    { /* Coeff Band 1 */

-      { 1, 108, 192, 220, 186, 173, 255, 194, 255, 255, 128},

-      { 123, 104, 188, 221, 165, 171, 247, 180, 255, 255, 128},

-      { 23, 76, 152, 216, 154, 166, 226, 182, 255, 209, 128},

-      { 1, 26, 52, 162, 109, 152, 208, 144, 255, 231, 128}

-    },

-    { /* Coeff Band 2 */

-      { 1, 57, 179, 220, 156, 175, 210, 158, 255, 223, 128},

-      { 48, 57, 134, 212, 151, 170, 219, 185, 255, 248, 128},

-      { 4, 35, 63, 189, 120, 156, 221, 159, 255, 241, 128},

-      { 1, 17, 23, 110, 97, 143, 187, 120, 255, 234, 128}

-    },

-    { /* Coeff Band 3 */

-      { 1, 115, 205, 243, 182, 187, 254, 218, 255, 255, 128},

-      { 80, 101, 186, 241, 183, 186, 249, 182, 255, 255, 128},

-      { 10, 81, 144, 229, 164, 175, 241, 185, 255, 255, 128},

-      { 1, 44, 81, 192, 130, 148, 240, 180, 255, 255, 128}

-    },

-    { /* Coeff Band 4 */

-      { 1, 161, 207, 249, 187, 176, 255, 180, 128, 128, 128},

-      { 79, 148, 196, 240, 186, 182, 253, 171, 255, 255, 128},

-      { 14, 111, 171, 233, 170, 178, 235, 204, 255, 255, 128},

-      { 1, 63, 103, 202, 143, 162, 240, 178, 255, 255, 128}

-    },

-    { /* Coeff Band 5 */

-      { 1, 101, 202, 239, 185, 184, 252, 186, 255, 255, 128},

-      { 43, 67, 166, 237, 178, 190, 246, 194, 255, 255, 128},

-      { 4, 49, 85, 220, 140, 168, 253, 182, 255, 255, 128},

-      { 1, 24, 35, 144, 93, 135, 239, 159, 255, 253, 128}

-    },

-    { /* Coeff Band 6 */

-      { 1, 212, 243, 255, 240, 234, 255, 255, 128, 128, 128},

-      { 98, 168, 234, 255, 229, 234, 255, 255, 128, 128, 128},

-      { 19, 127, 199, 255, 212, 198, 255, 255, 128, 128, 128},

-      { 1, 103, 162, 253, 186, 151, 255, 255, 128, 128, 128}

-    },

-    { /* Coeff Band 7 */

-      { 1, 188, 253, 255, 255, 128, 128, 128, 128, 128, 128},

-      { 191, 68, 242, 255, 255, 128, 128, 128, 128, 128, 128},

-      { 8, 132, 255, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-    }

-  }

-};

-static const vp9_prob

-default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]

-                             [COEF_BANDS]

-                             [PREV_COEF_CONTEXTS]

-                             [ENTROPY_NODES] = {

-  {

-    /* block Type 0 */

-    {

-      /* Coeff Band 0 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 1 */

-      { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},

-      { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},

-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},

-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 2 */

-      { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},

-      { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},

-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},

-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 3 */

-      { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},

-      { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},

-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},

-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 4 */

-      { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},

-      { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},

-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},

-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 5 */

-      { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},

-      { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},

-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},

-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 6 */

-      { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},

-      { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},

-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},

-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 7 */

-      { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},

-      { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},

-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},

-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}

-    }

-  },

-  {

-    /* block Type 1 */

-    {

-      /* Coeff Band 0 */

-      { 134, 152, 233, 224, 234, 52, 255, 166, 128, 128, 128},

-      { 97, 132, 185, 234, 186, 189, 197, 171, 255, 212, 128},

-      { 84, 110, 185, 237, 182, 182, 145, 145, 255, 255, 128}

-    },

-    {

-      /* Coeff Band 1 */

-      { 1, 124, 213, 247, 192, 212, 255, 255, 128, 128, 128},

-      { 88, 111, 178, 254, 189, 211, 255, 255, 128, 128, 128},

-      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128},

-      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128}

-    },

-    {

-      /* Coeff Band 2 */

-      { 1, 102, 225, 255, 210, 240, 128, 128, 128, 128, 128},

-      { 110, 78, 195, 254, 200, 191, 255, 255, 128, 128, 128},

-      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128},

-      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 3 */

-      { 1, 1, 229, 255, 202, 224, 128, 128, 128, 128, 128},

-      { 150, 1, 192, 255, 206, 226, 128, 128, 128, 128, 128},

-      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128},

-      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 4 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 5 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 6 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 7 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-    }

-  },

-  {

-    /* block Type 2 */

-    {

-      /* Coeff Band 0 */

-      { 11, 181, 226, 199, 183, 255, 255, 255, 128, 128, 128},

-      { 2, 147, 185, 248, 163, 180, 255, 236, 128, 128, 128},

-      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128},

-      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128}

-    },

-    {

-      /* Coeff Band 1 */

-      { 1, 150, 191, 246, 174, 188, 255, 235, 128, 128, 128},

-      { 1, 125, 166, 245, 165, 185, 255, 234, 128, 128, 128},

-      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128},

-      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128}

-    },

-    {

-      /* Coeff Band 2 */

-      { 1, 146, 184, 242, 167, 183, 255, 230, 255, 255, 128},

-      { 1, 119, 160, 239, 156, 178, 255, 231, 255, 255, 128},

-      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128},

-      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128}

-    },

-    {

-      /* Coeff Band 3 */

-      { 1, 150, 188, 244, 169, 183, 255, 233, 255, 255, 128},

-      { 1, 123, 162, 243, 161, 180, 255, 233, 128, 128, 128},

-      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128},

-      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128}

-    },

-    {

-      /* Coeff Band 4 */

-      { 1, 163, 202, 252, 188, 204, 255, 248, 128, 128, 128},

-      { 1, 136, 180, 251, 181, 201, 255, 246, 128, 128, 128},

-      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128},

-      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128}

-    },

-    {

-      /* Coeff Band 5 */

-      { 1, 156, 195, 249, 179, 193, 255, 241, 255, 255, 128},

-      { 1, 128, 169, 248, 171, 192, 255, 242, 255, 255, 128},

-      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128},

-      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128}

-    },

-    {

-      /* Coeff Band 6 */

-      { 1, 36, 71, 251, 192, 201, 255, 243, 255, 255, 128},

-      { 1, 49, 185, 250, 184, 199, 255, 242, 128, 128, 128},

-      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128},

-      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128}

-    },

-    {

-      /* Coeff Band 7 */

-      { 1, 19, 98, 255, 218, 222, 255, 255, 128, 128, 128},

-      { 36, 50, 210, 255, 212, 221, 255, 255, 128, 128, 128},

-      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128},

-      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128}

-    }

-  },

-  { /* block Type 3 */

-    { /* Coeff Band 0 */

-      { 192, 18, 155, 172, 145, 164, 192, 135, 246, 223, 255},

-      { 94, 29, 97, 131, 131, 153, 171, 121, 250, 190, 255},

-      { 25, 29, 63, 128, 119, 147, 168, 124, 251, 183, 255},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-    },

-    { /* Coeff Band 1 */

-      { 1, 108, 192, 220, 186, 173, 255, 194, 255, 255, 128},

-      { 123, 104, 188, 221, 165, 171, 247, 180, 255, 255, 128},

-      { 23, 76, 152, 216, 154, 166, 226, 182, 255, 209, 128},

-      { 1, 26, 52, 162, 109, 152, 208, 144, 255, 231, 128}

-    },

-    { /* Coeff Band 2 */

-      { 1, 57, 179, 220, 156, 175, 210, 158, 255, 223, 128},

-      { 48, 57, 134, 212, 151, 170, 219, 185, 255, 248, 128},

-      { 4, 35, 63, 189, 120, 156, 221, 159, 255, 241, 128},

-      { 1, 17, 23, 110, 97, 143, 187, 120, 255, 234, 128}

-    },

-    { /* Coeff Band 3 */

-      { 1, 115, 205, 243, 182, 187, 254, 218, 255, 255, 128},

-      { 80, 101, 186, 241, 183, 186, 249, 182, 255, 255, 128},

-      { 10, 81, 144, 229, 164, 175, 241, 185, 255, 255, 128},

-      { 1, 44, 81, 192, 130, 148, 240, 180, 255, 255, 128}

-    },

-    { /* Coeff Band 4 */

-      { 1, 161, 207, 249, 187, 176, 255, 180, 128, 128, 128},

-      { 79, 148, 196, 240, 186, 182, 253, 171, 255, 255, 128},

-      { 14, 111, 171, 233, 170, 178, 235, 204, 255, 255, 128},

-      { 1, 63, 103, 202, 143, 162, 240, 178, 255, 255, 128}

-    },

-    { /* Coeff Band 5 */

-      { 1, 101, 202, 239, 185, 184, 252, 186, 255, 255, 128},

-      { 43, 67, 166, 237, 178, 190, 246, 194, 255, 255, 128},

-      { 4, 49, 85, 220, 140, 168, 253, 182, 255, 255, 128},

-      { 1, 24, 35, 144, 93, 135, 239, 159, 255, 253, 128}

-    },

-    { /* Coeff Band 6 */

-      { 1, 212, 243, 255, 240, 234, 255, 255, 128, 128, 128},

-      { 98, 168, 234, 255, 229, 234, 255, 255, 128, 128, 128},

-      { 19, 127, 199, 255, 212, 198, 255, 255, 128, 128, 128},

-      { 1, 103, 162, 253, 186, 151, 255, 255, 128, 128, 128}

-    },

-    { /* Coeff Band 7 */

-      { 1, 188, 253, 255, 255, 128, 128, 128, 128, 128, 128},

-      { 191, 68, 242, 255, 255, 128, 128, 128, 128, 128, 128},

-      { 8, 132, 255, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-    }

-  }

-};

-static const vp9_prob

-  default_coef_probs_16x16[BLOCK_TYPES_16X16]

-                          [COEF_BANDS]

-                          [PREV_COEF_CONTEXTS]

-                          [ENTROPY_NODES] = {

-  { /* block Type 0 */

-    { /* Coeff Band 0 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-    },

-    { /* Coeff Band 1 */

-      { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},

-      { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},

-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},

-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}

-    },

-    { /* Coeff Band 2 */

-      { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},

-      { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},

-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},

-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}

-    },

-    { /* Coeff Band 3 */

-      { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},

-      { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},

-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},

-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}

-    },

-    { /* Coeff Band 4 */

-      { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},

-      { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},

-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},

-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}

-    },

-    { /* Coeff Band 5 */

-      { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},

-      { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},

-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},

-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}

-    },

-    { /* Coeff Band 6 */

-      { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},

-      { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},

-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},

-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}

-    },

-    { /* Coeff Band 7 */

-      { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},

-      { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},

-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},

-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}

-    }

-  },

-  { /* block Type 1 */

-      { /* Coeff Band 0 */

-        { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},

-        { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},

-        { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},

-        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-      },

-      { /* Coeff Band 1 */

-        { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},

-        { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},

-        { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},

-        { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}

-      },

-      { /* Coeff Band 2 */

-        { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},

-        { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},

-        { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},

-        { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}

-      },

-      { /* Coeff Band 3 */

-        { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},

-        { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},

-        { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},

-        { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}

-      },

-      { /* Coeff Band 4 */

-        { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},

-        { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},

-        { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},

-        { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}

-      },

-      { /* Coeff Band 5 */

-        { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},

-        { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},

-        { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},

-        { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}

-      },

-      { /* Coeff Band 6 */

-        { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},

-        { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},

-        { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},

-        { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}

-      },

-      { /* Coeff Band 7 */

-        { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},

-        { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},

-        { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},

-        { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}

-      }

-  },

-  { /* block Type 2 */

-      { /* Coeff Band 0 */

-        { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},

-        { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},

-        { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},

-        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-      },

-      { /* Coeff Band 1 */

-        { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},

-        { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},

-        { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},

-        { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}

-      },

-      { /* Coeff Band 2 */

-        { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},

-        { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},

-        { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},

-        { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}

-      },

-      { /* Coeff Band 3 */

-        { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},

-        { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},

-        { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},

-        { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}

-      },

-      { /* Coeff Band 4 */

-        { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},

-        { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},

-        { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},

-        { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}

-      },

-      { /* Coeff Band 5 */

-        { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},

-        { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},

-        { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},

-        { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}

-      },

-      { /* Coeff Band 6 */

-        { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},

-        { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},

-        { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},

-        { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}

-      },

-      { /* Coeff Band 7 */

-        { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},

-        { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},

-        { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},

-        { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}

-      }

-  },

-  { /* block Type 3 */

-    { /* Coeff Band 0 */

-      { 17, 105, 227, 195, 164, 170, 168, 137, 221, 160, 184},

-      { 6, 92, 166, 193, 158, 169, 179, 142, 236, 175, 200},

-      { 2, 68, 118, 193, 147, 168, 187, 149, 241, 178, 247},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-    },

-    { /* Coeff Band 1 */

-      { 1, 193, 221, 246, 198, 194, 244, 176, 255, 192, 128},

-      { 112, 160, 209, 244, 196, 194, 243, 175, 255, 209, 128},

-      { 45, 123, 175, 240, 184, 195, 239, 178, 255, 218, 255},

-      { 16, 53, 75, 169, 119, 152, 209, 146, 255, 219, 255}

-    },

-    { /* Coeff Band 2 */

-      { 1, 141, 183, 240, 176, 187, 246, 198, 255, 218, 128},

-      { 36, 97, 150, 231, 161, 180, 243, 191, 255, 217, 255},

-      { 8, 65, 111, 210, 143, 166, 230, 167, 255, 224, 255},

-      { 2, 35, 61, 157, 113, 149, 208, 142, 255, 217, 255}

-    },

-    { /* Coeff Band 3 */

-      { 1, 173, 196, 245, 184, 191, 252, 211, 255, 240, 128},

-      { 35, 119, 175, 242, 177, 187, 252, 209, 255, 235, 128},

-      { 4, 88, 141, 234, 161, 180, 249, 200, 255, 228, 128},

-      { 1, 57, 95, 203, 133, 161, 235, 167, 255, 231, 255}

-    },

-    { /* Coeff Band 4 */

-      { 1, 208, 227, 249, 209, 204, 248, 188, 255, 248, 128},

-      { 28, 162, 211, 247, 203, 200, 252, 188, 255, 232, 128},

-      { 5, 114, 174, 238, 182, 189, 245, 184, 255, 238, 128},

-      { 1, 61, 100, 205, 136, 164, 235, 163, 255, 239, 128}

-    },

-    { /* Coeff Band 5 */

-      { 1, 195, 218, 252, 208, 207, 250, 205, 255, 245, 128},

-      { 22, 141, 196, 249, 198, 201, 250, 202, 255, 244, 128},

-      { 2, 105, 163, 240, 178, 189, 246, 191, 255, 246, 128},

-      { 1, 70, 112, 206, 144, 167, 232, 162, 255, 239, 128}

-    },

-    { /* Coeff Band 6 */

-      { 1, 204, 215, 251, 204, 203, 255, 222, 255, 225, 128},

-      { 15, 140, 194, 249, 194, 199, 254, 221, 255, 253, 128},

-      { 1, 95, 153, 243, 172, 188, 254, 213, 255, 248, 128},

-      { 1, 59, 99, 216, 135, 166, 247, 190, 255, 237, 255}

-    },

-    { /* Coeff Band 7 */

-      { 1, 7, 231, 255, 227, 223, 255, 240, 255, 255, 128},

-      { 15, 157, 217, 255, 218, 219, 255, 239, 255, 255, 128},

-      { 1, 114, 182, 252, 198, 207, 255, 235, 255, 255, 128},

-      { 1, 71, 122, 238, 154, 181, 255, 216, 255, 255, 128}

-    }

-  }

-};

-static const vp9_prob

-  default_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]

-                                 [COEF_BANDS]

-                                 [PREV_COEF_CONTEXTS]

-                                 [ENTROPY_NODES] = {

-  { /* block Type 0 */

-    { /* Coeff Band 0 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-    },

-    { /* Coeff Band 1 */

-      { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},

-      { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},

-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},

-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}

-    },

-    { /* Coeff Band 2 */

-      { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},

-      { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},

-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},

-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}

-    },

-    { /* Coeff Band 3 */

-      { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},

-      { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},

-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},

-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}

-    },

-    { /* Coeff Band 4 */

-      { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},

-      { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},

-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},

-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}

-    },

-    { /* Coeff Band 5 */

-      { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},

-      { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},

-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},

-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}

-    },

-    { /* Coeff Band 6 */

-      { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},

-      { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},

-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},

-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}

-    },

-    { /* Coeff Band 7 */

-      { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},

-      { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},

-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},

-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}

-    }

-  },

-  { /* block Type 1 */

-      { /* Coeff Band 0 */

-        { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},

-        { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},

-        { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},

-        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-      },

-      { /* Coeff Band 1 */

-        { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},

-        { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},

-        { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},

-        { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}

-      },

-      { /* Coeff Band 2 */

-        { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},

-        { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},

-        { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},

-        { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}

-      },

-      { /* Coeff Band 3 */

-        { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},

-        { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},

-        { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},

-        { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}

-      },

-      { /* Coeff Band 4 */

-        { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},

-        { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},

-        { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},

-        { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}

-      },

-      { /* Coeff Band 5 */

-        { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},

-        { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},

-        { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},

-        { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}

-      },

-      { /* Coeff Band 6 */

-        { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},

-        { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},

-        { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},

-        { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}

-      },

-      { /* Coeff Band 7 */

-        { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},

-        { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},

-        { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},

-        { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}

-      }

-  },

-  { /* block Type 2 */

-      { /* Coeff Band 0 */

-        { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},

-        { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},

-        { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},

-        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-      },

-      { /* Coeff Band 1 */

-        { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},

-        { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},

-        { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},

-        { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}

-      },

-      { /* Coeff Band 2 */

-        { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},

-        { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},

-        { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},

-        { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}

-      },

-      { /* Coeff Band 3 */

-        { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},

-        { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},

-        { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},

-        { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}

-      },

-      { /* Coeff Band 4 */

-        { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},

-        { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},

-        { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},

-        { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}

-      },

-      { /* Coeff Band 5 */

-        { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},

-        { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},

-        { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},

-        { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}

-      },

-      { /* Coeff Band 6 */

-        { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},

-        { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},

-        { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},

-        { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}

-      },

-      { /* Coeff Band 7 */

-        { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},

-        { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},

-        { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},

-        { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}

-      }

-  },

-  { /* block Type 3 */

-    { /* Coeff Band 0 */

-      { 17, 105, 227, 195, 164, 170, 168, 137, 221, 160, 184},

-      { 6, 92, 166, 193, 158, 169, 179, 142, 236, 175, 200},

-      { 2, 68, 118, 193, 147, 168, 187, 149, 241, 178, 247},

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

-    },

-    { /* Coeff Band 1 */

-      { 1, 193, 221, 246, 198, 194, 244, 176, 255, 192, 128},

-      { 112, 160, 209, 244, 196, 194, 243, 175, 255, 209, 128},

-      { 45, 123, 175, 240, 184, 195, 239, 178, 255, 218, 255},

-      { 16, 53, 75, 169, 119, 152, 209, 146, 255, 219, 255}

-    },

-    { /* Coeff Band 2 */

-      { 1, 141, 183, 240, 176, 187, 246, 198, 255, 218, 128},

-      { 36, 97, 150, 231, 161, 180, 243, 191, 255, 217, 255},

-      { 8, 65, 111, 210, 143, 166, 230, 167, 255, 224, 255},

-      { 2, 35, 61, 157, 113, 149, 208, 142, 255, 217, 255}

-    },

-    { /* Coeff Band 3 */

-      { 1, 173, 196, 245, 184, 191, 252, 211, 255, 240, 128},

-      { 35, 119, 175, 242, 177, 187, 252, 209, 255, 235, 128},

-      { 4, 88, 141, 234, 161, 180, 249, 200, 255, 228, 128},

-      { 1, 57, 95, 203, 133, 161, 235, 167, 255, 231, 255}

-    },

-    { /* Coeff Band 4 */

-      { 1, 208, 227, 249, 209, 204, 248, 188, 255, 248, 128},

-      { 28, 162, 211, 247, 203, 200, 252, 188, 255, 232, 128},

-      { 5, 114, 174, 238, 182, 189, 245, 184, 255, 238, 128},

-      { 1, 61, 100, 205, 136, 164, 235, 163, 255, 239, 128}

-    },

-    { /* Coeff Band 5 */

-      { 1, 195, 218, 252, 208, 207, 250, 205, 255, 245, 128},

-      { 22, 141, 196, 249, 198, 201, 250, 202, 255, 244, 128},

-      { 2, 105, 163, 240, 178, 189, 246, 191, 255, 246, 128},

-      { 1, 70, 112, 206, 144, 167, 232, 162, 255, 239, 128}

-    },

-    { /* Coeff Band 6 */

-      { 1, 204, 215, 251, 204, 203, 255, 222, 255, 225, 128},

-      { 15, 140, 194, 249, 194, 199, 254, 221, 255, 253, 128},

-      { 1, 95, 153, 243, 172, 188, 254, 213, 255, 248, 128},

-      { 1, 59, 99, 216, 135, 166, 247, 190, 255, 237, 255}

-    },

-    { /* Coeff Band 7 */

-      { 1, 7, 231, 255, 227, 223, 255, 240, 255, 255, 128},

-      { 15, 157, 217, 255, 218, 219, 255, 239, 255, 255, 128},

-      { 1, 114, 182, 252, 198, 207, 255, 235, 255, 255, 128},

-      { 1, 71, 122, 238, 154, 181, 255, 216, 255, 255, 128}

-    }

-  }

-};

--- a/vp8/common/entropy.c

+++ /dev/null

@@ -1,447 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <stdio.h>

-#include "entropy.h"

-#include "string.h"

-#include "blockd.h"

-#include "onyxc_int.h"

-#include "entropymode.h"

-#include "vpx_mem/vpx_mem.h"

-#define uchar unsigned char     /* typedefs can clash */

-#define uint  unsigned int

-typedef const uchar cuchar;

-typedef const uint cuint;

-typedef vp9_prob Prob;

-#include "coefupdateprobs.h"

-const int vp9_i8x8_block[4] = {0, 2, 8, 10};

-DECLARE_ALIGNED(16, const unsigned char, vp9_norm[256]) = {

-  0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,

-  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

-};

-DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]) = {

-  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7

-};

-DECLARE_ALIGNED(16, cuchar, vp9_prev_token_class[MAX_ENTROPY_TOKENS]) = {

-  0, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0

-};

-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d[16]) = {

-  0,  1,  4,  8,

-  5,  2,  3,  6,

-  9, 12, 13, 10,

-  7, 11, 14, 15,

-};

-DECLARE_ALIGNED(16, const int, vp9_col_scan[16]) = {

-  0, 4,  8, 12,

-  1, 5,  9, 13,

-  2, 6, 10, 14,

-  3, 7, 11, 15

-};

-DECLARE_ALIGNED(16, const int, vp9_row_scan[16]) = {

-  0,   1,  2,  3,

-  4,   5,  6,  7,

-  8,   9, 10, 11,

-  12, 13, 14, 15

-};

-DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]) = { 0, 1, 2, 3, 5, 4, 4, 5,

-                                                           5, 3, 6, 3, 5, 4, 6, 6,

-                                                           6, 5, 5, 6, 6, 6, 6, 6,

-                                                           6, 6, 6, 6, 6, 6, 6, 6,

-                                                           6, 6, 6, 6, 7, 7, 7, 7,

-                                                           7, 7, 7, 7, 7, 7, 7, 7,

-                                                           7, 7, 7, 7, 7, 7, 7, 7,

-                                                           7, 7, 7, 7, 7, 7, 7, 7

-                                                         };

-DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]) = {

-  0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,

-  12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,

-  35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,

-  58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,

-};

-// Table can be optimized.

-DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]) = {

-    0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,

-    6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,

-    6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-};

-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = {

-      0,   1,  16,  32,  17,   2,   3,  18,  33,  48,  64,  49,  34,  19,   4,   5,

-     20,  35,  50,  65,  80,  96,  81,  66,  51,  36,  21,   6,   7,  22,  37,  52,

-     67,  82,  97, 112, 128, 113,  98,  83,  68,  53,  38,  23,   8,   9,  24,  39,

-     54,  69,  84,  99, 114, 129, 144, 160, 145, 130, 115, 100,  85,  70,  55,  40,

-     25,  10,  11,  26,  41,  56,  71,  86, 101, 116, 131, 146, 161, 176, 192, 177,

-    162, 147, 132, 117, 102,  87,  72,  57,  42,  27,  12,  13,  28,  43,  58,  73,

-     88, 103, 118, 133, 148, 163, 178, 193, 208, 224, 209, 194, 179, 164, 149, 134,

-    119, 104,  89,  74,  59,  44,  29,  14,  15,  30,  45,  60,  75,  90, 105, 120,

-    135, 150, 165, 180, 195, 210, 225, 240, 241, 226, 211, 196, 181, 166, 151, 136,

-    121, 106,  91,  76,  61,  46,  31,  47,  62,  77,  92, 107, 122, 137, 152, 167,

-    182, 197, 212, 227, 242, 243, 228, 213, 198, 183, 168, 153, 138, 123, 108,  93,

-     78,  63,  79,  94, 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230,

-    215, 200, 185, 170, 155, 140, 125, 110,  95, 111, 126, 141, 156, 171, 186, 201,

-    216, 231, 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188,

-    203, 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235,

-    250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, 255,

-};

-/* Array indices are identical to previously-existing CONTEXT_NODE indices */

-const vp9_tree_index vp9_coef_tree[ 22] =     /* corresponding _CONTEXT_NODEs */

-{

-  -DCT_EOB_TOKEN, 2,                             /* 0 = EOB */

-  -ZERO_TOKEN, 4,                               /* 1 = ZERO */

-  -ONE_TOKEN, 6,                               /* 2 = ONE */

-  8, 12,                                      /* 3 = LOW_VAL */

-  -TWO_TOKEN, 10,                            /* 4 = TWO */

-  -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */

-  14, 16,                                    /* 6 = HIGH_LOW */

-  -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */

-  18, 20,                                   /* 8 = CAT_THREEFOUR */

-  -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,  /* 9 = CAT_THREE */

-  -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6   /* 10 = CAT_FIVE */

-};

-struct vp9_token_struct vp9_coef_encodings[MAX_ENTROPY_TOKENS];

-/* Trees for extra bits.  Probabilities are constant and

-   do not depend on previously encoded bits */

-static const Prob Pcat1[] = { 159};

-static const Prob Pcat2[] = { 165, 145};

-static const Prob Pcat3[] = { 173, 148, 140};

-static const Prob Pcat4[] = { 176, 155, 140, 135};

-static const Prob Pcat5[] = { 180, 157, 141, 134, 130};

-static const Prob Pcat6[] =

-{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129};

-static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[26];

-static void init_bit_tree(vp9_tree_index *p, int n) {

-  int i = 0;

-  while (++i < n) {

-    p[0] = p[1] = i << 1;

-    p += 2;

-  }

-  p[0] = p[1] = 0;

-}

-static void init_bit_trees() {

-  init_bit_tree(cat1, 1);

-  init_bit_tree(cat2, 2);

-  init_bit_tree(cat3, 3);

-  init_bit_tree(cat4, 4);

-  init_bit_tree(cat5, 5);

-  init_bit_tree(cat6, 13);

-}

-vp9_extra_bit_struct vp9_extra_bits[12] = {

-  { 0, 0, 0, 0},

-  { 0, 0, 0, 1},

-  { 0, 0, 0, 2},

-  { 0, 0, 0, 3},

-  { 0, 0, 0, 4},

-  { cat1, Pcat1, 1, 5},

-  { cat2, Pcat2, 2, 7},

-  { cat3, Pcat3, 3, 11},

-  { cat4, Pcat4, 4, 19},

-  { cat5, Pcat5, 5, 35},

-  { cat6, Pcat6, 13, 67},

-  { 0, 0, 0, 0}

-};

-#include "default_coef_probs.h"

-void vp9_default_coef_probs(VP9_COMMON *pc) {

-  vpx_memcpy(pc->fc.coef_probs, default_coef_probs,

-             sizeof(pc->fc.coef_probs));

-  vpx_memcpy(pc->fc.hybrid_coef_probs, default_hybrid_coef_probs,

-             sizeof(pc->fc.hybrid_coef_probs));

-  vpx_memcpy(pc->fc.coef_probs_8x8, default_coef_probs_8x8,

-             sizeof(pc->fc.coef_probs_8x8));

-  vpx_memcpy(pc->fc.hybrid_coef_probs_8x8, default_hybrid_coef_probs_8x8,

-             sizeof(pc->fc.hybrid_coef_probs_8x8));

-  vpx_memcpy(pc->fc.coef_probs_16x16, default_coef_probs_16x16,

-             sizeof(pc->fc.coef_probs_16x16));

-  vpx_memcpy(pc->fc.hybrid_coef_probs_16x16,

-             default_hybrid_coef_probs_16x16,

-             sizeof(pc->fc.hybrid_coef_probs_16x16));

-}

-void vp9_coef_tree_initialize() {

-  init_bit_trees();

-  vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);

-}

-// #define COEF_COUNT_TESTING

-#define COEF_COUNT_SAT 24

-#define COEF_MAX_UPDATE_FACTOR 112

-#define COEF_COUNT_SAT_KEY 24

-#define COEF_MAX_UPDATE_FACTOR_KEY 112

-#define COEF_COUNT_SAT_AFTER_KEY 24

-#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128

-void vp9_adapt_coef_probs(VP9_COMMON *cm) {

-  int t, i, j, k, count;

-  unsigned int branch_ct[ENTROPY_NODES][2];

-  vp9_prob coef_probs[ENTROPY_NODES];

-  int update_factor; /* denominator 256 */

-  int factor;

-  int count_sat;

-  // printf("Frame type: %d\n", cm->frame_type);

-  if (cm->frame_type == KEY_FRAME) {

-    update_factor = COEF_MAX_UPDATE_FACTOR_KEY;

-    count_sat = COEF_COUNT_SAT_KEY;

-  } else if (cm->last_frame_type == KEY_FRAME) {

-    update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY;  /* adapt quickly */

-    count_sat = COEF_COUNT_SAT_AFTER_KEY;

-  } else {

-    update_factor = COEF_MAX_UPDATE_FACTOR;

-    count_sat = COEF_COUNT_SAT;

-  }

-#ifdef COEF_COUNT_TESTING

-  {

-    printf("static const unsigned int\ncoef_counts"

-           "[BLOCK_TYPES] [COEF_BANDS]"

-           "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");

-    for (i = 0; i < BLOCK_TYPES; ++i) {

-      printf("  {\n");

-      for (j = 0; j < COEF_BANDS; ++j) {

-        printf("    {\n");

-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-          printf("      {");

-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

-            printf("%d, ", cm->fc.coef_counts[i][j][k][t]);

-          printf("},\n");

-        }

-        printf("    },\n");

-      }

-      printf("  },\n");

-    }

-    printf("};\n");

-    printf("static const unsigned int\ncoef_counts_8x8"

-           "[BLOCK_TYPES_8X8] [COEF_BANDS]"

-           "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");

-    for (i = 0; i < BLOCK_TYPES_8X8; ++i) {

-      printf("  {\n");

-      for (j = 0; j < COEF_BANDS; ++j) {

-        printf("    {\n");

-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-          printf("      {");

-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

-            printf("%d, ", cm->fc.coef_counts_8x8[i][j][k][t]);

-          printf("},\n");

-        }

-        printf("    },\n");

-      }

-      printf("  },\n");

-    }

-    printf("};\n");

-    printf("static const unsigned int\nhybrid_coef_counts"

-           "[BLOCK_TYPES] [COEF_BANDS]"

-           "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");

-    for (i = 0; i < BLOCK_TYPES; ++i) {

-      printf("  {\n");

-      for (j = 0; j < COEF_BANDS; ++j) {

-        printf("    {\n");

-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-          printf("      {");

-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

-            printf("%d, ", cm->fc.hybrid_coef_counts[i][j][k][t]);

-          printf("},\n");

-        }

-        printf("    },\n");

-      }

-      printf("  },\n");

-    }

-    printf("};\n");

-  }

-#endif

-  for (i = 0; i < BLOCK_TYPES; ++i)

-    for (j = 0; j < COEF_BANDS; ++j)

-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

-          continue;

-        vp9_tree_probs_from_distribution(

-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

-          coef_probs, branch_ct, cm->fc.coef_counts [i][j][k],

-          256, 1);

-        for (t = 0; t < ENTROPY_NODES; ++t) {

-          int prob;

-          count = branch_ct[t][0] + branch_ct[t][1];

-          count = count > count_sat ? count_sat : count;

-          factor = (update_factor * count / count_sat);

-          prob = ((int)cm->fc.pre_coef_probs[i][j][k][t] * (256 - factor) +

-                  (int)coef_probs[t] * factor + 128) >> 8;

-          if (prob <= 0) cm->fc.coef_probs[i][j][k][t] = 1;

-          else if (prob > 255) cm->fc.coef_probs[i][j][k][t] = 255;

-          else cm->fc.coef_probs[i][j][k][t] = prob;

-        }

-      }

-  for (i = 0; i < BLOCK_TYPES; ++i)

-    for (j = 0; j < COEF_BANDS; ++j)

-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

-          continue;

-        vp9_tree_probs_from_distribution(

-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

-          coef_probs, branch_ct, cm->fc.hybrid_coef_counts [i][j][k],

-          256, 1);

-        for (t = 0; t < ENTROPY_NODES; ++t) {

-          int prob;

-          count = branch_ct[t][0] + branch_ct[t][1];

-          count = count > count_sat ? count_sat : count;

-          factor = (update_factor * count / count_sat);

-          prob = ((int)cm->fc.pre_hybrid_coef_probs[i][j][k][t] * (256 - factor) +

-                  (int)coef_probs[t] * factor + 128) >> 8;

-          if (prob <= 0) cm->fc.hybrid_coef_probs[i][j][k][t] = 1;

-          else if (prob > 255) cm->fc.hybrid_coef_probs[i][j][k][t] = 255;

-          else cm->fc.hybrid_coef_probs[i][j][k][t] = prob;

-        }

-      }

-  for (i = 0; i < BLOCK_TYPES_8X8; ++i)

-    for (j = 0; j < COEF_BANDS; ++j)

-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

-          continue;

-        vp9_tree_probs_from_distribution(

-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

-          coef_probs, branch_ct, cm->fc.coef_counts_8x8 [i][j][k],

-          256, 1);

-        for (t = 0; t < ENTROPY_NODES; ++t) {

-          int prob;

-          count = branch_ct[t][0] + branch_ct[t][1];

-          count = count > count_sat ? count_sat : count;

-          factor = (update_factor * count / count_sat);

-          prob = ((int)cm->fc.pre_coef_probs_8x8[i][j][k][t] * (256 - factor) +

-                  (int)coef_probs[t] * factor + 128) >> 8;

-          if (prob <= 0) cm->fc.coef_probs_8x8[i][j][k][t] = 1;

-          else if (prob > 255) cm->fc.coef_probs_8x8[i][j][k][t] = 255;

-          else cm->fc.coef_probs_8x8[i][j][k][t] = prob;

-        }

-      }

-  for (i = 0; i < BLOCK_TYPES_8X8; ++i)

-    for (j = 0; j < COEF_BANDS; ++j)

-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

-          continue;

-        vp9_tree_probs_from_distribution(

-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

-          coef_probs, branch_ct, cm->fc.hybrid_coef_counts_8x8 [i][j][k],

-          256, 1);

-        for (t = 0; t < ENTROPY_NODES; ++t) {

-          int prob;

-          count = branch_ct[t][0] + branch_ct[t][1];

-          count = count > count_sat ? count_sat : count;

-          factor = (update_factor * count / count_sat);

-          prob = ((int)cm->fc.pre_hybrid_coef_probs_8x8[i][j][k][t] *

-                  (256 - factor) +

-                  (int)coef_probs[t] * factor + 128) >> 8;

-          if (prob <= 0) cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = 1;

-          else if (prob > 255) cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = 255;

-          else cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = prob;

-        }

-      }

-  for (i = 0; i < BLOCK_TYPES_16X16; ++i)

-    for (j = 0; j < COEF_BANDS; ++j)

-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

-          continue;

-        vp9_tree_probs_from_distribution(

-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

-          coef_probs, branch_ct, cm->fc.coef_counts_16x16[i][j][k], 256, 1);

-        for (t = 0; t < ENTROPY_NODES; ++t) {

-          int prob;

-          count = branch_ct[t][0] + branch_ct[t][1];

-          count = count > count_sat ? count_sat : count;

-          factor = (update_factor * count / count_sat);

-          prob = ((int)cm->fc.pre_coef_probs_16x16[i][j][k][t] *

-                  (256 - factor) +

-                  (int)coef_probs[t] * factor + 128) >> 8;

-          if (prob <= 0) cm->fc.coef_probs_16x16[i][j][k][t] = 1;

-          else if (prob > 255) cm->fc.coef_probs_16x16[i][j][k][t] = 255;

-          else cm->fc.coef_probs_16x16[i][j][k][t] = prob;

-        }

-      }

-  for (i = 0; i < BLOCK_TYPES_16X16; ++i)

-    for (j = 0; j < COEF_BANDS; ++j)

-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

-          continue;

-        vp9_tree_probs_from_distribution(

-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

-          coef_probs, branch_ct, cm->fc.hybrid_coef_counts_16x16[i][j][k], 256, 1);

-        for (t = 0; t < ENTROPY_NODES; ++t) {

-          int prob;

-          count = branch_ct[t][0] + branch_ct[t][1];

-          count = count > count_sat ? count_sat : count;

-          factor = (update_factor * count / count_sat);

-          prob = ((int)cm->fc.pre_hybrid_coef_probs_16x16[i][j][k][t] * (256 - factor) +

-                  (int)coef_probs[t] * factor + 128) >> 8;

-          if (prob <= 0) cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = 1;

-          else if (prob > 255) cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = 255;

-          else cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = prob;

-        }

-      }

-}

--- a/vp8/common/entropy.h

+++ /dev/null

@@ -1,112 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_ENTROPY_H

-#define __INC_ENTROPY_H

-#include "treecoder.h"

-#include "blockd.h"

-#include "common.h"

-#include "coefupdateprobs.h"

-extern const int vp9_i8x8_block[4];

-/* Coefficient token alphabet */

-#define ZERO_TOKEN              0       /* 0         Extra Bits 0+0 */

-#define ONE_TOKEN               1       /* 1         Extra Bits 0+1 */

-#define TWO_TOKEN               2       /* 2         Extra Bits 0+1 */

-#define THREE_TOKEN             3       /* 3         Extra Bits 0+1 */

-#define FOUR_TOKEN              4       /* 4         Extra Bits 0+1 */

-#define DCT_VAL_CATEGORY1       5       /* 5-6       Extra Bits 1+1 */

-#define DCT_VAL_CATEGORY2       6       /* 7-10      Extra Bits 2+1 */

-#define DCT_VAL_CATEGORY3       7       /* 11-18     Extra Bits 3+1 */

-#define DCT_VAL_CATEGORY4       8       /* 19-34     Extra Bits 4+1 */

-#define DCT_VAL_CATEGORY5       9       /* 35-66     Extra Bits 5+1 */

-#define DCT_VAL_CATEGORY6       10      /* 67+       Extra Bits 13+1 */

-#define DCT_EOB_TOKEN           11      /* EOB       Extra Bits 0+0 */

-#define MAX_ENTROPY_TOKENS 12

-#define ENTROPY_NODES 11

-#define EOSB_TOKEN              127     /* Not signalled, encoder only */

-extern const vp9_tree_index vp9_coef_tree[];

-extern struct vp9_token_struct vp9_coef_encodings[MAX_ENTROPY_TOKENS];

-typedef struct {

-  vp9_tree_p tree;

-  const vp9_prob *prob;

-  int Len;

-  int base_val;

-} vp9_extra_bit_struct;

-extern vp9_extra_bit_struct vp9_extra_bits[12];    /* indexed by token value */

-#define PROB_UPDATE_BASELINE_COST   7

-#define MAX_PROB                255

-#define DCT_MAX_VALUE           8192

-/* Coefficients are predicted via a 3-dimensional probability table. */

-/* Outside dimension.  0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */

-#define BLOCK_TYPES 4

-#define BLOCK_TYPES_8X8 4

-#define BLOCK_TYPES_16X16 4

-/* Middle dimension is a coarsening of the coefficient's

-   position within the 4x4 DCT. */

-#define COEF_BANDS 8

-extern DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]);

-extern DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]);

-extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]);

-/* Inside dimension is 3-valued measure of nearby complexity, that is,

-   the extent to which nearby coefficients are nonzero.  For the first

-   coefficient (DC, unless block type is 0), we look at the (already encoded)

-   blocks above and to the left of the current block.  The context index is

-   then the number (0,1,or 2) of these blocks having nonzero coefficients.

-   After decoding a coefficient, the measure is roughly the size of the

-   most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1).

-   Note that the intuitive meaning of this measure changes as coefficients

-   are decoded, e.g., prior to the first token, a zero means that my neighbors

-   are empty while, after the first token, because of the use of end-of-block,

-   a zero means we just decoded a zero and hence guarantees that a non-zero

-   coefficient will appear later in this block.  However, this shift

-   in meaning is perfectly OK because our context depends also on the

-   coefficient band (and since zigzag positions 0, 1, and 2 are in

-   distinct bands). */

-/*# define DC_TOKEN_CONTEXTS        3*/ /* 00, 0!0, !0!0 */

-#define PREV_COEF_CONTEXTS       4

-#define SUBEXP_PARAM                4   /* Subexponential code parameter */

-#define MODULUS_PARAM               13  /* Modulus parameter */

-extern DECLARE_ALIGNED(16, const unsigned char, vp9_prev_token_class[MAX_ENTROPY_TOKENS]);

-struct VP9Common;

-void vp9_default_coef_probs(struct VP9Common *);

-extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d[16]);

-extern DECLARE_ALIGNED(16, const int, vp9_col_scan[16]);

-extern DECLARE_ALIGNED(16, const int, vp9_row_scan[16]);

-extern DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]);

-void vp9_coef_tree_initialize(void);

-extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]);

-void vp9_adapt_coef_probs(struct VP9Common *);

-#endif

--- a/vp8/common/entropymode.c

+++ /dev/null

@@ -1,614 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "onyxc_int.h"

-#include "modecont.h"

-#include "vpx_mem/vpx_mem.h"

-static const unsigned int kf_y_mode_cts[8][VP9_YMODES] = {

-  /* DC V   H  D45 135 117 153 D27 D63 TM i8x8 BPRED */

-  {12,  6,  5,  5,  5,  5,  5,  5,  5,  2, 22, 200},

-  {25, 13, 13,  7,  7,  7,  7,  7,  7,  6, 27, 160},

-  {31, 17, 18,  8,  8,  8,  8,  8,  8,  9, 26, 139},

-  {40, 22, 23,  8,  8,  8,  8,  8,  8, 12, 27, 116},

-  {53, 26, 28,  8,  8,  8,  8,  8,  8, 13, 26,  94},

-  {68, 33, 35,  8,  8,  8,  8,  8,  8, 17, 20,  68},

-  {78, 38, 38,  8,  8,  8,  8,  8,  8, 19, 16,  52},

-  {89, 42, 42,  8,  8,  8,  8,  8,  8, 21, 12,  34},

-};

-static const unsigned int y_mode_cts  [VP9_YMODES] = {

-  /* DC V   H  D45 135 117 153 D27 D63 TM i8x8 BPRED */

-  98, 19, 15, 14, 14, 14, 14, 12, 12, 13, 16, 70

-};

-static const unsigned int uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {

-  /* DC   V   H  D45 135 117 153 D27 D63 TM */

-  { 200, 15, 15, 10, 10, 10, 10, 10, 10,  6}, /* DC */

-  { 130, 75, 10, 10, 10, 10, 10, 10, 10,  6}, /* V */

-  { 130, 10, 75, 10, 10, 10, 10, 10, 10,  6}, /* H */

-  { 130, 15, 10, 75, 10, 10, 10, 10, 10,  6}, /* D45 */

-  { 150, 15, 10, 10, 75, 10, 10, 10, 10,  6}, /* D135 */

-  { 150, 15, 10, 10, 10, 75, 10, 10, 10,  6}, /* D117 */

-  { 150, 15, 10, 10, 10, 10, 75, 10, 10,  6}, /* D153 */

-  { 150, 15, 10, 10, 10, 10, 10, 75, 10,  6}, /* D27 */

-  { 150, 15, 10, 10, 10, 10, 10, 10, 75,  6}, /* D63 */

-  { 160, 30, 30, 10, 10, 10, 10, 10, 10, 16}, /* TM */

-  { 132, 46, 40, 10, 10, 10, 10, 10, 10, 18}, /* i8x8 - never used */

-  { 150, 35, 41, 10, 10, 10, 10, 10, 10, 10}, /* BPRED */

-};

-static const unsigned int i8x8_mode_cts  [VP9_I8X8_MODES] = {

-  /* DC V   H D45 135 117 153 D27 D63  TM */

-  73, 49, 61, 30, 30, 30, 30, 30, 30, 13

-};

-static const unsigned int kf_uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {

-  // DC   V   H  D45 135 117 153 D27 D63 TM

-  { 160, 24, 24, 20, 20, 20, 20, 20, 20,  8}, /* DC */

-  { 102, 64, 30, 20, 20, 20, 20, 20, 20, 10}, /* V */

-  { 102, 30, 64, 20, 20, 20, 20, 20, 20, 10}, /* H */

-  { 102, 33, 20, 64, 20, 20, 20, 20, 20, 14}, /* D45 */

-  { 102, 33, 20, 20, 64, 20, 20, 20, 20, 14}, /* D135 */

-  { 122, 33, 20, 20, 20, 64, 20, 20, 20, 14}, /* D117 */

-  { 102, 33, 20, 20, 20, 20, 64, 20, 20, 14}, /* D153 */

-  { 102, 33, 20, 20, 20, 20, 20, 64, 20, 14}, /* D27 */

-  { 102, 33, 20, 20, 20, 20, 20, 20, 64, 14}, /* D63 */

-  { 132, 36, 30, 20, 20, 20, 20, 20, 20, 18}, /* TM */

-  { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* i8x8 - never used */

-  { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* BPRED */

-};

-static const unsigned int bmode_cts[VP9_BINTRAMODES] = {

-  /* DC    TM     VE     HE   LD    RD    VR    VL    HD    HU */

-  43891, 17694, 10036, 3920, 3363, 2546, 5119, 3221, 2471, 1723

-};

-typedef enum {

-  SUBMVREF_NORMAL,

-  SUBMVREF_LEFT_ZED,

-  SUBMVREF_ABOVE_ZED,

-  SUBMVREF_LEFT_ABOVE_SAME,

-  SUBMVREF_LEFT_ABOVE_ZED

-} sumvfref_t;

-int vp9_mv_cont(const int_mv *l, const int_mv *a) {

-  int lez = (l->as_int == 0);

-  int aez = (a->as_int == 0);

-  int lea = (l->as_int == a->as_int);

-  if (lea && lez)

-    return SUBMVREF_LEFT_ABOVE_ZED;

-  if (lea)

-    return SUBMVREF_LEFT_ABOVE_SAME;

-  if (aez)

-    return SUBMVREF_ABOVE_ZED;

-  if (lez)

-    return SUBMVREF_LEFT_ZED;

-  return SUBMVREF_NORMAL;

-}

-const vp9_prob vp9_sub_mv_ref_prob [VP9_SUBMVREFS - 1] = { 180, 162, 25};

-const vp9_prob vp9_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP9_SUBMVREFS - 1] = {

-  { 147, 136, 18 },

-  { 106, 145, 1  },

-  { 179, 121, 1  },

-  { 223, 1, 34 },

-  { 208, 1, 1  }

-};

-vp9_mbsplit vp9_mbsplits [VP9_NUMMBSPLITS] = {

-  {

-    0,  0,  0,  0,

-    0,  0,  0,  0,

-    1,  1,  1,  1,

-    1,  1,  1,  1,

-  }, {

-    0,  0,  1,  1,

-    0,  0,  1,  1,

-    0,  0,  1,  1,

-    0,  0,  1,  1,

-  }, {

-    0,  0,  1,  1,

-    0,  0,  1,  1,

-    2,  2,  3,  3,

-    2,  2,  3,  3,

-  }, {

-    0,  1,  2,  3,

-    4,  5,  6,  7,

-    8,  9,  10, 11,

-    12, 13, 14, 15,

-  },

-};

-const int vp9_mbsplit_count [VP9_NUMMBSPLITS] = { 2, 2, 4, 16};

-const vp9_prob vp9_mbsplit_probs [VP9_NUMMBSPLITS - 1] = { 110, 111, 150};

-/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */

-const vp9_tree_index vp9_bmode_tree[VP9_BINTRAMODES * 2 - 2] = /* INTRAMODECONTEXTNODE value */

-{

-  -B_DC_PRED, 2,                             /* 0 = DC_NODE */

-  -B_TM_PRED, 4,                            /* 1 = TM_NODE */

-  -B_VE_PRED, 6,                           /* 2 = VE_NODE */

-  8, 12,                                  /* 3 = COM_NODE */

-  -B_HE_PRED, 10,                        /* 4 = HE_NODE */

-  -B_RD_PRED, -B_VR_PRED,               /* 5 = RD_NODE */

-  -B_LD_PRED, 14,                        /* 6 = LD_NODE */

-  -B_VL_PRED, 16,                      /* 7 = VL_NODE */

-  -B_HD_PRED, -B_HU_PRED             /* 8 = HD_NODE */

-};

-/* Again, these trees use the same probability indices as their

-   explicitly-programmed predecessors. */

-const vp9_tree_index vp9_ymode_tree[VP9_YMODES * 2 - 2] = {

-  2, 14,

-  -DC_PRED, 4,

-  6, 8,

-  -D45_PRED, -D135_PRED,

-  10, 12,

-  -D117_PRED, -D153_PRED,

-  -D27_PRED, -D63_PRED,

-  16, 18,

-  -V_PRED, -H_PRED,

-  -TM_PRED, 20,

-  -B_PRED, -I8X8_PRED

-};

-const vp9_tree_index vp9_kf_ymode_tree[VP9_YMODES * 2 - 2] = {

-  2, 14,

-  -DC_PRED, 4,

-  6, 8,

-  -D45_PRED, -D135_PRED,

-  10, 12,

-  -D117_PRED, -D153_PRED,

-  -D27_PRED, -D63_PRED,

-  16, 18,

-  -V_PRED, -H_PRED,

-  -TM_PRED, 20,

-  -B_PRED, -I8X8_PRED

-};

-const vp9_tree_index vp9_i8x8_mode_tree[VP9_I8X8_MODES * 2 - 2] = {

-  2, 14,

-  -DC_PRED, 4,

-  6, 8,

-  -D45_PRED, -D135_PRED,

-  10, 12,

-  -D117_PRED, -D153_PRED,

-  -D27_PRED, -D63_PRED,

-  -V_PRED, 16,

-  -H_PRED, -TM_PRED

-};

-const vp9_tree_index vp9_uv_mode_tree[VP9_UV_MODES * 2 - 2] = {

-  2, 14,

-  -DC_PRED, 4,

-  6, 8,

-  -D45_PRED, -D135_PRED,

-  10, 12,

-  -D117_PRED, -D153_PRED,

-  -D27_PRED, -D63_PRED,

-  -V_PRED, 16,

-  -H_PRED, -TM_PRED

-};

-const vp9_tree_index vp9_mbsplit_tree[6] = {

-  -PARTITIONING_4X4,   2,

-  -PARTITIONING_8X8,   4,

-  -PARTITIONING_16X8, -PARTITIONING_8X16,

-};

-const vp9_tree_index vp9_mv_ref_tree[8] = {

-  -ZEROMV, 2,

-  -NEARESTMV, 4,

-  -NEARMV, 6,

-  -NEWMV, -SPLITMV

-};

-#if CONFIG_SUPERBLOCKS

-const vp9_tree_index vp9_sb_mv_ref_tree[6] = {

-  -ZEROMV, 2,

-  -NEARESTMV, 4,

-  -NEARMV, -NEWMV

-};

-#endif

-const vp9_tree_index vp9_sub_mv_ref_tree[6] = {

-  -LEFT4X4, 2,

-  -ABOVE4X4, 4,

-  -ZERO4X4, -NEW4X4

-};

-struct vp9_token_struct vp9_bmode_encodings   [VP9_BINTRAMODES];

-struct vp9_token_struct vp9_ymode_encodings   [VP9_YMODES];

-#if CONFIG_SUPERBLOCKS

-struct vp9_token_struct vp9_sb_kf_ymode_encodings [VP9_I32X32_MODES];

-#endif

-struct vp9_token_struct vp9_kf_ymode_encodings [VP9_YMODES];

-struct vp9_token_struct vp9_uv_mode_encodings  [VP9_UV_MODES];

-struct vp9_token_struct vp9_i8x8_mode_encodings  [VP9_I8X8_MODES];

-struct vp9_token_struct vp9_mbsplit_encodings [VP9_NUMMBSPLITS];

-struct vp9_token_struct vp9_mv_ref_encoding_array    [VP9_MVREFS];

-#if CONFIG_SUPERBLOCKS

-struct vp9_token_struct vp9_sb_mv_ref_encoding_array  [VP9_MVREFS];

-#endif

-struct vp9_token_struct vp9_sub_mv_ref_encoding_array [VP9_SUBMVREFS];

-void vp9_init_mbmode_probs(VP9_COMMON *x) {

-  unsigned int bct [VP9_YMODES] [2];      /* num Ymodes > num UV modes */

-  vp9_tree_probs_from_distribution(VP9_YMODES, vp9_ymode_encodings,

-                                   vp9_ymode_tree, x->fc.ymode_prob,

-                                   bct, y_mode_cts, 256, 1);

-  {

-    int i;

-    for (i = 0; i < 8; i++) {

-      vp9_tree_probs_from_distribution(VP9_YMODES, vp9_kf_ymode_encodings,

-                                       vp9_kf_ymode_tree, x->kf_ymode_prob[i],

-                                       bct, kf_y_mode_cts[i], 256, 1);

-#if CONFIG_SUPERBLOCKS

-      vp9_tree_probs_from_distribution(VP9_I32X32_MODES,

-                                       vp9_sb_kf_ymode_encodings,

-                                       vp9_sb_ymode_tree,

-                                       x->sb_kf_ymode_prob[i], bct,

-                                       kf_y_mode_cts[i], 256, 1);

-#endif

-    }

-  }

-  {

-    int i;

-    for (i = 0; i < VP9_YMODES; i++) {

-      vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,

-                                       vp9_uv_mode_tree, x->kf_uv_mode_prob[i],

-                                       bct, kf_uv_mode_cts[i], 256, 1);

-      vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,

-                                       vp9_uv_mode_tree, x->fc.uv_mode_prob[i],

-                                       bct, uv_mode_cts[i], 256, 1);

-    }

-  }

-  vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings,

-                                   vp9_i8x8_mode_tree, x->fc.i8x8_mode_prob,

-                                   bct, i8x8_mode_cts, 256, 1);

-  vpx_memcpy(x->fc.sub_mv_ref_prob, vp9_sub_mv_ref_prob2,

-             sizeof(vp9_sub_mv_ref_prob2));

-  vpx_memcpy(x->fc.mbsplit_prob, vp9_mbsplit_probs, sizeof(vp9_mbsplit_probs));

-  vpx_memcpy(x->fc.switchable_interp_prob, vp9_switchable_interp_prob,

-             sizeof(vp9_switchable_interp_prob));

-}

-static void intra_bmode_probs_from_distribution(

-  vp9_prob p [VP9_BINTRAMODES - 1],

-  unsigned int branch_ct [VP9_BINTRAMODES - 1] [2],

-  const unsigned int events [VP9_BINTRAMODES]) {

-  vp9_tree_probs_from_distribution(VP9_BINTRAMODES, vp9_bmode_encodings,

-                                   vp9_bmode_tree, p, branch_ct,

-                                   events, 256, 1);

-}

-void vp9_default_bmode_probs(vp9_prob p [VP9_BINTRAMODES - 1]) {

-  unsigned int branch_ct [VP9_BINTRAMODES - 1] [2];

-  intra_bmode_probs_from_distribution(p, branch_ct, bmode_cts);

-}

-void vp9_kf_default_bmode_probs(vp9_prob p[VP9_BINTRAMODES][VP9_BINTRAMODES]

-                                          [VP9_BINTRAMODES - 1]) {

-  unsigned int branch_ct[VP9_BINTRAMODES - 1][2];

-  int i, j;

-  for (i = 0; i < VP9_BINTRAMODES; i++) {

-    for (j = 0; j < VP9_BINTRAMODES; j++) {

-      intra_bmode_probs_from_distribution(

-        p[i][j], branch_ct, vp9_kf_default_bmode_counts[i][j]);

-    }

-  }

-}

-#if VP9_SWITCHABLE_FILTERS == 3

-const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {

-  -0, 2,

-  -1, -2

-};

-struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];

-const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {

-  EIGHTTAP, SIXTAP, EIGHTTAP_SHARP};

-const int vp9_switchable_interp_map[SWITCHABLE+1] = {1, -1, 0, 2, -1};

-const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]

-                                          [VP9_SWITCHABLE_FILTERS-1] = {

-  {248, 192}, { 32, 248}, { 32,  32}, {192, 160}

-};

-#elif VP9_SWITCHABLE_FILTERS == 2

-const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {

-  -0, -1,

-};

-struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];

-const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]

-                                          [VP9_SWITCHABLE_FILTERS-1] = {

-  {248},

-  { 64},

-  {192},

-};

-const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {

-  EIGHTTAP, EIGHTTAP_SHARP};

-const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, -1, 0, 1, -1}; //8, 8s

-#endif

-void vp9_entropy_mode_init() {

-  vp9_tokens_from_tree(vp9_bmode_encodings,   vp9_bmode_tree);

-  vp9_tokens_from_tree(vp9_ymode_encodings,   vp9_ymode_tree);

-  vp9_tokens_from_tree(vp9_kf_ymode_encodings, vp9_kf_ymode_tree);

-#if CONFIG_SUPERBLOCKS

-  vp9_tokens_from_tree(vp9_sb_kf_ymode_encodings, vp9_sb_ymode_tree);

-#endif

-  vp9_tokens_from_tree(vp9_uv_mode_encodings,  vp9_uv_mode_tree);

-  vp9_tokens_from_tree(vp9_i8x8_mode_encodings,  vp9_i8x8_mode_tree);

-  vp9_tokens_from_tree(vp9_mbsplit_encodings, vp9_mbsplit_tree);

-  vp9_tokens_from_tree(vp9_switchable_interp_encodings,

-                       vp9_switchable_interp_tree);

-  vp9_tokens_from_tree_offset(vp9_mv_ref_encoding_array,

-                              vp9_mv_ref_tree, NEARESTMV);

-#if CONFIG_SUPERBLOCKS

-  vp9_tokens_from_tree_offset(vp9_sb_mv_ref_encoding_array,

-                              vp9_sb_mv_ref_tree, NEARESTMV);

-#endif

-  vp9_tokens_from_tree_offset(vp9_sub_mv_ref_encoding_array,

-                              vp9_sub_mv_ref_tree, LEFT4X4);

-}

-void vp9_init_mode_contexts(VP9_COMMON *pc) {

-  vpx_memset(pc->fc.mv_ref_ct, 0, sizeof(pc->fc.mv_ref_ct));

-  vpx_memset(pc->fc.mv_ref_ct_a, 0, sizeof(pc->fc.mv_ref_ct_a));

-  vpx_memcpy(pc->fc.mode_context,

-             vp9_default_mode_contexts,

-             sizeof(pc->fc.mode_context));

-  vpx_memcpy(pc->fc.mode_context_a,

-             vp9_default_mode_contexts_a,

-             sizeof(pc->fc.mode_context_a));

-}

-void vp9_accum_mv_refs(VP9_COMMON *pc,

-                       MB_PREDICTION_MODE m,

-                       const int ct[4]) {

-  int (*mv_ref_ct)[4][2];

-  if (pc->refresh_alt_ref_frame)

-    mv_ref_ct = pc->fc.mv_ref_ct_a;

-  else

-    mv_ref_ct = pc->fc.mv_ref_ct;

-  if (m == ZEROMV) {

-    ++mv_ref_ct [ct[0]] [0] [0];

-  } else {

-    ++mv_ref_ct [ct[0]] [0] [1];

-    if (m == NEARESTMV) {

-      ++mv_ref_ct [ct[1]] [1] [0];

-    } else {

-      ++mv_ref_ct [ct[1]] [1] [1];

-      if (m == NEARMV) {

-        ++mv_ref_ct [ct[2]] [2] [0];

-      } else {

-        ++mv_ref_ct [ct[2]] [2] [1];

-        if (m == NEWMV) {

-          ++mv_ref_ct [ct[3]] [3] [0];

-        } else {

-          ++mv_ref_ct [ct[3]] [3] [1];

-        }

-      }

-    }

-  }

-}

-#define MVREF_COUNT_SAT 20

-#define MVREF_MAX_UPDATE_FACTOR 144

-void vp9_update_mode_context(VP9_COMMON *pc) {

-  int i, j;

-  int (*mv_ref_ct)[4][2];

-  int (*mode_context)[4];

-  if (pc->refresh_alt_ref_frame) {

-    mv_ref_ct = pc->fc.mv_ref_ct_a;

-    mode_context = pc->fc.mode_context_a;

-  } else {

-    mv_ref_ct = pc->fc.mv_ref_ct;

-    mode_context = pc->fc.mode_context;

-  }

-  for (j = 0; j < 6; j++) {

-    for (i = 0; i < 4; i++) {

-      int this_prob;

-      int count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];

-      int factor;

-      {

-        this_prob = count > 0 ? 256 * mv_ref_ct[j][i][0] / count : 128;

-        count = count > MVREF_COUNT_SAT ? MVREF_COUNT_SAT : count;

-        factor = (MVREF_MAX_UPDATE_FACTOR * count / MVREF_COUNT_SAT);

-        this_prob = (pc->fc.vp8_mode_contexts[j][i] * (256 - factor) +

-                     this_prob * factor + 128) >> 8;

-        this_prob = this_prob ? (this_prob < 255 ? this_prob : 255) : 1;

-        mode_context[j][i] = this_prob;

-      }

-    }

-  }

-}

-#ifdef MODE_STATS

-#include "vp8/common/modecont.h"

-void print_mode_contexts(VP9_COMMON *pc) {

-  int j, i;

-  printf("\n====================\n");

-  for (j = 0; j < 6; j++) {

-    for (i = 0; i < 4; i++) {

-      printf("%4d ", pc->fc.mode_context[j][i]);

-    }

-    printf("\n");

-  }

-  printf("====================\n");

-  for (j = 0; j < 6; j++) {

-    for (i = 0; i < 4; i++) {

-      printf("%4d ", pc->fc.mode_context_a[j][i]);

-    }

-    printf("\n");

-  }

-}

-#endif

-// #define MODE_COUNT_TESTING

-#define MODE_COUNT_SAT 20

-#define MODE_MAX_UPDATE_FACTOR 144

-void vp9_adapt_mode_probs(VP9_COMMON *cm) {

-  int i, t, count, factor;

-  unsigned int branch_ct[32][2];

-  vp9_prob ymode_probs[VP9_YMODES - 1];

-  vp9_prob uvmode_probs[VP9_UV_MODES - 1];

-  vp9_prob bmode_probs[VP9_BINTRAMODES - 1];

-  vp9_prob i8x8_mode_probs[VP9_I8X8_MODES - 1];

-  vp9_prob sub_mv_ref_probs[VP9_SUBMVREFS - 1];

-  vp9_prob mbsplit_probs[VP9_NUMMBSPLITS - 1];

-#ifdef MODE_COUNT_TESTING

-  printf("static const unsigned int\nymode_counts"

-         "[VP9_YMODES] = {\n");

-  for (t = 0; t < VP9_YMODES; ++t) printf("%d, ", cm->fc.ymode_counts[t]);

-  printf("};\n");

-  printf("static const unsigned int\nuv_mode_counts"

-         "[VP9_YMODES] [VP9_UV_MODES] = {\n");

-  for (i = 0; i < VP9_YMODES; ++i) {

-    printf("  {");

-    for (t = 0; t < VP9_UV_MODES; ++t) printf("%d, ", cm->fc.uv_mode_counts[i][t]);

-    printf("},\n");

-  }

-  printf("};\n");

-  printf("static const unsigned int\nbmode_counts"

-         "[VP9_BINTRAMODES] = {\n");

-  for (t = 0; t < VP9_BINTRAMODES; ++t) printf("%d, ", cm->fc.bmode_counts[t]);

-  printf("};\n");

-  printf("static const unsigned int\ni8x8_mode_counts"

-         "[VP9_I8X8_MODES] = {\n");

-  for (t = 0; t < VP9_I8X8_MODES; ++t) printf("%d, ", cm->fc.i8x8_mode_counts[t]);

-  printf("};\n");

-  printf("static const unsigned int\nsub_mv_ref_counts"

-         "[SUBMVREF_COUNT] [VP9_SUBMVREFS] = {\n");

-  for (i = 0; i < SUBMVREF_COUNT; ++i) {

-    printf("  {");

-    for (t = 0; t < VP9_SUBMVREFS; ++t) printf("%d, ", cm->fc.sub_mv_ref_counts[i][t]);

-    printf("},\n");

-  }

-  printf("};\n");

-  printf("static const unsigned int\nmbsplit_counts"

-         "[VP9_NUMMBSPLITS] = {\n");

-  for (t = 0; t < VP9_NUMMBSPLITS; ++t) printf("%d, ", cm->fc.mbsplit_counts[t]);

-  printf("};\n");

-#endif

-  vp9_tree_probs_from_distribution(

-    VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree,

-    ymode_probs, branch_ct, cm->fc.ymode_counts,

-    256, 1);

-  for (t = 0; t < VP9_YMODES - 1; ++t) {

-    int prob;

-    count = branch_ct[t][0] + branch_ct[t][1];

-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;

-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);

-    prob = ((int)cm->fc.pre_ymode_prob[t] * (256 - factor) +

-            (int)ymode_probs[t] * factor + 128) >> 8;

-    if (prob <= 0) cm->fc.ymode_prob[t] = 1;

-    else if (prob > 255) cm->fc.ymode_prob[t] = 255;

-    else cm->fc.ymode_prob[t] = prob;

-  }

-  for (i = 0; i < VP9_YMODES; ++i) {

-    vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,

-                                     vp9_uv_mode_tree, uvmode_probs, branch_ct,

-                                     cm->fc.uv_mode_counts[i], 256, 1);

-    for (t = 0; t < VP9_UV_MODES - 1; ++t) {

-      int prob;

-      count = branch_ct[t][0] + branch_ct[t][1];

-      count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;

-      factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);

-      prob = ((int)cm->fc.pre_uv_mode_prob[i][t] * (256 - factor) +

-              (int)uvmode_probs[t] * factor + 128) >> 8;

-      if (prob <= 0) cm->fc.uv_mode_prob[i][t] = 1;

-      else if (prob > 255) cm->fc.uv_mode_prob[i][t] = 255;

-      else cm->fc.uv_mode_prob[i][t] = prob;

-    }

-  }

-  vp9_tree_probs_from_distribution(VP9_BINTRAMODES, vp9_bmode_encodings,

-                                   vp9_bmode_tree, bmode_probs, branch_ct,

-                                   cm->fc.bmode_counts, 256, 1);

-  for (t = 0; t < VP9_BINTRAMODES - 1; ++t) {

-    int prob;

-    count = branch_ct[t][0] + branch_ct[t][1];

-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;

-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);

-    prob = ((int)cm->fc.pre_bmode_prob[t] * (256 - factor) +

-            (int)bmode_probs[t] * factor + 128) >> 8;

-    if (prob <= 0) cm->fc.bmode_prob[t] = 1;

-    else if (prob > 255) cm->fc.bmode_prob[t] = 255;

-    else cm->fc.bmode_prob[t] = prob;

-  }

-  vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings,

-                                   vp9_i8x8_mode_tree, i8x8_mode_probs,

-                                   branch_ct, cm->fc.i8x8_mode_counts, 256, 1);

-  for (t = 0; t < VP9_I8X8_MODES - 1; ++t) {

-    int prob;

-    count = branch_ct[t][0] + branch_ct[t][1];

-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;

-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);

-    prob = ((int)cm->fc.pre_i8x8_mode_prob[t] * (256 - factor) +

-            (int)i8x8_mode_probs[t] * factor + 128) >> 8;

-    if (prob <= 0) cm->fc.i8x8_mode_prob[t] = 1;

-    else if (prob > 255) cm->fc.i8x8_mode_prob[t] = 255;

-    else cm->fc.i8x8_mode_prob[t] = prob;

-  }

-  for (i = 0; i < SUBMVREF_COUNT; ++i) {

-    vp9_tree_probs_from_distribution(VP9_SUBMVREFS,

-                                     vp9_sub_mv_ref_encoding_array,

-                                     vp9_sub_mv_ref_tree, sub_mv_ref_probs,

-                                     branch_ct, cm->fc.sub_mv_ref_counts[i],

-                                     256, 1);

-    for (t = 0; t < VP9_SUBMVREFS - 1; ++t) {

-      int prob;

-      count = branch_ct[t][0] + branch_ct[t][1];

-      count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;

-      factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);

-      prob = ((int)cm->fc.pre_sub_mv_ref_prob[i][t] * (256 - factor) +

-              (int)sub_mv_ref_probs[t] * factor + 128) >> 8;

-      if (prob <= 0) cm->fc.sub_mv_ref_prob[i][t] = 1;

-      else if (prob > 255) cm->fc.sub_mv_ref_prob[i][t] = 255;

-      else cm->fc.sub_mv_ref_prob[i][t] = prob;

-    }

-  }

-  vp9_tree_probs_from_distribution(VP9_NUMMBSPLITS, vp9_mbsplit_encodings,

-                                   vp9_mbsplit_tree, mbsplit_probs, branch_ct,

-                                   cm->fc.mbsplit_counts, 256, 1);

-  for (t = 0; t < VP9_NUMMBSPLITS - 1; ++t) {

-    int prob;

-    count = branch_ct[t][0] + branch_ct[t][1];

-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;

-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);

-    prob = ((int)cm->fc.pre_mbsplit_prob[t] * (256 - factor) +

-            (int)mbsplit_probs[t] * factor + 128) >> 8;

-    if (prob <= 0) cm->fc.mbsplit_prob[t] = 1;

-    else if (prob > 255) cm->fc.mbsplit_prob[t] = 255;

-    else cm->fc.mbsplit_prob[t] = prob;

-  }

-}

--- a/vp8/common/entropymode.h

+++ /dev/null

@@ -1,102 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_ENTROPYMODE_H

-#define __INC_ENTROPYMODE_H

-#include "blockd.h"

-#include "treecoder.h"

-#define SUBMVREF_COUNT 5

-#define VP9_NUMMBSPLITS 4

-typedef const int vp9_mbsplit[16];

-extern vp9_mbsplit vp9_mbsplits[VP9_NUMMBSPLITS];

-extern const int vp9_mbsplit_count[VP9_NUMMBSPLITS];    /* # of subsets */

-extern const vp9_prob vp9_mbsplit_probs[VP9_NUMMBSPLITS - 1];

-extern int vp9_mv_cont(const int_mv *l, const int_mv *a);

-extern const vp9_prob vp9_sub_mv_ref_prob[VP9_SUBMVREFS - 1];

-extern const vp9_prob vp9_sub_mv_ref_prob2[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];

-extern const unsigned int vp9_kf_default_bmode_counts[VP9_BINTRAMODES]

-                                                     [VP9_BINTRAMODES]

-                                                     [VP9_BINTRAMODES];

-extern const vp9_tree_index vp9_bmode_tree[];

-extern const vp9_tree_index  vp9_ymode_tree[];

-extern const vp9_tree_index  vp9_kf_ymode_tree[];

-extern const vp9_tree_index  vp9_uv_mode_tree[];

-#define vp9_sb_ymode_tree vp9_uv_mode_tree

-extern const vp9_tree_index  vp9_i8x8_mode_tree[];

-extern const vp9_tree_index  vp9_mbsplit_tree[];

-extern const vp9_tree_index  vp9_mv_ref_tree[];

-extern const vp9_tree_index  vp9_sb_mv_ref_tree[];

-extern const vp9_tree_index  vp9_sub_mv_ref_tree[];

-extern struct vp9_token_struct vp9_bmode_encodings[VP9_BINTRAMODES];

-extern struct vp9_token_struct vp9_ymode_encodings[VP9_YMODES];

-extern struct vp9_token_struct vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES];

-extern struct vp9_token_struct vp9_kf_ymode_encodings[VP9_YMODES];

-extern struct vp9_token_struct vp9_i8x8_mode_encodings[VP9_I8X8_MODES];

-extern struct vp9_token_struct vp9_uv_mode_encodings[VP9_UV_MODES];

-extern struct vp9_token_struct vp9_mbsplit_encodings[VP9_NUMMBSPLITS];

-/* Inter mode values do not start at zero */

-extern struct vp9_token_struct vp9_mv_ref_encoding_array[VP9_MVREFS];

-extern struct vp9_token_struct vp9_sb_mv_ref_encoding_array[VP9_MVREFS];

-extern struct vp9_token_struct vp9_sub_mv_ref_encoding_array[VP9_SUBMVREFS];

-void vp9_entropy_mode_init(void);

-struct VP9Common;

-void vp9_init_mbmode_probs(struct VP9Common *x);

-extern void vp9_init_mode_contexts(struct VP9Common *pc);

-extern void vp9_update_mode_context(struct VP9Common *pc);

-extern void vp9_accum_mv_refs(struct VP9Common *pc,

-                              MB_PREDICTION_MODE m,

-                              const int ct[4]);

-void vp9_default_bmode_probs(vp9_prob dest[VP9_BINTRAMODES - 1]);

-void vp9_kf_default_bmode_probs(vp9_prob dest[VP9_BINTRAMODES][VP9_BINTRAMODES]

-                                             [VP9_BINTRAMODES - 1]);

-void vp9_adapt_mode_probs(struct VP9Common *);

-#define VP9_SWITCHABLE_FILTERS 2 /* number of switchable filters */

-extern const  INTERPOLATIONFILTERTYPE vp9_switchable_interp

-                  [VP9_SWITCHABLE_FILTERS];

-extern const  int vp9_switchable_interp_map[SWITCHABLE + 1];

-extern const  vp9_tree_index vp9_switchable_interp_tree

-                  [2 * (VP9_SWITCHABLE_FILTERS - 1)];

-extern struct vp9_token_struct vp9_switchable_interp_encodings

-                  [VP9_SWITCHABLE_FILTERS];

-extern const  vp9_prob vp9_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]

-                                                 [VP9_SWITCHABLE_FILTERS - 1];

-#endif

--- a/vp8/common/entropymv.c

+++ /dev/null

@@ -1,465 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "onyxc_int.h"

-#include "entropymv.h"

-//#define MV_COUNT_TESTING

-#define MV_COUNT_SAT 16

-#define MV_MAX_UPDATE_FACTOR 160

-#if CONFIG_NEW_MVREF

-/* Integer pel reference mv threshold for use of high-precision 1/8 mv */

-#define COMPANDED_MVREF_THRESH    1000000

-#else

-/* Integer pel reference mv threshold for use of high-precision 1/8 mv */

-#define COMPANDED_MVREF_THRESH    8

-#endif

-/* Smooth or bias the mv-counts before prob computation */

-/* #define SMOOTH_MV_COUNTS */

-const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = {

-  -MV_JOINT_ZERO, 2,

-  -MV_JOINT_HNZVZ, 4,

-  -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ

-};

-struct vp9_token_struct vp9_mv_joint_encodings[MV_JOINTS];

-const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = {

-  -MV_CLASS_0, 2,

-  -MV_CLASS_1, 4,

-  6, 8,

-  -MV_CLASS_2, -MV_CLASS_3,

-  10, 12,

-  -MV_CLASS_4, -MV_CLASS_5,

-  -MV_CLASS_6, -MV_CLASS_7,

-};

-struct vp9_token_struct vp9_mv_class_encodings[MV_CLASSES];

-const vp9_tree_index vp9_mv_class0_tree [2 * CLASS0_SIZE - 2] = {

-  -0, -1,

-};

-struct vp9_token_struct vp9_mv_class0_encodings[CLASS0_SIZE];

-const vp9_tree_index vp9_mv_fp_tree [2 * 4 - 2] = {

-  -0, 2,

-  -1, 4,

-  -2, -3

-};

-struct vp9_token_struct vp9_mv_fp_encodings[4];

-const nmv_context vp9_default_nmv_context = {

-  {32, 64, 96},

-  {

-    { /* vert component */

-      128,                                             /* sign */

-      {224, 144, 192, 168, 192, 176, 192},             /* class */

-      {216},                                           /* class0 */

-      {136, 140, 148, 160, 176, 192, 224},             /* bits */

-      {{128, 128, 64}, {96, 112, 64}},                 /* class0_fp */

-      {64, 96, 64},                                    /* fp */

-      160,                                             /* class0_hp bit */

-      128,                                             /* hp */

-    },

-    { /* hor component */

-      128,                                             /* sign */

-      {216, 128, 176, 160, 176, 176, 192},             /* class */

-      {208},                                           /* class0 */

-      {136, 140, 148, 160, 176, 192, 224},             /* bits */

-      {{128, 128, 64}, {96, 112, 64}},                 /* class0_fp */

-      {64, 96, 64},                                    /* fp */

-      160,                                             /* class0_hp bit */

-      128,                                             /* hp */

-    }

-  },

-};

-MV_JOINT_TYPE vp9_get_mv_joint(MV mv) {

-  if (mv.row == 0 && mv.col == 0) return MV_JOINT_ZERO;

-  else if (mv.row == 0 && mv.col != 0) return MV_JOINT_HNZVZ;

-  else if (mv.row != 0 && mv.col == 0) return MV_JOINT_HZVNZ;

-  else return MV_JOINT_HNZVNZ;

-}

-#define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0)

-MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {

-  MV_CLASS_TYPE c;

-  if      (z < CLASS0_SIZE * 8)    c = MV_CLASS_0;

-  else if (z < CLASS0_SIZE * 16)   c = MV_CLASS_1;

-  else if (z < CLASS0_SIZE * 32)   c = MV_CLASS_2;

-  else if (z < CLASS0_SIZE * 64)   c = MV_CLASS_3;

-  else if (z < CLASS0_SIZE * 128)  c = MV_CLASS_4;

-  else if (z < CLASS0_SIZE * 256)  c = MV_CLASS_5;

-  else if (z < CLASS0_SIZE * 512)  c = MV_CLASS_6;

-  else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7;

-  else assert(0);

-  if (offset)

-    *offset = z - mv_class_base(c);

-  return c;

-}

-int vp9_use_nmv_hp(const MV *ref) {

-  if ((abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH &&

-      (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH)

-    return 1;

-  else

-    return 0;

-}

-int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {

-  return mv_class_base(c) + offset;

-}

-static void increment_nmv_component_count(int v,

-                                          nmv_component_counts *mvcomp,

-                                          int incr,

-                                          int usehp) {

-  assert (v != 0);            /* should not be zero */

-  mvcomp->mvcount[MV_MAX + v] += incr;

-}

-static void increment_nmv_component(int v,

-                                    nmv_component_counts *mvcomp,

-                                    int incr,

-                                    int usehp) {

-  int s, z, c, o, d, e, f;

-  assert (v != 0);            /* should not be zero */

-  s = v < 0;

-  mvcomp->sign[s] += incr;

-  z = (s ? -v : v) - 1;       /* magnitude - 1 */

-  c = vp9_get_mv_class(z, &o);

-  mvcomp->classes[c] += incr;

-  d = (o >> 3);               /* int mv data */

-  f = (o >> 1) & 3;           /* fractional pel mv data */

-  e = (o & 1);                /* high precision mv data */

-  if (c == MV_CLASS_0) {

-    mvcomp->class0[d] += incr;

-  } else {

-    int i, b;

-    b = c + CLASS0_BITS - 1;  /* number of bits */

-    for (i = 0; i < b; ++i)

-      mvcomp->bits[i][((d >> i) & 1)] += incr;

-  }

-  /* Code the fractional pel bits */

-  if (c == MV_CLASS_0) {

-    mvcomp->class0_fp[d][f] += incr;

-  } else {

-    mvcomp->fp[f] += incr;

-  }

-  /* Code the high precision bit */

-  if (usehp) {

-    if (c == MV_CLASS_0) {

-      mvcomp->class0_hp[e] += incr;

-    } else {

-      mvcomp->hp[e] += incr;

-    }

-  }

-}

-#ifdef SMOOTH_MV_COUNTS

-static void smooth_counts(nmv_component_counts *mvcomp) {

-  static const int flen = 3;  // (filter_length + 1) / 2

-  static const int fval[] = {8, 3, 1};

-  static const int fvalbits = 4;

-  int i;

-  unsigned int smvcount[MV_VALS];

-  vpx_memcpy(smvcount, mvcomp->mvcount, sizeof(smvcount));

-  smvcount[MV_MAX] = (smvcount[MV_MAX - 1] + smvcount[MV_MAX + 1]) >> 1;

-  for (i = flen - 1; i <= MV_VALS - flen; ++i) {

-    int j, s = smvcount[i] * fval[0];

-    for (j = 1; j < flen; ++j)

-      s += (smvcount[i - j] + smvcount[i + j]) * fval[j];

-    mvcomp->mvcount[i] = (s + (1 << (fvalbits - 1))) >> fvalbits;

-  }

-}

-#endif

-static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {

-  int v;

-  vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount));

-  for (v = 1; v <= MV_MAX; v++) {

-    increment_nmv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp);

-    increment_nmv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp);

-  }

-}

-void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,

-                       int usehp) {

-  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);

-  mvctx->joints[j]++;

-  usehp = usehp && vp9_use_nmv_hp(ref);

-  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {

-    increment_nmv_component_count(mv->row, &mvctx->comps[0], 1, usehp);

-  }

-  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {

-    increment_nmv_component_count(mv->col, &mvctx->comps[1], 1, usehp);

-  }

-}

-static void adapt_prob(vp9_prob *dest, vp9_prob prep, vp9_prob newp,

-                       unsigned int ct[2]) {

-  int factor;

-  int prob;

-  int count = ct[0] + ct[1];

-  if (count) {

-    count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count;

-    factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT);

-    prob = ((int)prep * (256 - factor) + (int)(newp) * factor + 128) >> 8;

-    prob += !prob;

-    prob = (prob > 255 ? 255 : prob);

-    *dest = prob;

-  }

-}

-void vp9_counts_to_nmv_context(

-    nmv_context_counts *NMVcount,

-    nmv_context *prob,

-    int usehp,

-    unsigned int (*branch_ct_joint)[2],

-    unsigned int (*branch_ct_sign)[2],

-    unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],

-    unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],

-    unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],

-    unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],

-    unsigned int (*branch_ct_fp)[4 - 1][2],

-    unsigned int (*branch_ct_class0_hp)[2],

-    unsigned int (*branch_ct_hp)[2]) {

-  int i, j, k;

-  counts_to_context(&NMVcount->comps[0], usehp);

-  counts_to_context(&NMVcount->comps[1], usehp);

-  vp9_tree_probs_from_distribution(MV_JOINTS,

-                                   vp9_mv_joint_encodings,

-                                   vp9_mv_joint_tree,

-                                   prob->joints,

-                                   branch_ct_joint,

-                                   NMVcount->joints,

-                                   256, 1);

-  for (i = 0; i < 2; ++i) {

-    prob->comps[i].sign =

-        vp9_bin_prob_from_distribution(NMVcount->comps[i].sign);

-    branch_ct_sign[i][0] = NMVcount->comps[i].sign[0];

-    branch_ct_sign[i][1] = NMVcount->comps[i].sign[1];

-    vp9_tree_probs_from_distribution(MV_CLASSES,

-                                     vp9_mv_class_encodings,

-                                     vp9_mv_class_tree,

-                                     prob->comps[i].classes,

-                                     branch_ct_classes[i],

-                                     NMVcount->comps[i].classes,

-                                     256, 1);

-    vp9_tree_probs_from_distribution(CLASS0_SIZE,

-                                     vp9_mv_class0_encodings,

-                                     vp9_mv_class0_tree,

-                                     prob->comps[i].class0,

-                                     branch_ct_class0[i],

-                                     NMVcount->comps[i].class0,

-                                     256, 1);

-    for (j = 0; j < MV_OFFSET_BITS; ++j) {

-      prob->comps[i].bits[j] = vp9_bin_prob_from_distribution(

-          NMVcount->comps[i].bits[j]);

-      branch_ct_bits[i][j][0] = NMVcount->comps[i].bits[j][0];

-      branch_ct_bits[i][j][1] = NMVcount->comps[i].bits[j][1];

-    }

-  }

-  for (i = 0; i < 2; ++i) {

-    for (k = 0; k < CLASS0_SIZE; ++k) {

-      vp9_tree_probs_from_distribution(4,

-                                       vp9_mv_fp_encodings,

-                                       vp9_mv_fp_tree,

-                                       prob->comps[i].class0_fp[k],

-                                       branch_ct_class0_fp[i][k],

-                                       NMVcount->comps[i].class0_fp[k],

-                                       256, 1);

-    }

-    vp9_tree_probs_from_distribution(4,

-                                     vp9_mv_fp_encodings,

-                                     vp9_mv_fp_tree,

-                                     prob->comps[i].fp,

-                                     branch_ct_fp[i],

-                                     NMVcount->comps[i].fp,

-                                     256, 1);

-  }

-  if (usehp) {

-    for (i = 0; i < 2; ++i) {

-      prob->comps[i].class0_hp = vp9_bin_prob_from_distribution(

-          NMVcount->comps[i].class0_hp);

-      branch_ct_class0_hp[i][0] = NMVcount->comps[i].class0_hp[0];

-      branch_ct_class0_hp[i][1] = NMVcount->comps[i].class0_hp[1];

-      prob->comps[i].hp =

-          vp9_bin_prob_from_distribution(NMVcount->comps[i].hp);

-      branch_ct_hp[i][0] = NMVcount->comps[i].hp[0];

-      branch_ct_hp[i][1] = NMVcount->comps[i].hp[1];

-    }

-  }

-}

-void vp9_adapt_nmv_probs(VP9_COMMON *cm, int usehp) {

-  int i, j, k;

-  nmv_context prob;

-  unsigned int branch_ct_joint[MV_JOINTS - 1][2];

-  unsigned int branch_ct_sign[2][2];

-  unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];

-  unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];

-  unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];

-  unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];

-  unsigned int branch_ct_fp[2][4 - 1][2];

-  unsigned int branch_ct_class0_hp[2][2];

-  unsigned int branch_ct_hp[2][2];

-#ifdef MV_COUNT_TESTING

-  printf("joints count: ");

-  for (j = 0; j < MV_JOINTS; ++j) printf("%d ", cm->fc.NMVcount.joints[j]);

-  printf("\n"); fflush(stdout);

-  printf("signs count:\n");

-  for (i = 0; i < 2; ++i)

-    printf("%d/%d ", cm->fc.NMVcount.comps[i].sign[0], cm->fc.NMVcount.comps[i].sign[1]);

-  printf("\n"); fflush(stdout);

-  printf("classes count:\n");

-  for (i = 0; i < 2; ++i) {

-    for (j = 0; j < MV_CLASSES; ++j)

-      printf("%d ", cm->fc.NMVcount.comps[i].classes[j]);

-    printf("\n"); fflush(stdout);

-  }

-  printf("class0 count:\n");

-  for (i = 0; i < 2; ++i) {

-    for (j = 0; j < CLASS0_SIZE; ++j)

-      printf("%d ", cm->fc.NMVcount.comps[i].class0[j]);

-    printf("\n"); fflush(stdout);

-  }

-  printf("bits count:\n");

-  for (i = 0; i < 2; ++i) {

-    for (j = 0; j < MV_OFFSET_BITS; ++j)

-      printf("%d/%d ", cm->fc.NMVcount.comps[i].bits[j][0],

-                       cm->fc.NMVcount.comps[i].bits[j][1]);

-    printf("\n"); fflush(stdout);

-  }

-  printf("class0_fp count:\n");

-  for (i = 0; i < 2; ++i) {

-    for (j = 0; j < CLASS0_SIZE; ++j) {

-      printf("{");

-      for (k = 0; k < 4; ++k)

-        printf("%d ", cm->fc.NMVcount.comps[i].class0_fp[j][k]);

-      printf("}, ");

-    }

-    printf("\n"); fflush(stdout);

-  }

-  printf("fp count:\n");

-  for (i = 0; i < 2; ++i) {

-    for (j = 0; j < 4; ++j)

-      printf("%d ", cm->fc.NMVcount.comps[i].fp[j]);

-    printf("\n"); fflush(stdout);

-  }

-  if (usehp) {

-    printf("class0_hp count:\n");

-    for (i = 0; i < 2; ++i)

-      printf("%d/%d ", cm->fc.NMVcount.comps[i].class0_hp[0],

-                       cm->fc.NMVcount.comps[i].class0_hp[1]);

-    printf("\n"); fflush(stdout);

-    printf("hp count:\n");

-    for (i = 0; i < 2; ++i)

-      printf("%d/%d ", cm->fc.NMVcount.comps[i].hp[0],

-                       cm->fc.NMVcount.comps[i].hp[1]);

-    printf("\n"); fflush(stdout);

-  }

-#endif

-#ifdef SMOOTH_MV_COUNTS

-  smooth_counts(&cm->fc.NMVcount.comps[0]);

-  smooth_counts(&cm->fc.NMVcount.comps[1]);

-#endif

-  vp9_counts_to_nmv_context(&cm->fc.NMVcount,

-                            &prob,

-                            usehp,

-                            branch_ct_joint,

-                            branch_ct_sign,

-                            branch_ct_classes,

-                            branch_ct_class0,

-                            branch_ct_bits,

-                            branch_ct_class0_fp,

-                            branch_ct_fp,

-                            branch_ct_class0_hp,

-                            branch_ct_hp);

-  for (j = 0; j < MV_JOINTS - 1; ++j) {

-    adapt_prob(&cm->fc.nmvc.joints[j],

-               cm->fc.pre_nmvc.joints[j],

-               prob.joints[j],

-               branch_ct_joint[j]);

-  }

-  for (i = 0; i < 2; ++i) {

-    adapt_prob(&cm->fc.nmvc.comps[i].sign,

-               cm->fc.pre_nmvc.comps[i].sign,

-               prob.comps[i].sign,

-               branch_ct_sign[i]);

-    for (j = 0; j < MV_CLASSES - 1; ++j) {

-      adapt_prob(&cm->fc.nmvc.comps[i].classes[j],

-                 cm->fc.pre_nmvc.comps[i].classes[j],

-                 prob.comps[i].classes[j],

-                 branch_ct_classes[i][j]);

-    }

-    for (j = 0; j < CLASS0_SIZE - 1; ++j) {

-      adapt_prob(&cm->fc.nmvc.comps[i].class0[j],

-                 cm->fc.pre_nmvc.comps[i].class0[j],

-                 prob.comps[i].class0[j],

-                 branch_ct_class0[i][j]);

-    }

-    for (j = 0; j < MV_OFFSET_BITS; ++j) {

-      adapt_prob(&cm->fc.nmvc.comps[i].bits[j],

-                 cm->fc.pre_nmvc.comps[i].bits[j],

-                 prob.comps[i].bits[j],

-                 branch_ct_bits[i][j]);

-    }

-  }

-  for (i = 0; i < 2; ++i) {

-    for (j = 0; j < CLASS0_SIZE; ++j) {

-      for (k = 0; k < 3; ++k) {

-        adapt_prob(&cm->fc.nmvc.comps[i].class0_fp[j][k],

-                   cm->fc.pre_nmvc.comps[i].class0_fp[j][k],

-                   prob.comps[i].class0_fp[j][k],

-                   branch_ct_class0_fp[i][j][k]);

-      }

-    }

-    for (j = 0; j < 3; ++j) {

-      adapt_prob(&cm->fc.nmvc.comps[i].fp[j],

-                 cm->fc.pre_nmvc.comps[i].fp[j],

-                 prob.comps[i].fp[j],

-                 branch_ct_fp[i][j]);

-    }

-  }

-  if (usehp) {

-    for (i = 0; i < 2; ++i) {

-      adapt_prob(&cm->fc.nmvc.comps[i].class0_hp,

-                 cm->fc.pre_nmvc.comps[i].class0_hp,

-                 prob.comps[i].class0_hp,

-                 branch_ct_class0_hp[i]);

-      adapt_prob(&cm->fc.nmvc.comps[i].hp,

-                 cm->fc.pre_nmvc.comps[i].hp,

-                 prob.comps[i].hp,

-                 branch_ct_hp[i]);

-    }

-  }

-}

-void vp9_entropy_mv_init() {

-  vp9_tokens_from_tree(vp9_mv_joint_encodings, vp9_mv_joint_tree);

-  vp9_tokens_from_tree(vp9_mv_class_encodings, vp9_mv_class_tree);

-  vp9_tokens_from_tree(vp9_mv_class0_encodings, vp9_mv_class0_tree);

-  vp9_tokens_from_tree(vp9_mv_fp_encodings, vp9_mv_fp_tree);

-}

-void vp9_init_mv_probs(VP9_COMMON *cm) {

-  vpx_memcpy(&cm->fc.nmvc, &vp9_default_nmv_context, sizeof(nmv_context));

-}

--- a/vp8/common/entropymv.h

+++ /dev/null

@@ -1,129 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_ENTROPYMV_H

-#define __INC_ENTROPYMV_H

-#include "treecoder.h"

-#include "vpx_config.h"

-#include "blockd.h"

-struct VP9Common;

-void vp9_entropy_mv_init();

-void vp9_init_mv_probs(struct VP9Common *cm);

-void vp9_adapt_nmv_probs(struct VP9Common *cm, int usehp);

-int vp9_use_nmv_hp(const MV *ref);

-#define VP9_NMV_UPDATE_PROB  255

-//#define MV_GROUP_UPDATE

-#define LOW_PRECISION_MV_UPDATE  /* Use 7 bit forward update */

-/* Symbols for coding which components are zero jointly */

-#define MV_JOINTS     4

-typedef enum {

-  MV_JOINT_ZERO = 0,             /* Zero vector */

-  MV_JOINT_HNZVZ = 1,            /* Vert zero, hor nonzero */

-  MV_JOINT_HZVNZ = 2,            /* Hor zero, vert nonzero */

-  MV_JOINT_HNZVNZ = 3,           /* Both components nonzero */

-} MV_JOINT_TYPE;

-extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2];

-extern struct vp9_token_struct vp9_mv_joint_encodings [MV_JOINTS];

-/* Symbols for coding magnitude class of nonzero components */

-#define MV_CLASSES     8

-typedef enum {

-  MV_CLASS_0 = 0,      /* (0, 2]     integer pel */

-  MV_CLASS_1 = 1,      /* (2, 4]     integer pel */

-  MV_CLASS_2 = 2,      /* (4, 8]     integer pel */

-  MV_CLASS_3 = 3,      /* (8, 16]    integer pel */

-  MV_CLASS_4 = 4,      /* (16, 32]   integer pel */

-  MV_CLASS_5 = 5,      /* (32, 64]   integer pel */

-  MV_CLASS_6 = 6,      /* (64, 128]  integer pel */

-  MV_CLASS_7 = 7,      /* (128, 256] integer pel */

-} MV_CLASS_TYPE;

-extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2];

-extern struct vp9_token_struct vp9_mv_class_encodings [MV_CLASSES];

-#define CLASS0_BITS    1  /* bits at integer precision for class 0 */

-#define CLASS0_SIZE    (1 << CLASS0_BITS)

-#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)

-#define MV_MAX_BITS    (MV_CLASSES + CLASS0_BITS + 2)

-#define MV_MAX         ((1 << MV_MAX_BITS) - 1)

-#define MV_VALS        ((MV_MAX << 1) + 1)

-extern const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2];

-extern struct vp9_token_struct vp9_mv_class0_encodings[CLASS0_SIZE];

-extern const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2];

-extern struct vp9_token_struct vp9_mv_fp_encodings[4];

-typedef struct {

-  vp9_prob sign;

-  vp9_prob classes[MV_CLASSES - 1];

-  vp9_prob class0[CLASS0_SIZE - 1];

-  vp9_prob bits[MV_OFFSET_BITS];

-  vp9_prob class0_fp[CLASS0_SIZE][4 - 1];

-  vp9_prob fp[4 - 1];

-  vp9_prob class0_hp;

-  vp9_prob hp;

-} nmv_component;

-typedef struct {

-  vp9_prob joints[MV_JOINTS - 1];

-  nmv_component comps[2];

-} nmv_context;

-MV_JOINT_TYPE vp9_get_mv_joint(MV mv);

-MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset);

-int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset);

-typedef struct {

-  unsigned int mvcount[MV_VALS];

-  unsigned int sign[2];

-  unsigned int classes[MV_CLASSES];

-  unsigned int class0[CLASS0_SIZE];

-  unsigned int bits[MV_OFFSET_BITS][2];

-  unsigned int class0_fp[CLASS0_SIZE][4];

-  unsigned int fp[4];

-  unsigned int class0_hp[2];

-  unsigned int hp[2];

-} nmv_component_counts;

-typedef struct {

-  unsigned int joints[MV_JOINTS];

-  nmv_component_counts comps[2];

-} nmv_context_counts;

-void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,

-                       int usehp);

-extern const nmv_context vp9_default_nmv_context;

-void vp9_counts_to_nmv_context(

-    nmv_context_counts *NMVcount,

-    nmv_context *prob,

-    int usehp,

-    unsigned int (*branch_ct_joint)[2],

-    unsigned int (*branch_ct_sign)[2],

-    unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],

-    unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],

-    unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],

-    unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],

-    unsigned int (*branch_ct_fp)[4 - 1][2],

-    unsigned int (*branch_ct_class0_hp)[2],

-    unsigned int (*branch_ct_hp)[2]);

-#endif

--- a/vp8/common/extend.c

+++ /dev/null

@@ -1,169 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "extend.h"

-#include "vpx_mem/vpx_mem.h"

-static void copy_and_extend_plane(unsigned char *s, /* source */

-                                  int sp,           /* source pitch */

-                                  unsigned char *d, /* destination */

-                                  int dp,           /* destination pitch */

-                                  int h,            /* height */

-                                  int w,            /* width */

-                                  int et,           /* extend top border */

-                                  int el,           /* extend left border */

-                                  int eb,           /* extend bottom border */

-                                  int er) {         /* extend right border */

-  int i;

-  unsigned char *src_ptr1, *src_ptr2;

-  unsigned char *dest_ptr1, *dest_ptr2;

-  int linesize;

-  /* copy the left and right most columns out */

-  src_ptr1 = s;

-  src_ptr2 = s + w - 1;

-  dest_ptr1 = d - el;

-  dest_ptr2 = d + w;

-  for (i = 0; i < h; i++) {

-    vpx_memset(dest_ptr1, src_ptr1[0], el);

-    vpx_memcpy(dest_ptr1 + el, src_ptr1, w);

-    vpx_memset(dest_ptr2, src_ptr2[0], er);

-    src_ptr1  += sp;

-    src_ptr2  += sp;

-    dest_ptr1 += dp;

-    dest_ptr2 += dp;

-  }

-  /* Now copy the top and bottom lines into each line of the respective

-   * borders

-   */

-  src_ptr1 = d - el;

-  src_ptr2 = d + dp * (h - 1) - el;

-  dest_ptr1 = d + dp * (-et) - el;

-  dest_ptr2 = d + dp * (h) - el;

-  linesize = el + er + w;

-  for (i = 0; i < et; i++) {

-    vpx_memcpy(dest_ptr1, src_ptr1, linesize);

-    dest_ptr1 += dp;

-  }

-  for (i = 0; i < eb; i++) {

-    vpx_memcpy(dest_ptr2, src_ptr2, linesize);

-    dest_ptr2 += dp;

-  }

-}

-void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,

-                               YV12_BUFFER_CONFIG *dst) {

-  int et = dst->border;

-  int el = dst->border;

-  int eb = dst->border + dst->y_height - src->y_height;

-  int er = dst->border + dst->y_width - src->y_width;

-  copy_and_extend_plane(src->y_buffer, src->y_stride,

-                        dst->y_buffer, dst->y_stride,

-                        src->y_height, src->y_width,

-                        et, el, eb, er);

-  et = dst->border >> 1;

-  el = dst->border >> 1;

-  eb = (dst->border >> 1) + dst->uv_height - src->uv_height;

-  er = (dst->border >> 1) + dst->uv_width - src->uv_width;

-  copy_and_extend_plane(src->u_buffer, src->uv_stride,

-                        dst->u_buffer, dst->uv_stride,

-                        src->uv_height, src->uv_width,

-                        et, el, eb, er);

-  copy_and_extend_plane(src->v_buffer, src->uv_stride,

-                        dst->v_buffer, dst->uv_stride,

-                        src->uv_height, src->uv_width,

-                        et, el, eb, er);

-}

-void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,

-                                         YV12_BUFFER_CONFIG *dst,

-                                         int srcy, int srcx,

-                                         int srch, int srcw) {

-  int et = dst->border;

-  int el = dst->border;

-  int eb = dst->border + dst->y_height - src->y_height;

-  int er = dst->border + dst->y_width - src->y_width;

-  int src_y_offset = srcy * src->y_stride + srcx;

-  int dst_y_offset = srcy * dst->y_stride + srcx;

-  int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);

-  int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);

-  // If the side is not touching the bounder then don't extend.

-  if (srcy)

-    et = 0;

-  if (srcx)

-    el = 0;

-  if (srcy + srch != src->y_height)

-    eb = 0;

-  if (srcx + srcw != src->y_width)

-    er = 0;

-  copy_and_extend_plane(src->y_buffer + src_y_offset,

-                        src->y_stride,

-                        dst->y_buffer + dst_y_offset,

-                        dst->y_stride,

-                        srch, srcw,

-                        et, el, eb, er);

-  et = (et + 1) >> 1;

-  el = (el + 1) >> 1;

-  eb = (eb + 1) >> 1;

-  er = (er + 1) >> 1;

-  srch = (srch + 1) >> 1;

-  srcw = (srcw + 1) >> 1;

-  copy_and_extend_plane(src->u_buffer + src_uv_offset,

-                        src->uv_stride,

-                        dst->u_buffer + dst_uv_offset,

-                        dst->uv_stride,

-                        srch, srcw,

-                        et, el, eb, er);

-  copy_and_extend_plane(src->v_buffer + src_uv_offset,

-                        src->uv_stride,

-                        dst->v_buffer + dst_uv_offset,

-                        dst->uv_stride,

-                        srch, srcw,

-                        et, el, eb, er);

-}

-/* note the extension is only for the last row, for intra prediction purpose */

-void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr,

-                       unsigned char *UPtr, unsigned char *VPtr) {

-  int i;

-  YPtr += ybf->y_stride * 14;

-  UPtr += ybf->uv_stride * 6;

-  VPtr += ybf->uv_stride * 6;

-  for (i = 0; i < 4; i++) {

-    YPtr[i] = YPtr[-1];

-    UPtr[i] = UPtr[-1];

-    VPtr[i] = VPtr[-1];

-  }

-  YPtr += ybf->y_stride;

-  UPtr += ybf->uv_stride;

-  VPtr += ybf->uv_stride;

-  for (i = 0; i < 4; i++) {

-    YPtr[i] = YPtr[-1];

-    UPtr[i] = UPtr[-1];

-    VPtr[i] = VPtr[-1];

-  }

-}

--- a/vp8/common/extend.h

+++ /dev/null

@@ -1,27 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_EXTEND_H

-#define __INC_EXTEND_H

-#include "vpx_scale/yv12config.h"

-void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr,

-                       unsigned char *UPtr, unsigned char *VPtr);

-void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,

-                               YV12_BUFFER_CONFIG *dst);

-void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,

-                                         YV12_BUFFER_CONFIG *dst,

-                                         int srcy, int srcx,

-                                         int srch, int srcw);

-#endif  // __INC_EXTEND_H

--- a/vp8/common/filter.c

+++ /dev/null

@@ -1,1159 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <stdlib.h>

-#include "filter.h"

-#include "vpx_ports/mem.h"

-#include "vpx_rtcd.h"

-DECLARE_ALIGNED(16, const short, vp9_bilinear_filters[SUBPEL_SHIFTS][2]) = {

-  { 128,   0 },

-  { 120,   8 },

-  { 112,  16 },

-  { 104,  24 },

-  {  96,  32 },

-  {  88,  40 },

-  {  80,  48 },

-  {  72,  56 },

-  {  64,  64 },

-  {  56,  72 },

-  {  48,  80 },

-  {  40,  88 },

-  {  32,  96 },

-  {  24, 104 },

-  {  16, 112 },

-  {   8, 120 }

-};

-#define FILTER_ALPHA       0

-#define FILTER_ALPHA_SHARP 1

-DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {

-#if FILTER_ALPHA == 0

-  /* Lagrangian interpolation filter */

-  { 0,   0,   0, 128,   0,   0,   0,  0},

-  { 0,   1,  -5, 126,   8,  -3,   1,  0},

-  { -1,   3, -10, 122,  18,  -6,   2,  0},

-  { -1,   4, -13, 118,  27,  -9,   3, -1},

-  { -1,   4, -16, 112,  37, -11,   4, -1},

-  { -1,   5, -18, 105,  48, -14,   4, -1},

-  { -1,   5, -19,  97,  58, -16,   5, -1},

-  { -1,   6, -19,  88,  68, -18,   5, -1},

-  { -1,   6, -19,  78,  78, -19,   6, -1},

-  { -1,   5, -18,  68,  88, -19,   6, -1},

-  { -1,   5, -16,  58,  97, -19,   5, -1},

-  { -1,   4, -14,  48, 105, -18,   5, -1},

-  { -1,   4, -11,  37, 112, -16,   4, -1},

-  { -1,   3,  -9,  27, 118, -13,   4, -1},

-  { 0,   2,  -6,  18, 122, -10,   3, -1},

-  { 0,   1,  -3,   8, 126,  -5,   1,  0}

-#elif FILTER_ALPHA == 50

-  /* Generated using MATLAB:

-   * alpha = 0.5;

-   * b=intfilt(8,4,alpha);

-   * bi=round(128*b);

-   * ba=flipud(reshape([bi 0], 8, 8));

-   * disp(num2str(ba, '%d,'))

-   */

-  { 0,   0,   0, 128,   0,   0,   0,  0},

-  { 0,   1,  -5, 126,   8,  -3,   1,  0},

-  { 0,   2, -10, 122,  18,  -6,   2,  0},

-  { -1,   3, -13, 118,  27,  -9,   3,  0},

-  { -1,   4, -16, 112,  37, -11,   3,  0},

-  { -1,   5, -17, 104,  48, -14,   4, -1},

-  { -1,   5, -18,  96,  58, -16,   5, -1},

-  { -1,   5, -19,  88,  68, -17,   5, -1},

-  { -1,   5, -18,  78,  78, -18,   5, -1},

-  { -1,   5, -17,  68,  88, -19,   5, -1},

-  { -1,   5, -16,  58,  96, -18,   5, -1},

-  { -1,   4, -14,  48, 104, -17,   5, -1},

-  { 0,   3, -11,  37, 112, -16,   4, -1},

-  { 0,   3,  -9,  27, 118, -13,   3, -1},

-  { 0,   2,  -6,  18, 122, -10,   2,  0},

-  { 0,   1,  -3,   8, 126,  -5,   1,  0}

-#endif  /* FILTER_ALPHA */

-};

-DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) = {

-#if FILTER_ALPHA_SHARP == 1

-  /* dct based filter */

-  {0,   0,   0, 128,   0,   0,   0, 0},

-  {-1,   3,  -7, 127,   8,  -3,   1, 0},

-  {-2,   5, -13, 125,  17,  -6,   3, -1},

-  {-3,   7, -17, 121,  27, -10,   5, -2},

-  {-4,   9, -20, 115,  37, -13,   6, -2},

-  {-4,  10, -23, 108,  48, -16,   8, -3},

-  {-4,  10, -24, 100,  59, -19,   9, -3},

-  {-4,  11, -24,  90,  70, -21,  10, -4},

-  {-4,  11, -23,  80,  80, -23,  11, -4},

-  {-4,  10, -21,  70,  90, -24,  11, -4},

-  {-3,   9, -19,  59, 100, -24,  10, -4},

-  {-3,   8, -16,  48, 108, -23,  10, -4},

-  {-2,   6, -13,  37, 115, -20,   9, -4},

-  {-2,   5, -10,  27, 121, -17,   7, -3},

-  {-1,   3,  -6,  17, 125, -13,   5, -2},

-  {0,   1,  -3,   8, 127,  -7,   3, -1}

-#elif FILTER_ALPHA_SHARP == 75

-  /* alpha = 0.75 */

-  {0,   0,   0, 128,   0,   0,   0, 0},

-  {-1,   2,  -6, 126,   9,  -3,   2, -1},

-  {-1,   4, -11, 123,  18,  -7,   3, -1},

-  {-2,   6, -16, 119,  28, -10,   5, -2},

-  {-2,   7, -19, 113,  38, -13,   6, -2},

-  {-3,   8, -21, 106,  49, -16,   7, -2},

-  {-3,   9, -22,  99,  59, -19,   8, -3},

-  {-3,   9, -23,  90,  70, -21,   9, -3},

-  {-3,   9, -22,  80,  80, -22,   9, -3},

-  {-3,   9, -21,  70,  90, -23,   9, -3},

-  {-3,   8, -19,  59,  99, -22,   9, -3},

-  {-2,   7, -16,  49, 106, -21,   8, -3},

-  {-2,   6, -13,  38, 113, -19,   7, -2},

-  {-2,   5, -10,  28, 119, -16,   6, -2},

-  {-1,   3,  -7,  18, 123, -11,   4, -1},

-  {-1,   2,  -3,   9, 126,  -6,   2, -1}

-#endif  /* FILTER_ALPHA_SHARP */

-};

-DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]) = {

-  {0,   0, 128,   0,   0, 0},

-  {1,  -5, 125,   8,  -2, 1},

-  {1,  -8, 122,  17,  -5, 1},

-  {2, -11, 116,  27,  -8, 2},

-  {3, -14, 110,  37, -10, 2},

-  {3, -15, 103,  47, -12, 2},

-  {3, -16,  95,  57, -14, 3},

-  {3, -16,  86,  67, -15, 3},

-  {3, -16,  77,  77, -16, 3},

-  {3, -15,  67,  86, -16, 3},

-  {3, -14,  57,  95, -16, 3},

-  {2, -12,  47, 103, -15, 3},

-  {2, -10,  37, 110, -14, 3},

-  {2,  -8,  27, 116, -11, 2},

-  {1,  -5,  17, 122,  -8, 1},

-  {1,  -2,   8, 125,  -5, 1}

-};

-static void filter_block2d_first_pass_6(unsigned char *src_ptr,

-                                        int *output_ptr,

-                                        unsigned int src_pixels_per_line,

-                                        unsigned int pixel_step,

-                                        unsigned int output_height,

-                                        unsigned int output_width,

-                                        const short *vp9_filter) {

-  unsigned int i, j;

-  int  Temp;

-  for (i = 0; i < output_height; i++) {

-    for (j = 0; j < output_width; j++) {

-      Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +

-             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +

-             ((int)src_ptr[0]                    * vp9_filter[2]) +

-             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +

-             ((int)src_ptr[2 * pixel_step]       * vp9_filter[4]) +

-             ((int)src_ptr[3 * pixel_step]       * vp9_filter[5]) +

-             (VP9_FILTER_WEIGHT >> 1);      /* Rounding */

-      /* Normalize back to 0-255 */

-      Temp = Temp >> VP9_FILTER_SHIFT;

-      if (Temp < 0)

-        Temp = 0;

-      else if (Temp > 255)

-        Temp = 255;

-      output_ptr[j] = Temp;

-      src_ptr++;

-    }

-    /* Next row... */

-    src_ptr    += src_pixels_per_line - output_width;

-    output_ptr += output_width;

-  }

-}

-static void filter_block2d_second_pass_6(int *src_ptr,

-                                         unsigned char *output_ptr,

-                                         int output_pitch,

-                                         unsigned int src_pixels_per_line,

-                                         unsigned int pixel_step,

-                                         unsigned int output_height,

-                                         unsigned int output_width,

-                                         const short *vp9_filter) {

-  unsigned int i, j;

-  int  Temp;

-  for (i = 0; i < output_height; i++) {

-    for (j = 0; j < output_width; j++) {

-      /* Apply filter */

-      Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +

-             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +

-             ((int)src_ptr[0]                    * vp9_filter[2]) +

-             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +

-             ((int)src_ptr[2 * pixel_step]         * vp9_filter[4]) +

-             ((int)src_ptr[3 * pixel_step]         * vp9_filter[5]) +

-             (VP9_FILTER_WEIGHT >> 1);   /* Rounding */

-      /* Normalize back to 0-255 */

-      Temp = Temp >> VP9_FILTER_SHIFT;

-      if (Temp < 0)

-        Temp = 0;

-      else if (Temp > 255)

-        Temp = 255;

-      output_ptr[j] = (unsigned char)Temp;

-      src_ptr++;

-    }

-    /* Start next row */

-    src_ptr    += src_pixels_per_line - output_width;

-    output_ptr += output_pitch;

-  }

-}

-/*

- * The only functional difference between filter_block2d_second_pass()

- * and this function is that filter_block2d_second_pass() does a sixtap

- * filter on the input and stores it in the output. This function

- * (filter_block2d_second_pass_avg()) does a sixtap filter on the input,

- * and then averages that with the content already present in the output

- * ((filter_result + dest + 1) >> 1) and stores that in the output.

- */

-static void filter_block2d_second_pass_avg_6(int *src_ptr,

-                                             unsigned char *output_ptr,

-                                             int output_pitch,

-                                             unsigned int src_pixels_per_line,

-                                             unsigned int pixel_step,

-                                             unsigned int output_height,

-                                             unsigned int output_width,

-                                             const short *vp9_filter) {

-  unsigned int i, j;

-  int  Temp;

-  for (i = 0; i < output_height; i++) {

-    for (j = 0; j < output_width; j++) {

-      /* Apply filter */

-      Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +

-             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +

-             ((int)src_ptr[0]                    * vp9_filter[2]) +

-             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +

-             ((int)src_ptr[2 * pixel_step]         * vp9_filter[4]) +

-             ((int)src_ptr[3 * pixel_step]         * vp9_filter[5]) +

-             (VP9_FILTER_WEIGHT >> 1);   /* Rounding */

-      /* Normalize back to 0-255 */

-      Temp = Temp >> VP9_FILTER_SHIFT;

-      if (Temp < 0)

-        Temp = 0;

-      else if (Temp > 255)

-        Temp = 255;

-      output_ptr[j] = (unsigned char)((output_ptr[j] + Temp + 1) >> 1);

-      src_ptr++;

-    }

-    /* Start next row */

-    src_ptr    += src_pixels_per_line - output_width;

-    output_ptr += output_pitch;

-  }

-}

-#define Interp_Extend 3

-static void filter_block2d_6(unsigned char  *src_ptr,

-                             unsigned char  *output_ptr,

-                             unsigned int src_pixels_per_line,

-                             int output_pitch,

-                             const short  *HFilter,

-                             const short  *VFilter) {

-  int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer used in filtering */

-  /* First filter 1-D horizontally... */

-  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,

-                              3 + Interp_Extend * 2, 4, HFilter);

-  /* then filter verticaly... */

-  filter_block2d_second_pass_6(FData + 4 * (Interp_Extend - 1), output_ptr, output_pitch, 4, 4, 4, 4, VFilter);

-}

-void vp9_sixtap_predict_c(unsigned char  *src_ptr,

-                          int   src_pixels_per_line,

-                          int  xoffset,

-                          int  yoffset,

-                          unsigned char *dst_ptr,

-                          int dst_pitch) {

-  const short  *HFilter;

-  const short  *VFilter;

-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

-  filter_block2d_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);

-}

-/*

- * The difference between filter_block2d_6() and filter_block2d_avg_6 is

- * that filter_block2d_6() does a 6-tap filter and stores it in the output

- * buffer, whereas filter_block2d_avg_6() does the same 6-tap filter, and

- * then averages that with the content already present in the output

- * ((filter_result + dest + 1) >> 1) and stores that in the output.

- */

-static void filter_block2d_avg_6(unsigned char  *src_ptr,

-                                 unsigned char  *output_ptr,

-                                 unsigned int src_pixels_per_line,

-                                 int output_pitch,

-                                 const short  *HFilter,

-                                 const short  *VFilter) {

-  int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer used in filtering */

-  /* First filter 1-D horizontally... */

-  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line),

-                              FData, src_pixels_per_line, 1,

-                              3 + Interp_Extend * 2, 4, HFilter);

-  /* then filter verticaly... */

-  filter_block2d_second_pass_avg_6(FData + 4 * (Interp_Extend - 1), output_ptr,

-                                   output_pitch, 4, 4, 4, 4, VFilter);

-}

-void vp9_sixtap_predict_avg_c

-(

-  unsigned char  *src_ptr,

-  int   src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  unsigned char *dst_ptr,

-  int dst_pitch

-) {

-  const short  *HFilter;

-  const short  *VFilter;

-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

-  filter_block2d_avg_6(src_ptr, dst_ptr, src_pixels_per_line,

-                       dst_pitch, HFilter, VFilter);

-}

-void vp9_sixtap_predict8x8_c

-(

-  unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  unsigned char *dst_ptr,

-  int  dst_pitch

-) {

-  const short  *HFilter;

-  const short  *VFilter;

-  // int FData[(7+Interp_Extend*2)*16];   /* Temp data buffer used in filtering */

-  int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */

-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

-  /* First filter 1-D horizontally... */

-  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,

-                              7 + Interp_Extend * 2, 8, HFilter);

-  /* then filter verticaly... */

-  filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);

-}

-void vp9_sixtap_predict_avg8x8_c

-(

-  unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  unsigned char *dst_ptr,

-  int  dst_pitch

-) {

-  const short  *HFilter;

-  const short  *VFilter;

-  // int FData[(7+Interp_Extend*2)*16];   /* Temp data buffer used in filtering */

-  int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */

-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

-  /* First filter 1-D horizontally... */

-  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,

-                              7 + Interp_Extend * 2, 8, HFilter);

-  /* then filter verticaly... */

-  filter_block2d_second_pass_avg_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);

-}

-void vp9_sixtap_predict8x4_c

-(

-  unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  unsigned char *dst_ptr,

-  int  dst_pitch

-) {

-  const short  *HFilter;

-  const short  *VFilter;

-  // int FData[(7+Interp_Extend*2)*16];   /* Temp data buffer used in filtering */

-  int FData[(3 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */

-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

-  /* First filter 1-D horizontally... */

-  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,

-                              3 + Interp_Extend * 2, 8, HFilter);

-  /* then filter verticaly... */

-  filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);

-}

-void vp9_sixtap_predict16x16_c

-(

-  unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  unsigned char *dst_ptr,

-  int  dst_pitch

-) {

-  const short  *HFilter;

-  const short  *VFilter;

-  // int FData[(15+Interp_Extend*2)*24];   /* Temp data buffer used in filtering */

-  int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer used in filtering */

-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

-  /* First filter 1-D horizontally... */

-  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,

-                              15 + Interp_Extend * 2, 16, HFilter);

-  /* then filter verticaly... */

-  filter_block2d_second_pass_6(FData + 16 * (Interp_Extend - 1), dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);

-}

-void vp9_sixtap_predict_avg16x16_c

-(

-  unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  unsigned char *dst_ptr,

-  int  dst_pitch

-) {

-  const short  *HFilter;

-  const short  *VFilter;

-  // int FData[(15+Interp_Extend*2)*24];   /* Temp data buffer used in filtering */

-  int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer used in filtering */

-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

-  /* First filter 1-D horizontally... */

-  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,

-                              src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter);

-  /* then filter verticaly... */

-  filter_block2d_second_pass_avg_6(FData + 16 * (Interp_Extend - 1), dst_ptr, dst_pitch,

-                                   16, 16, 16, 16, VFilter);

-}

-typedef enum {

-  VPX_FILTER_4x4 = 0,

-  VPX_FILTER_8x8 = 1,

-  VPX_FILTER_8x4 = 2,

-  VPX_FILTER_16x16 = 3,

-} filter_size_t;

-static const unsigned int filter_size_to_wh[][2] = {

-  {4, 4},

-  {8, 8},

-  {8, 4},

-  {16,16},

-};

-static const unsigned int filter_max_height = 16;

-static const unsigned int filter_max_width = 16;

-static void filter_block2d_8_c(const unsigned char *src_ptr,

-                               const unsigned int   src_stride,

-                               const short *HFilter,

-                               const short *VFilter,

-                               const filter_size_t filter_size,

-                               unsigned char *dst_ptr,

-                               unsigned int   dst_stride) {

-  const unsigned int output_width = filter_size_to_wh[filter_size][0];

-  const unsigned int output_height = filter_size_to_wh[filter_size][1];

-  // Between passes, we use an intermediate buffer whose height is extended to

-  // have enough horizontally filtered values as input for the vertical pass.

-  // This buffer is allocated to be big enough for the largest block type we

-  // support.

-  const int kInterp_Extend = 4;

-  const unsigned int intermediate_height =

-    (kInterp_Extend - 1) +     output_height + kInterp_Extend;

-  const unsigned int max_intermediate_height =

-    (kInterp_Extend - 1) + filter_max_height + kInterp_Extend;

-#ifdef _MSC_VER

-  // MSVC does not support C99 style declaration

-  unsigned char intermediate_buffer[23 * 16];

-#else

-  unsigned char intermediate_buffer[max_intermediate_height * filter_max_width];

-#endif

-  const int intermediate_next_stride = 1 - intermediate_height * output_width;

-  // Horizontal pass (src -> transposed intermediate).

-  {

-    unsigned char *output_ptr = intermediate_buffer;

-    const int src_next_row_stride = src_stride - output_width;

-    unsigned int i, j;

-    src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);

-    for (i = 0; i < intermediate_height; i++) {

-      for (j = 0; j < output_width; j++) {

-        // Apply filter...

-        int temp = ((int)src_ptr[0] * HFilter[0]) +

-                   ((int)src_ptr[1] * HFilter[1]) +

-                   ((int)src_ptr[2] * HFilter[2]) +

-                   ((int)src_ptr[3] * HFilter[3]) +

-                   ((int)src_ptr[4] * HFilter[4]) +

-                   ((int)src_ptr[5] * HFilter[5]) +

-                   ((int)src_ptr[6] * HFilter[6]) +

-                   ((int)src_ptr[7] * HFilter[7]) +

-                   (VP9_FILTER_WEIGHT >> 1); // Rounding

-        // Normalize back to 0-255...

-        temp >>= VP9_FILTER_SHIFT;

-        if (temp < 0) {

-          temp = 0;

-        } else if (temp > 255) {

-          temp = 255;

-        }

-        src_ptr++;

-        *output_ptr = temp;

-        output_ptr += intermediate_height;

-      }

-      src_ptr += src_next_row_stride;

-      output_ptr += intermediate_next_stride;

-    }

-  }

-  // Vertical pass (transposed intermediate -> dst).

-  {

-    unsigned char *src_ptr = intermediate_buffer;

-    const int dst_next_row_stride = dst_stride - output_width;

-    unsigned int i, j;

-    for (i = 0; i < output_height; i++) {

-      for (j = 0; j < output_width; j++) {

-        // Apply filter...

-        int temp = ((int)src_ptr[0] * VFilter[0]) +

-                   ((int)src_ptr[1] * VFilter[1]) +

-                   ((int)src_ptr[2] * VFilter[2]) +

-                   ((int)src_ptr[3] * VFilter[3]) +

-                   ((int)src_ptr[4] * VFilter[4]) +

-                   ((int)src_ptr[5] * VFilter[5]) +

-                   ((int)src_ptr[6] * VFilter[6]) +

-                   ((int)src_ptr[7] * VFilter[7]) +

-                   (VP9_FILTER_WEIGHT >> 1); // Rounding

-        // Normalize back to 0-255...

-        temp >>= VP9_FILTER_SHIFT;

-        if (temp < 0) {

-          temp = 0;

-        } else if (temp > 255) {

-          temp = 255;

-        }

-        src_ptr += intermediate_height;

-        *dst_ptr++ = (unsigned char)temp;

-      }

-      src_ptr += intermediate_next_stride;

-      dst_ptr += dst_next_row_stride;

-    }

-  }

-}

-void vp9_filter_block2d_4x4_8_c(const unsigned char *src_ptr,

-                                const unsigned int src_stride,

-                                const short *HFilter_aligned16,

-                                const short *VFilter_aligned16,

-                                unsigned char *dst_ptr,

-                                unsigned int dst_stride) {

-  filter_block2d_8_c(src_ptr, src_stride,

-                     HFilter_aligned16, VFilter_aligned16,

-                     VPX_FILTER_4x4, dst_ptr, dst_stride);

-}

-void vp9_filter_block2d_8x4_8_c(const unsigned char *src_ptr,

-                                const unsigned int src_stride,

-                                const short *HFilter_aligned16,

-                                const short *VFilter_aligned16,

-                                unsigned char *dst_ptr,

-                                unsigned int dst_stride) {

-  filter_block2d_8_c(src_ptr, src_stride,

-                     HFilter_aligned16, VFilter_aligned16,

-                     VPX_FILTER_8x4, dst_ptr, dst_stride);

-}

-void vp9_filter_block2d_8x8_8_c(const unsigned char *src_ptr,

-                                const unsigned int src_stride,

-                                const short *HFilter_aligned16,

-                                const short *VFilter_aligned16,

-                                unsigned char *dst_ptr,

-                                unsigned int dst_stride) {

-  filter_block2d_8_c(src_ptr, src_stride,

-                     HFilter_aligned16, VFilter_aligned16,

-                     VPX_FILTER_8x8, dst_ptr, dst_stride);

-}

-void vp9_filter_block2d_16x16_8_c(const unsigned char *src_ptr,

-                                  const unsigned int src_stride,

-                                  const short *HFilter_aligned16,

-                                  const short *VFilter_aligned16,

-                                  unsigned char *dst_ptr,

-                                  unsigned int dst_stride) {

-  filter_block2d_8_c(src_ptr, src_stride,

-                     HFilter_aligned16, VFilter_aligned16,

-                     VPX_FILTER_16x16, dst_ptr, dst_stride);

-}

-static void block2d_average_c(unsigned char *src,

-                              unsigned int   src_stride,

-                              unsigned char *output_ptr,

-                              unsigned int output_stride,

-                              const filter_size_t filter_size) {

-  const unsigned int output_width = filter_size_to_wh[filter_size][0];

-  const unsigned int output_height = filter_size_to_wh[filter_size][1];

-  unsigned int i, j;

-  for (i = 0; i < output_height; i++) {

-    for (j = 0; j < output_width; j++) {

-      output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;

-    }

-    output_ptr += output_stride;

-  }

-}

-#define block2d_average block2d_average_c

-void vp9_eighttap_predict_c(unsigned char  *src_ptr,

-                            int   src_pixels_per_line,

-                            int  xoffset,

-                            int  yoffset,

-                            unsigned char *dst_ptr,

-                            int dst_pitch) {

-  const short  *HFilter;

-  const short  *VFilter;

-  HFilter = vp9_sub_pel_filters_8[xoffset];

-  VFilter = vp9_sub_pel_filters_8[yoffset];

-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,

-                           HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict_avg4x4_c(unsigned char  *src_ptr,

-                                   int   src_pixels_per_line,

-                                   int  xoffset,

-                                   int  yoffset,

-                                   unsigned char *dst_ptr,

-                                   int dst_pitch) {

-  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];

-  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];

-  unsigned char tmp[4 * 4];

-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,

-                           HFilter, VFilter,

-                           tmp, 4);

-  block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);

-}

-void vp9_eighttap_predict_sharp_c(unsigned char  *src_ptr,

-                                  int   src_pixels_per_line,

-                                  int  xoffset,

-                                  int  yoffset,

-                                  unsigned char *dst_ptr,

-                                  int dst_pitch) {

-  const short  *HFilter;

-  const short  *VFilter;

-  HFilter = vp9_sub_pel_filters_8s[xoffset];

-  VFilter = vp9_sub_pel_filters_8s[yoffset];

-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,

-                           HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict_avg4x4_sharp_c(unsigned char  *src_ptr,

-                                         int   src_pixels_per_line,

-                                         int  xoffset,

-                                         int  yoffset,

-                                         unsigned char *dst_ptr,

-                                         int dst_pitch) {

-  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];

-  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];

-  unsigned char tmp[4 * 4];

-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,

-                           HFilter, VFilter,

-                           tmp, 4);

-  block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);

-}

-void vp9_eighttap_predict8x8_c(unsigned char  *src_ptr,

-                               int  src_pixels_per_line,

-                               int  xoffset,

-                               int  yoffset,

-                               unsigned char *dst_ptr,

-                               int  dst_pitch) {

-  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];

-  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];

-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,

-                           HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict8x8_sharp_c(unsigned char  *src_ptr,

-                                     int  src_pixels_per_line,

-                                     int  xoffset,

-                                     int  yoffset,

-                                     unsigned char *dst_ptr,

-                                     int  dst_pitch) {

-  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];

-  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];

-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,

-                           HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict_avg8x8_c(unsigned char  *src_ptr,

-                                   int  src_pixels_per_line,

-                                   int  xoffset,

-                                   int  yoffset,

-                                   unsigned char *dst_ptr,

-                                   int  dst_pitch) {

-  unsigned char tmp[8 * 8];

-  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];

-  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];

-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,

-                           HFilter, VFilter,

-                           tmp, 8);

-  block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);

-}

-void vp9_eighttap_predict_avg8x8_sharp_c(unsigned char  *src_ptr,

-                                         int  src_pixels_per_line,

-                                         int  xoffset,

-                                         int  yoffset,

-                                         unsigned char *dst_ptr,

-                                         int  dst_pitch) {

-  unsigned char tmp[8 * 8];

-  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];

-  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];

-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,

-                           HFilter, VFilter,

-                           tmp, 8);

-  block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);

-}

-void vp9_eighttap_predict8x4_c(unsigned char  *src_ptr,

-                               int  src_pixels_per_line,

-                               int  xoffset,

-                               int  yoffset,

-                               unsigned char *dst_ptr,

-                               int  dst_pitch) {

-  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];

-  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];

-  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line,

-                           HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict8x4_sharp_c(unsigned char  *src_ptr,

-                                     int  src_pixels_per_line,

-                                     int  xoffset,

-                                     int  yoffset,

-                                     unsigned char *dst_ptr,

-                                     int  dst_pitch) {

-  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];

-  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];

-  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line,

-                           HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict16x16_c(unsigned char  *src_ptr,

-                                 int  src_pixels_per_line,

-                                 int  xoffset,

-                                 int  yoffset,

-                                 unsigned char *dst_ptr,

-                                 int  dst_pitch) {

-  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];

-  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];

-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,

-                       HFilter, VFilter,

-                       dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict16x16_sharp_c(unsigned char  *src_ptr,

-                                       int  src_pixels_per_line,

-                                       int  xoffset,

-                                       int  yoffset,

-                                       unsigned char *dst_ptr,

-                                       int  dst_pitch) {

-  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];

-  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];

-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,

-                       HFilter, VFilter,

-                       dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict_avg16x16_c(unsigned char  *src_ptr,

-                                     int  src_pixels_per_line,

-                                     int  xoffset,

-                                     int  yoffset,

-                                     unsigned char *dst_ptr,

-                                     int  dst_pitch) {

-  DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 16 * 16);

-  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];

-  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];

-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,

-                       HFilter, VFilter,

-                       tmp, 16);

-  block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);

-}

-void vp9_eighttap_predict_avg16x16_sharp_c(unsigned char  *src_ptr,

-                                           int  src_pixels_per_line,

-                                           int  xoffset,

-                                           int  yoffset,

-                                           unsigned char *dst_ptr,

-                                           int  dst_pitch) {

-  DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 16 * 16);

-  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];

-  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];

-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,

-                       HFilter, VFilter,

-                       tmp, 16);

-  block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);

-}

-/****************************************************************************

- *

- *  ROUTINE       : filter_block2d_bil_first_pass

- *

- *  INPUTS        : UINT8  *src_ptr    : Pointer to source block.

- *                  UINT32  src_stride : Stride of source block.

- *                  UINT32  height     : Block height.

- *                  UINT32  width      : Block width.

- *                  INT32  *vp9_filter : Array of 2 bi-linear filter taps.

- *

- *  OUTPUTS       : INT32  *dst_ptr    : Pointer to filtered block.

- *

- *  RETURNS       : void

- *

- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block

- *                  in the horizontal direction to produce the filtered output

- *                  block. Used to implement first-pass of 2-D separable filter.

- *

- *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.

- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.

- *

- ****************************************************************************/

-static void filter_block2d_bil_first_pass(unsigned char  *src_ptr,

-                                          unsigned short *dst_ptr,

-                                          unsigned int    src_stride,

-                                          unsigned int    height,

-                                          unsigned int    width,

-                                          const short    *vp9_filter) {

-  unsigned int i, j;

-  for (i = 0; i < height; i++) {

-    for (j = 0; j < width; j++) {

-      /* Apply bilinear filter */

-      dst_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) +

-                    ((int)src_ptr[1] * vp9_filter[1]) +

-                    (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;

-      src_ptr++;

-    }

-    /* Next row... */

-    src_ptr += src_stride - width;

-    dst_ptr += width;

-  }

-}

-/****************************************************************************

- *

- *  ROUTINE       : filter_block2d_bil_second_pass

- *

- *  INPUTS        : INT32  *src_ptr    : Pointer to source block.

- *                  UINT32  dst_pitch  : Destination block pitch.

- *                  UINT32  height     : Block height.

- *                  UINT32  width      : Block width.

- *                  INT32  *vp9_filter : Array of 2 bi-linear filter taps.

- *

- *  OUTPUTS       : UINT16 *dst_ptr    : Pointer to filtered block.

- *

- *  RETURNS       : void

- *

- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block

- *                  in the vertical direction to produce the filtered output

- *                  block. Used to implement second-pass of 2-D separable filter.

- *

- *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.

- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.

- *

- ****************************************************************************/

-static void filter_block2d_bil_second_pass(unsigned short *src_ptr,

-                                           unsigned char  *dst_ptr,

-                                           int             dst_pitch,

-                                           unsigned int    height,

-                                           unsigned int    width,

-                                           const short    *vp9_filter) {

-  unsigned int  i, j;

-  int  Temp;

-  for (i = 0; i < height; i++) {

-    for (j = 0; j < width; j++) {

-      /* Apply filter */

-      Temp = ((int)src_ptr[0]     * vp9_filter[0]) +

-             ((int)src_ptr[width] * vp9_filter[1]) +

-             (VP9_FILTER_WEIGHT / 2);

-      dst_ptr[j] = (unsigned int)(Temp >> VP9_FILTER_SHIFT);

-      src_ptr++;

-    }

-    /* Next row... */

-    dst_ptr += dst_pitch;

-  }

-}

-/*

- * As before for filter_block2d_second_pass_avg(), the functional difference

- * between filter_block2d_bil_second_pass() and filter_block2d_bil_second_pass_avg()

- * is that filter_block2d_bil_second_pass() does a bilinear filter on input

- * and stores the result in output; filter_block2d_bil_second_pass_avg(),

- * instead, does a bilinear filter on input, averages the resulting value

- * with the values already present in the output and stores the result of

- * that back into the output ((filter_result + dest + 1) >> 1).

- */

-static void filter_block2d_bil_second_pass_avg(unsigned short *src_ptr,

-                                               unsigned char  *dst_ptr,

-                                               int             dst_pitch,

-                                               unsigned int    height,

-                                               unsigned int    width,

-                                               const short    *vp9_filter) {

-  unsigned int  i, j;

-  int  Temp;

-  for (i = 0; i < height; i++) {

-    for (j = 0; j < width; j++) {

-      /* Apply filter */

-      Temp = ((int)src_ptr[0]     * vp9_filter[0]) +

-             ((int)src_ptr[width] * vp9_filter[1]) +

-             (VP9_FILTER_WEIGHT / 2);

-      dst_ptr[j] = (unsigned int)(((Temp >> VP9_FILTER_SHIFT) + dst_ptr[j] + 1) >> 1);

-      src_ptr++;

-    }

-    /* Next row... */

-    dst_ptr += dst_pitch;

-  }

-}

-/****************************************************************************

- *

- *  ROUTINE       : filter_block2d_bil

- *

- *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.

- *                  UINT32  src_pitch        : Stride of source block.

- *                  UINT32  dst_pitch        : Stride of destination block.

- *                  INT32  *HFilter          : Array of 2 horizontal filter taps.

- *                  INT32  *VFilter          : Array of 2 vertical filter taps.

- *                  INT32  Width             : Block width

- *                  INT32  Height            : Block height

- *

- *  OUTPUTS       : UINT16 *dst_ptr       : Pointer to filtered block.

- *

- *  RETURNS       : void

- *

- *  FUNCTION      : 2-D filters an input block by applying a 2-tap

- *                  bi-linear filter horizontally followed by a 2-tap

- *                  bi-linear filter vertically on the result.

- *

- *  SPECIAL NOTES : The largest block size can be handled here is 16x16

- *

- ****************************************************************************/

-static void filter_block2d_bil(unsigned char *src_ptr,

-                               unsigned char *dst_ptr,

-                               unsigned int   src_pitch,

-                               unsigned int   dst_pitch,

-                               const short   *HFilter,

-                               const short   *VFilter,

-                               int            Width,

-                               int            Height) {

-  unsigned short FData[17 * 16];  /* Temp data buffer used in filtering */

-  /* First filter 1-D horizontally... */

-  filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);

-  /* then 1-D vertically... */

-  filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);

-}

-static void filter_block2d_bil_avg(unsigned char *src_ptr,

-                                   unsigned char *dst_ptr,

-                                   unsigned int   src_pitch,

-                                   unsigned int   dst_pitch,

-                                   const short   *HFilter,

-                                   const short   *VFilter,

-                                   int            Width,

-                                   int            Height) {

-  unsigned short FData[17 * 16];  /* Temp data buffer used in filtering */

-  /* First filter 1-D horizontally... */

-  filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);

-  /* then 1-D vertically... */

-  filter_block2d_bil_second_pass_avg(FData, dst_ptr, dst_pitch, Height, Width, VFilter);

-}

-void vp9_bilinear_predict4x4_c(unsigned char  *src_ptr,

-                               int   src_pixels_per_line,

-                               int  xoffset,

-                               int  yoffset,

-                               unsigned char *dst_ptr,

-                               int dst_pitch) {

-  const short *HFilter;

-  const short *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);

-}

-void vp9_bilinear_predict_avg4x4_c(unsigned char  *src_ptr,

-                                   int   src_pixels_per_line,

-                                   int  xoffset,

-                                   int  yoffset,

-                                   unsigned char *dst_ptr,

-                                   int dst_pitch) {

-  const short *HFilter;

-  const short *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,

-                         dst_pitch, HFilter, VFilter, 4, 4);

-}

-void vp9_bilinear_predict8x8_c(unsigned char  *src_ptr,

-                               int  src_pixels_per_line,

-                               int  xoffset,

-                               int  yoffset,

-                               unsigned char *dst_ptr,

-                               int  dst_pitch) {

-  const short *HFilter;

-  const short *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);

-}

-void vp9_bilinear_predict_avg8x8_c(unsigned char  *src_ptr,

-                                   int  src_pixels_per_line,

-                                   int  xoffset,

-                                   int  yoffset,

-                                   unsigned char *dst_ptr,

-                                   int  dst_pitch) {

-  const short *HFilter;

-  const short *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,

-                         dst_pitch, HFilter, VFilter, 8, 8);

-}

-void vp9_bilinear_predict8x4_c(unsigned char  *src_ptr,

-                               int  src_pixels_per_line,

-                               int  xoffset,

-                               int  yoffset,

-                               unsigned char *dst_ptr,

-                               int  dst_pitch) {

-  const short *HFilter;

-  const short *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);

-}

-void vp9_bilinear_predict16x16_c(unsigned char  *src_ptr,

-                                 int  src_pixels_per_line,

-                                 int  xoffset,

-                                 int  yoffset,

-                                 unsigned char *dst_ptr,

-                                 int  dst_pitch) {

-  const short *HFilter;

-  const short *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);

-}

-void vp9_bilinear_predict_avg16x16_c(unsigned char  *src_ptr,

-                                     int  src_pixels_per_line,

-                                     int  xoffset,

-                                     int  yoffset,

-                                     unsigned char *dst_ptr,

-                                     int  dst_pitch) {

-  const short *HFilter;

-  const short *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,

-                         dst_pitch, HFilter, VFilter, 16, 16);

-}

--- a/vp8/common/filter.h

+++ /dev/null

@@ -1,28 +1,0 @@

-/*

- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef FILTER_H

-#define FILTER_H

-#include "vpx_config.h"

-#include "vpx_scale/yv12config.h"

-#define BLOCK_HEIGHT_WIDTH 4

-#define VP9_FILTER_WEIGHT 128

-#define VP9_FILTER_SHIFT  7

-#define SUBPEL_SHIFTS 16

-extern const short vp9_bilinear_filters[SUBPEL_SHIFTS][2];

-extern const short vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6];

-extern const short vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8];

-extern const short vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8];

-#endif // FILTER_H

--- a/vp8/common/findnearmv.c

+++ /dev/null

@@ -1,327 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "findnearmv.h"

-#include "vp8/common/sadmxn.h"

-#include <limits.h>

-const unsigned char vp9_mbsplit_offset[4][16] = {

-  { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},

-  { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},

-  { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},

-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}

-};

-static void lower_mv_precision(int_mv *mv, int usehp)

-{

-  if (!usehp || !vp9_use_nmv_hp(&mv->as_mv)) {

-    if (mv->as_mv.row & 1)

-      mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1);

-    if (mv->as_mv.col & 1)

-      mv->as_mv.col += (mv->as_mv.col > 0 ? -1 : 1);

-  }

-}

-/* Predict motion vectors using those from already-decoded nearby blocks.

-   Note that we only consider one 4x4 subblock from each candidate 16x16

-   macroblock.   */

-void vp9_find_near_mvs

-(

-  MACROBLOCKD *xd,

-  const MODE_INFO *here,

-  const MODE_INFO *lf_here,

-  int_mv *nearest,

-  int_mv *nearby,

-  int_mv *best_mv,

-  int cnt[4],

-  int refframe,

-  int *ref_frame_sign_bias) {

-  const MODE_INFO *above = here - xd->mode_info_stride;

-  const MODE_INFO *left = here - 1;

-  const MODE_INFO *aboveleft = above - 1;

-  const MODE_INFO *third = NULL;

-  int_mv            near_mvs[4];

-  int_mv           *mv = near_mvs;

-  int             *cntx = cnt;

-  enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV};

-  /* Zero accumulators */

-  mv[0].as_int = mv[1].as_int = mv[2].as_int = 0;

-  cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;

-  /* Process above */

-  if (above->mbmi.ref_frame != INTRA_FRAME) {

-    if (above->mbmi.mv[0].as_int) {

-      ++ mv;

-      mv->as_int = above->mbmi.mv[0].as_int;

-      mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame],

-              refframe, mv, ref_frame_sign_bias);

-      ++cntx;

-    }

-    *cntx += 2;

-  }

-  /* Process left */

-  if (left->mbmi.ref_frame != INTRA_FRAME) {

-    if (left->mbmi.mv[0].as_int) {

-      int_mv this_mv;

-      this_mv.as_int = left->mbmi.mv[0].as_int;

-      mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame],

-              refframe, &this_mv, ref_frame_sign_bias);

-      if (this_mv.as_int != mv->as_int) {

-        ++ mv;

-        mv->as_int = this_mv.as_int;

-        ++ cntx;

-      }

-      *cntx += 2;

-    } else

-      cnt[CNT_INTRA] += 2;

-  }

-  /* Process above left or the one from last frame */

-  if (aboveleft->mbmi.ref_frame != INTRA_FRAME ||

-      (lf_here->mbmi.ref_frame == LAST_FRAME && refframe == LAST_FRAME)) {

-    if (aboveleft->mbmi.mv[0].as_int) {

-      third = aboveleft;

-    } else if (lf_here->mbmi.mv[0].as_int) {

-      third = lf_here;

-    }

-    if (third) {

-      int_mv this_mv;

-      this_mv.as_int = third->mbmi.mv[0].as_int;

-      mv_bias(ref_frame_sign_bias[third->mbmi.ref_frame],

-              refframe, &this_mv, ref_frame_sign_bias);

-      if (this_mv.as_int != mv->as_int) {

-        ++ mv;

-        mv->as_int = this_mv.as_int;

-        ++ cntx;

-      }

-      *cntx += 1;

-    } else

-      cnt[CNT_INTRA] += 1;

-  }

-  /* If we have three distinct MV's ... */

-  if (cnt[CNT_SPLITMV]) {

-    /* See if the third MV can be merged with NEAREST */

-    if (mv->as_int == near_mvs[CNT_NEAREST].as_int)

-      cnt[CNT_NEAREST] += 1;

-  }

-  cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV)

-                      + (left->mbmi.mode == SPLITMV)) * 2

-                     + (

-                       lf_here->mbmi.mode == SPLITMV ||

-                       aboveleft->mbmi.mode == SPLITMV);

-  /* Swap near and nearest if necessary */

-  if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {

-    int tmp;

-    tmp = cnt[CNT_NEAREST];

-    cnt[CNT_NEAREST] = cnt[CNT_NEAR];

-    cnt[CNT_NEAR] = tmp;

-    tmp = near_mvs[CNT_NEAREST].as_int;

-    near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;

-    near_mvs[CNT_NEAR].as_int = tmp;

-  }

-  /* Use near_mvs[0] to store the "best" MV */

-  if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA])

-    near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST];

-  /* Set up return values */

-  best_mv->as_int = near_mvs[0].as_int;

-  nearest->as_int = near_mvs[CNT_NEAREST].as_int;

-  nearby->as_int = near_mvs[CNT_NEAR].as_int;

-  /* Make sure that the 1/8th bits of the Mvs are zero if high_precision

-   * is not being used, by truncating the last bit towards 0

-   */

-  lower_mv_precision(best_mv, xd->allow_high_precision_mv);

-  lower_mv_precision(nearest, xd->allow_high_precision_mv);

-  lower_mv_precision(nearby, xd->allow_high_precision_mv);

-  // TODO: move clamp outside findnearmv

-  clamp_mv2(nearest, xd);

-  clamp_mv2(nearby, xd);

-  clamp_mv2(best_mv, xd);

-}

-vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,

-                           vp9_prob p[VP9_MVREFS - 1], const int near_mv_ref_ct[4]

-                          ) {

-  p[0] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[0]] [0];

-  p[1] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[1]] [1];

-  p[2] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[2]] [2];

-  p[3] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[3]] [3];

-  return p;

-}

-#if CONFIG_NEWBESTREFMV

-#define SP(x) (((x) & 7) << 1)

-unsigned int vp9_sad3x16_c(

-  const unsigned char *src_ptr,

-  int  src_stride,

-  const unsigned char *ref_ptr,

-  int  ref_stride,

-  int max_sad) {

-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 3, 16);

-}

-unsigned int vp9_sad16x3_c(

-  const unsigned char *src_ptr,

-  int  src_stride,

-  const unsigned char *ref_ptr,

-  int  ref_stride,

-  int max_sad) {

-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 3);

-}

-/* check a list of motion vectors by sad score using a number rows of pixels

- * above and a number cols of pixels in the left to select the one with best

- * score to use as ref motion vector

- */

-void vp9_find_best_ref_mvs(MACROBLOCKD *xd,

-                           unsigned char *ref_y_buffer,

-                           int ref_y_stride,

-                           int_mv *mvlist,

-                           int_mv *best_mv,

-                           int_mv *nearest,

-                           int_mv *near) {

-  int i, j;

-  unsigned char *above_src;

-  unsigned char *left_src;

-  unsigned char *above_ref;

-  unsigned char *left_ref;

-  int score;

-  int sse;

-  int ref_scores[MAX_MV_REFS] = {0};

-  int_mv sorted_mvs[MAX_MV_REFS];

-  int zero_seen = FALSE;

-  // Default all to 0,0 if nothing else available

-  best_mv->as_int = nearest->as_int = near->as_int = 0;

-  vpx_memset(sorted_mvs, 0, sizeof(sorted_mvs));

-#if CONFIG_SUBPELREFMV

-  above_src = xd->dst.y_buffer - xd->dst.y_stride * 2;

-  left_src  = xd->dst.y_buffer - 2;

-  above_ref = ref_y_buffer - ref_y_stride * 2;

-  left_ref  = ref_y_buffer - 2;

-#else

-  above_src = xd->dst.y_buffer - xd->dst.y_stride * 3;

-  left_src  = xd->dst.y_buffer - 3;

-  above_ref = ref_y_buffer - ref_y_stride * 3;

-  left_ref  = ref_y_buffer - 3;

-#endif

-  //for(i = 0; i < MAX_MV_REFS; ++i) {

-  // Limit search to the predicted best 4

-  for(i = 0; i < 4; ++i) {

-    int_mv this_mv;

-    int offset = 0;

-    int row_offset, col_offset;

-    this_mv.as_int = mvlist[i].as_int;

-    // If we see a 0,0 vector for a second time we have reached the end of

-    // the list of valid candidate vectors.

-    if (!this_mv.as_int && zero_seen)

-      break;

-    zero_seen = zero_seen || !this_mv.as_int;

-    clamp_mv(&this_mv,

-             xd->mb_to_left_edge - LEFT_TOP_MARGIN + 24,

-             xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,

-             xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,

-             xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);

-#if CONFIG_SUBPELREFMV

-    row_offset = this_mv.as_mv.row >> 3;

-    col_offset = this_mv.as_mv.col >> 3;

-    offset = ref_y_stride * row_offset + col_offset;

-    score = 0;

-    if (xd->up_available) {

-      vp9_sub_pixel_variance16x2_c(above_ref + offset, ref_y_stride,

-                                   SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

-                                   above_src, xd->dst.y_stride, &sse);

-      score += sse;

-    }

-    if (xd->left_available) {

-      vp9_sub_pixel_variance2x16_c(left_ref + offset, ref_y_stride,

-                                   SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

-                                   left_src, xd->dst.y_stride, &sse);

-      score += sse;

-    }

-#else

-    row_offset = (this_mv.as_mv.row > 0) ?

-      ((this_mv.as_mv.row + 3) >> 3):((this_mv.as_mv.row + 4) >> 3);

-    col_offset = (this_mv.as_mv.col > 0) ?

-      ((this_mv.as_mv.col + 3) >> 3):((this_mv.as_mv.col + 4) >> 3);

-    offset = ref_y_stride * row_offset + col_offset;

-    score = 0;

-    if (xd->up_available) {

-      score += vp9_sad16x3(above_src, xd->dst.y_stride,

-                           above_ref + offset, ref_y_stride, INT_MAX);

-    }

-    if (xd->left_available) {

-      score += vp9_sad3x16(left_src, xd->dst.y_stride,

-                           left_ref + offset, ref_y_stride, INT_MAX);

-    }

-#endif

-    // Add the entry to our list and then resort the list on score.

-    ref_scores[i] = score;

-    sorted_mvs[i].as_int = this_mv.as_int;

-    j = i;

-    while (j > 0) {

-      if (ref_scores[j] < ref_scores[j-1]) {

-        ref_scores[j] = ref_scores[j-1];

-        sorted_mvs[j].as_int = sorted_mvs[j-1].as_int;

-        ref_scores[j-1] = score;

-        sorted_mvs[j-1].as_int = this_mv.as_int;

-        j--;

-      } else

-        break;

-    }

-  }

-  // Make sure all the candidates are properly clamped etc

-  for (i = 0; i < 4; ++i) {

-    lower_mv_precision(&sorted_mvs[i], xd->allow_high_precision_mv);

-    clamp_mv2(&sorted_mvs[i], xd);

-  }

-  // Set the best mv to the first entry in the sorted list

-  best_mv->as_int = sorted_mvs[0].as_int;

-  // Provided that there are non zero vectors available there will not

-  // be more than one 0,0 entry in the sorted list.

-  // The best ref mv is always set to the first entry (which gave the best

-  // results. The nearest is set to the first non zero vector if available and

-  // near to the second non zero vector if available.

-  // We do not use 0,0 as a nearest or near as 0,0 has its own mode.

-  if ( sorted_mvs[0].as_int ) {

-    nearest->as_int = sorted_mvs[0].as_int;

-    if ( sorted_mvs[1].as_int )

-      near->as_int = sorted_mvs[1].as_int;

-    else

-      near->as_int = sorted_mvs[2].as_int;

-  } else {

-      nearest->as_int = sorted_mvs[1].as_int;

-      near->as_int = sorted_mvs[2].as_int;

-  }

-  // Copy back the re-ordered mv list

-  vpx_memcpy(mvlist, sorted_mvs, sizeof(sorted_mvs));

-}

-#endif  // CONFIG_NEWBESTREFMV

--- a/vp8/common/findnearmv.h

+++ /dev/null

@@ -1,188 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_FINDNEARMV_H

-#define __INC_FINDNEARMV_H

-#include "mv.h"

-#include "blockd.h"

-#include "modecont.h"

-#include "treecoder.h"

-#include "onyxc_int.h"

-#if CONFIG_NEWBESTREFMV

-/* check a list of motion vectors by sad score using a number rows of pixels

- * above and a number cols of pixels in the left to select the one with best

- * score to use as ref motion vector

- */

-void vp9_find_best_ref_mvs(MACROBLOCKD *xd,

-                           unsigned char *ref_y_buffer,

-                           int ref_y_stride,

-                           int_mv *mvlist,

-                           int_mv *best_mv,

-                           int_mv *nearest,

-                           int_mv *near);

-#endif

-static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, const int *ref_frame_sign_bias) {

-  MV xmv;

-  xmv = mvp->as_mv;

-  if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe]) {

-    xmv.row *= -1;

-    xmv.col *= -1;

-  }

-  mvp->as_mv = xmv;

-}

-#define LEFT_TOP_MARGIN (16 << 3)

-#define RIGHT_BOTTOM_MARGIN (16 << 3)

-static void clamp_mv(int_mv *mv,

-                     int mb_to_left_edge,

-                     int mb_to_right_edge,

-                     int mb_to_top_edge,

-                     int mb_to_bottom_edge) {

-  mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ?

-                  mb_to_left_edge : mv->as_mv.col;

-  mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ?

-                  mb_to_right_edge : mv->as_mv.col;

-  mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ?

-                  mb_to_top_edge : mv->as_mv.row;

-  mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ?

-                  mb_to_bottom_edge : mv->as_mv.row;

-}

-static void clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) {

-  clamp_mv(mv,

-           xd->mb_to_left_edge - LEFT_TOP_MARGIN,

-           xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,

-           xd->mb_to_top_edge - LEFT_TOP_MARGIN,

-           xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);

-}

-static unsigned int check_mv_bounds(int_mv *mv,

-                                    int mb_to_left_edge,

-                                    int mb_to_right_edge,

-                                    int mb_to_top_edge,

-                                    int mb_to_bottom_edge) {

-  return (mv->as_mv.col < mb_to_left_edge) ||

-         (mv->as_mv.col > mb_to_right_edge) ||

-         (mv->as_mv.row < mb_to_top_edge) ||

-         (mv->as_mv.row > mb_to_bottom_edge);

-}

-void vp9_find_near_mvs(MACROBLOCKD *xd,

-                       const MODE_INFO *here,

-                       const MODE_INFO *lfhere,

-                       int_mv *nearest, int_mv *nearby, int_mv *best,

-                       int near_mv_ref_cts[4],

-                       int refframe,

-                       int *ref_frame_sign_bias);

-vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,

-                           vp9_prob p[VP9_MVREFS - 1],

-                           const int near_mv_ref_ct[4]);

-extern const unsigned char vp9_mbsplit_offset[4][16];

-static int left_block_mv(const MODE_INFO *cur_mb, int b) {

-  if (!(b & 3)) {

-    /* On L edge, get from MB to left of us */

-    --cur_mb;

-    if (cur_mb->mbmi.mode != SPLITMV)

-      return cur_mb->mbmi.mv[0].as_int;

-    b += 4;

-  }

-  return (cur_mb->bmi + b - 1)->as_mv.first.as_int;

-}

-static int left_block_second_mv(const MODE_INFO *cur_mb, int b) {

-  if (!(b & 3)) {

-    /* On L edge, get from MB to left of us */

-    --cur_mb;

-    if (cur_mb->mbmi.mode != SPLITMV)

-      return cur_mb->mbmi.second_ref_frame ? cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;

-    b += 4;

-  }

-  return cur_mb->mbmi.second_ref_frame ? (cur_mb->bmi + b - 1)->as_mv.second.as_int : (cur_mb->bmi + b - 1)->as_mv.first.as_int;

-}

-static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {

-  if (!(b >> 2)) {

-    /* On top edge, get from MB above us */

-    cur_mb -= mi_stride;

-    if (cur_mb->mbmi.mode != SPLITMV)

-      return cur_mb->mbmi.mv[0].as_int;

-    b += 16;

-  }

-  return (cur_mb->bmi + b - 4)->as_mv.first.as_int;

-}

-static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {

-  if (!(b >> 2)) {

-    /* On top edge, get from MB above us */

-    cur_mb -= mi_stride;

-    if (cur_mb->mbmi.mode != SPLITMV)

-      return cur_mb->mbmi.second_ref_frame ? cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;

-    b += 16;

-  }

-  return cur_mb->mbmi.second_ref_frame ? (cur_mb->bmi + b - 4)->as_mv.second.as_int : (cur_mb->bmi + b - 4)->as_mv.first.as_int;

-}

-static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {

-  if (!(b & 3)) {

-    /* On L edge, get from MB to left of us */

-    --cur_mb;

-    if (cur_mb->mbmi.mode < I8X8_PRED) {

-      return pred_mode_conv(cur_mb->mbmi.mode);

-    } else if (cur_mb->mbmi.mode == I8X8_PRED) {

-      return pred_mode_conv((cur_mb->bmi + 3 + b)->as_mode.first);

-    } else if (cur_mb->mbmi.mode == B_PRED) {

-      return ((cur_mb->bmi + 3 + b)->as_mode.first);

-    } else {

-      return B_DC_PRED;

-    }

-  }

-  return (cur_mb->bmi + b - 1)->as_mode.first;

-}

-static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,

-                                          int b, int mi_stride) {

-  if (!(b >> 2)) {

-    /* On top edge, get from MB above us */

-    cur_mb -= mi_stride;

-    if (cur_mb->mbmi.mode < I8X8_PRED) {

-      return pred_mode_conv(cur_mb->mbmi.mode);

-    } else if (cur_mb->mbmi.mode == I8X8_PRED) {

-      return pred_mode_conv((cur_mb->bmi + 12 + b)->as_mode.first);

-    } else if (cur_mb->mbmi.mode == B_PRED) {

-      return ((cur_mb->bmi + 12 + b)->as_mode.first);

-    } else {

-      return B_DC_PRED;

-    }

-  }

-  return (cur_mb->bmi + b - 4)->as_mode.first;

-}

-#endif

--- a/vp8/common/generic/systemdependent.c

+++ /dev/null

@@ -1,87 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "vpx_rtcd.h"

-#include "vp8/common/subpixel.h"

-#include "vp8/common/loopfilter.h"

-#include "vp8/common/idct.h"

-#include "vp8/common/onyxc_int.h"

-extern void vp9_arch_x86_common_init(VP9_COMMON *ctx);

-extern void vp9_arch_arm_common_init(VP9_COMMON *ctx);

-void vp9_machine_specific_config(VP9_COMMON *ctx) {

-#if CONFIG_RUNTIME_CPU_DETECT

-  VP9_COMMON_RTCD *rtcd = &ctx->rtcd;

-  rtcd->idct.idct1        = vp9_short_idct4x4llm_1_c;

-  rtcd->idct.idct16       = vp9_short_idct4x4llm_c;

-  rtcd->idct.idct1_scalar_add = vp9_dc_only_idct_add_c;

-  rtcd->idct.iwalsh1      = vp9_short_inv_walsh4x4_1_c;

-  rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_c;

-  rtcd->idct.idct8        = vp9_short_idct8x8_c;

-  rtcd->idct.idct1_scalar_add_8x8 = vp9_dc_only_idct_add_8x8_c;

-  rtcd->idct.ihaar2       = vp9_short_ihaar2x2_c;

-  rtcd->idct.idct16x16    = vp9_short_idct16x16_c;

-  rtcd->subpix.eighttap16x16       = vp9_eighttap_predict16x16_c;

-  rtcd->subpix.eighttap8x8         = vp9_eighttap_predict8x8_c;

-  rtcd->subpix.eighttap_avg16x16   = vp9_eighttap_predict_avg16x16_c;

-  rtcd->subpix.eighttap_avg8x8     = vp9_eighttap_predict_avg8x8_c;

-  rtcd->subpix.eighttap_avg4x4     = vp9_eighttap_predict_avg4x4_c;

-  rtcd->subpix.eighttap8x4         = vp9_eighttap_predict8x4_c;

-  rtcd->subpix.eighttap4x4         = vp9_eighttap_predict_c;

-  rtcd->subpix.eighttap16x16_sharp     = vp9_eighttap_predict16x16_sharp_c;

-  rtcd->subpix.eighttap8x8_sharp       = vp9_eighttap_predict8x8_sharp_c;

-  rtcd->subpix.eighttap_avg16x16_sharp = vp9_eighttap_predict_avg16x16_sharp_c;

-  rtcd->subpix.eighttap_avg8x8_sharp   = vp9_eighttap_predict_avg8x8_sharp_c;

-  rtcd->subpix.eighttap_avg4x4_sharp   = vp9_eighttap_predict_avg4x4_sharp_c;

-  rtcd->subpix.eighttap8x4_sharp       = vp9_eighttap_predict8x4_sharp_c;

-  rtcd->subpix.eighttap4x4_sharp       = vp9_eighttap_predict_sharp_c;

-  rtcd->subpix.sixtap16x16       = vp9_sixtap_predict16x16_c;

-  rtcd->subpix.sixtap8x8         = vp9_sixtap_predict8x8_c;

-  rtcd->subpix.sixtap_avg16x16   = vp9_sixtap_predict_avg16x16_c;

-  rtcd->subpix.sixtap_avg8x8     = vp9_sixtap_predict_avg8x8_c;

-  rtcd->subpix.sixtap8x4         = vp9_sixtap_predict8x4_c;

-  rtcd->subpix.sixtap4x4         = vp9_sixtap_predict_c;

-  rtcd->subpix.sixtap_avg4x4     = vp9_sixtap_predict_avg_c;

-  rtcd->subpix.bilinear16x16     = vp9_bilinear_predict16x16_c;

-  rtcd->subpix.bilinear8x8       = vp9_bilinear_predict8x8_c;

-  rtcd->subpix.bilinear_avg16x16 = vp9_bilinear_predict_avg16x16_c;

-  rtcd->subpix.bilinear_avg8x8   = vp9_bilinear_predict_avg8x8_c;

-  rtcd->subpix.bilinear8x4       = vp9_bilinear_predict8x4_c;

-  rtcd->subpix.bilinear4x4       = vp9_bilinear_predict4x4_c;

-  rtcd->subpix.bilinear_avg4x4   = vp9_bilinear_predict_avg4x4_c;

-#if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_INTERNAL_STATS)

-  rtcd->postproc.down             = vp9_mbpost_proc_down_c;

-  rtcd->postproc.across           = vp9_mbpost_proc_across_ip_c;

-  rtcd->postproc.downacross       = vp9_post_proc_down_and_across_c;

-  rtcd->postproc.addnoise         = vp9_plane_add_noise_c;

-  rtcd->postproc.blend_mb_inner   = vp9_blend_mb_inner_c;

-  rtcd->postproc.blend_mb_outer   = vp9_blend_mb_outer_c;

-  rtcd->postproc.blend_b          = vp9_blend_b_c;

-#endif

-#endif

-#if ARCH_X86 || ARCH_X86_64

-  vp9_arch_x86_common_init(ctx);

-#endif

-#if ARCH_ARM

-  vp9_arch_arm_common_init(ctx);

-#endif

-  vpx_rtcd();

-}

--- a/vp8/common/header.h

+++ /dev/null

@@ -1,42 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_HEADER_H

-#define __INC_HEADER_H

-/* 24 bits total */

-typedef struct {

-  unsigned int type: 1;

-  unsigned int version: 3;

-  unsigned int show_frame: 1;

-  /* Allow 2^20 bytes = 8 megabits for first partition */

-  unsigned int first_partition_length_in_bytes: 19;

-#ifdef PACKET_TESTING

-  unsigned int frame_number;

-  unsigned int update_gold: 1;

-  unsigned int uses_gold: 1;

-  unsigned int update_last: 1;

-  unsigned int uses_last: 1;

-#endif

-} VP9_HEADER;

-#ifdef PACKET_TESTING

-#define VP9_HEADER_SIZE 8

-#else

-#define VP9_HEADER_SIZE 3

-#endif

-#endif

--- a/vp8/common/idct.h

+++ /dev/null

@@ -1,144 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_IDCT_H

-#define __INC_IDCT_H

-#include "vp8/common/blockd.h"

-#define prototype_second_order(sym) \

-  void sym(short *input, short *output)

-#define prototype_idct(sym) \

-  void sym(short *input, short *output, int pitch)

-#define prototype_idct_scalar_add(sym) \

-  void sym(short input, \

-           unsigned char *pred, unsigned char *output, \

-           int pitch, int stride)

-#if ARCH_X86 || ARCH_X86_64

-#include "x86/idct_x86.h"

-#endif

-#ifdef _MSC_VER

-/* TODO: remove these after integer implmementations are done */

-#define M_PI       3.14159265358979323846

-#define round(x) (((x)>0)? floor((x)+0.5): ceil((x)-0.5))

-#endif

-#if ARCH_ARM

-#include "arm/idct_arm.h"

-#endif

-#if CONFIG_LOSSLESS

-#define WHT_UPSCALE_FACTOR 3

-#define Y2_WHT_UPSCALE_FACTOR 2

-#endif

-#ifndef vp9_idct_idct16x16

-#define vp9_idct_idct16x16 vp9_short_idct16x16_c

-#endif

-extern prototype_idct(vp9_idct_idct16x16);

-#ifndef vp9_idct_idct8

-#define vp9_idct_idct8 vp9_short_idct8x8_c

-#endif

-extern prototype_idct(vp9_idct_idct8);

-#ifndef vp9_idct_idct8_1

-#define vp9_idct_idct8_1 vp9_short_idct8x8_1_c

-#endif

-extern prototype_idct(vp9_idct_idct8_1);

-#ifndef vp9_idct_ihaar2

-#define vp9_idct_ihaar2 vp9_short_ihaar2x2_c

-#endif

-extern prototype_idct(vp9_idct_ihaar2);

-#ifndef vp9_idct_ihaar2_1

-#define vp9_idct_ihaar2_1 vp9_short_ihaar2x2_1_c

-#endif

-extern prototype_idct(vp9_idct_ihaar2_1);

-#ifndef vp9_idct_idct1_scalar_add_8x8

-#define vp9_idct_idct1_scalar_add_8x8 vp9_dc_only_idct_add_8x8_c

-#endif

-extern prototype_idct_scalar_add(vp9_idct_idct1_scalar_add_8x8);

-#ifndef vp9_idct_idct1

-#define vp9_idct_idct1 vp9_short_idct4x4llm_1_c

-#endif

-extern prototype_idct(vp9_idct_idct1);

-#ifndef vp9_idct_idct16

-#define vp9_idct_idct16 vp9_short_idct4x4llm_c

-#endif

-extern prototype_idct(vp9_idct_idct16);

-#ifndef vp9_idct_idct1_scalar_add

-#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_c

-#endif

-extern prototype_idct_scalar_add(vp9_idct_idct1_scalar_add);

-#ifndef vp9_idct_iwalsh1

-#define vp9_idct_iwalsh1 vp9_short_inv_walsh4x4_1_c

-#endif

-extern prototype_second_order(vp9_idct_iwalsh1);

-#ifndef vp9_idct_iwalsh16

-#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_c

-#endif

-extern prototype_second_order(vp9_idct_iwalsh16);

-#if CONFIG_LOSSLESS

-extern prototype_idct(vp9_short_inv_walsh4x4_x8_c);

-extern prototype_idct(vp9_short_inv_walsh4x4_1_x8_c);

-extern prototype_idct_scalar_add(vp9_dc_only_inv_walsh_add_c);

-extern prototype_second_order(vp9_short_inv_walsh4x4_lossless_c);

-extern prototype_second_order(vp9_short_inv_walsh4x4_1_lossless_c);

-#endif

-void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,

-                  TX_TYPE tx_type, int tx_dim);

-typedef prototype_idct((*vp9_idct_fn_t));

-typedef prototype_idct_scalar_add((*vp9_idct_scalar_add_fn_t));

-typedef prototype_second_order((*vp9_second_order_fn_t));

-typedef struct {

-  vp9_idct_fn_t            idct1;

-  vp9_idct_fn_t            idct16;

-  vp9_idct_scalar_add_fn_t idct1_scalar_add;

-  vp9_second_order_fn_t iwalsh1;

-  vp9_second_order_fn_t iwalsh16;

-  vp9_idct_fn_t            idct8;

-  vp9_idct_fn_t            idct8_1;

-  vp9_idct_scalar_add_fn_t idct1_scalar_add_8x8;

-  vp9_idct_fn_t ihaar2;

-  vp9_idct_fn_t ihaar2_1;

-  vp9_idct_fn_t            idct16x16;

-} vp9_idct_rtcd_vtable_t;

-#if CONFIG_RUNTIME_CPU_DETECT

-#define IDCT_INVOKE(ctx,fn) (ctx)->fn

-#else

-#define IDCT_INVOKE(ctx,fn) vp9_idct_##fn

-#endif

-#endif

--- a/vp8/common/idctllm.c

+++ /dev/null

@@ -1,1275 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-/****************************************************************************

- * Notes:

- *

- * This implementation makes use of 16 bit fixed point verio of two multiply

- * constants:

- *         1.   sqrt(2) * cos (pi/8)

- *         2.   sqrt(2) * sin (pi/8)

- * Becuase the first constant is bigger than 1, to maintain the same 16 bit

- * fixed point precision as the second one, we use a trick of

- *         x * a = x + x*(a-1)

- * so

- *         x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).

- **************************************************************************/

-#include <assert.h>

-#include <math.h>

-#include "vpx_ports/config.h"

-#include "vp8/common/idct.h"

-#include "vp8/common/systemdependent.h"

-#include "vp8/common/blockd.h"

-static const int cospi8sqrt2minus1 = 20091;

-static const int sinpi8sqrt2      = 35468;

-static const int rounding = 0;

-// TODO: these transforms can be further converted into integer forms

-//       for complexity optimization

-static const float idct_4[16] = {

-  0.500000000000000,   0.653281482438188,   0.500000000000000,   0.270598050073099,

-  0.500000000000000,   0.270598050073099,  -0.500000000000000,  -0.653281482438188,

-  0.500000000000000,  -0.270598050073099,  -0.500000000000000,   0.653281482438188,

-  0.500000000000000,  -0.653281482438188,   0.500000000000000,  -0.270598050073099

-};

-static const float iadst_4[16] = {

-  0.228013428883779,   0.577350269189626,   0.656538502008139,   0.428525073124360,

-  0.428525073124360,   0.577350269189626,  -0.228013428883779,  -0.656538502008139,

-  0.577350269189626,                   0,  -0.577350269189626,   0.577350269189626,

-  0.656538502008139,  -0.577350269189626,   0.428525073124359,  -0.228013428883779

-};

-static const float idct_8[64] = {

-  0.353553390593274,   0.490392640201615,   0.461939766255643,   0.415734806151273,

-  0.353553390593274,   0.277785116509801,   0.191341716182545,   0.097545161008064,

-  0.353553390593274,   0.415734806151273,   0.191341716182545,  -0.097545161008064,

- -0.353553390593274,  -0.490392640201615,  -0.461939766255643,  -0.277785116509801,

-  0.353553390593274,   0.277785116509801,  -0.191341716182545,  -0.490392640201615,

- -0.353553390593274,   0.097545161008064,   0.461939766255643,   0.415734806151273,

-  0.353553390593274,   0.097545161008064,  -0.461939766255643,  -0.277785116509801,

-  0.353553390593274,   0.415734806151273,  -0.191341716182545,  -0.490392640201615,

-  0.353553390593274,  -0.097545161008064,  -0.461939766255643,   0.277785116509801,

-  0.353553390593274,  -0.415734806151273,  -0.191341716182545,   0.490392640201615,

-  0.353553390593274,  -0.277785116509801,  -0.191341716182545,   0.490392640201615,

- -0.353553390593274,  -0.097545161008064,   0.461939766255643,  -0.415734806151273,

-  0.353553390593274,  -0.415734806151273,   0.191341716182545,   0.097545161008064,

- -0.353553390593274,   0.490392640201615,  -0.461939766255643,   0.277785116509801,

-  0.353553390593274,  -0.490392640201615,   0.461939766255643,  -0.415734806151273,

-  0.353553390593274,  -0.277785116509801,   0.191341716182545,  -0.097545161008064

-};

-static const float iadst_8[64] = {

-  0.089131608307533,   0.255357107325376,   0.387095214016349,   0.466553967085785,

-  0.483002021635509,   0.434217976756762,   0.326790388032145,   0.175227946595735,

-  0.175227946595735,   0.434217976756762,   0.466553967085785,   0.255357107325376,

- -0.089131608307533,  -0.387095214016348,  -0.483002021635509,  -0.326790388032145,

-  0.255357107325376,   0.483002021635509,   0.175227946595735,  -0.326790388032145,

- -0.466553967085785,  -0.089131608307533,   0.387095214016349,   0.434217976756762,

-  0.326790388032145,   0.387095214016349,  -0.255357107325376,  -0.434217976756762,

-  0.175227946595735,   0.466553967085786,  -0.089131608307534,  -0.483002021635509,

-  0.387095214016349,   0.175227946595735,  -0.483002021635509,   0.089131608307533,

-  0.434217976756762,  -0.326790388032145,  -0.255357107325377,   0.466553967085785,

-  0.434217976756762,  -0.089131608307533,  -0.326790388032145,   0.483002021635509,

- -0.255357107325376,  -0.175227946595735,   0.466553967085785,  -0.387095214016348,

-  0.466553967085785,  -0.326790388032145,   0.089131608307533,   0.175227946595735,

- -0.387095214016348,   0.483002021635509,  -0.434217976756762,   0.255357107325376,

-  0.483002021635509,  -0.466553967085785,   0.434217976756762,  -0.387095214016348,

-  0.326790388032145,  -0.255357107325375,   0.175227946595736,  -0.089131608307532

-};

-static const int16_t idct_i4[16] = {

-  8192,  10703,  8192,   4433,

-  8192,   4433, -8192, -10703,

-  8192,  -4433, -8192,  10703,

-  8192, -10703,  8192,  -4433

-};

-static const int16_t iadst_i4[16] = {

-   3736,  9459, 10757,   7021,

-   7021,  9459, -3736, -10757,

-   9459,     0, -9459,   9459,

-  10757, -9459,  7021,  -3736

-};

-static const int16_t idct_i8[64] = {

-   5793,  8035,  7568,  6811,

-   5793,  4551,  3135,  1598,

-   5793,  6811,  3135, -1598,

-  -5793, -8035, -7568, -4551,

-   5793,  4551, -3135, -8035,

-  -5793,  1598,  7568,  6811,

-   5793,  1598, -7568, -4551,

-   5793,  6811, -3135, -8035,

-   5793, -1598, -7568,  4551,

-   5793, -6811, -3135,  8035,

-   5793, -4551, -3135,  8035,

-  -5793, -1598,  7568, -6811,

-   5793, -6811,  3135,  1598,

-  -5793,  8035, -7568,  4551,

-   5793, -8035,  7568, -6811,

-   5793, -4551,  3135, -1598

-};

-static const int16_t iadst_i8[64] = {

-   1460,  4184,  6342,  7644,

-   7914,  7114,  5354,  2871,

-   2871,  7114,  7644,  4184,

-  -1460, -6342, -7914, -5354,

-   4184,  7914,  2871, -5354,

-  -7644, -1460,  6342,  7114,

-   5354,  6342, -4184, -7114,

-   2871,  7644, -1460, -7914,

-   6342,  2871, -7914,  1460,

-   7114, -5354, -4184,  7644,

-   7114, -1460, -5354,  7914,

-  -4184, -2871,  7644, -6342,

-   7644, -5354,  1460,  2871,

-  -6342,  7914, -7114,  4184,

-   7914, -7644,  7114, -6342,

-   5354, -4184,  2871, -1460

-};

-static float idct_16[256] = {

-  0.250000,  0.351851,  0.346760,  0.338330,  0.326641,  0.311806,  0.293969,  0.273300,

-  0.250000,  0.224292,  0.196424,  0.166664,  0.135299,  0.102631,  0.068975,  0.034654,

-  0.250000,  0.338330,  0.293969,  0.224292,  0.135299,  0.034654, -0.068975, -0.166664,

- -0.250000, -0.311806, -0.346760, -0.351851, -0.326641, -0.273300, -0.196424, -0.102631,

-  0.250000,  0.311806,  0.196424,  0.034654, -0.135299, -0.273300, -0.346760, -0.338330,

- -0.250000, -0.102631,  0.068975,  0.224292,  0.326641,  0.351851,  0.293969,  0.166664,

-  0.250000,  0.273300,  0.068975, -0.166664, -0.326641, -0.338330, -0.196424,  0.034654,

-  0.250000,  0.351851,  0.293969,  0.102631, -0.135299, -0.311806, -0.346760, -0.224292,

-  0.250000,  0.224292, -0.068975, -0.311806, -0.326641, -0.102631,  0.196424,  0.351851,

-  0.250000, -0.034654, -0.293969, -0.338330, -0.135299,  0.166664,  0.346760,  0.273300,

-  0.250000,  0.166664, -0.196424, -0.351851, -0.135299,  0.224292,  0.346760,  0.102631,

- -0.250000, -0.338330, -0.068975,  0.273300,  0.326641,  0.034654, -0.293969, -0.311806,

-  0.250000,  0.102631, -0.293969, -0.273300,  0.135299,  0.351851,  0.068975, -0.311806,

- -0.250000,  0.166664,  0.346760,  0.034654, -0.326641, -0.224292,  0.196424,  0.338330,

-  0.250000,  0.034654, -0.346760, -0.102631,  0.326641,  0.166664, -0.293969, -0.224292,

-  0.250000,  0.273300, -0.196424, -0.311806,  0.135299,  0.338330, -0.068975, -0.351851,

-  0.250000, -0.034654, -0.346760,  0.102631,  0.326641, -0.166664, -0.293969,  0.224292,

-  0.250000, -0.273300, -0.196424,  0.311806,  0.135299, -0.338330, -0.068975,  0.351851,

-  0.250000, -0.102631, -0.293969,  0.273300,  0.135299, -0.351851,  0.068975,  0.311806,

- -0.250000, -0.166664,  0.346760, -0.034654, -0.326641,  0.224292,  0.196424, -0.338330,

-  0.250000, -0.166664, -0.196424,  0.351851, -0.135299, -0.224292,  0.346760, -0.102631,

- -0.250000,  0.338330, -0.068975, -0.273300,  0.326641, -0.034654, -0.293969,  0.311806,

-  0.250000, -0.224292, -0.068975,  0.311806, -0.326641,  0.102631,  0.196424, -0.351851,

-  0.250000,  0.034654, -0.293969,  0.338330, -0.135299, -0.166664,  0.346760, -0.273300,

-  0.250000, -0.273300,  0.068975,  0.166664, -0.326641,  0.338330, -0.196424, -0.034654,

-  0.250000, -0.351851,  0.293969, -0.102631, -0.135299,  0.311806, -0.346760,  0.224292,

-  0.250000, -0.311806,  0.196424, -0.034654, -0.135299,  0.273300, -0.346760,  0.338330,

- -0.250000,  0.102631,  0.068975, -0.224292,  0.326641, -0.351851,  0.293969, -0.166664,

-  0.250000, -0.338330,  0.293969, -0.224292,  0.135299, -0.034654, -0.068975,  0.166664,

- -0.250000,  0.311806, -0.346760,  0.351851, -0.326641,  0.273300, -0.196424,  0.102631,

-  0.250000, -0.351851,  0.346760, -0.338330,  0.326641, -0.311806,  0.293969, -0.273300,

-  0.250000, -0.224292,  0.196424, -0.166664,  0.135299, -0.102631,  0.068975, -0.034654

-};

-static float iadst_16[256] = {

-  0.033094,  0.098087,  0.159534,  0.215215,  0.263118,  0.301511,  0.329007,  0.344612,

-  0.347761,  0.338341,  0.316693,  0.283599,  0.240255,  0.188227,  0.129396,  0.065889,

-  0.065889,  0.188227,  0.283599,  0.338341,  0.344612,  0.301511,  0.215215,  0.098087,

- -0.033094, -0.159534, -0.263118, -0.329007, -0.347761, -0.316693, -0.240255, -0.129396,

-  0.098087,  0.263118,  0.344612,  0.316693,  0.188227,  0.000000, -0.188227, -0.316693,

- -0.344612, -0.263118, -0.098087,  0.098087,  0.263118,  0.344612,  0.316693,  0.188227,

-  0.129396,  0.316693,  0.329007,  0.159534, -0.098087, -0.301511, -0.338341, -0.188227,

-  0.065889,  0.283599,  0.344612,  0.215215, -0.033094, -0.263118, -0.347761, -0.240255,

-  0.159534,  0.344612,  0.240255, -0.065889, -0.316693, -0.301511, -0.033094,  0.263118,

-  0.338341,  0.129396, -0.188227, -0.347761, -0.215215,  0.098087,  0.329007,  0.283599,

-  0.188227,  0.344612,  0.098087, -0.263118, -0.316693, -0.000000,  0.316693,  0.263118,

- -0.098087, -0.344612, -0.188227,  0.188227,  0.344612,  0.098087, -0.263118, -0.316693,

-  0.215215,  0.316693, -0.065889, -0.347761, -0.098087,  0.301511,  0.240255, -0.188227,

- -0.329007,  0.033094,  0.344612,  0.129396, -0.283599, -0.263118,  0.159534,  0.338341,

-  0.240255,  0.263118, -0.215215, -0.283599,  0.188227,  0.301511, -0.159534, -0.316693,

-  0.129396,  0.329007, -0.098087, -0.338341,  0.065889,  0.344612, -0.033094, -0.347761,

-  0.263118,  0.188227, -0.316693, -0.098087,  0.344612,  0.000000, -0.344612,  0.098087,

-  0.316693, -0.188227, -0.263118,  0.263118,  0.188227, -0.316693, -0.098087,  0.344612,

-  0.283599,  0.098087, -0.347761,  0.129396,  0.263118, -0.301511, -0.065889,  0.344612,

- -0.159534, -0.240255,  0.316693,  0.033094, -0.338341,  0.188227,  0.215215, -0.329007,

-  0.301511,  0.000000, -0.301511,  0.301511,  0.000000, -0.301511,  0.301511,  0.000000,

- -0.301511,  0.301511,  0.000000, -0.301511,  0.301511,  0.000000, -0.301511,  0.301511,

-  0.316693, -0.098087, -0.188227,  0.344612, -0.263118, -0.000000,  0.263118, -0.344612,

-  0.188227,  0.098087, -0.316693,  0.316693, -0.098087, -0.188227,  0.344612, -0.263118,

-  0.329007, -0.188227, -0.033094,  0.240255, -0.344612,  0.301511, -0.129396, -0.098087,

-  0.283599, -0.347761,  0.263118, -0.065889, -0.159534,  0.316693, -0.338341,  0.215215,

-  0.338341, -0.263118,  0.129396,  0.033094, -0.188227,  0.301511, -0.347761,  0.316693,

- -0.215215,  0.065889,  0.098087, -0.240255,  0.329007, -0.344612,  0.283599, -0.159534,

-  0.344612, -0.316693,  0.263118, -0.188227,  0.098087,  0.000000, -0.098087,  0.188227,

- -0.263118,  0.316693, -0.344612,  0.344612, -0.316693,  0.263118, -0.188227,  0.098087,

-  0.347761, -0.344612,  0.338341, -0.329007,  0.316693, -0.301511,  0.283599, -0.263118,

-  0.240255, -0.215215,  0.188227, -0.159534,  0.129396, -0.098087,  0.065889, -0.033094

-};

-static const int16_t idct_i16[256] = {

-   4096,  5765,  5681,  5543,  5352,  5109,  4816,  4478,

-   4096,  3675,  3218,  2731,  2217,  1682,  1130,   568,

-   4096,  5543,  4816,  3675,  2217,   568, -1130, -2731,

-  -4096, -5109, -5681, -5765, -5352, -4478, -3218, -1682,

-   4096,  5109,  3218,   568, -2217, -4478, -5681, -5543,

-  -4096, -1682,  1130,  3675,  5352,  5765,  4816,  2731,

-   4096,  4478,  1130, -2731, -5352, -5543, -3218,   568,

-   4096,  5765,  4816,  1682, -2217, -5109, -5681, -3675,

-   4096,  3675, -1130, -5109, -5352, -1682,  3218,  5765,

-   4096,  -568, -4816, -5543, -2217,  2731,  5681,  4478,

-   4096,  2731, -3218, -5765, -2217,  3675,  5681,  1682,

-  -4096, -5543, -1130,  4478,  5352,   568, -4816, -5109,

-   4096,  1682, -4816, -4478,  2217,  5765,  1130, -5109,

-  -4096,  2731,  5681,   568, -5352, -3675,  3218,  5543,

-   4096,   568, -5681, -1682,  5352,  2731, -4816, -3675,

-   4096,  4478, -3218, -5109,  2217,  5543, -1130, -5765,

-   4096,  -568, -5681,  1682,  5352, -2731, -4816,  3675,

-   4096, -4478, -3218,  5109,  2217, -5543, -1130,  5765,

-   4096, -1682, -4816,  4478,  2217, -5765,  1130,  5109,

-  -4096, -2731,  5681,  -568, -5352,  3675,  3218, -5543,

-   4096, -2731, -3218,  5765, -2217, -3675,  5681, -1682,

-  -4096,  5543, -1130, -4478,  5352,  -568, -4816,  5109,

-   4096, -3675, -1130,  5109, -5352,  1682,  3218, -5765,

-   4096,   568, -4816,  5543, -2217, -2731,  5681, -4478,

-   4096, -4478,  1130,  2731, -5352,  5543, -3218,  -568,

-   4096, -5765,  4816, -1682, -2217,  5109, -5681,  3675,

-   4096, -5109,  3218,  -568, -2217,  4478, -5681,  5543,

-  -4096,  1682,  1130, -3675,  5352, -5765,  4816, -2731,

-   4096, -5543,  4816, -3675,  2217,  -568, -1130,  2731,

-  -4096,  5109, -5681,  5765, -5352,  4478, -3218,  1682,

-   4096, -5765,  5681, -5543,  5352, -5109,  4816, -4478,

-   4096, -3675,  3218, -2731,  2217, -1682,  1130,  -568

-};

-static const int16_t iadst_i16[256] = {

-    542,  1607,  2614,  3526,  4311,  4940,  5390,  5646,

-   5698,  5543,  5189,  4646,  3936,  3084,  2120,  1080,

-   1080,  3084,  4646,  5543,  5646,  4940,  3526,  1607,

-   -542, -2614, -4311, -5390, -5698, -5189, -3936, -2120,

-   1607,  4311,  5646,  5189,  3084,     0, -3084, -5189,

-  -5646, -4311, -1607,  1607,  4311,  5646,  5189,  3084,

-   2120,  5189,  5390,  2614, -1607, -4940, -5543, -3084,

-   1080,  4646,  5646,  3526, -542,  -4311, -5698, -3936,

-   2614,  5646,  3936, -1080, -5189, -4940,  -542,  4311,

-   5543,  2120, -3084, -5698, -3526,  1607,  5390,  4646,

-   3084,  5646,  1607, -4311, -5189,     0,  5189,  4311,

-  -1607, -5646, -3084,  3084,  5646,  1607, -4311, -5189,

-   3526,  5189, -1080, -5698, -1607,  4940,  3936, -3084,

-  -5390,   542,  5646,  2120, -4646, -4311,  2614,  5543,

-   3936,  4311, -3526, -4646,  3084,  4940, -2614, -5189,

-   2120,  5390, -1607, -5543,  1080,  5646,  -542, -5698,

-   4311,  3084, -5189, -1607,  5646,     0, -5646,  1607,

-   5189, -3084, -4311,  4311,  3084, -5189, -1607,  5646,

-   4646,  1607, -5698,  2120,  4311, -4940, -1080,  5646,

-  -2614, -3936,  5189,   542, -5543,  3084,  3526, -5390,

-   4940,     0, -4940,  4940,     0, -4940,  4940,     0,

-  -4940,  4940,     0, -4940,  4940,     0, -4940,  4940,

-   5189, -1607, -3084,  5646, -4311,     0,  4311, -5646,

-   3084,  1607, -5189,  5189, -1607, -3084,  5646, -4311,

-   5390, -3084,  -542,  3936, -5646,  4940, -2120, -1607,

-   4646, -5698,  4311, -1080, -2614,  5189, -5543,  3526,

-   5543, -4311,  2120,   542, -3084,  4940, -5698,  5189,

-  -3526,  1080,  1607, -3936,  5390, -5646,  4646, -2614,

-   5646, -5189,  4311, -3084,  1607,     0, -1607,  3084,

-  -4311,  5189, -5646,  5646, -5189,  4311, -3084,  1607,

-   5698, -5646,  5543, -5390,  5189, -4940,  4646, -4311,

-   3936, -3526,  3084, -2614,  2120, -1607,  1080,  -542

-};

-/* For test */

-#define TEST_INT 1

-#if TEST_INT

-#define vp9_ihtllm_int_c vp9_ihtllm_c

-#else

-#define vp9_ihtllm_float_c vp9_ihtllm_c

-#endif

-void vp9_ihtllm_float_c(const int16_t *input, int16_t *output, int pitch,

-                  TX_TYPE tx_type, int tx_dim) {

-  vp9_clear_system_state();  // Make it simd safe : __asm emms;

-  {

-    int i, j, k;

-    float bufa[256], bufb[256];  // buffers are for floating-point test purpose

-                                 // the implementation could be simplified in

-                                 // conjunction with integer transform

-    const int16_t *ip = input;

-    int16_t *op = output;

-    int shortpitch = pitch >> 1;

-    float *pfa = &bufa[0];

-    float *pfb = &bufb[0];

-    // pointers to vertical and horizontal transforms

-    const float *ptv, *pth;

-    assert(tx_type != DCT_DCT);

-    // load and convert residual array into floating-point

-    for(j = 0; j < tx_dim; j++) {

-      for(i = 0; i < tx_dim; i++) {

-        pfa[i] = (float)ip[i];

-      }

-      pfa += tx_dim;

-      ip  += tx_dim;

-    }

-    // vertical transformation

-    pfa = &bufa[0];

-    pfb = &bufb[0];

-    switch(tx_type) {

-      case ADST_ADST :

-      case ADST_DCT  :

-        ptv = (tx_dim == 4) ? &iadst_4[0] :

-                              ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);

-        break;

-      default :

-        ptv = (tx_dim == 4) ? &idct_4[0] :

-                              ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);

-        break;

-    }

-    for(j = 0; j < tx_dim; j++) {

-      for(i = 0; i < tx_dim; i++) {

-        pfb[i] = 0 ;

-        for(k = 0; k < tx_dim; k++) {

-          pfb[i] += ptv[k] * pfa[(k * tx_dim)];

-        }

-        pfa += 1;

-      }

-      pfb += tx_dim;

-      ptv += tx_dim;

-      pfa = &bufa[0];

-    }

-    // horizontal transformation

-    pfa = &bufa[0];

-    pfb = &bufb[0];

-    switch(tx_type) {

-      case ADST_ADST :

-      case  DCT_ADST :

-        pth = (tx_dim == 4) ? &iadst_4[0] :

-                              ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);

-        break;

-      default :

-        pth = (tx_dim == 4) ? &idct_4[0] :

-                              ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);

-        break;

-    }

-    for(j = 0; j < tx_dim; j++) {

-      for(i = 0; i < tx_dim; i++) {

-        pfa[i] = 0;

-        for(k = 0; k < tx_dim; k++) {

-          pfa[i] += pfb[k] * pth[k];

-        }

-        pth += tx_dim;

-       }

-      pfa += tx_dim;

-      pfb += tx_dim;

-      switch(tx_type) {

-        case ADST_ADST :

-        case  DCT_ADST :

-          pth = (tx_dim == 4) ? &iadst_4[0] :

-                                ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);

-          break;

-        default :

-          pth = (tx_dim == 4) ? &idct_4[0] :

-                                ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);

-          break;

-      }

-    }

-    // convert to short integer format and load BLOCKD buffer

-    op  = output;

-    pfa = &bufa[0];

-    for(j = 0; j < tx_dim; j++) {

-      for(i = 0; i < tx_dim; i++) {

-        op[i] = (pfa[i] > 0 ) ? (int16_t)( pfa[i] / 8 + 0.49) :

-                               -(int16_t)( - pfa[i] / 8 + 0.49);

-      }

-      op += shortpitch;

-      pfa += tx_dim;

-    }

-  }

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-}

-/* Converted the transforms to integer form. */

-#define VERTICAL_SHIFT 14  // 16

-#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)

-#define HORIZONTAL_SHIFT 17  // 15

-#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)

-void vp9_ihtllm_int_c(const int16_t *input, int16_t *output, int pitch,

-                      TX_TYPE tx_type, int tx_dim) {

-  int i, j, k;

-  int16_t imbuf[256];

-  const int16_t *ip = input;

-  int16_t *op = output;

-  int16_t *im = &imbuf[0];

-  /* pointers to vertical and horizontal transforms. */

-  const int16_t *ptv = NULL, *pth = NULL;

-  int shortpitch = pitch >> 1;

-  switch (tx_type) {

-    case ADST_ADST :

-      ptv = pth = (tx_dim == 4) ? &iadst_i4[0]

-                                  : ((tx_dim == 8) ? &iadst_i8[0]

-                                                     : &iadst_i16[0]);

-      break;

-    case ADST_DCT  :

-      ptv = (tx_dim == 4) ? &iadst_i4[0]

-                            : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);

-      pth = (tx_dim == 4) ? &idct_i4[0]

-                            : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);

-      break;

-    case  DCT_ADST :

-      ptv = (tx_dim == 4) ? &idct_i4[0]

-                            : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);

-      pth = (tx_dim == 4) ? &iadst_i4[0]

-                            : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);

-      break;

-    case  DCT_DCT :

-      ptv = pth = (tx_dim == 4) ? &idct_i4[0]

-                                  : ((tx_dim == 8) ? &idct_i8[0]

-                                                     : &idct_i16[0]);

-      break;

-    default:

-      assert(0);

-      break;

-  }

-  /* vertical transformation */

-  for (j = 0; j < tx_dim; j++) {

-    for (i = 0; i < tx_dim; i++) {

-      int temp = 0;

-      for (k = 0; k < tx_dim; k++) {

-        temp += ptv[k] * ip[(k * tx_dim)];

-      }

-      im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);

-      ip++;

-    }

-    im += tx_dim;  // 16

-    ptv += tx_dim;

-    ip = input;

-  }

-  /* horizontal transformation */

-  im = &imbuf[0];

-  for (j = 0; j < tx_dim; j++) {

-    const int16_t *pthc = pth;

-    for (i = 0; i < tx_dim; i++) {

-      int temp = 0;

-      for (k = 0; k < tx_dim; k++) {

-        temp += im[k] * pthc[k];

-      }

-      op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);

-      pthc += tx_dim;

-    }

-    im += tx_dim;  // 16

-    op += shortpitch;

-  }

-}

-void vp9_short_idct4x4llm_c(short *input, short *output, int pitch) {

-  int i;

-  int a1, b1, c1, d1;

-  short *ip = input;

-  short *op = output;

-  int temp1, temp2;

-  int shortpitch = pitch >> 1;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[0] + ip[8];

-    b1 = ip[0] - ip[8];

-    temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;

-    temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);

-    c1 = temp1 - temp2;

-    temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);

-    temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;

-    d1 = temp1 + temp2;

-    op[shortpitch * 0] = a1 + d1;

-    op[shortpitch * 3] = a1 - d1;

-    op[shortpitch * 1] = b1 + c1;

-    op[shortpitch * 2] = b1 - c1;

-    ip++;

-    op++;

-  }

-  ip = output;

-  op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[0] + ip[2];

-    b1 = ip[0] - ip[2];

-    temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16;

-    temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16);

-    c1 = temp1 - temp2;

-    temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16);

-    temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16;

-    d1 = temp1 + temp2;

-    op[0] = (a1 + d1 + 16) >> 5;

-    op[3] = (a1 - d1 + 16) >> 5;

-    op[1] = (b1 + c1 + 16) >> 5;

-    op[2] = (b1 - c1 + 16) >> 5;

-    ip += shortpitch;

-    op += shortpitch;

-  }

-}

-void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch) {

-  int i;

-  int a1;

-  short *op = output;

-  int shortpitch = pitch >> 1;

-  a1 = ((input[0] + 16) >> 5);

-  for (i = 0; i < 4; i++) {

-    op[0] = a1;

-    op[1] = a1;

-    op[2] = a1;

-    op[3] = a1;

-    op += shortpitch;

-  }

-}

-void vp9_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,

-                            unsigned char *dst_ptr, int pitch, int stride) {

-  int a1 = ((input_dc + 16) >> 5);

-  int r, c;

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

-      int a = a1 + pred_ptr[c];

-      if (a < 0)

-        a = 0;

-      if (a > 255)

-        a = 255;

-      dst_ptr[c] = (unsigned char) a;

-    }

-    dst_ptr += stride;

-    pred_ptr += pitch;

-  }

-}

-void vp9_short_inv_walsh4x4_c(short *input, short *output) {

-  int i;

-  int a1, b1, c1, d1;

-  short *ip = input;

-  short *op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ((ip[0] + ip[3]));

-    b1 = ((ip[1] + ip[2]));

-    c1 = ((ip[1] - ip[2]));

-    d1 = ((ip[0] - ip[3]));

-    op[0] = (a1 + b1 + 1) >> 1;

-    op[1] = (c1 + d1) >> 1;

-    op[2] = (a1 - b1) >> 1;

-    op[3] = (d1 - c1) >> 1;

-    ip += 4;

-    op += 4;

-  }

-  ip = output;

-  op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[0] + ip[12];

-    b1 = ip[4] + ip[8];

-    c1 = ip[4] - ip[8];

-    d1 = ip[0] - ip[12];

-    op[0] = (a1 + b1 + 1) >> 1;

-    op[4] = (c1 + d1) >> 1;

-    op[8] = (a1 - b1) >> 1;

-    op[12] = (d1 - c1) >> 1;

-    ip++;

-    op++;

-  }

-}

-void vp9_short_inv_walsh4x4_1_c(short *in, short *out) {

-  int i;

-  short tmp[4];

-  short *ip = in;

-  short *op = tmp;

-  op[0] = (ip[0] + 1) >> 1;

-  op[1] = op[2] = op[3] = (ip[0] >> 1);

-  ip = tmp;

-  op = out;

-  for (i = 0; i < 4; i++) {

-    op[0] = (ip[0] + 1) >> 1;

-    op[4] = op[8] = op[12] = (ip[0] >> 1);

-    ip++;

-    op++;

-  }

-}

-#if CONFIG_LOSSLESS

-void vp9_short_inv_walsh4x4_lossless_c(short *input, short *output) {

-  int i;

-  int a1, b1, c1, d1;

-  short *ip = input;

-  short *op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ((ip[0] + ip[3])) >> Y2_WHT_UPSCALE_FACTOR;

-    b1 = ((ip[1] + ip[2])) >> Y2_WHT_UPSCALE_FACTOR;

-    c1 = ((ip[1] - ip[2])) >> Y2_WHT_UPSCALE_FACTOR;

-    d1 = ((ip[0] - ip[3])) >> Y2_WHT_UPSCALE_FACTOR;

-    op[0] = (a1 + b1 + 1) >> 1;

-    op[1] = (c1 + d1) >> 1;

-    op[2] = (a1 - b1) >> 1;

-    op[3] = (d1 - c1) >> 1;

-    ip += 4;

-    op += 4;

-  }

-  ip = output;

-  op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[0] + ip[12];

-    b1 = ip[4] + ip[8];

-    c1 = ip[4] - ip[8];

-    d1 = ip[0] - ip[12];

-    op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

-    op[4] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

-    op[8] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

-    op[12] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

-    ip++;

-    op++;

-  }

-}

-void vp9_short_inv_walsh4x4_1_lossless_c(short *in, short *out) {

-  int i;

-  short tmp[4];

-  short *ip = in;

-  short *op = tmp;

-  op[0] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) + 1) >> 1;

-  op[1] = op[2] = op[3] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) >> 1);

-  ip = tmp;

-  op = out;

-  for (i = 0; i < 4; i++) {

-    op[0] = ((ip[0] + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

-    op[4] = op[8] = op[12] = ((ip[0] >> 1)) << Y2_WHT_UPSCALE_FACTOR;

-    ip++;

-    op++;

-  }

-}

-void vp9_short_inv_walsh4x4_x8_c(short *input, short *output, int pitch) {

-  int i;

-  int a1, b1, c1, d1;

-  short *ip = input;

-  short *op = output;

-  int shortpitch = pitch >> 1;

-  for (i = 0; i < 4; i++) {

-    a1 = ((ip[0] + ip[3])) >> WHT_UPSCALE_FACTOR;

-    b1 = ((ip[1] + ip[2])) >> WHT_UPSCALE_FACTOR;

-    c1 = ((ip[1] - ip[2])) >> WHT_UPSCALE_FACTOR;

-    d1 = ((ip[0] - ip[3])) >> WHT_UPSCALE_FACTOR;

-    op[0] = (a1 + b1 + 1) >> 1;

-    op[1] = (c1 + d1) >> 1;

-    op[2] = (a1 - b1) >> 1;

-    op[3] = (d1 - c1) >> 1;

-    ip += 4;

-    op += shortpitch;

-  }

-  ip = output;

-  op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[shortpitch * 0] + ip[shortpitch * 3];

-    b1 = ip[shortpitch * 1] + ip[shortpitch * 2];

-    c1 = ip[shortpitch * 1] - ip[shortpitch * 2];

-    d1 = ip[shortpitch * 0] - ip[shortpitch * 3];

-    op[shortpitch * 0] = (a1 + b1 + 1) >> 1;

-    op[shortpitch * 1] = (c1 + d1) >> 1;

-    op[shortpitch * 2] = (a1 - b1) >> 1;

-    op[shortpitch * 3] = (d1 - c1) >> 1;

-    ip++;

-    op++;

-  }

-}

-void vp9_short_inv_walsh4x4_1_x8_c(short *in, short *out, int pitch) {

-  int i;

-  short tmp[4];

-  short *ip = in;

-  short *op = tmp;

-  int shortpitch = pitch >> 1;

-  op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;

-  op[1] = op[2] = op[3] = ((ip[0] >> WHT_UPSCALE_FACTOR) >> 1);

-  ip = tmp;

-  op = out;

-  for (i = 0; i < 4; i++) {

-    op[shortpitch * 0] = (ip[0] + 1) >> 1;

-    op[shortpitch * 1] = op[shortpitch * 2] = op[shortpitch * 3] = ip[0] >> 1;

-    ip++;

-    op++;

-  }

-}

-void vp9_dc_only_inv_walsh_add_c(short input_dc, unsigned char *pred_ptr,

-                                 unsigned char *dst_ptr,

-                                 int pitch, int stride) {

-  int r, c;

-  short tmp[16];

-  vp9_short_inv_walsh4x4_1_x8_c(&input_dc, tmp, 4 << 1);

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

-      int a = tmp[r * 4 + c] + pred_ptr[c];

-      if (a < 0)

-        a = 0;

-      if (a > 255)

-        a = 255;

-      dst_ptr[c] = (unsigned char) a;

-    }

-    dst_ptr += stride;

-    pred_ptr += pitch;

-  }

-}

-#endif

-void vp9_dc_only_idct_add_8x8_c(short input_dc,

-                                unsigned char *pred_ptr,

-                                unsigned char *dst_ptr,

-                                int pitch, int stride) {

-  int a1 = ((input_dc + 16) >> 5);

-  int r, c, b;

-  unsigned char *orig_pred = pred_ptr;

-  unsigned char *orig_dst = dst_ptr;

-  for (b = 0; b < 4; b++) {

-    for (r = 0; r < 4; r++) {

-      for (c = 0; c < 4; c++) {

-        int a = a1 + pred_ptr[c];

-        if (a < 0)

-          a = 0;

-        if (a > 255)

-          a = 255;

-        dst_ptr[c] = (unsigned char) a;

-      }

-      dst_ptr += stride;

-      pred_ptr += pitch;

-    }

-    dst_ptr = orig_dst + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * stride;

-    pred_ptr = orig_pred + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * pitch;

-  }

-}

-#define W1 2841                 /* 2048*sqrt(2)*cos(1*pi/16) */

-#define W2 2676                 /* 2048*sqrt(2)*cos(2*pi/16) */

-#define W3 2408                 /* 2048*sqrt(2)*cos(3*pi/16) */

-#define W5 1609                 /* 2048*sqrt(2)*cos(5*pi/16) */

-#define W6 1108                 /* 2048*sqrt(2)*cos(6*pi/16) */

-#define W7 565                  /* 2048*sqrt(2)*cos(7*pi/16) */

-/* row (horizontal) IDCT

- *

- * 7                       pi         1 dst[k] = sum c[l] * src[l] * cos( -- *

- * ( k + - ) * l ) l=0                      8          2

- *

- * where: c[0]    = 128 c[1..7] = 128*sqrt(2) */

-static void idctrow(int *blk) {

-  int x0, x1, x2, x3, x4, x5, x6, x7, x8;

-  /* shortcut */

-  if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |

-        (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {

-    blk[0] = blk[1] = blk[2] = blk[3] = blk[4]

-                                        = blk[5] = blk[6] = blk[7] = blk[0] << 3;

-    return;

-  }

-  x0 = (blk[0] << 11) + 128;    /* for proper rounding in the fourth stage */

-  /* first stage */

-  x8 = W7 * (x4 + x5);

-  x4 = x8 + (W1 - W7) * x4;

-  x5 = x8 - (W1 + W7) * x5;

-  x8 = W3 * (x6 + x7);

-  x6 = x8 - (W3 - W5) * x6;

-  x7 = x8 - (W3 + W5) * x7;

-  /* second stage */

-  x8 = x0 + x1;

-  x0 -= x1;

-  x1 = W6 * (x3 + x2);

-  x2 = x1 - (W2 + W6) * x2;

-  x3 = x1 + (W2 - W6) * x3;

-  x1 = x4 + x6;

-  x4 -= x6;

-  x6 = x5 + x7;

-  x5 -= x7;

-  /* third stage */

-  x7 = x8 + x3;

-  x8 -= x3;

-  x3 = x0 + x2;

-  x0 -= x2;

-  x2 = (181 * (x4 + x5) + 128) >> 8;

-  x4 = (181 * (x4 - x5) + 128) >> 8;

-  /* fourth stage */

-  blk[0] = (x7 + x1) >> 8;

-  blk[1] = (x3 + x2) >> 8;

-  blk[2] = (x0 + x4) >> 8;

-  blk[3] = (x8 + x6) >> 8;

-  blk[4] = (x8 - x6) >> 8;

-  blk[5] = (x0 - x4) >> 8;

-  blk[6] = (x3 - x2) >> 8;

-  blk[7] = (x7 - x1) >> 8;

-}

-/* column (vertical) IDCT

- *

- * 7                         pi         1 dst[8*k] = sum c[l] * src[8*l] *

- * cos( -- * ( k + - ) * l ) l=0                        8          2

- *

- * where: c[0]    = 1/1024 c[1..7] = (1/1024)*sqrt(2) */

-static void idctcol(int *blk) {

-  int x0, x1, x2, x3, x4, x5, x6, x7, x8;

-  /* shortcut */

-  if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |

-        (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |

-        (x7 = blk[8 * 3]))) {

-    blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]

-                                           = blk[8 * 4] = blk[8 * 5] = blk[8 * 6]

-                                                                       = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);

-    return;

-  }

-  x0 = (blk[8 * 0] << 8) + 16384;

-  /* first stage */

-  x8 = W7 * (x4 + x5) + 4;

-  x4 = (x8 + (W1 - W7) * x4) >> 3;

-  x5 = (x8 - (W1 + W7) * x5) >> 3;

-  x8 = W3 * (x6 + x7) + 4;

-  x6 = (x8 - (W3 - W5) * x6) >> 3;

-  x7 = (x8 - (W3 + W5) * x7) >> 3;

-  /* second stage */

-  x8 = x0 + x1;

-  x0 -= x1;

-  x1 = W6 * (x3 + x2) + 4;

-  x2 = (x1 - (W2 + W6) * x2) >> 3;

-  x3 = (x1 + (W2 - W6) * x3) >> 3;

-  x1 = x4 + x6;

-  x4 -= x6;

-  x6 = x5 + x7;

-  x5 -= x7;

-  /* third stage */

-  x7 = x8 + x3;

-  x8 -= x3;

-  x3 = x0 + x2;

-  x0 -= x2;

-  x2 = (181 * (x4 + x5) + 128) >> 8;

-  x4 = (181 * (x4 - x5) + 128) >> 8;

-  /* fourth stage */

-  blk[8 * 0] = (x7 + x1) >> 14;

-  blk[8 * 1] = (x3 + x2) >> 14;

-  blk[8 * 2] = (x0 + x4) >> 14;

-  blk[8 * 3] = (x8 + x6) >> 14;

-  blk[8 * 4] = (x8 - x6) >> 14;

-  blk[8 * 5] = (x0 - x4) >> 14;

-  blk[8 * 6] = (x3 - x2) >> 14;

-  blk[8 * 7] = (x7 - x1) >> 14;

-}

-#define TX_DIM 8

-void vp9_short_idct8x8_c(short *coefs, short *block, int pitch) {

-  int X[TX_DIM * TX_DIM];

-  int i, j;

-  int shortpitch = pitch >> 1;

-  for (i = 0; i < TX_DIM; i++) {

-    for (j = 0; j < TX_DIM; j++) {

-      X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1

-                                + (coefs[i * TX_DIM + j] < 0)) >> 2;

-    }

-  }

-  for (i = 0; i < 8; i++)

-    idctrow(X + 8 * i);

-  for (i = 0; i < 8; i++)

-    idctcol(X + i);

-  for (i = 0; i < TX_DIM; i++) {

-    for (j = 0; j < TX_DIM; j++) {

-      block[i * shortpitch + j]  = X[i * TX_DIM + j] >> 1;

-    }

-  }

-}

-void vp9_short_ihaar2x2_c(short *input, short *output, int pitch) {

-  int i;

-  short *ip = input; // 0,1, 4, 8

-  short *op = output;

-  for (i = 0; i < 16; i++) {

-    op[i] = 0;

-  }

-  op[0] = (ip[0] + ip[1] + ip[4] + ip[8] + 1) >> 1;

-  op[1] = (ip[0] - ip[1] + ip[4] - ip[8]) >> 1;

-  op[4] = (ip[0] + ip[1] - ip[4] - ip[8]) >> 1;

-  op[8] = (ip[0] - ip[1] - ip[4] + ip[8]) >> 1;

-}

-#if 0

-// Keep a really bad float version as reference for now.

-void vp9_short_idct16x16_c(short *input, short *output, int pitch) {

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-  {

-    double x;

-    const int short_pitch = pitch >> 1;

-    int i, j, k, l;

-    for (l = 0; l < 16; ++l) {

-      for (k = 0; k < 16; ++k) {

-        double s = 0;

-        for (i = 0; i < 16; ++i) {

-          for (j = 0; j < 16; ++j) {

-            x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32;

-            if (i != 0)

-              x *= sqrt(2.0);

-            if (j != 0)

-              x *= sqrt(2.0);

-            s += x;

-          }

-        }

-        output[k*short_pitch+l] = (short)round(s);

-      }

-    }

-  }

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-}

-#endif

-static const double C1 = 0.995184726672197;

-static const double C2 = 0.98078528040323;

-static const double C3 = 0.956940335732209;

-static const double C4 = 0.923879532511287;

-static const double C5 = 0.881921264348355;

-static const double C6 = 0.831469612302545;

-static const double C7 = 0.773010453362737;

-static const double C8 = 0.707106781186548;

-static const double C9 = 0.634393284163646;

-static const double C10 = 0.555570233019602;

-static const double C11 = 0.471396736825998;

-static const double C12 = 0.38268343236509;

-static const double C13 = 0.290284677254462;

-static const double C14 = 0.195090322016128;

-static const double C15 = 0.098017140329561;

-static void butterfly_16x16_idct_1d(double input[16], double output[16]) {

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-  {

-    double step[16];

-    double intermediate[16];

-    double temp1, temp2;

-    // step 1 and 2

-    step[ 0] = input[0] + input[8];

-    step[ 1] = input[0] - input[8];

-    temp1 = input[4]*C12;

-    temp2 = input[12]*C4;

-    temp1 -= temp2;

-    temp1 *= C8;

-    step[ 2] = 2*(temp1);

-    temp1 = input[4]*C4;

-    temp2 = input[12]*C12;

-    temp1 += temp2;

-    temp1 = (temp1);

-    temp1 *= C8;

-    step[ 3] = 2*(temp1);

-    temp1 = input[2]*C8;

-    temp1 = 2*(temp1);

-    temp2 = input[6] + input[10];

-    step[ 4] = temp1 + temp2;

-    step[ 5] = temp1 - temp2;

-    temp1 = input[14]*C8;

-    temp1 = 2*(temp1);

-    temp2 = input[6] - input[10];

-    step[ 6] = temp2 - temp1;

-    step[ 7] = temp2 + temp1;

-    // for odd input

-    temp1 = input[3]*C12;

-    temp2 = input[13]*C4;

-    temp1 += temp2;

-    temp1 = (temp1);

-    temp1 *= C8;

-    intermediate[ 8] = 2*(temp1);

-    temp1 = input[3]*C4;

-    temp2 = input[13]*C12;

-    temp2 -= temp1;

-    temp2 = (temp2);

-    temp2 *= C8;

-    intermediate[ 9] = 2*(temp2);

-    intermediate[10] = 2*(input[9]*C8);

-    intermediate[11] = input[15] - input[1];

-    intermediate[12] = input[15] + input[1];

-    intermediate[13] = 2*((input[7]*C8));

-    temp1 = input[11]*C12;

-    temp2 = input[5]*C4;

-    temp2 -= temp1;

-    temp2 = (temp2);

-    temp2 *= C8;

-    intermediate[14] = 2*(temp2);

-    temp1 = input[11]*C4;

-    temp2 = input[5]*C12;

-    temp1 += temp2;

-    temp1 = (temp1);

-    temp1 *= C8;

-    intermediate[15] = 2*(temp1);

-    step[ 8] = intermediate[ 8] + intermediate[14];

-    step[ 9] = intermediate[ 9] + intermediate[15];

-    step[10] = intermediate[10] + intermediate[11];

-    step[11] = intermediate[10] - intermediate[11];

-    step[12] = intermediate[12] + intermediate[13];

-    step[13] = intermediate[12] - intermediate[13];

-    step[14] = intermediate[ 8] - intermediate[14];

-    step[15] = intermediate[ 9] - intermediate[15];

-    // step 3

-    output[0] = step[ 0] + step[ 3];

-    output[1] = step[ 1] + step[ 2];

-    output[2] = step[ 1] - step[ 2];

-    output[3] = step[ 0] - step[ 3];

-    temp1 = step[ 4]*C14;

-    temp2 = step[ 7]*C2;

-    temp1 -= temp2;

-    output[4] =  (temp1);

-    temp1 = step[ 4]*C2;

-    temp2 = step[ 7]*C14;

-    temp1 += temp2;

-    output[7] =  (temp1);

-    temp1 = step[ 5]*C10;

-    temp2 = step[ 6]*C6;

-    temp1 -= temp2;

-    output[5] =  (temp1);

-    temp1 = step[ 5]*C6;

-    temp2 = step[ 6]*C10;

-    temp1 += temp2;

-    output[6] =  (temp1);

-    output[8] = step[ 8] + step[11];

-    output[9] = step[ 9] + step[10];

-    output[10] = step[ 9] - step[10];

-    output[11] = step[ 8] - step[11];

-    output[12] = step[12] + step[15];

-    output[13] = step[13] + step[14];

-    output[14] = step[13] - step[14];

-    output[15] = step[12] - step[15];

-    // output 4

-    step[ 0] = output[0] + output[7];

-    step[ 1] = output[1] + output[6];

-    step[ 2] = output[2] + output[5];

-    step[ 3] = output[3] + output[4];

-    step[ 4] = output[3] - output[4];

-    step[ 5] = output[2] - output[5];

-    step[ 6] = output[1] - output[6];

-    step[ 7] = output[0] - output[7];

-    temp1 = output[8]*C7;

-    temp2 = output[15]*C9;

-    temp1 -= temp2;

-    step[ 8] = (temp1);

-    temp1 = output[9]*C11;

-    temp2 = output[14]*C5;

-    temp1 += temp2;

-    step[ 9] = (temp1);

-    temp1 = output[10]*C3;

-    temp2 = output[13]*C13;

-    temp1 -= temp2;

-    step[10] = (temp1);

-    temp1 = output[11]*C15;

-    temp2 = output[12]*C1;

-    temp1 += temp2;

-    step[11] = (temp1);

-    temp1 = output[11]*C1;

-    temp2 = output[12]*C15;

-    temp2 -= temp1;

-    step[12] = (temp2);

-    temp1 = output[10]*C13;

-    temp2 = output[13]*C3;

-    temp1 += temp2;

-    step[13] = (temp1);

-    temp1 = output[9]*C5;

-    temp2 = output[14]*C11;

-    temp2 -= temp1;

-    step[14] = (temp2);

-    temp1 = output[8]*C9;

-    temp2 = output[15]*C7;

-    temp1 += temp2;

-    step[15] = (temp1);

-    // step 5

-    output[0] = (step[0] + step[15]);

-    output[1] = (step[1] + step[14]);

-    output[2] = (step[2] + step[13]);

-    output[3] = (step[3] + step[12]);

-    output[4] = (step[4] + step[11]);

-    output[5] = (step[5] + step[10]);

-    output[6] = (step[6] + step[ 9]);

-    output[7] = (step[7] + step[ 8]);

-    output[15] = (step[0] - step[15]);

-    output[14] = (step[1] - step[14]);

-    output[13] = (step[2] - step[13]);

-    output[12] = (step[3] - step[12]);

-    output[11] = (step[4] - step[11]);

-    output[10] = (step[5] - step[10]);

-    output[9] = (step[6] - step[ 9]);

-    output[8] = (step[7] - step[ 8]);

-  }

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-}

-// Remove once an int version of iDCT is written

-#if 0

-void reference_16x16_idct_1d(double input[16], double output[16]) {

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-  {

-    const double kPi = 3.141592653589793238462643383279502884;

-    const double kSqrt2 = 1.414213562373095048801688724209698;

-    for (int k = 0; k < 16; k++) {

-      output[k] = 0.0;

-      for (int n = 0; n < 16; n++) {

-        output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0);

-        if (n == 0)

-          output[k] = output[k]/kSqrt2;

-      }

-    }

-  }

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-}

-#endif

-void vp9_short_idct16x16_c(short *input, short *output, int pitch) {

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-  {

-    double out[16*16], out2[16*16];

-    const int short_pitch = pitch >> 1;

-    int i, j;

-      // First transform rows

-    for (i = 0; i < 16; ++i) {

-      double temp_in[16], temp_out[16];

-      for (j = 0; j < 16; ++j)

-        temp_in[j] = input[j + i*short_pitch];

-      butterfly_16x16_idct_1d(temp_in, temp_out);

-      for (j = 0; j < 16; ++j)

-        out[j + i*16] = temp_out[j];

-    }

-    // Then transform columns

-    for (i = 0; i < 16; ++i) {

-      double temp_in[16], temp_out[16];

-      for (j = 0; j < 16; ++j)

-        temp_in[j] = out[j*16 + i];

-      butterfly_16x16_idct_1d(temp_in, temp_out);

-      for (j = 0; j < 16; ++j)

-        out2[j*16 + i] = temp_out[j];

-    }

-    for (i = 0; i < 16*16; ++i)

-      output[i] = round(out2[i]/128);

-  }

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-}

--- a/vp8/common/implicit_segmentation.c

+++ /dev/null

@@ -1,255 +1,0 @@

-/*

- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp8/common/onyxc_int.h"

-#define MAX_REGIONS 24000

-#ifndef NULL

-#define NULL 0

-#endif

-#define min_mbs_in_region 3

-// this linked list structure holds equivalences for connected

-// component labeling

-struct list_el {

-  int label;

-  int seg_value;

-  int count;

-  struct list_el *next;

-};

-typedef struct list_el item;

-// connected colorsegments

-typedef struct {

-  int min_x;

-  int min_y;

-  int max_x;

-  int max_y;

-  long long sum_x;

-  long long sum_y;

-  int pixels;

-  int seg_value;

-  int label;

-} segment_info;

-typedef enum {

-  SEGMENT_MODE,

-  SEGMENT_MV,

-  SEGMENT_REFFRAME,

-  SEGMENT_SKIPPED

-} SEGMENT_TYPE;

-// this merges the two equivalence lists and

-// then makes sure that every label points to the same

-// equivalence list

-void merge(item *labels, int u, int v) {

-  item *a = labels[u].next;

-  item *b = labels[v].next;

-  item c;

-  item *it = &c;

-  int count;

-  // check if they are already merged

-  if (u == v || a == b)

-    return;

-  count = a->count + b->count;

-  // merge 2 sorted linked lists.

-  while (a != NULL && b != NULL) {

-    if (a->label < b->label) {

-      it->next = a;

-      a = a->next;

-    } else {

-      it->next = b;

-      b = b->next;

-    }

-    it = it->next;

-  }

-  if (a == NULL)

-    it->next = b;

-  else

-    it->next = a;

-  it = c.next;

-  // make sure every equivalence in the linked list points to this new ll

-  while (it != NULL) {

-    labels[it->label].next = c.next;

-    it = it->next;

-  }

-  c.next->count = count;

-}

-void segment_via_mode_info(VP9_COMMON *oci, int how) {

-  MODE_INFO *mi = oci->mi;

-  int i, j;

-  int mb_index = 0;

-  int label = 1;

-  int pitch = oci->mb_cols;

-  // holds linked list equivalences

-  // the max should probably be allocated at a higher level in oci

-  item equivalences[MAX_REGIONS];

-  int eq_ptr = 0;

-  item labels[MAX_REGIONS];

-  segment_info segments[MAX_REGIONS];

-  int label_count = 1;

-  int labeling[400 * 300];

-  int *lp = labeling;

-  label_count = 1;

-  memset(labels, 0, sizeof(labels));

-  memset(segments, 0, sizeof(segments));

-  /* Go through each macroblock first pass labelling */

-  for (i = 0; i < oci->mb_rows; i++, lp += pitch) {

-    for (j = 0; j < oci->mb_cols; j++) {

-      // int above seg_value, left seg_value, this seg_value...

-      int a = -1, l = -1, n = -1;

-      // above label, left label

-      int al = -1, ll = -1;

-      if (i) {

-        al = lp[j - pitch];

-        a = labels[al].next->seg_value;

-      }

-      if (j) {

-        ll = lp[j - 1];

-        l = labels[ll].next->seg_value;

-      }

-      // what setting are we going to do the implicit segmentation on

-      switch (how) {

-        case SEGMENT_MODE:

-          n = mi[mb_index].mbmi.mode;

-          break;

-        case SEGMENT_MV:

-          n = mi[mb_index].mbmi.mv[0].as_int;

-          if (mi[mb_index].mbmi.ref_frame == INTRA_FRAME)

-            n = -9999999;

-          break;

-        case SEGMENT_REFFRAME:

-          n = mi[mb_index].mbmi.ref_frame;

-          break;

-        case SEGMENT_SKIPPED:

-          n = mi[mb_index].mbmi.mb_skip_coeff;

-          break;

-      }

-      // above and left both have the same seg_value

-      if (n == a && n == l) {

-        // pick the lowest label

-        lp[j] = (al < ll ? al : ll);

-        labels[lp[j]].next->count++;

-        // merge the above and left equivalencies

-        merge(labels, al, ll);

-      }

-      // this matches above seg_value

-      else if (n == a) {

-        // give it the same label as above

-        lp[j] = al;

-        labels[al].next->count++;

-      }

-      // this matches left seg_value

-      else if (n == l) {

-        // give it the same label as above

-        lp[j] = ll;

-        labels[ll].next->count++;

-      } else {

-        // new label doesn't match either

-        item *e = &labels[label];

-        item *nl = &equivalences[eq_ptr++];

-        lp[j] = label;

-        nl->label = label;

-        nl->next = 0;

-        nl->seg_value = n;

-        nl->count = 1;

-        e->next = nl;

-        label++;

-      }

-      mb_index++;

-    }

-    mb_index++;

-  }

-  lp = labeling;

-  // give new labels to regions

-  for (i = 1; i < label; i++)

-    if (labels[i].next->count > min_mbs_in_region  &&  labels[labels[i].next->label].label == 0) {

-      segment_info *cs = &segments[label_count];

-      cs->label = label_count;

-      labels[labels[i].next->label].label = label_count++;

-      labels[labels[i].next->label].seg_value  = labels[i].next->seg_value;

-      cs->seg_value = labels[labels[i].next->label].seg_value;

-      cs->min_x = oci->mb_cols;

-      cs->min_y = oci->mb_rows;

-      cs->max_x = 0;

-      cs->max_y = 0;

-      cs->sum_x = 0;

-      cs->sum_y = 0;

-      cs->pixels = 0;

-    }

-  lp = labeling;

-  // this is just to gather stats...

-  for (i = 0; i < oci->mb_rows; i++, lp += pitch) {

-    for (j = 0; j < oci->mb_cols; j++) {

-      segment_info *cs;

-      int oldlab = labels[lp[j]].next->label;

-      int lab = labels[oldlab].label;

-      lp[j] = lab;

-      cs = &segments[lab];

-      cs->min_x = (j < cs->min_x ? j : cs->min_x);

-      cs->max_x = (j > cs->max_x ? j : cs->max_x);

-      cs->min_y = (i < cs->min_y ? i : cs->min_y);

-      cs->max_y = (i > cs->max_y ? i : cs->max_y);

-      cs->sum_x += j;

-      cs->sum_y += i;

-      cs->pixels++;

-      lp[j] = lab;

-      mb_index++;

-    }

-    mb_index++;

-  }

-  {

-    lp = labeling;

-    printf("labelling \n");

-    mb_index = 0;

-    for (i = 0; i < oci->mb_rows; i++, lp += pitch) {

-      for (j = 0; j < oci->mb_cols; j++) {

-        printf("%4d", lp[j]);

-      }

-      printf("            ");

-      for (j = 0; j < oci->mb_cols; j++, mb_index++) {

-        // printf("%3d",mi[mb_index].mbmi.mode );

-        printf("%4d:%4d", mi[mb_index].mbmi.mv[0].as_mv.row,

-            mi[mb_index].mbmi.mv[0].as_mv.col);

-      }

-      printf("\n");

-      ++mb_index;

-    }

-    printf("\n");

-  }

-}

--- a/vp8/common/invtrans.c

+++ /dev/null

@@ -1,135 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "invtrans.h"

-static void recon_dcblock(MACROBLOCKD *xd) {

-  BLOCKD *b = &xd->block[24];

-  int i;

-  for (i = 0; i < 16; i++) {

-    xd->block[i].dqcoeff[0] = b->diff[i];

-  }

-}

-static void recon_dcblock_8x8(MACROBLOCKD *xd) {

-  BLOCKD *b = &xd->block[24]; // for coeff 0, 2, 8, 10

-  xd->block[0].dqcoeff[0] = b->diff[0];

-  xd->block[4].dqcoeff[0] = b->diff[1];

-  xd->block[8].dqcoeff[0] = b->diff[4];

-  xd->block[12].dqcoeff[0] = b->diff[8];

-}

-void vp9_inverse_transform_b_4x4(const vp9_idct_rtcd_vtable_t *rtcd,

-                                 BLOCKD *b, int pitch) {

-  if (b->eob <= 1)

-    IDCT_INVOKE(rtcd, idct1)(b->dqcoeff, b->diff, pitch);

-  else

-    IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->diff, pitch);

-}

-void vp9_inverse_transform_mby_4x4(const vp9_idct_rtcd_vtable_t *rtcd,

-                                   MACROBLOCKD *xd) {

-  int i;

-  BLOCKD *blockd = xd->block;

-  if (xd->mode_info_context->mbmi.mode != SPLITMV) {

-    /* do 2nd order transform on the dc block */

-    IDCT_INVOKE(rtcd, iwalsh16)(blockd[24].dqcoeff, blockd[24].diff);

-    recon_dcblock(xd);

-  }

-  for (i = 0; i < 16; i++) {

-    vp9_inverse_transform_b_4x4(rtcd, &blockd[i], 32);

-  }

-}

-void vp9_inverse_transform_mbuv_4x4(const vp9_idct_rtcd_vtable_t *rtcd,

-                                    MACROBLOCKD *xd) {

-  int i;

-  BLOCKD *blockd = xd->block;

-  for (i = 16; i < 24; i++) {

-    vp9_inverse_transform_b_4x4(rtcd, &blockd[i], 16);

-  }

-}

-void vp9_inverse_transform_mb_4x4(const vp9_idct_rtcd_vtable_t *rtcd,

-                                  MACROBLOCKD *xd) {

-  vp9_inverse_transform_mby_4x4(rtcd, xd);

-  vp9_inverse_transform_mbuv_4x4(rtcd, xd);

-}

-void vp9_inverse_transform_b_8x8(const vp9_idct_rtcd_vtable_t *rtcd,

-                                 short *input_dqcoeff, short *output_coeff,

-                                 int pitch) {

-  // int b,i;

-  // if (b->eob > 1)

-  IDCT_INVOKE(rtcd, idct8)(input_dqcoeff, output_coeff, pitch);

-  // else

-  // IDCT_INVOKE(rtcd, idct8_1)(b->dqcoeff, b->diff, pitch);//pitch

-}

-void vp9_inverse_transform_mby_8x8(const vp9_idct_rtcd_vtable_t *rtcd,

-                                   MACROBLOCKD *xd) {

-  int i;

-  BLOCKD *blockd = xd->block;

-  if (xd->mode_info_context->mbmi.mode != SPLITMV) {

-    // do 2nd order transform on the dc block

-    IDCT_INVOKE(rtcd, ihaar2)(blockd[24].dqcoeff, blockd[24].diff, 8);

-    recon_dcblock_8x8(xd); // need to change for 8x8

-  }

-  for (i = 0; i < 9; i += 8) {

-    vp9_inverse_transform_b_8x8(rtcd, &blockd[i].dqcoeff[0],

-                                &blockd[i].diff[0], 32);

-  }

-  for (i = 2; i < 11; i += 8) {

-    vp9_inverse_transform_b_8x8(rtcd, &blockd[i + 2].dqcoeff[0],

-                                &blockd[i].diff[0], 32);

-  }

-}

-void vp9_inverse_transform_mbuv_8x8(const vp9_idct_rtcd_vtable_t *rtcd,

-                                    MACROBLOCKD *xd) {

-  int i;

-  BLOCKD *blockd = xd->block;

-  for (i = 16; i < 24; i += 4) {

-    vp9_inverse_transform_b_8x8(rtcd, &blockd[i].dqcoeff[0],

-                                &blockd[i].diff[0], 16);

-  }

-}

-void vp9_inverse_transform_mb_8x8(const vp9_idct_rtcd_vtable_t *rtcd,

-                                  MACROBLOCKD *xd) {

-  vp9_inverse_transform_mby_8x8(rtcd, xd);

-  vp9_inverse_transform_mbuv_8x8(rtcd, xd);

-}

-void vp9_inverse_transform_b_16x16(const vp9_idct_rtcd_vtable_t *rtcd,

-                                   short *input_dqcoeff,

-                                   short *output_coeff, int pitch) {

-  IDCT_INVOKE(rtcd, idct16x16)(input_dqcoeff, output_coeff, pitch);

-}

-void vp9_inverse_transform_mby_16x16(const vp9_idct_rtcd_vtable_t *rtcd,

-                                     MACROBLOCKD *xd) {

-  vp9_inverse_transform_b_16x16(rtcd, &xd->block[0].dqcoeff[0],

-                                &xd->block[0].diff[0], 32);

-}

-void vp9_inverse_transform_mb_16x16(const vp9_idct_rtcd_vtable_t *rtcd,

-                                    MACROBLOCKD *xd) {

-  vp9_inverse_transform_mby_16x16(rtcd, xd);

-  vp9_inverse_transform_mbuv_8x8(rtcd, xd);

-}

--- a/vp8/common/invtrans.h

+++ /dev/null

@@ -1,53 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_INVTRANS_H

-#define __INC_INVTRANS_H

-#include "vpx_ports/config.h"

-#include "idct.h"

-#include "blockd.h"

-extern void vp9_inverse_transform_b_4x4(const vp9_idct_rtcd_vtable_t *rtcd,

-                                        BLOCKD *b, int pitch);

-extern void vp9_inverse_transform_mb_4x4(const vp9_idct_rtcd_vtable_t *rtcd,

-                                         MACROBLOCKD *xd);

-extern void vp9_inverse_transform_mby_4x4(const vp9_idct_rtcd_vtable_t *rtcd,

-                                          MACROBLOCKD *xd);

-extern void vp9_inverse_transform_mbuv_4x4(const vp9_idct_rtcd_vtable_t *rtcd,

-                                           MACROBLOCKD *xd);

-extern void vp9_inverse_transform_b_8x8(const vp9_idct_rtcd_vtable_t *rtcd,

-                                        short *input_dqcoeff,

-                                        short *output_coeff, int pitch);

-extern void vp9_inverse_transform_mb_8x8(const vp9_idct_rtcd_vtable_t *rtcd,

-                                         MACROBLOCKD *xd);

-extern void vp9_inverse_transform_mby_8x8(const vp9_idct_rtcd_vtable_t *rtcd,

-                                          MACROBLOCKD *xd);

-extern void vp9_inverse_transform_mbuv_8x8(const vp9_idct_rtcd_vtable_t *rtcd,

-                                           MACROBLOCKD *xd);

-extern void vp9_inverse_transform_b_16x16(const vp9_idct_rtcd_vtable_t *rtcd,

-                                          short *input_dqcoeff,

-                                          short *output_coeff, int pitch);

-extern void vp9_inverse_transform_mb_16x16(const vp9_idct_rtcd_vtable_t *rtcd,

-                                           MACROBLOCKD *xd);

-extern void vp9_inverse_transform_mby_16x16(const vp9_idct_rtcd_vtable_t *rtcd,

-                                            MACROBLOCKD *xd);

-#endif  // __INC_INVTRANS_H

--- a/vp8/common/loopfilter.c

+++ /dev/null

@@ -1,524 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_config.h"

-#include "loopfilter.h"

-#include "onyxc_int.h"

-#include "vpx_mem/vpx_mem.h"

-#include "vp8/common/seg_common.h"

-static void lf_init_lut(loop_filter_info_n *lfi) {

-  int filt_lvl;

-  for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++) {

-    if (filt_lvl >= 40) {

-      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2;

-      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3;

-    } else if (filt_lvl >= 20) {

-      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;

-      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2;

-    } else if (filt_lvl >= 15) {

-      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;

-      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1;

-    } else {

-      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0;

-      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0;

-    }

-  }

-  lfi->mode_lf_lut[DC_PRED] = 1;

-  lfi->mode_lf_lut[D45_PRED] = 1;

-  lfi->mode_lf_lut[D135_PRED] = 1;

-  lfi->mode_lf_lut[D117_PRED] = 1;

-  lfi->mode_lf_lut[D153_PRED] = 1;

-  lfi->mode_lf_lut[D27_PRED] = 1;

-  lfi->mode_lf_lut[D63_PRED] = 1;

-  lfi->mode_lf_lut[V_PRED] = 1;

-  lfi->mode_lf_lut[H_PRED] = 1;

-  lfi->mode_lf_lut[TM_PRED] = 1;

-  lfi->mode_lf_lut[B_PRED]  = 0;

-  lfi->mode_lf_lut[I8X8_PRED] = 0;

-  lfi->mode_lf_lut[ZEROMV]  = 1;

-  lfi->mode_lf_lut[NEARESTMV] = 2;

-  lfi->mode_lf_lut[NEARMV] = 2;

-  lfi->mode_lf_lut[NEWMV] = 2;

-  lfi->mode_lf_lut[SPLITMV] = 3;

-}

-void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,

-                                      int sharpness_lvl) {

-  int i;

-  /* For each possible value for the loop filter fill out limits */

-  for (i = 0; i <= MAX_LOOP_FILTER; i++) {

-    int filt_lvl = i;

-    int block_inside_limit = 0;

-    /* Set loop filter paramaeters that control sharpness. */

-    block_inside_limit = filt_lvl >> (sharpness_lvl > 0);

-    block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);

-    if (sharpness_lvl > 0) {

-      if (block_inside_limit > (9 - sharpness_lvl))

-        block_inside_limit = (9 - sharpness_lvl);

-    }

-    if (block_inside_limit < 1)

-      block_inside_limit = 1;

-    vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);

-    vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit),

-               SIMD_WIDTH);

-    vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),

-               SIMD_WIDTH);

-  }

-}

-void vp9_loop_filter_init(VP9_COMMON *cm) {

-  loop_filter_info_n *lfi = &cm->lf_info;

-  int i;

-  /* init limits for given sharpness*/

-  vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);

-  cm->last_sharpness_level = cm->sharpness_level;

-  /* init LUT for lvl  and hev thr picking */

-  lf_init_lut(lfi);

-  /* init hev threshold const vectors */

-  for (i = 0; i < 4; i++) {

-    vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);

-  }

-}

-void vp9_loop_filter_frame_init(VP9_COMMON *cm,

-                                MACROBLOCKD *xd,

-                                int default_filt_lvl) {

-  int seg,  /* segment number */

-      ref,  /* index in ref_lf_deltas */

-      mode; /* index in mode_lf_deltas */

-  loop_filter_info_n *lfi = &cm->lf_info;

-  /* update limits if sharpness has changed */

-  if (cm->last_sharpness_level != cm->sharpness_level) {

-    vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);

-    cm->last_sharpness_level = cm->sharpness_level;

-  }

-  for (seg = 0; seg < MAX_MB_SEGMENTS; seg++) {

-    int lvl_seg = default_filt_lvl;

-    int lvl_ref, lvl_mode;

-    // Set the baseline filter values for each segment

-    if (vp9_segfeature_active(xd, seg, SEG_LVL_ALT_LF)) {

-      /* Abs value */

-      if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {

-        lvl_seg = vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF);

-      } else { /* Delta Value */

-        lvl_seg += vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF);

-        lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63 : lvl_seg) : 0;

-      }

-    }

-    if (!xd->mode_ref_lf_delta_enabled) {

-      /* we could get rid of this if we assume that deltas are set to

-       * zero when not in use; encoder always uses deltas

-       */

-      vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4);

-      continue;

-    }

-    lvl_ref = lvl_seg;

-    /* INTRA_FRAME */

-    ref = INTRA_FRAME;

-    /* Apply delta for reference frame */

-    lvl_ref += xd->ref_lf_deltas[ref];

-    /* Apply delta for Intra modes */

-    mode = 0; /* B_PRED */

-    /* Only the split mode BPRED has a further special case */

-    lvl_mode = lvl_ref +  xd->mode_lf_deltas[mode];

-    lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */

-    lfi->lvl[seg][ref][mode] = lvl_mode;

-    mode = 1; /* all the rest of Intra modes */

-    lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref)  : 0; /* clamp */

-    lfi->lvl[seg][ref][mode] = lvl_mode;

-    /* LAST, GOLDEN, ALT */

-    for (ref = 1; ref < MAX_REF_FRAMES; ref++) {

-      int lvl_ref = lvl_seg;

-      /* Apply delta for reference frame */

-      lvl_ref += xd->ref_lf_deltas[ref];

-      /* Apply delta for Inter modes */

-      for (mode = 1; mode < 4; mode++) {

-        lvl_mode = lvl_ref + xd->mode_lf_deltas[mode];

-        lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */

-        lfi->lvl[seg][ref][mode] = lvl_mode;

-      }

-    }

-  }

-}

-void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd) {

-  YV12_BUFFER_CONFIG *post = cm->frame_to_show;

-  loop_filter_info_n *lfi_n = &cm->lf_info;

-  struct loop_filter_info lfi;

-  FRAME_TYPE frame_type = cm->frame_type;

-  int mb_row;

-  int mb_col;

-  int filter_level;

-  unsigned char *y_ptr, *u_ptr, *v_ptr;

-  /* Point at base of Mb MODE_INFO list */

-  const MODE_INFO *mode_info_context = cm->mi;

-  /* Initialize the loop filter for this frame. */

-  vp9_loop_filter_frame_init(cm, xd, cm->filter_level);

-  /* Set up the buffer pointers */

-  y_ptr = post->y_buffer;

-  u_ptr = post->u_buffer;

-  v_ptr = post->v_buffer;

-  /* vp9_filter each macro block */

-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {

-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

-      int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&

-                     mode_info_context->mbmi.mode != I8X8_PRED &&

-                     mode_info_context->mbmi.mode != SPLITMV &&

-                     mode_info_context->mbmi.mb_skip_coeff);

-      const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];

-      const int seg = mode_info_context->mbmi.segment_id;

-      const int ref_frame = mode_info_context->mbmi.ref_frame;

-      int tx_type = mode_info_context->mbmi.txfm_size;

-      filter_level = lfi_n->lvl[seg][ref_frame][mode_index];

-      if (filter_level) {

-        if (cm->filter_type == NORMAL_LOOPFILTER) {

-          const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];

-          lfi.mblim = lfi_n->mblim[filter_level];

-          lfi.blim = lfi_n->blim[filter_level];

-          lfi.lim = lfi_n->lim[filter_level];

-          lfi.hev_thr = lfi_n->hev_thr[hev_index];

-          if (mb_col > 0

-#if CONFIG_SUPERBLOCKS

-              && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&

-                   mode_info_context[0].mbmi.mb_skip_coeff &&

-                   mode_info_context[-1].mbmi.mb_skip_coeff)

-#endif

-              )

-            vp9_loop_filter_mbv(y_ptr, u_ptr, v_ptr, post->y_stride,

-                                post->uv_stride, &lfi);

-          if (!skip_lf && tx_type != TX_16X16) {

-            if (tx_type == TX_8X8)

-              vp9_loop_filter_bv8x8(y_ptr, u_ptr, v_ptr, post->y_stride,

-                                    post->uv_stride, &lfi);

-            else

-              vp9_loop_filter_bv(y_ptr, u_ptr, v_ptr, post->y_stride,

-                                 post->uv_stride, &lfi);

-          }

-          /* don't apply across umv border */

-          if (mb_row > 0

-#if CONFIG_SUPERBLOCKS

-              && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&

-                   mode_info_context[0].mbmi.mb_skip_coeff &&

-                   mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)

-#endif

-              )

-            vp9_loop_filter_mbh(y_ptr, u_ptr, v_ptr, post->y_stride,

-                                post->uv_stride, &lfi);

-          if (!skip_lf && tx_type != TX_16X16) {

-            if (tx_type == TX_8X8)

-              vp9_loop_filter_bh8x8(y_ptr, u_ptr, v_ptr, post->y_stride,

-                                    post->uv_stride, &lfi);

-            else

-              vp9_loop_filter_bh(y_ptr, u_ptr, v_ptr, post->y_stride,

-                                 post->uv_stride, &lfi);

-          }

-        } else {

-          // FIXME: Not 8x8 aware

-          if (mb_col > 0

-#if CONFIG_SUPERBLOCKS

-              && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&

-                   mode_info_context[0].mbmi.mb_skip_coeff &&

-                   mode_info_context[-1].mbmi.mb_skip_coeff)

-#endif

-              )

-            vp9_loop_filter_simple_mbv(y_ptr, post->y_stride,

-                                       lfi_n->mblim[filter_level]);

-          if (!skip_lf)

-            vp9_loop_filter_simple_bv(y_ptr, post->y_stride,

-                                      lfi_n->blim[filter_level]);

-          /* don't apply across umv border */

-          if (mb_row > 0

-#if CONFIG_SUPERBLOCKS

-              && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&

-                   mode_info_context[0].mbmi.mb_skip_coeff &&

-                   mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)

-#endif

-              )

-            vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,

-                                       lfi_n->mblim[filter_level]);

-          if (!skip_lf)

-            vp9_loop_filter_simple_bh(y_ptr, post->y_stride,

-                                      lfi_n->blim[filter_level]);

-        }

-      }

-      y_ptr += 16;

-      u_ptr += 8;

-      v_ptr += 8;

-      mode_info_context++;     /* step to next MB */

-    }

-    y_ptr += post->y_stride  * 16 - post->y_width;

-    u_ptr += post->uv_stride *  8 - post->uv_width;

-    v_ptr += post->uv_stride *  8 - post->uv_width;

-    mode_info_context++;         /* Skip border mb */

-  }

-}

-void vp9_loop_filter_frame_yonly(VP9_COMMON *cm, MACROBLOCKD *xd,

-                                 int default_filt_lvl) {

-  YV12_BUFFER_CONFIG *post = cm->frame_to_show;

-  unsigned char *y_ptr;

-  int mb_row;

-  int mb_col;

-  loop_filter_info_n *lfi_n = &cm->lf_info;

-  struct loop_filter_info lfi;

-  int filter_level;

-  FRAME_TYPE frame_type = cm->frame_type;

-  /* Point at base of Mb MODE_INFO list */

-  const MODE_INFO *mode_info_context = cm->mi;

-#if 0

-  if (default_filt_lvl == 0) /* no filter applied */

-    return;

-#endif

-  /* Initialize the loop filter for this frame. */

-  vp9_loop_filter_frame_init(cm, xd, default_filt_lvl);

-  /* Set up the buffer pointers */

-  y_ptr = post->y_buffer;

-  /* vp9_filter each macro block */

-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {

-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

-      int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&

-                     mode_info_context->mbmi.mode != I8X8_PRED &&

-                     mode_info_context->mbmi.mode != SPLITMV &&

-                     mode_info_context->mbmi.mb_skip_coeff);

-      const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];

-      const int seg = mode_info_context->mbmi.segment_id;

-      const int ref_frame = mode_info_context->mbmi.ref_frame;

-      int tx_type = mode_info_context->mbmi.txfm_size;

-      filter_level = lfi_n->lvl[seg][ref_frame][mode_index];

-      if (filter_level) {

-        if (cm->filter_type == NORMAL_LOOPFILTER) {

-          const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];

-          lfi.mblim = lfi_n->mblim[filter_level];

-          lfi.blim = lfi_n->blim[filter_level];

-          lfi.lim = lfi_n->lim[filter_level];

-          lfi.hev_thr = lfi_n->hev_thr[hev_index];

-          if (mb_col > 0)

-            vp9_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi);

-          if (!skip_lf && tx_type != TX_16X16) {

-            if (tx_type == TX_8X8)

-              vp9_loop_filter_bv8x8(y_ptr, 0, 0, post->y_stride, 0, &lfi);

-            else

-              vp9_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi);

-          }

-          /* don't apply across umv border */

-          if (mb_row > 0)

-            vp9_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi);

-          if (!skip_lf && tx_type != TX_16X16) {

-            if (tx_type == TX_8X8)

-              vp9_loop_filter_bh8x8(y_ptr, 0, 0, post->y_stride, 0, &lfi);

-            else

-              vp9_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi);

-          }

-        } else {

-          // FIXME: Not 8x8 aware

-          if (mb_col > 0)

-            vp9_loop_filter_simple_mbv(y_ptr, post->y_stride,

-                                       lfi_n->mblim[filter_level]);

-          if (!skip_lf)

-            vp9_loop_filter_simple_bv(y_ptr, post->y_stride,

-                                      lfi_n->blim[filter_level]);

-          /* don't apply across umv border */

-          if (mb_row > 0)

-            vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,

-                                       lfi_n->mblim[filter_level]);

-          if (!skip_lf)

-            vp9_loop_filter_simple_bh(y_ptr, post->y_stride,

-                                      lfi_n->blim[filter_level]);

-        }

-      }

-      y_ptr += 16;

-      mode_info_context++;        /* step to next MB */

-    }

-    y_ptr += post->y_stride  * 16 - post->y_width;

-    mode_info_context++;            /* Skip border mb */

-  }

-}

-void vp9_loop_filter_partial_frame(VP9_COMMON *cm, MACROBLOCKD *xd,

-                                   int default_filt_lvl) {

-  YV12_BUFFER_CONFIG *post = cm->frame_to_show;

-  unsigned char *y_ptr;

-  int mb_row;

-  int mb_col;

-  int mb_cols = post->y_width  >> 4;

-  int linestocopy, i;

-  loop_filter_info_n *lfi_n = &cm->lf_info;

-  struct loop_filter_info lfi;

-  int filter_level;

-  int alt_flt_enabled = xd->segmentation_enabled;

-  FRAME_TYPE frame_type = cm->frame_type;

-  const MODE_INFO *mode_info_context;

-  int lvl_seg[MAX_MB_SEGMENTS];

-  mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);

-  /* 3 is a magic number. 4 is probably magic too */

-  linestocopy = (post->y_height >> (4 + 3));

-  if (linestocopy < 1)

-    linestocopy = 1;

-  linestocopy <<= 4;

-  /* Note the baseline filter values for each segment */

-  /* See vp9_loop_filter_frame_init. Rather than call that for each change

-   * to default_filt_lvl, copy the relevant calculation here.

-   */

-  if (alt_flt_enabled) {

-    for (i = 0; i < MAX_MB_SEGMENTS; i++) {

-      /* Abs value */

-      if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {

-        lvl_seg[i] = vp9_get_segdata(xd, i, SEG_LVL_ALT_LF);

-      }

-      /* Delta Value */

-      else {

-        lvl_seg[i] = default_filt_lvl +

-                     vp9_get_segdata(xd, i, SEG_LVL_ALT_LF);

-        lvl_seg[i] = (lvl_seg[i] > 0) ?

-                     ((lvl_seg[i] > 63) ? 63 : lvl_seg[i]) : 0;

-      }

-    }

-  }

-  /* Set up the buffer pointers */

-  y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride;

-  /* vp9_filter each macro block */

-  for (mb_row = 0; mb_row < (linestocopy >> 4); mb_row++) {

-    for (mb_col = 0; mb_col < mb_cols; mb_col++) {

-      int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&

-                     mode_info_context->mbmi.mode != I8X8_PRED &&

-                     mode_info_context->mbmi.mode != SPLITMV &&

-                     mode_info_context->mbmi.mb_skip_coeff);

-      if (alt_flt_enabled)

-        filter_level = lvl_seg[mode_info_context->mbmi.segment_id];

-      else

-        filter_level = default_filt_lvl;

-      if (filter_level) {

-        if (cm->filter_type == NORMAL_LOOPFILTER) {

-          const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];

-          lfi.mblim = lfi_n->mblim[filter_level];

-          lfi.blim = lfi_n->blim[filter_level];

-          lfi.lim = lfi_n->lim[filter_level];

-          lfi.hev_thr = lfi_n->hev_thr[hev_index];

-          if (mb_col > 0)

-            vp9_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi);

-          if (!skip_lf)

-            vp9_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi);

-          vp9_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi);

-          if (!skip_lf)

-            vp9_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi);

-        } else {

-          if (mb_col > 0)

-            vp9_loop_filter_simple_mbv (y_ptr, post->y_stride,

-                                        lfi_n->mblim[filter_level]);

-          if (!skip_lf)

-            vp9_loop_filter_simple_bv(y_ptr, post->y_stride,

-                                      lfi_n->blim[filter_level]);

-          vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,

-                                     lfi_n->mblim[filter_level]);

-          if (!skip_lf)

-            vp9_loop_filter_simple_bh(y_ptr, post->y_stride,

-                                      lfi_n->blim[filter_level]);

-        }

-      }

-      y_ptr += 16;

-      mode_info_context += 1;      /* step to next MB */

-    }

-    y_ptr += post->y_stride  * 16 - post->y_width;

-    mode_info_context += 1;          /* Skip border mb */

-  }

-}

--- a/vp8/common/loopfilter.h

+++ /dev/null

@@ -1,104 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef loopfilter_h

-#define loopfilter_h

-#include "vpx_ports/mem.h"

-#include "vpx_config.h"

-#include "blockd.h"

-#define MAX_LOOP_FILTER 63

-typedef enum {

-  NORMAL_LOOPFILTER = 0,

-  SIMPLE_LOOPFILTER = 1

-} LOOPFILTERTYPE;

-#if ARCH_ARM

-#define SIMD_WIDTH 1

-#else

-#define SIMD_WIDTH 16

-#endif

-/* Need to align this structure so when it is declared and

- * passed it can be loaded into vector registers.

- */

-typedef struct {

-  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,

-                  mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);

-  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,

-                  blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);

-  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,

-                  lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);

-  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,

-                  hev_thr[4][SIMD_WIDTH]);

-  unsigned char lvl[4][4][4];

-  unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1];

-  unsigned char mode_lf_lut[MB_MODE_COUNT];

-} loop_filter_info_n;

-struct loop_filter_info {

-  const unsigned char *mblim;

-  const unsigned char *blim;

-  const unsigned char *lim;

-  const unsigned char *hev_thr;

-};

-#define prototype_loopfilter(sym) \

-  void sym(unsigned char *src, int pitch, const unsigned char *blimit,\

-           const unsigned char *limit, const unsigned char *thresh, int count)

-#define prototype_loopfilter_block(sym) \

-  void sym(unsigned char *y, unsigned char *u, unsigned char *v, \

-           int ystride, int uv_stride, struct loop_filter_info *lfi)

-#define prototype_simple_loopfilter(sym) \

-  void sym(unsigned char *y, int ystride, const unsigned char *blimit)

-#if ARCH_X86 || ARCH_X86_64

-#include "x86/loopfilter_x86.h"

-#endif

-#if ARCH_ARM

-#include "arm/loopfilter_arm.h"

-#endif

-typedef void loop_filter_uvfunction(unsigned char *u,   /* source pointer */

-                                    int p,              /* pitch */

-                                    const unsigned char *blimit,

-                                    const unsigned char *limit,

-                                    const unsigned char *thresh,

-                                    unsigned char *v);

-/* assorted loopfilter functions which get used elsewhere */

-struct VP9Common;

-struct macroblockd;

-void vp9_loop_filter_init(struct VP9Common *cm);

-void vp9_loop_filter_frame_init(struct VP9Common *cm,

-                                struct macroblockd *mbd,

-                                int default_filt_lvl);

-void vp9_loop_filter_frame(struct VP9Common *cm, struct macroblockd *mbd);

-void vp9_loop_filter_partial_frame(struct VP9Common *cm,

-                                   struct macroblockd *mbd,

-                                   int default_filt_lvl);

-void vp9_loop_filter_frame_yonly(struct VP9Common *cm,

-                                 struct macroblockd *mbd,

-                                 int default_filt_lvl);

-void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,

-                                      int sharpness_lvl);

-#endif  // loopfilter_h

--- a/vp8/common/loopfilter_filters.c

+++ /dev/null

@@ -1,480 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <stdlib.h>

-#include "vpx_config.h"

-#include "loopfilter.h"

-#include "onyxc_int.h"

-typedef unsigned char uc;

-static __inline signed char signed_char_clamp(int t) {

-  t = (t < -128 ? -128 : t);

-  t = (t > 127 ? 127 : t);

-  return (signed char) t;

-}

-/* should we apply any filter at all ( 11111111 yes, 00000000 no) */

-static __inline signed char filter_mask(uc limit, uc blimit,

-                                        uc p3, uc p2, uc p1, uc p0,

-                                        uc q0, uc q1, uc q2, uc q3) {

-  signed char mask = 0;

-  mask |= (abs(p3 - p2) > limit) * -1;

-  mask |= (abs(p2 - p1) > limit) * -1;

-  mask |= (abs(p1 - p0) > limit) * -1;

-  mask |= (abs(q1 - q0) > limit) * -1;

-  mask |= (abs(q2 - q1) > limit) * -1;

-  mask |= (abs(q3 - q2) > limit) * -1;

-  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;

-  mask = ~mask;

-  return mask;

-}

-/* is there high variance internal edge ( 11111111 yes, 00000000 no) */

-static __inline signed char hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1) {

-  signed char hev = 0;

-  hev  |= (abs(p1 - p0) > thresh) * -1;

-  hev  |= (abs(q1 - q0) > thresh) * -1;

-  return hev;

-}

-static __inline void filter(signed char mask, uc hev, uc *op1,

-                            uc *op0, uc *oq0, uc *oq1)

-{

-  signed char ps0, qs0;

-  signed char ps1, qs1;

-  signed char filter, Filter1, Filter2;

-  signed char u;

-  ps1 = (signed char) * op1 ^ 0x80;

-  ps0 = (signed char) * op0 ^ 0x80;

-  qs0 = (signed char) * oq0 ^ 0x80;

-  qs1 = (signed char) * oq1 ^ 0x80;

-  /* add outer taps if we have high edge variance */

-  filter = signed_char_clamp(ps1 - qs1);

-  filter &= hev;

-  /* inner taps */

-  filter = signed_char_clamp(filter + 3 * (qs0 - ps0));

-  filter &= mask;

-  /* save bottom 3 bits so that we round one side +4 and the other +3

-   * if it equals 4 we'll set to adjust by -1 to account for the fact

-   * we'd round 3 the other way

-   */

-  Filter1 = signed_char_clamp(filter + 4);

-  Filter2 = signed_char_clamp(filter + 3);

-  Filter1 >>= 3;

-  Filter2 >>= 3;

-  u = signed_char_clamp(qs0 - Filter1);

-  *oq0 = u ^ 0x80;

-  u = signed_char_clamp(ps0 + Filter2);

-  *op0 = u ^ 0x80;

-  filter = Filter1;

-  /* outer tap adjustments */

-  filter += 1;

-  filter >>= 1;

-  filter &= ~hev;

-  u = signed_char_clamp(qs1 - filter);

-  *oq1 = u ^ 0x80;

-  u = signed_char_clamp(ps1 + filter);

-  *op1 = u ^ 0x80;

-}

-void vp9_loop_filter_horizontal_edge_c

-(

-  unsigned char *s,

-  int p, /* pitch */

-  const unsigned char *blimit,

-  const unsigned char *limit,

-  const unsigned char *thresh,

-  int count

-) {

-  int  hev = 0; /* high edge variance */

-  signed char mask = 0;

-  int i = 0;

-  /* loop filter designed to work using chars so that we can make maximum use

-   * of 8 bit simd instructions.

-   */

-  do {

-    mask = filter_mask(limit[0], blimit[0],

-                       s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],

-                       s[0 * p], s[1 * p], s[2 * p], s[3 * p]);

-    hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);

-    filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);

-    ++s;

-  } while (++i < count * 8);

-}

-void vp9_loop_filter_vertical_edge_c(unsigned char *s,

-                                     int p,

-                                     const unsigned char *blimit,

-                                     const unsigned char *limit,

-                                     const unsigned char *thresh,

-                                     int count) {

-  int  hev = 0; /* high edge variance */

-  signed char mask = 0;

-  int i = 0;

-  /* loop filter designed to work using chars so that we can make maximum use

-   * of 8 bit simd instructions.

-   */

-  do {

-    mask = filter_mask(limit[0], blimit[0],

-                       s[-4], s[-3], s[-2], s[-1],

-                       s[0], s[1], s[2], s[3]);

-    hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);

-    filter(mask, hev, s - 2, s - 1, s, s + 1);

-    s += p;

-  } while (++i < count * 8);

-}

-static __inline signed char flatmask(uc thresh,

-                                     uc p4, uc p3, uc p2, uc p1, uc p0,

-                                     uc q0, uc q1, uc q2, uc q3, uc q4) {

-  signed char flat = 0;

-  flat |= (abs(p1 - p0) > 1) * -1;

-  flat |= (abs(q1 - q0) > 1) * -1;

-  flat |= (abs(p0 - p2) > 1) * -1;

-  flat |= (abs(q0 - q2) > 1) * -1;

-  flat |= (abs(p3 - p0) > 1) * -1;

-  flat |= (abs(q3 - q0) > 1) * -1;

-  flat |= (abs(p4 - p0) > 1) * -1;

-  flat |= (abs(q4 - q0) > 1) * -1;

-  flat = ~flat;

-  return flat;

-}

-static __inline void mbfilter(signed char mask, uc hev, uc flat,

-                              uc *op4, uc *op3, uc *op2, uc *op1, uc *op0,

-                              uc *oq0, uc *oq1, uc *oq2, uc *oq3, uc *oq4) {

-  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */

-  if (flat && mask) {

-    unsigned char p0, q0;

-    unsigned char p1, q1;

-    unsigned char p2, q2;

-    unsigned char p3, q3;

-    unsigned char p4, q4;

-    p4 = *op4;

-    p3 = *op3;

-    p2 = *op2;

-    p1 = *op1;

-    p0 = *op0;

-    q0 = *oq0;

-    q1 = *oq1;

-    q2 = *oq2;

-    q3 = *oq3;

-    q4 = *oq4;

-    *op2 = (p4 + p4 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3;

-    *op1 = (p4 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3;

-    *op0 = (p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2 + 4) >> 3;

-    *oq0 = (p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3 + 4) >> 3;

-    *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q4 + 4) >> 3;

-    *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q4 + q4 + 4) >> 3;

-  } else {

-    signed char ps0, qs0;

-    signed char ps1, qs1;

-    signed char filter, Filter1, Filter2;

-    signed char u;

-    ps1 = (signed char) * op1 ^ 0x80;

-    ps0 = (signed char) * op0 ^ 0x80;

-    qs0 = (signed char) * oq0 ^ 0x80;

-    qs1 = (signed char) * oq1 ^ 0x80;

-    /* add outer taps if we have high edge variance */

-    filter = signed_char_clamp(ps1 - qs1);

-    filter &= hev;

-    /* inner taps */

-    filter = signed_char_clamp(filter + 3 * (qs0 - ps0));

-    filter &= mask;

-    Filter1 = signed_char_clamp(filter + 4);

-    Filter2 = signed_char_clamp(filter + 3);

-    Filter1 >>= 3;

-    Filter2 >>= 3;

-    u = signed_char_clamp(qs0 - Filter1);

-    *oq0 = u ^ 0x80;

-    u = signed_char_clamp(ps0 + Filter2);

-    *op0 = u ^ 0x80;

-    filter = Filter1;

-    /* outer tap adjustments */

-    filter += 1;

-    filter >>= 1;

-    filter &= ~hev;

-    u = signed_char_clamp(qs1 - filter);

-    *oq1 = u ^ 0x80;

-    u = signed_char_clamp(ps1 + filter);

-    *op1 = u ^ 0x80;

-  }

-}

-void vp9_mbloop_filter_horizontal_edge_c

-(

-  unsigned char *s,

-  int p,

-  const unsigned char *blimit,

-  const unsigned char *limit,

-  const unsigned char *thresh,

-  int count

-) {

-  signed char hev = 0; /* high edge variance */

-  signed char mask = 0;

-  signed char flat = 0;

-  int i = 0;

-  /* loop filter designed to work using chars so that we can make maximum use

-   * of 8 bit simd instructions.

-   */

-  do {

-    mask = filter_mask(limit[0], blimit[0],

-                       s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],

-                       s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);

-    hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);

-    flat = flatmask(thresh[0],

-                    s[-5 * p], s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],

-                    s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p], s[ 4 * p]);

-    mbfilter(mask, hev, flat,

-             s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,

-             s,       s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p);

-    ++s;

-  } while (++i < count * 8);

-}

-void vp9_mbloop_filter_vertical_edge_c

-(

-  unsigned char *s,

-  int p,

-  const unsigned char *blimit,

-  const unsigned char *limit,

-  const unsigned char *thresh,

-  int count

-) {

-  signed char hev = 0; /* high edge variance */

-  signed char mask = 0;

-  signed char flat = 0;

-  int i = 0;

-  do {

-    mask = filter_mask(limit[0], blimit[0],

-                       s[-4], s[-3], s[-2], s[-1],

-                       s[0], s[1], s[2], s[3]);

-    hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);

-    flat = flatmask(thresh[0],

-                    s[-5], s[-4], s[-3], s[-2], s[-1],

-                    s[ 0], s[ 1], s[ 2], s[ 3], s[ 4]);

-    mbfilter(mask, hev, flat,

-             s - 5, s - 4, s - 3, s - 2, s - 1,

-             s,     s + 1, s + 2, s + 3, s + 4);

-    s += p;

-  } while (++i < count * 8);

-}

-/* should we apply any filter at all ( 11111111 yes, 00000000 no) */

-static __inline signed char simple_filter_mask(uc blimit,

-                                               uc p1, uc p0,

-                                               uc q0, uc q1) {

-  /* Why does this cause problems for win32?

-   * error C2143: syntax error : missing ';' before 'type'

-   *  (void) limit;

-   */

-  signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= blimit) * -1;

-  return mask;

-}

-static __inline void simple_filter(signed char mask,

-                                   uc *op1, uc *op0,

-                                   uc *oq0, uc *oq1) {

-  signed char filter, Filter1, Filter2;

-  signed char p1 = (signed char) * op1 ^ 0x80;

-  signed char p0 = (signed char) * op0 ^ 0x80;

-  signed char q0 = (signed char) * oq0 ^ 0x80;

-  signed char q1 = (signed char) * oq1 ^ 0x80;

-  signed char u;

-  filter = signed_char_clamp(p1 - q1);

-  filter = signed_char_clamp(filter + 3 * (q0 - p0));

-  filter &= mask;

-  /* save bottom 3 bits so that we round one side +4 and the other +3 */

-  Filter1 = signed_char_clamp(filter + 4);

-  Filter1 >>= 3;

-  u = signed_char_clamp(q0 - Filter1);

-  *oq0  = u ^ 0x80;

-  Filter2 = signed_char_clamp(filter + 3);

-  Filter2 >>= 3;

-  u = signed_char_clamp(p0 + Filter2);

-  *op0 = u ^ 0x80;

-}

-void vp9_loop_filter_simple_horizontal_edge_c

-(

-  unsigned char *s,

-  int p,

-  const unsigned char *blimit

-) {

-  signed char mask = 0;

-  int i = 0;

-  do {

-    mask = simple_filter_mask(blimit[0],

-                              s[-2 * p], s[-1 * p],

-                              s[0 * p], s[1 * p]);

-    simple_filter(mask,

-                  s - 2 * p, s - 1 * p,

-                  s, s + 1 * p);

-    ++s;

-  } while (++i < 16);

-}

-void vp9_loop_filter_simple_vertical_edge_c

-(

-  unsigned char *s,

-  int p,

-  const unsigned char *blimit

-) {

-  signed char mask = 0;

-  int i = 0;

-  do {

-    mask = simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);

-    simple_filter(mask, s - 2, s - 1, s, s + 1);

-    s += p;

-  } while (++i < 16);

-}

-/* Vertical MB Filtering */

-void vp9_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,

-                           unsigned char *v_ptr, int y_stride, int uv_stride,

-                           struct loop_filter_info *lfi) {

-  vp9_mbloop_filter_vertical_edge_c(y_ptr, y_stride,

-                                    lfi->mblim, lfi->lim, lfi->hev_thr, 2);

-  if (u_ptr)

-    vp9_mbloop_filter_vertical_edge_c(u_ptr, uv_stride,

-                                      lfi->mblim, lfi->lim, lfi->hev_thr, 1);

-  if (v_ptr)

-    vp9_mbloop_filter_vertical_edge_c(v_ptr, uv_stride,

-                                      lfi->mblim, lfi->lim, lfi->hev_thr, 1);

-}

-/* Vertical B Filtering */

-void vp9_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,

-                          unsigned char *v_ptr, int y_stride, int uv_stride,

-                          struct loop_filter_info *lfi) {

-  vp9_loop_filter_vertical_edge_c(y_ptr + 4, y_stride,

-                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  vp9_loop_filter_vertical_edge_c(y_ptr + 8, y_stride,

-                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  vp9_loop_filter_vertical_edge_c(y_ptr + 12, y_stride,

-                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  if (u_ptr)

-    vp9_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride,

-                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);

-  if (v_ptr)

-    vp9_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride,

-                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);

-}

-/* Horizontal MB filtering */

-void vp9_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,

-                           unsigned char *v_ptr, int y_stride, int uv_stride,

-                           struct loop_filter_info *lfi) {

-  vp9_mbloop_filter_horizontal_edge_c(y_ptr, y_stride,

-                                      lfi->mblim, lfi->lim, lfi->hev_thr, 2);

-  if (u_ptr)

-    vp9_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride,

-                                        lfi->mblim, lfi->lim, lfi->hev_thr, 1);

-  if (v_ptr)

-    vp9_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride,

-                                        lfi->mblim, lfi->lim, lfi->hev_thr, 1);

-}

-/* Horizontal B Filtering */

-void vp9_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,

-                          unsigned char *v_ptr, int y_stride, int uv_stride,

-                          struct loop_filter_info *lfi) {

-  vp9_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride,

-                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  vp9_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride,

-                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  vp9_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride,

-                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  if (u_ptr)

-    vp9_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride,

-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);

-  if (v_ptr)

-    vp9_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride,

-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);

-}

-void vp9_loop_filter_bh8x8_c(unsigned char *y_ptr, unsigned char *u_ptr,

-                             unsigned char *v_ptr, int y_stride, int uv_stride,

-                             struct loop_filter_info *lfi) {

-  vp9_mbloop_filter_horizontal_edge_c(

-    y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

-}

-void vp9_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,

-                           const unsigned char *blimit) {

-  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride,

-                                           y_stride, blimit);

-  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride,

-                                           y_stride, blimit);

-  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride,

-                                           y_stride, blimit);

-}

-void vp9_loop_filter_bv8x8_c(unsigned char *y_ptr, unsigned char *u_ptr,

-                             unsigned char *v_ptr, int y_stride, int uv_stride,

-                             struct loop_filter_info *lfi) {

-  vp9_mbloop_filter_vertical_edge_c(

-    y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

-}

-void vp9_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,

-                           const unsigned char *blimit) {

-  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);

-  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);

-  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);

-}

--- a/vp8/common/maskingmv.c

+++ /dev/null

@@ -1,806 +1,0 @@

-/*

- ============================================================================

- Name        : maskingmv.c

- Author      : jimbankoski

- Version     :

- Copyright   : Your copyright notice

- Description : Hello World in C, Ansi-style

- ============================================================================

- */

-#include <stdio.h>

-#include <stdlib.h>

-#include <string.h>

-extern unsigned int vp9_sad16x16_sse3(

-  unsigned char *src_ptr,

-  int  src_stride,

-  unsigned char *ref_ptr,

-  int  ref_stride,

-  int  max_err);

-extern void vp9_sad16x16x3_sse3(

-  unsigned char *src_ptr,

-  int  src_stride,

-  unsigned char *ref_ptr,

-  int  ref_stride,

-  int  *results);

-extern int vp8_growmaskmb_sse3(

-  unsigned char *om,

-  unsigned char *nm);

-extern void vp8_makemask_sse3(

-  unsigned char *y,

-  unsigned char *u,

-  unsigned char *v,

-  unsigned char *ym,

-  int yp,

-  int uvp,

-  int ys,

-  int us,

-  int vs,

-  int yt,

-  int ut,

-  int vt);

-unsigned int vp9_sad16x16_unmasked_wmt(

-  unsigned char *src_ptr,

-  int  src_stride,

-  unsigned char *ref_ptr,

-  int  ref_stride,

-  unsigned char *mask);

-unsigned int vp9_sad16x16_masked_wmt(

-  unsigned char *src_ptr,

-  int  src_stride,

-  unsigned char *ref_ptr,

-  int  ref_stride,

-  unsigned char *mask);

-unsigned int vp8_masked_predictor_wmt(

-  unsigned char *masked,

-  unsigned char *unmasked,

-  int  src_stride,

-  unsigned char *dst_ptr,

-  int  dst_stride,

-  unsigned char *mask);

-unsigned int vp8_masked_predictor_uv_wmt(

-  unsigned char *masked,

-  unsigned char *unmasked,

-  int  src_stride,

-  unsigned char *dst_ptr,

-  int  dst_stride,

-  unsigned char *mask);

-unsigned int vp8_uv_from_y_mask(

-  unsigned char *ymask,

-  unsigned char *uvmask);

-int yp = 16;

-unsigned char sxy[] = {

-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90

-};

-unsigned char sts[] = {

-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

-};

-unsigned char str[] = {

-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

-};

-unsigned char y[] = {

-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,

-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,

-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,

-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,

-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,

-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,

-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,

-  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,

-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,

-  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,

-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,

-  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,

-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,

-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,

-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,

-  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40

-};

-int uvp = 8;

-unsigned char u[] = {

-  90, 80, 70, 70, 90, 90, 90, 17,

-  90, 80, 70, 70, 90, 90, 90, 17,

-  84, 70, 70, 90, 90, 90, 17, 17,

-  84, 70, 70, 90, 90, 90, 17, 17,

-  80, 70, 70, 90, 90, 90, 17, 17,

-  90, 80, 70, 70, 90, 90, 90, 17,

-  90, 80, 70, 70, 90, 90, 90, 17,

-  90, 80, 70, 70, 90, 90, 90, 17

-};

-unsigned char v[] = {

-  80, 80, 80, 80, 80, 80, 80, 80,

-  80, 80, 80, 80, 80, 80, 80, 80,

-  80, 80, 80, 80, 80, 80, 80, 80,

-  80, 80, 80, 80, 80, 80, 80, 80,

-  80, 80, 80, 80, 80, 80, 80, 80,

-  80, 80, 80, 80, 80, 80, 80, 80,

-  80, 80, 80, 80, 80, 80, 80, 80,

-  80, 80, 80, 80, 80, 80, 80, 80

-};

-unsigned char ym[256];

-unsigned char uvm[64];

-typedef struct {

-  unsigned char y;

-  unsigned char yt;

-  unsigned char u;

-  unsigned char ut;

-  unsigned char v;

-  unsigned char vt;

-  unsigned char use;

-} COLOR_SEG_ELEMENT;

-/*

-COLOR_SEG_ELEMENT segmentation[]=

-{

-    { 60,4,80,17,80,10, 1},

-    { 40,4,15,10,80,10, 1},

-};

-*/

-COLOR_SEG_ELEMENT segmentation[] = {

-  { 79, 44, 92, 44, 237, 60, 1},

-};

-unsigned char pixel_mask(unsigned char y, unsigned char u, unsigned char v,

-                         COLOR_SEG_ELEMENT sgm[],

-                         int c) {

-  COLOR_SEG_ELEMENT *s = sgm;

-  unsigned char m = 0;

-  int i;

-  for (i = 0; i < c; i++, s++)

-    m |= (abs(y - s->y) < s->yt &&

-          abs(u - s->u) < s->ut &&

-          abs(v - s->v) < s->vt ? 255 : 0);

-  return m;

-}

-int neighbors[256][8];

-int makeneighbors(void) {

-  int i, j;

-  for (i = 0; i < 256; i++) {

-    int r = (i >> 4), c = (i & 15);

-    int ni = 0;

-    for (j = 0; j < 8; j++)

-      neighbors[i][j] = i;

-    for (j = 0; j < 256; j++) {

-      int nr = (j >> 4), nc = (j & 15);

-      if (abs(nr - r) < 2 && abs(nc - c) < 2)

-        neighbors[i][ni++] = j;

-    }

-  }

-  return 0;

-}

-void grow_ymask(unsigned char *ym) {

-  unsigned char nym[256];

-  int i, j;

-  for (i = 0; i < 256; i++) {

-    nym[i] = ym[i];

-    for (j = 0; j < 8; j++) {

-      nym[i] |= ym[neighbors[i][j]];

-    }

-  }

-  for (i = 0; i < 256; i++)

-    ym[i] = nym[i];

-}

-void make_mb_mask(unsigned char *y, unsigned char *u, unsigned char *v,

-                  unsigned char *ym, unsigned char *uvm,

-                  int yp, int uvp,

-                  COLOR_SEG_ELEMENT sgm[],

-                  int count) {

-  int r, c;

-  unsigned char *oym = ym;

-  memset(ym, 20, 256);

-  for (r = 0; r < 8; r++, uvm += 8, u += uvp, v += uvp, y += (yp + yp), ym += 32)

-    for (c = 0; c < 8; c++) {

-      int y1 = y[c << 1];

-      int u1 = u[c];

-      int v1 = v[c];

-      int m = pixel_mask(y1, u1, v1, sgm, count);

-      uvm[c] = m;

-      ym[c << 1] = uvm[c]; // = pixel_mask(y[c<<1],u[c],v[c],sgm,count);

-      ym[(c << 1) + 1] = pixel_mask(y[1 + (c << 1)], u[c], v[c], sgm, count);

-      ym[(c << 1) + 16] = pixel_mask(y[yp + (c << 1)], u[c], v[c], sgm, count);

-      ym[(c << 1) + 17] = pixel_mask(y[1 + yp + (c << 1)], u[c], v[c], sgm, count);

-    }

-  grow_ymask(oym);

-}

-int masked_sad(unsigned char *src, int p, unsigned char *dst, int dp,

-               unsigned char *ym) {

-  int i, j;

-  unsigned sad = 0;

-  for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16)

-    for (j = 0; j < 16; j++)

-      if (ym[j])

-        sad += abs(src[j] - dst[j]);

-  return sad;

-}

-int compare_masks(unsigned char *sym, unsigned char *ym) {

-  int i, j;

-  unsigned sad = 0;

-  for (i = 0; i < 16; i++, sym += 16, ym += 16)

-    for (j = 0; j < 16; j++)

-      sad += (sym[j] != ym[j] ? 1 : 0);

-  return sad;

-}

-int unmasked_sad(unsigned char *src, int p, unsigned char *dst, int dp,

-                 unsigned char *ym) {

-  int i, j;

-  unsigned sad = 0;

-  for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16)

-    for (j = 0; j < 16; j++)

-      if (!ym[j])

-        sad += abs(src[j] - dst[j]);

-  return sad;

-}

-int masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,

-                         int yp, int uvp,

-                         unsigned char *dy, unsigned char *du, unsigned char *dv,

-                         int dyp, int duvp,

-                         COLOR_SEG_ELEMENT sgm[],

-                         int count,

-                         int *mi,

-                         int *mj,

-                         int *ui,

-                         int *uj,

-                         int *wm) {

-  int i, j;

-  unsigned char ym[256];

-  unsigned char uvm[64];

-  unsigned char dym[256];

-  unsigned char duvm[64];

-  unsigned int e = 0;

-  int beste = 256;

-  int bmi = -32, bmj = -32;

-  int bui = -32, buj = -32;

-  int beste1 = 256;

-  int bmi1 = -32, bmj1 = -32;

-  int bui1 = -32, buj1 = -32;

-  int obeste;

-  // first try finding best mask and then unmasked

-  beste = 0xffffffff;

-  // find best unmasked mv

-  for (i = -32; i < 32; i++) {

-    unsigned char *dyz = i * dyp + dy;

-    unsigned char *duz = i / 2 * duvp + du;

-    unsigned char *dvz = i / 2 * duvp + dv;

-    for (j = -32; j < 32; j++) {

-      // 0,0  masked destination

-      make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count);

-      e = unmasked_sad(y, yp, dyz + j, dyp, dym);

-      if (e < beste) {

-        bui = i;

-        buj = j;

-        beste = e;

-      }

-    }

-  }

-  // bui=0;buj=0;

-  // best mv masked destination

-  make_mb_mask(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2,

-               dym, duvm, dyp, duvp, sgm, count);

-  obeste = beste;

-  beste = 0xffffffff;

-  // find best masked

-  for (i = -32; i < 32; i++) {

-    unsigned char *dyz = i * dyp + dy;

-    for (j = -32; j < 32; j++) {

-      e = masked_sad(y, yp, dyz + j, dyp, dym);

-      if (e < beste) {

-        bmi = i;

-        bmj = j;

-        beste = e;

-      }

-    }

-  }

-  beste1 = beste + obeste;

-  bmi1 = bmi;

-  bmj1 = bmj;

-  bui1 = bui;

-  buj1 = buj;

-  beste = 0xffffffff;

-  // source mask

-  make_mb_mask(y, u, v, ym, uvm, yp, uvp, sgm, count);

-  // find best mask

-  for (i = -32; i < 32; i++) {

-    unsigned char *dyz = i * dyp + dy;

-    unsigned char *duz = i / 2 * duvp + du;

-    unsigned char *dvz = i / 2 * duvp + dv;

-    for (j = -32; j < 32; j++) {

-      // 0,0  masked destination

-      make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count);

-      e = compare_masks(ym, dym);

-      if (e < beste) {

-        bmi = i;

-        bmj = j;

-        beste = e;

-      }

-    }

-  }

-  // best mv masked destination

-  make_mb_mask(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2,

-               dym, duvm, dyp, duvp, sgm, count);

-  obeste = masked_sad(y, yp, dy + bmi * dyp + bmj, dyp, dym);

-  beste = 0xffffffff;

-  // find best unmasked mv

-  for (i = -32; i < 32; i++) {

-    unsigned char *dyz = i * dyp + dy;

-    for (j = -32; j < 32; j++) {

-      e = unmasked_sad(y, yp, dyz + j, dyp, dym);

-      if (e < beste) {

-        bui = i;

-        buj = j;

-        beste = e;

-      }

-    }

-  }

-  beste += obeste;

-  if (beste < beste1) {

-    *mi = bmi;

-    *mj = bmj;

-    *ui = bui;

-    *uj = buj;

-    *wm = 1;

-  } else {

-    *mi = bmi1;

-    *mj = bmj1;

-    *ui = bui1;

-    *uj = buj1;

-    *wm = 0;

-  }

-  return 0;

-}

-int predict(unsigned char *src, int p, unsigned char *dst, int dp,

-            unsigned char *ym, unsigned char *prd) {

-  int i, j;

-  for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16, prd += 16)

-    for (j = 0; j < 16; j++)

-      prd[j] = (ym[j] ? src[j] : dst[j]);

-  return 0;

-}

-int fast_masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,

-                              int yp, int uvp,

-                              unsigned char *dy, unsigned char *du, unsigned char *dv,

-                              int dyp, int duvp,

-                              COLOR_SEG_ELEMENT sgm[],

-                              int count,

-                              int *mi,

-                              int *mj,

-                              int *ui,

-                              int *uj,

-                              int *wm) {

-  int i, j;

-  unsigned char ym[256];

-  unsigned char ym2[256];

-  unsigned char uvm[64];

-  unsigned char dym2[256];

-  unsigned char dym[256];

-  unsigned char duvm[64];

-  unsigned int e = 0;

-  int beste = 256;

-  int bmi = -32, bmj = -32;

-  int bui = -32, buj = -32;

-  int beste1 = 256;

-  int bmi1 = -32, bmj1 = -32;

-  int bui1 = -32, buj1 = -32;

-  int obeste;

-  // first try finding best mask and then unmasked

-  beste = 0xffffffff;

-#if 0

-  for (i = 0; i < 16; i++) {

-    unsigned char *dy = i * yp + y;

-    for (j = 0; j < 16; j++)

-      printf("%2x", dy[j]);

-    printf("\n");

-  }

-  printf("\n");

-  for (i = -32; i < 48; i++) {

-    unsigned char *dyz = i * dyp + dy;

-    for (j = -32; j < 48; j++)

-      printf("%2x", dyz[j]);

-    printf("\n");

-  }

-#endif

-  // find best unmasked mv

-  for (i = -32; i < 32; i++) {

-    unsigned char *dyz = i * dyp + dy;

-    unsigned char *duz = i / 2 * duvp + du;

-    unsigned char *dvz = i / 2 * duvp + dv;

-    for (j = -32; j < 32; j++) {

-      // 0,0  masked destination

-      vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp,

-                        sgm[0].y, sgm[0].u, sgm[0].v,

-                        sgm[0].yt, sgm[0].ut, sgm[0].vt);

-      vp8_growmaskmb_sse3(dym, dym2);

-      e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2);

-      if (e < beste) {

-        bui = i;

-        buj = j;

-        beste = e;

-      }

-    }

-  }

-  // bui=0;buj=0;

-  // best mv masked destination

-  vp8_makemask_sse3(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2,

-                    dym, dyp, duvp,

-                    sgm[0].y, sgm[0].u, sgm[0].v,

-                    sgm[0].yt, sgm[0].ut, sgm[0].vt);

-  vp8_growmaskmb_sse3(dym, dym2);

-  obeste = beste;

-  beste = 0xffffffff;

-  // find best masked

-  for (i = -32; i < 32; i++) {

-    unsigned char *dyz = i * dyp + dy;

-    for (j = -32; j < 32; j++) {

-      e = vp9_sad16x16_masked_wmt(y, yp, dyz + j, dyp, dym2);

-      if (e < beste) {

-        bmi = i;

-        bmj = j;

-        beste = e;

-      }

-    }

-  }

-  beste1 = beste + obeste;

-  bmi1 = bmi;

-  bmj1 = bmj;

-  bui1 = bui;

-  buj1 = buj;

-  // source mask

-  vp8_makemask_sse3(y, u, v,

-                    ym, yp, uvp,

-                    sgm[0].y, sgm[0].u, sgm[0].v,

-                    sgm[0].yt, sgm[0].ut, sgm[0].vt);

-  vp8_growmaskmb_sse3(ym, ym2);

-  // find best mask

-  for (i = -32; i < 32; i++) {

-    unsigned char *dyz = i * dyp + dy;

-    unsigned char *duz = i / 2 * duvp + du;

-    unsigned char *dvz = i / 2 * duvp + dv;

-    for (j = -32; j < 32; j++) {

-      // 0,0  masked destination

-      vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp,

-                        sgm[0].y, sgm[0].u, sgm[0].v,

-                        sgm[0].yt, sgm[0].ut, sgm[0].vt);

-      vp8_growmaskmb_sse3(dym, dym2);

-      e = compare_masks(ym2, dym2);

-      if (e < beste) {

-        bmi = i;

-        bmj = j;

-        beste = e;

-      }

-    }

-  }

-  vp8_makemask_sse3(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2,

-                    dym, dyp, duvp,

-                    sgm[0].y, sgm[0].u, sgm[0].v,

-                    sgm[0].yt, sgm[0].ut, sgm[0].vt);

-  vp8_growmaskmb_sse3(dym, dym2);

-  obeste = vp9_sad16x16_masked_wmt(y, yp, dy + bmi * dyp + bmj, dyp, dym2);

-  beste = 0xffffffff;

-  // find best unmasked mv

-  for (i = -32; i < 32; i++) {

-    unsigned char *dyz = i * dyp + dy;

-    for (j = -32; j < 32; j++) {

-      e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2);

-      if (e < beste) {

-        bui = i;

-        buj = j;

-        beste = e;

-      }

-    }

-  }

-  beste += obeste;

-  if (beste < beste1) {

-    *mi = bmi;

-    *mj = bmj;

-    *ui = bui;

-    *uj = buj;

-    *wm = 1;

-  } else {

-    *mi = bmi1;

-    *mj = bmj1;

-    *ui = bui1;

-    *uj = buj1;

-    *wm = 0;

-    beste = beste1;

-  }

-  return beste;

-}

-int predict_all(unsigned char *ym, unsigned char *um, unsigned char *vm,

-                int ymp, int uvmp,

-                unsigned char *yp, unsigned char *up, unsigned char *vp,

-                int ypp, int uvpp,

-                COLOR_SEG_ELEMENT sgm[],

-                int count,

-                int mi,

-                int mj,

-                int ui,

-                int uj,

-                int wm) {

-  int i, j;

-  unsigned char dym[256];

-  unsigned char dym2[256];

-  unsigned char duvm[64];

-  unsigned char *yu = ym, *uu = um, *vu = vm;

-  unsigned char *dym3 = dym2;

-  ym += mi * ymp + mj;

-  um += mi / 2 * uvmp + mj / 2;

-  vm += mi / 2 * uvmp + mj / 2;

-  yu += ui * ymp + uj;

-  uu += ui / 2 * uvmp + uj / 2;

-  vu += ui / 2 * uvmp + uj / 2;

-  // best mv masked destination

-  if (wm)

-    vp8_makemask_sse3(ym, um, vm, dym, ymp, uvmp,

-                      sgm[0].y, sgm[0].u, sgm[0].v,

-                      sgm[0].yt, sgm[0].ut, sgm[0].vt);

-  else

-    vp8_makemask_sse3(yu, uu, vu, dym, ymp, uvmp,

-                      sgm[0].y, sgm[0].u, sgm[0].v,

-                      sgm[0].yt, sgm[0].ut, sgm[0].vt);

-  vp8_growmaskmb_sse3(dym, dym2);

-  vp8_masked_predictor_wmt(ym, yu, ymp, yp, ypp, dym3);

-  vp8_uv_from_y_mask(dym3, duvm);

-  vp8_masked_predictor_uv_wmt(um, uu, uvmp, up, uvpp, duvm);

-  vp8_masked_predictor_uv_wmt(vm, vu, uvmp, vp, uvpp, duvm);

-  return 0;

-}

-unsigned char f0p[1280 * 720 * 3 / 2];

-unsigned char f1p[1280 * 720 * 3 / 2];

-unsigned char prd[1280 * 720 * 3 / 2];

-unsigned char msk[1280 * 720 * 3 / 2];

-int mainz(int argc, char *argv[]) {

-  FILE *f = fopen(argv[1], "rb");

-  FILE *g = fopen(argv[2], "wb");

-  int w = atoi(argv[3]), h = atoi(argv[4]);

-  int y_stride = w, uv_stride = w / 2;

-  int r, c;

-  unsigned char *f0 = f0p, *f1 = f1p, *t;

-  unsigned char ym[256], uvm[64];

-  unsigned char ym2[256], uvm2[64];

-  unsigned char ym3[256], uvm3[64];

-  int a, b;

-  COLOR_SEG_ELEMENT last = { 20, 20, 20, 20, 230, 20, 1}, best;

-#if 0

-  makeneighbors();

-  COLOR_SEG_ELEMENT segmentation[] = {

-    { 60, 4, 80, 17, 80, 10, 1},

-    { 40, 4, 15, 10, 80, 10, 1},

-  };

-  make_mb_mask(y, u, v, ym2, uvm2, 16, 8, segmentation, 1);

-  vp8_makemask_sse3(y, u, v, ym, (int) 16, (int) 8,

-                    (int) segmentation[0].y, (int) segmentation[0].u, (int) segmentation[0].v,

-                    segmentation[0].yt, segmentation[0].ut, segmentation[0].vt);

-  vp8_growmaskmb_sse3(ym, ym3);

-  a = vp9_sad16x16_masked_wmt(str, 16, sts, 16, ym3);

-  b = vp9_sad16x16_unmasked_wmt(str, 16, sts, 16, ym3);

-  vp8_masked_predictor_wmt(str, sts, 16, ym, 16, ym3);

-  vp8_uv_from_y_mask(ym3, uvm3);

-  return 4;

-#endif

-  makeneighbors();

-  memset(prd, 128, w * h * 3 / 2);

-  fread(f0, w * h * 3 / 2, 1, f);

-  while (!feof(f)) {

-    unsigned char *ys = f1, *yd = f0, *yp = prd;

-    unsigned char *us = f1 + w * h, *ud = f0 + w * h, *up = prd + w * h;

-    unsigned char *vs = f1 + w * h * 5 / 4, *vd = f0 + w * h * 5 / 4, *vp = prd + w * h * 5 / 4;

-    fread(f1, w * h * 3 / 2, 1, f);

-    ys += 32 * y_stride;

-    yd += 32 * y_stride;

-    yp += 32 * y_stride;

-    us += 16 * uv_stride;

-    ud += 16 * uv_stride;

-    up += 16 * uv_stride;

-    vs += 16 * uv_stride;

-    vd += 16 * uv_stride;

-    vp += 16 * uv_stride;

-    for (r = 32; r < h - 32; r += 16,

-         ys += 16 * w, yd += 16 * w, yp += 16 * w,

-         us += 8 * uv_stride, ud += 8 * uv_stride, up += 8 * uv_stride,

-         vs += 8 * uv_stride, vd += 8 * uv_stride, vp += 8 * uv_stride) {

-      for (c = 32; c < w - 32; c += 16) {

-        int mi, mj, ui, uj, wm;

-        int bmi, bmj, bui, buj, bwm;

-        unsigned char ym[256];

-        if (vp9_sad16x16_sse3(ys + c, y_stride, yd + c, y_stride, 0xffff) == 0)

-          bmi = bmj = bui = buj = bwm = 0;

-        else {

-          COLOR_SEG_ELEMENT cs[5];

-          int j;

-          unsigned int beste = 0xfffffff;

-          unsigned int bestj = 0;

-          // try color from last mb segmentation

-          cs[0] = last;

-          // try color segs from 4 pixels in mb recon as segmentation

-          cs[1].y = yd[c + y_stride + 1];

-          cs[1].u = ud[c / 2 + uv_stride];

-          cs[1].v = vd[c / 2 + uv_stride];

-          cs[1].yt = cs[1].ut = cs[1].vt = 20;

-          cs[2].y = yd[c + w + 14];

-          cs[2].u = ud[c / 2 + uv_stride + 7];

-          cs[2].v = vd[c / 2 + uv_stride + 7];

-          cs[2].yt = cs[2].ut = cs[2].vt = 20;

-          cs[3].y = yd[c + w * 14 + 1];

-          cs[3].u = ud[c / 2 + uv_stride * 7];

-          cs[3].v = vd[c / 2 + uv_stride * 7];

-          cs[3].yt = cs[3].ut = cs[3].vt = 20;

-          cs[4].y = yd[c + w * 14 + 14];

-          cs[4].u = ud[c / 2 + uv_stride * 7 + 7];

-          cs[4].v = vd[c / 2 + uv_stride * 7 + 7];

-          cs[4].yt = cs[4].ut = cs[4].vt = 20;

-          for (j = 0; j < 5; j++) {

-            int e;

-            e = fast_masked_motion_search(

-                  ys + c, us + c / 2, vs + c / 2, y_stride, uv_stride,

-                  yd + c, ud + c / 2, vd + c / 2, y_stride, uv_stride,

-                  &cs[j], 1, &mi, &mj, &ui, &uj, &wm);

-            if (e < beste) {

-              bmi = mi;

-              bmj = mj;

-              bui = ui;

-              buj = uj, bwm = wm;

-              bestj = j;

-              beste = e;

-            }

-          }

-          best = cs[bestj];

-          // best = segmentation[0];

-          last = best;

-        }

-        predict_all(yd + c, ud + c / 2, vd + c / 2, w, uv_stride,

-                    yp + c, up + c / 2, vp + c / 2, w, uv_stride,

-                    &best, 1, bmi, bmj, bui, buj, bwm);

-      }

-    }

-    fwrite(prd, w * h * 3 / 2, 1, g);

-    t = f0;

-    f0 = f1;

-    f1 = t;

-  }

-  fclose(f);

-  fclose(g);

-  return;

-}

--- a/vp8/common/mbpitch.c

+++ /dev/null

@@ -1,124 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "blockd.h"

-typedef enum {

-  PRED = 0,

-  DEST = 1

-} BLOCKSET;

-static void setup_block

-(

-  BLOCKD *b,

-  int mv_stride,

-  unsigned char **base,

-  unsigned char **base2,

-  int Stride,

-  int offset,

-  BLOCKSET bs

-) {

-  if (bs == DEST) {

-    b->dst_stride = Stride;

-    b->dst = offset;

-    b->base_dst = base;

-  } else {

-    b->pre_stride = Stride;

-    b->pre = offset;

-    b->base_pre = base;

-    b->base_second_pre = base2;

-  }

-}

-static void setup_macroblock(MACROBLOCKD *xd, BLOCKSET bs) {

-  int block;

-  unsigned char **y, **u, **v;

-  unsigned char **y2, **u2, **v2;

-  BLOCKD *blockd = xd->block;

-  int stride;

-  if (bs == DEST) {

-    y = &xd->dst.y_buffer;

-    u = &xd->dst.u_buffer;

-    v = &xd->dst.v_buffer;

-  } else {

-    y = &xd->pre.y_buffer;

-    u = &xd->pre.u_buffer;

-    v = &xd->pre.v_buffer;

-    y2 = &xd->second_pre.y_buffer;

-    u2 = &xd->second_pre.u_buffer;

-    v2 = &xd->second_pre.v_buffer;

-  }

-  stride = xd->dst.y_stride;

-  for (block = 0; block < 16; block++) { /* y blocks */

-    setup_block(&blockd[block], stride, y, y2, stride,

-                (block >> 2) * 4 * stride + (block & 3) * 4, bs);

-  }

-  stride = xd->dst.uv_stride;

-  for (block = 16; block < 20; block++) { /* U and V blocks */

-    setup_block(&blockd[block], stride, u, u2, stride,

-      ((block - 16) >> 1) * 4 * stride + (block & 1) * 4, bs);

-    setup_block(&blockd[block + 4], stride, v, v2, stride,

-      ((block - 16) >> 1) * 4 * stride + (block & 1) * 4, bs);

-  }

-}

-void vp9_setup_block_dptrs(MACROBLOCKD *xd) {

-  int r, c;

-  BLOCKD *blockd = xd->block;

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

-      blockd[r * 4 + c].diff = &xd->diff[r * 4 * 16 + c * 4];

-      blockd[r * 4 + c].predictor = xd->predictor + r * 4 * 16 + c * 4;

-    }

-  }

-  for (r = 0; r < 2; r++) {

-    for (c = 0; c < 2; c++) {

-      blockd[16 + r * 2 + c].diff = &xd->diff[256 + r * 4 * 8 + c * 4];

-      blockd[16 + r * 2 + c].predictor =

-        xd->predictor + 256 + r * 4 * 8 + c * 4;

-    }

-  }

-  for (r = 0; r < 2; r++) {

-    for (c = 0; c < 2; c++) {

-      blockd[20 + r * 2 + c].diff = &xd->diff[320 + r * 4 * 8 + c * 4];

-      blockd[20 + r * 2 + c].predictor =

-        xd->predictor + 320 + r * 4 * 8 + c * 4;

-    }

-  }

-  blockd[24].diff = &xd->diff[384];

-  for (r = 0; r < 25; r++) {

-    blockd[r].qcoeff  = xd->qcoeff  + r * 16;

-    blockd[r].dqcoeff = xd->dqcoeff + r * 16;

-  }

-}

-void vp9_build_block_doffsets(MACROBLOCKD *xd) {

-  /* handle the destination pitch features */

-  setup_macroblock(xd, DEST);

-  setup_macroblock(xd, PRED);

-}

--- a/vp8/common/modecont.c

+++ /dev/null

@@ -1,64 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "entropy.h"

-const int vp9_default_mode_contexts[6][4] = {

-  {

-    /* 0 */

-    7,     1,     1,   183

-  },

-  {

-    /* 1 */

-    14,    18,    14,   147

-  },

-  {

-    /* 2 */

-    135,    64,    57,    68

-  },

-  {

-    /* 3 */

-    60,    56,   128,   65

-  },

-  {

-    /* 4 */

-    159,   134,   128,   34

-  },

-  {

-    /* 5 */

-    234,   188,   128,   28

-  },

-};

-const int vp9_default_mode_contexts_a[6][4] = {

-  {

-    /* 0 */

-    4,     1,    1,   143

-  },

-  {

-    /* 1 */

-    7,     9,    7,   107

-  },

-  {

-    /* 2 */

-    95,    34,   57,    68

-  },

-  {

-    /* 3 */

-    95,    56,   128,   65

-  },

-  {

-    /* 4 */

-    159,   67,   128,   34

-  },

-  {

-    /* 5 */

-    234,   94,   128,   28

-  },

-};

--- a/vp8/common/modecont.h

+++ /dev/null

@@ -1,17 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_MODECONT_H

-#define __INC_MODECONT_H

-extern const int vp9_default_mode_contexts[6][4];

-extern const int vp9_default_mode_contexts_a[6][4];

-#endif

--- a/vp8/common/modecontext.c

+++ /dev/null

@@ -1,145 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "entropymode.h"

-const unsigned int vp9_kf_default_bmode_counts[VP9_BINTRAMODES][VP9_BINTRAMODES][VP9_BINTRAMODES] = {

-  {

-    /*Above Mode :  0*/

-    { 43438,   2195,    470,    316,    615,    171,    217,    412,    124,    160, }, /* left_mode 0 */

-    {  5722,   2751,    296,    291,     81,     68,     80,    101,    100,    170, }, /* left_mode 1 */

-    {  1629,    201,    307,     25,     47,     16,     34,     72,     19,     28, }, /* left_mode 2 */

-    {   332,    266,     36,    500,     20,     65,     23,     14,    154,    106, }, /* left_mode 3 */

-    {   450,     97,     10,     24,    117,     10,      2,     12,      8,     71, }, /* left_mode 4 */

-    {   384,     49,     29,     44,     12,    162,     51,      5,     87,     42, }, /* left_mode 5 */

-    {   495,     53,    157,     27,     14,     57,    180,     17,     17,     34, }, /* left_mode 6 */

-    {   695,     64,     62,      9,     27,      5,      3,    147,     10,     26, }, /* left_mode 7 */

-    {   230,     54,     20,    124,     16,    125,     29,     12,    283,     37, }, /* left_mode 8 */

-    {   260,     87,     21,    120,     32,     16,     33,     16,     33,    203, }, /* left_mode 9 */

-  },

-  {

-    /*Above Mode :  1*/

-    {  3934,   2573,    355,    137,    128,     87,    133,    117,     37,     27, }, /* left_mode 0 */

-    {  1036,   1929,    278,    135,     27,     37,     48,     55,     41,     91, }, /* left_mode 1 */

-    {   223,    256,    253,     15,     13,      9,     28,     64,      3,      3, }, /* left_mode 2 */

-    {   120,    129,     17,    316,     15,     11,      9,      4,     53,     74, }, /* left_mode 3 */

-    {   129,     58,      6,     11,     38,      2,      0,      5,      2,     67, }, /* left_mode 4 */

-    {    53,     22,     11,     16,      8,     26,     14,      3,     19,     12, }, /* left_mode 5 */

-    {    59,     26,     61,     11,      4,      9,     35,     13,      8,      8, }, /* left_mode 6 */

-    {   101,     52,     40,      8,      5,      2,      8,     59,      2,     20, }, /* left_mode 7 */

-    {    48,     34,     10,     52,      8,     15,      6,      6,     63,     20, }, /* left_mode 8 */

-    {    96,     48,     22,     63,     11,     14,      5,      8,      9,     96, }, /* left_mode 9 */

-  },

-  {

-    /*Above Mode :  2*/

-    {   709,    461,    506,     36,     27,     33,    151,     98,     24,      6, }, /* left_mode 0 */

-    {   201,    375,    442,     27,     13,      8,     46,     58,      6,     19, }, /* left_mode 1 */

-    {   122,    140,    417,      4,     13,      3,     33,     59,      4,      2, }, /* left_mode 2 */

-    {    36,     17,     22,     16,      6,      8,     12,     17,      9,     21, }, /* left_mode 3 */

-    {    51,     15,      7,      1,     14,      0,      4,      5,      3,     22, }, /* left_mode 4 */

-    {    18,     11,     30,      9,      7,     20,     11,      5,      2,      6, }, /* left_mode 5 */

-    {    38,     21,    103,      9,      4,     12,     79,     13,      2,      5, }, /* left_mode 6 */

-    {    64,     17,     66,      2,     12,      4,      2,     65,      4,      5, }, /* left_mode 7 */

-    {    14,      7,      7,     16,      3,     11,      4,     13,     15,     16, }, /* left_mode 8 */

-    {    36,      8,     32,      9,      9,      4,     14,      7,      6,     24, }, /* left_mode 9 */

-  },

-  {

-    /*Above Mode :  3*/

-    {  1340,    173,     36,    119,     30,     10,     13,     10,     20,     26, }, /* left_mode 0 */

-    {   156,    293,     26,    108,      5,     16,      2,      4,     23,     30, }, /* left_mode 1 */

-    {    60,     34,     13,      7,      3,      3,      0,      8,      4,      5, }, /* left_mode 2 */

-    {    72,     64,      1,    235,      3,      9,      2,      7,     28,     38, }, /* left_mode 3 */

-    {    29,     14,      1,      3,      5,      0,      2,      2,      5,     13, }, /* left_mode 4 */

-    {    22,      7,      4,     11,      2,      5,      1,      2,      6,      4, }, /* left_mode 5 */

-    {    18,     14,      5,      6,      4,      3,     14,      0,      9,      2, }, /* left_mode 6 */

-    {    41,     10,      7,      1,      2,      0,      0,     10,      2,      1, }, /* left_mode 7 */

-    {    23,     19,      2,     33,      1,      5,      2,      0,     51,      8, }, /* left_mode 8 */

-    {    33,     26,      7,     53,      3,      9,      3,      3,      9,     19, }, /* left_mode 9 */

-  },

-  {

-    /*Above Mode :  4*/

-    {   410,    165,     43,     31,     66,     15,     30,     54,      8,     17, }, /* left_mode 0 */

-    {   115,     64,     27,     18,     30,      7,     11,     15,      4,     19, }, /* left_mode 1 */

-    {    31,     23,     25,      1,      7,      2,      2,     10,      0,      5, }, /* left_mode 2 */

-    {    17,      4,      1,      6,      8,      2,      7,      5,      5,     21, }, /* left_mode 3 */

-    {   120,     12,      1,      2,     83,      3,      0,      4,      1,     40, }, /* left_mode 4 */

-    {     4,      3,      1,      2,      1,      2,      5,      0,      3,      6, }, /* left_mode 5 */

-    {    10,      2,     13,      6,      6,      6,      8,      2,      4,      5, }, /* left_mode 6 */

-    {    58,     10,      5,      1,     28,      1,      1,     33,      1,      9, }, /* left_mode 7 */

-    {     8,      2,      1,      4,      2,      5,      1,      1,      2,     10, }, /* left_mode 8 */

-    {    76,      7,      5,      7,     18,      2,      2,      0,      5,     45, }, /* left_mode 9 */

-  },

-  {

-    /*Above Mode :  5*/

-    {   444,     46,     47,     20,     14,    110,     60,     14,     60,      7, }, /* left_mode 0 */

-    {    59,     57,     25,     18,      3,     17,     21,      6,     14,      6, }, /* left_mode 1 */

-    {    24,     17,     20,      6,      4,     13,      7,      2,      3,      2, }, /* left_mode 2 */

-    {    13,     11,      5,     14,      4,      9,      2,      4,     15,      7, }, /* left_mode 3 */

-    {     8,      5,      2,      1,      4,      0,      1,      1,      2,     12, }, /* left_mode 4 */

-    {    19,      5,      5,      7,      4,     40,      6,      3,     10,      4, }, /* left_mode 5 */

-    {    16,      5,      9,      1,      1,     16,     26,      2,     10,      4, }, /* left_mode 6 */

-    {    11,      4,      8,      1,      1,      4,      4,      5,      4,      1, }, /* left_mode 7 */

-    {    15,      1,      3,      7,      3,     21,      7,      1,     34,      5, }, /* left_mode 8 */

-    {    18,      5,      1,      3,      4,      3,      7,      1,      2,      9, }, /* left_mode 9 */

-  },

-  {

-    /*Above Mode :  6*/

-    {   476,    149,     94,     13,     14,     77,    291,     27,     23,      3, }, /* left_mode 0 */

-    {    79,     83,     42,     14,      2,     12,     63,      2,      4,     14, }, /* left_mode 1 */

-    {    43,     36,     55,      1,      3,      8,     42,     11,      5,      1, }, /* left_mode 2 */

-    {     9,      9,      6,     16,      1,      5,      6,      3,     11,     10, }, /* left_mode 3 */

-    {    10,      3,      1,      3,     10,      1,      0,      1,      1,      4, }, /* left_mode 4 */

-    {    14,      6,     15,      5,      1,     20,     25,      2,      5,      0, }, /* left_mode 5 */

-    {    28,      7,     51,      1,      0,      8,    127,      6,      2,      5, }, /* left_mode 6 */

-    {    13,      3,      3,      2,      3,      1,      2,      8,      1,      2, }, /* left_mode 7 */

-    {    10,      3,      3,      3,      3,      8,      2,      2,      9,      3, }, /* left_mode 8 */

-    {    13,      7,     11,      4,      0,      4,      6,      2,      5,      8, }, /* left_mode 9 */

-  },

-  {

-    /*Above Mode :  7*/

-    {   376,    135,    119,      6,     32,      8,     31,    224,      9,      3, }, /* left_mode 0 */

-    {    93,     60,     54,      6,     13,      7,      8,     92,      2,     12, }, /* left_mode 1 */

-    {    74,     36,     84,      0,      3,      2,      9,     67,      2,      1, }, /* left_mode 2 */

-    {    19,      4,      4,      8,      8,      2,      4,      7,      6,     16, }, /* left_mode 3 */

-    {    51,      7,      4,      1,     77,      3,      0,     14,      1,     15, }, /* left_mode 4 */

-    {     7,      7,      5,      7,      4,      7,      4,      5,      0,      3, }, /* left_mode 5 */

-    {    18,      2,     19,      2,      2,      4,     12,     11,      1,      2, }, /* left_mode 6 */

-    {   129,      6,     27,      1,     21,      3,      0,    189,      0,      6, }, /* left_mode 7 */

-    {     9,      1,      2,      8,      3,      7,      0,      5,      3,      3, }, /* left_mode 8 */

-    {    20,      4,      5,     10,      4,      2,      7,     17,      3,     16, }, /* left_mode 9 */

-  },

-  {

-    /*Above Mode :  8*/

-    {   617,     68,     34,     79,     11,     27,     25,     14,     75,     13, }, /* left_mode 0 */

-    {    51,     82,     21,     26,      6,     12,     13,      1,     26,     16, }, /* left_mode 1 */

-    {    29,      9,     12,     11,      3,      7,      1,     10,      2,      2, }, /* left_mode 2 */

-    {    17,     19,     11,     74,      4,      3,      2,      0,     58,     13, }, /* left_mode 3 */

-    {    10,      1,      1,      3,      4,      1,      0,      2,      1,      8, }, /* left_mode 4 */

-    {    14,      4,      5,      5,      1,     13,      2,      0,     27,      8, }, /* left_mode 5 */

-    {    10,      3,      5,      4,      1,      7,      6,      4,      5,      1, }, /* left_mode 6 */

-    {    10,      2,      6,      2,      1,      1,      1,      4,      2,      1, }, /* left_mode 7 */

-    {    14,      8,      5,     23,      2,     12,      6,      2,    117,      5, }, /* left_mode 8 */

-    {     9,      6,      2,     19,      1,      6,      3,      2,      9,      9, }, /* left_mode 9 */

-  },

-  {

-    /*Above Mode :  9*/

-    {   680,     73,     22,     38,     42,      5,     11,      9,      6,     28, }, /* left_mode 0 */

-    {   113,    112,     21,     22,     10,      2,      8,      4,      6,     42, }, /* left_mode 1 */

-    {    44,     20,     24,      6,      5,      4,      3,      3,      1,      2, }, /* left_mode 2 */

-    {    40,     23,      7,     71,      5,      2,      4,      1,      7,     22, }, /* left_mode 3 */

-    {    85,      9,      4,      4,     17,      2,      0,      3,      2,     23, }, /* left_mode 4 */

-    {    13,      4,      2,      6,      1,      7,      0,      1,      7,      6, }, /* left_mode 5 */

-    {    26,      6,      8,      3,      2,      3,      8,      1,      5,      4, }, /* left_mode 6 */

-    {    54,      8,      9,      6,      7,      0,      1,     11,      1,      3, }, /* left_mode 7 */

-    {     9,     10,      4,     13,      2,      5,      4,      2,     14,      8, }, /* left_mode 8 */

-    {    92,      9,      5,     19,     15,      3,      3,      1,      6,     58, }, /* left_mode 9 */

-  },

-};

--- a/vp8/common/mv.h

+++ /dev/null

@@ -1,26 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_MV_H

-#define __INC_MV_H

-#include "vpx/vpx_integer.h"

-typedef struct {

-  short row;

-  short col;

-} MV;

-typedef union {

-  uint32_t  as_int;

-  MV        as_mv;

-} int_mv;        /* facilitates faster equality tests and copies */

-#endif

--- a/vp8/common/mvref_common.c

+++ /dev/null

@@ -1,342 +1,0 @@

-/*

- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "mvref_common.h"

-#if CONFIG_NEWBESTREFMV

-#define MVREF_NEIGHBOURS 8

-static int mv_ref_search[MVREF_NEIGHBOURS][2] =

-  { {0,-1},{-1,0},{-1,-1},{0,-2},{-2,0},{-1,-2},{-2,-1},{-2,-2} };

-static int ref_distance_weight[MVREF_NEIGHBOURS] =

-  { 3,3,2,1,1,1,1,1 };

-// clamp_mv

-#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units

-static void clamp_mv(const MACROBLOCKD *xd, int_mv *mv) {

-  if (mv->as_mv.col < (xd->mb_to_left_edge - MV_BORDER))

-    mv->as_mv.col = xd->mb_to_left_edge - MV_BORDER;

-  else if (mv->as_mv.col > xd->mb_to_right_edge + MV_BORDER)

-    mv->as_mv.col = xd->mb_to_right_edge + MV_BORDER;

-  if (mv->as_mv.row < (xd->mb_to_top_edge - MV_BORDER))

-    mv->as_mv.row = xd->mb_to_top_edge - MV_BORDER;

-  else if (mv->as_mv.row > xd->mb_to_bottom_edge + MV_BORDER)

-    mv->as_mv.row = xd->mb_to_bottom_edge + MV_BORDER;

-}

-// Gets a best matching candidate refenence motion vector

-// from the given mode info structure (if available)

-static int get_candidate_mvref(

-  const MODE_INFO *candidate_mi,

-  MV_REFERENCE_FRAME ref_frame,

-  MV_REFERENCE_FRAME *c_ref_frame,

-  int_mv *c_mv,

-  MV_REFERENCE_FRAME *c2_ref_frame,

-  int_mv *c2_mv

-) {

-  int ret_val = FALSE;

-  c2_mv->as_int = 0;

-  *c2_ref_frame = INTRA_FRAME;

-  // Target ref frame matches candidate first ref frame

-  if (ref_frame == candidate_mi->mbmi.ref_frame) {

-    c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;

-    *c_ref_frame = ref_frame;

-    ret_val = TRUE;

-    // Is there a second non zero vector we can use.

-    if ((candidate_mi->mbmi.second_ref_frame != INTRA_FRAME) &&

-        (candidate_mi->mbmi.mv[1].as_int != 0) &&

-        (candidate_mi->mbmi.mv[1].as_int != c_mv->as_int)) {

-      c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;

-      *c2_ref_frame = candidate_mi->mbmi.second_ref_frame;

-    }

-  // Target ref frame matches candidate second ref frame

-  } else if (ref_frame == candidate_mi->mbmi.second_ref_frame) {

-    c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;

-    *c_ref_frame = ref_frame;

-    ret_val = TRUE;

-    // Is there a second non zero vector we can use.

-    if ((candidate_mi->mbmi.ref_frame != INTRA_FRAME) &&

-        (candidate_mi->mbmi.mv[0].as_int != 0) &&

-        (candidate_mi->mbmi.mv[0].as_int != c_mv->as_int)) {

-      c2_mv->as_int = candidate_mi->mbmi.mv[0].as_int;

-      *c2_ref_frame = candidate_mi->mbmi.ref_frame;

-    }

-  // No ref frame matches so use first ref mv as first choice

-  } else if (candidate_mi->mbmi.ref_frame != INTRA_FRAME) {

-    c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;

-    *c_ref_frame = candidate_mi->mbmi.ref_frame;

-    ret_val = TRUE;

-    // Is there a second non zero vector we can use.

-    if ((candidate_mi->mbmi.second_ref_frame != INTRA_FRAME) &&

-        (candidate_mi->mbmi.mv[1].as_int != 0) &&

-        (candidate_mi->mbmi.mv[1].as_int != c_mv->as_int)) {

-      c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;

-      *c2_ref_frame = candidate_mi->mbmi.second_ref_frame;

-    }

-  // If only the second ref mv is valid:- (Should not trigger in current code

-  // base given current possible compound prediction options).

-  } else if (candidate_mi->mbmi.second_ref_frame != INTRA_FRAME) {

-    c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;

-    *c_ref_frame = candidate_mi->mbmi.second_ref_frame;

-    ret_val = TRUE;

-  }

-  return ret_val;

-}

-// Performs mv adjustment based on reference frame and clamps the MV

-// if it goes off the edge of the buffer.

-static void scale_mv(

-  MACROBLOCKD *xd,

-  MV_REFERENCE_FRAME this_ref_frame,

-  MV_REFERENCE_FRAME candidate_ref_frame,

-  int_mv *candidate_mv,

-  int *ref_sign_bias

-) {

-  if (candidate_ref_frame != this_ref_frame) {

-    //int frame_distances[MAX_REF_FRAMES];

-    //int last_distance = 1;

-    //int gf_distance = xd->frames_since_golden;

-    //int arf_distance = xd->frames_till_alt_ref_frame;

-    // Sign inversion where appropriate.

-    if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) {

-      candidate_mv->as_mv.row = -candidate_mv->as_mv.row;

-      candidate_mv->as_mv.col = -candidate_mv->as_mv.col;

-    }

-    // Scale based on frame distance if the reference frames not the same.

-    /*frame_distances[INTRA_FRAME] = 1;   // should never be used

-    frame_distances[LAST_FRAME] = 1;

-    frame_distances[GOLDEN_FRAME] =

-      (xd->frames_since_golden) ? xd->frames_since_golden : 1;

-    frame_distances[ALTREF_FRAME] =

-      (xd->frames_till_alt_ref_frame) ? xd->frames_till_alt_ref_frame : 1;

-    if (frame_distances[this_ref_frame] &&

-        frame_distances[candidate_ref_frame]) {

-      candidate_mv->as_mv.row =

-        (short)(((int)(candidate_mv->as_mv.row) *

-                 frame_distances[this_ref_frame]) /

-                frame_distances[candidate_ref_frame]);

-      candidate_mv->as_mv.col =

-        (short)(((int)(candidate_mv->as_mv.col) *

-                 frame_distances[this_ref_frame]) /

-                frame_distances[candidate_ref_frame]);

-    }

-    */

-  }

-  // Clamp the MV so it does not point out of the frame buffer

-  clamp_mv(xd, candidate_mv);

-}

-// Adds a new candidate reference vector to the list if indeed it is new.

-// If it is not new then the score of the existing candidate that it matches

-// is increased and the list is resorted.

-static void addmv_and_shuffle(

-  int_mv *mv_list,

-  int *mv_scores,

-  int *index,

-  int_mv candidate_mv,

-  int weight

-) {

-  int i = *index;

-  int duplicate_found = FALSE;

-  // Check for duplicates. If there is one increment its score.

-  // Duplicate defined as being the same full pel vector with rounding.

-  while (i > 0) {

-    i--;

-    if (candidate_mv.as_int == mv_list[i].as_int) {

-      duplicate_found = TRUE;

-      mv_scores[i] += weight;

-      break;

-    }

-  }

-  // If no duplicate was found add the new vector and give it a weight

-  if (!duplicate_found) {

-    mv_list[*index].as_int = candidate_mv.as_int;

-    mv_scores[*index] = weight;

-    i = *index;

-    (*index)++;

-  }

-  // Reshuffle the list so that highest scoring mvs at the top.

-  while (i > 0) {

-    if (mv_scores[i] > mv_scores[i-1]) {

-      int tmp_score = mv_scores[i-1];

-      int_mv tmp_mv = mv_list[i-1];

-      mv_scores[i-1] = mv_scores[i];

-      mv_list[i-1] = mv_list[i];

-      mv_scores[i] = tmp_score;

-      mv_list[i] = tmp_mv;

-      i--;

-    } else

-      break;

-  }

-}

-// This function searches the neighbourhood of a given MB/SB and populates a

-// list of candidate reference vectors.

-//

-void vp9_find_mv_refs(

-  MACROBLOCKD *xd,

-  MODE_INFO *here,

-  MODE_INFO *lf_here,

-  MV_REFERENCE_FRAME ref_frame,

-  int_mv *mv_ref_list,

-  int *ref_sign_bias

-) {

-  int i;

-  MODE_INFO *candidate_mi;

-  int_mv candidate_mvs[MAX_MV_REFS];

-  int_mv c_refmv;

-  MV_REFERENCE_FRAME c_ref_frame;

-  int_mv c2_refmv;

-  MV_REFERENCE_FRAME c2_ref_frame;

-  int candidate_scores[MAX_MV_REFS];

-  int index = 0;

-  int ref_weight = 0;

-  int valid_mv_ref;

-  // Blank the reference vector lists and other local structures.

-  vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REFS);

-  vpx_memset(candidate_mvs, 0, sizeof(int_mv) * MAX_MV_REFS);

-  vpx_memset(candidate_scores, 0, sizeof(candidate_scores));

-  // Populate a list with candidate reference vectors from the

-  // spatial neighbours.

-  for (i = 0; i < 2; ++i) {

-    if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&

-        ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {

-      candidate_mi = here + mv_ref_search[i][0] +

-                     (mv_ref_search[i][1] * xd->mode_info_stride);

-      valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,

-                                         &c_ref_frame, &c_refmv,

-                                         &c2_ref_frame, &c2_refmv);

-      // If there is a valid MV candidate then add it to the list

-      if (valid_mv_ref) {

-        scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );

-        ref_weight = ref_distance_weight[i] +

-                     ((c_ref_frame == ref_frame) << 4);

-        addmv_and_shuffle(candidate_mvs, candidate_scores,

-                          &index, c_refmv, ref_weight);

-        // If there is a second valid mv then add it as well.

-        if (c2_ref_frame != INTRA_FRAME) {

-          scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );

-          ref_weight = ref_distance_weight[i] +

-                       ((c2_ref_frame == ref_frame) << 4);

-          addmv_and_shuffle(candidate_mvs, candidate_scores,

-                            &index, c2_refmv, ref_weight);

-        }

-      }

-    }

-  }

-  // Look at the corresponding vector in the last frame

-  candidate_mi = lf_here;

-  valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,

-                                     &c_ref_frame, &c_refmv,

-                                     &c2_ref_frame, &c2_refmv);

-  // If there is a valid MV candidate then add it to the list

-  if (valid_mv_ref) {

-    scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );

-    ref_weight = 2 + ((c_ref_frame == ref_frame) << 4);

-    addmv_and_shuffle(candidate_mvs, candidate_scores,

-                      &index, c_refmv, ref_weight);

-    // If there is a second valid mv then add it as well.

-    if (c2_ref_frame != INTRA_FRAME) {

-      scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );

-      ref_weight = ref_distance_weight[i] +

-                   ((c2_ref_frame == ref_frame) << 4);

-      addmv_and_shuffle(candidate_mvs, candidate_scores,

-                        &index, c2_refmv, ref_weight);

-    }

-  }

-  // Populate a list with candidate reference vectors from the

-  // spatial neighbours.

-  for (i = 2; i < MVREF_NEIGHBOURS; ++i) {

-    if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&

-        ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {

-      candidate_mi = here + mv_ref_search[i][0] +

-                     (mv_ref_search[i][1] * xd->mode_info_stride);

-      valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,

-                                         &c_ref_frame, &c_refmv,

-                                         &c2_ref_frame, &c2_refmv);

-      // If there is a valid MV candidate then add it to the list

-      if (valid_mv_ref) {

-        scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );

-        ref_weight = ref_distance_weight[i] +

-                     ((c_ref_frame == ref_frame) << 4);

-        addmv_and_shuffle(candidate_mvs, candidate_scores,

-                          &index, c_refmv, ref_weight);

-        // If there is a second valid mv then add it as well.

-        if (c2_ref_frame != INTRA_FRAME) {

-          scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );

-          ref_weight = ref_distance_weight[i] +

-                       ((c2_ref_frame == ref_frame) << 4);

-          addmv_and_shuffle(candidate_mvs, candidate_scores,

-                            &index, c2_refmv, ref_weight);

-        }

-      }

-    }

-  }

-  // 0,0 is always a valid reference.

-  for (i = 0; i < index; ++i)

-    if (candidate_mvs[i].as_int == 0)

-      break;

-  if (i == index) {

-    c_refmv.as_int = 0;

-    addmv_and_shuffle(candidate_mvs, candidate_scores,

-                      &index, c_refmv, candidate_scores[3]+1 );

-  }

-  // Copy over the candidate list.

-  vpx_memcpy(mv_ref_list, candidate_mvs, sizeof(candidate_mvs));

-}

-#endif

--- a/vp8/common/mvref_common.h

+++ /dev/null

@@ -1,31 +1,0 @@

-/*

- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "onyxc_int.h"

-#include "blockd.h"

-// MR reference entropy header file.

-#if CONFIG_NEWBESTREFMV

-#ifndef __INC_MVREF_COMMON_H

-#define __INC_MVREF_COMMON_H

-void vp9_find_mv_refs(

-  MACROBLOCKD *xd,

-  MODE_INFO *here,

-  MODE_INFO *lf_here,

-  MV_REFERENCE_FRAME ref_frame,

-  int_mv * mv_ref_list,

-  int *ref_sign_bias

-);

-#endif

-#endif

--- a/vp8/common/onyx.h

+++ /dev/null

@@ -1,225 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_ONYX_H

-#define __INC_ONYX_H

-#ifdef __cplusplus

-extern "C"

-{

-#endif

-#include "vpx/internal/vpx_codec_internal.h"

-#include "vpx/vp8cx.h"

-#include "vpx_scale/yv12config.h"

-#include "type_aliases.h"

-#include "ppflags.h"

-  typedef int *VP9_PTR;

-  /* Create/destroy static data structures. */

-  typedef enum {

-    NORMAL      = 0,

-    FOURFIVE    = 1,

-    THREEFIVE   = 2,

-    ONETWO      = 3

-  } VPX_SCALING;

-  typedef enum {

-    VP9_LAST_FLAG = 1,

-    VP9_GOLD_FLAG = 2,

-    VP9_ALT_FLAG = 4

-  } VP9_REFFRAME;

-  typedef enum {

-    USAGE_STREAM_FROM_SERVER    = 0x0,

-    USAGE_LOCAL_FILE_PLAYBACK   = 0x1,

-    USAGE_CONSTRAINED_QUALITY   = 0x2

-  } END_USAGE;

-  typedef enum {

-    MODE_GOODQUALITY    = 0x1,

-    MODE_BESTQUALITY    = 0x2,

-    MODE_FIRSTPASS      = 0x3,

-    MODE_SECONDPASS     = 0x4,

-    MODE_SECONDPASS_BEST = 0x5,

-  } MODE;

-  typedef enum {

-    FRAMEFLAGS_KEY    = 1,

-    FRAMEFLAGS_GOLDEN = 2,

-    FRAMEFLAGS_ALTREF = 4,

-  } FRAMETYPE_FLAGS;

-#include <assert.h>

-  static __inline void Scale2Ratio(int mode, int *hr, int *hs) {

-    switch (mode) {

-      case    NORMAL:

-        *hr = 1;

-        *hs = 1;

-        break;

-      case    FOURFIVE:

-        *hr = 4;

-        *hs = 5;

-        break;

-      case    THREEFIVE:

-        *hr = 3;

-        *hs = 5;

-        break;

-      case    ONETWO:

-        *hr = 1;

-        *hs = 2;

-        break;

-      default:

-        *hr = 1;

-        *hs = 1;

-        assert(0);

-        break;

-    }

-  }

-  typedef struct {

-    int Version;            // 4 versions of bitstream defined 0 best quality/slowest decode, 3 lowest quality/fastest decode

-    int Width;              // width of data passed to the compressor

-    int Height;             // height of data passed to the compressor

-    double frame_rate;       // set to passed in framerate

-    int target_bandwidth;    // bandwidth to be used in kilobits per second

-    int noise_sensitivity;   // parameter used for applying pre processing blur: recommendation 0

-    int Sharpness;          // parameter used for sharpening output: recommendation 0:

-    int cpu_used;

-    unsigned int rc_max_intra_bitrate_pct;

-    // mode ->

-    // (0)=Realtime/Live Encoding. This mode is optimized for realtim encoding (for example, capturing

-    //    a television signal or feed from a live camera). ( speed setting controls how fast )

-    // (1)=Good Quality Fast Encoding. The encoder balances quality with the amount of time it takes to

-    //    encode the output. ( speed setting controls how fast )

-    // (2)=One Pass - Best Quality. The encoder places priority on the quality of the output over encoding

-    //    speed. The output is compressed at the highest possible quality. This option takes the longest

-    //    amount of time to encode. ( speed setting ignored )

-    // (3)=Two Pass - First Pass. The encoder generates a file of statistics for use in the second encoding

-    //    pass. ( speed setting controls how fast )

-    // (4)=Two Pass - Second Pass. The encoder uses the statistics that were generated in the first encoding

-    //    pass to create the compressed output. ( speed setting controls how fast )

-    // (5)=Two Pass - Second Pass Best.  The encoder uses the statistics that were generated in the first

-    //    encoding pass to create the compressed output using the highest possible quality, and taking a

-    //    longer amount of time to encode.. ( speed setting ignored )

-    int Mode;               //

-    // Key Framing Operations

-    int auto_key;            // automatically detect cut scenes and set the keyframes

-    int key_freq;            // maximum distance to key frame.

-    int allow_lag;           // allow lagged compression (if 0 lagin frames is ignored)

-    int lag_in_frames;        // how many frames lag before we start encoding

-    // ----------------------------------------------------------------

-    // DATARATE CONTROL OPTIONS

-    int end_usage; // vbr or cbr

-    // buffer targeting aggressiveness

-    int under_shoot_pct;

-    int over_shoot_pct;

-    // buffering parameters

-    int starting_buffer_level;  // in seconds

-    int optimal_buffer_level;

-    int maximum_buffer_size;

-    // controlling quality

-    int fixed_q;

-    int worst_allowed_q;

-    int best_allowed_q;

-    int cq_level;

-    int lossless;

-    // two pass datarate control

-    int two_pass_vbrbias;        // two pass datarate control tweaks

-    int two_pass_vbrmin_section;

-    int two_pass_vbrmax_section;

-    // END DATARATE CONTROL OPTIONS

-    // ----------------------------------------------------------------

-    // these parameters aren't to be used in final build don't use!!!

-    int play_alternate;

-    int alt_freq;

-    int encode_breakout;  // early breakout encode threshold : for video conf recommend 800

-    int arnr_max_frames;

-    int arnr_strength;

-    int arnr_type;

-    struct vpx_fixed_buf         two_pass_stats_in;

-    struct vpx_codec_pkt_list  *output_pkt_list;

-    vp8e_tuning tuning;

-  } VP9_CONFIG;

-  void vp9_initialize_enc();

-  VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf);

-  void vp9_remove_compressor(VP9_PTR *comp);

-  void vp9_change_config(VP9_PTR onyx, VP9_CONFIG *oxcf);

-// receive a frames worth of data caller can assume that a copy of this frame is made

-// and not just a copy of the pointer..

-  int vp9_receive_raw_frame(VP9_PTR comp, unsigned int frame_flags,

-                            YV12_BUFFER_CONFIG *sd, int64_t time_stamp,

-                            int64_t end_time_stamp);

-  int vp9_get_compressed_data(VP9_PTR comp, unsigned int *frame_flags,

-                              unsigned long *size, unsigned char *dest,

-                              int64_t *time_stamp, int64_t *time_end,

-                              int flush);

-  int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,

-                                vp9_ppflags_t *flags);

-  int vp9_use_as_reference(VP9_PTR comp, int ref_frame_flags);

-  int vp9_update_reference(VP9_PTR comp, int ref_frame_flags);

-  int vp9_get_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,

-                            YV12_BUFFER_CONFIG *sd);

-  int vp9_set_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,

-                            YV12_BUFFER_CONFIG *sd);

-  int vp9_update_entropy(VP9_PTR comp, int update);

-  int vp9_set_roimap(VP9_PTR comp, unsigned char *map,

-                     unsigned int rows, unsigned int cols,

-                     int delta_q[4], int delta_lf[4],

-                     unsigned int threshold[4]);

-  int vp9_set_active_map(VP9_PTR comp, unsigned char *map,

-                         unsigned int rows, unsigned int cols);

-  int vp9_set_internal_size(VP9_PTR comp,

-                            VPX_SCALING horiz_mode, VPX_SCALING vert_mode);

-  int vp9_get_quantizer(VP9_PTR c);

-#ifdef __cplusplus

-}

-#endif

-#endif  // __INC_ONYX_H

--- a/vp8/common/onyxc_int.h

+++ /dev/null

@@ -1,314 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_ONYXC_INT_H

-#define __INC_ONYXC_INT_H

-#include "vpx_config.h"

-#include "vpx/internal/vpx_codec_internal.h"

-#include "vpx_rtcd.h"

-#include "loopfilter.h"

-#include "entropymv.h"

-#include "entropy.h"

-#include "entropymode.h"

-#include "idct.h"

-#if CONFIG_POSTPROC

-#include "postproc.h"

-#endif

-/*#ifdef PACKET_TESTING*/

-#include "header.h"

-/*#endif*/

-/* Create/destroy static data structures. */

-void vp9_initialize_common(void);

-#define MINQ 0

-#define MAXQ 255

-#define QINDEX_BITS 8

-#define QINDEX_RANGE (MAXQ + 1)

-#define NUM_YV12_BUFFERS 4

-#define COMP_PRED_CONTEXTS   2

-typedef struct frame_contexts {

-  vp9_prob bmode_prob [VP9_BINTRAMODES - 1];

-  vp9_prob ymode_prob [VP9_YMODES - 1]; /* interframe intra mode probs */

-  vp9_prob uv_mode_prob [VP9_YMODES][VP9_UV_MODES - 1];

-  vp9_prob i8x8_mode_prob [VP9_I8X8_MODES - 1];

-  vp9_prob sub_mv_ref_prob [SUBMVREF_COUNT][VP9_SUBMVREFS - 1];

-  vp9_prob mbsplit_prob [VP9_NUMMBSPLITS - 1];

-  vp9_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

-  vp9_prob hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

-  vp9_prob coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

-  vp9_prob hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

-  vp9_prob coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

-  vp9_prob hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

-  nmv_context nmvc;

-  nmv_context pre_nmvc;

-  vp9_prob pre_bmode_prob [VP9_BINTRAMODES - 1];

-  vp9_prob pre_ymode_prob [VP9_YMODES - 1]; /* interframe intra mode probs */

-  vp9_prob pre_uv_mode_prob [VP9_YMODES][VP9_UV_MODES - 1];

-  vp9_prob pre_i8x8_mode_prob [VP9_I8X8_MODES - 1];

-  vp9_prob pre_sub_mv_ref_prob [SUBMVREF_COUNT][VP9_SUBMVREFS - 1];

-  vp9_prob pre_mbsplit_prob [VP9_NUMMBSPLITS - 1];

-  unsigned int bmode_counts [VP9_BINTRAMODES];

-  unsigned int ymode_counts [VP9_YMODES];   /* interframe intra mode probs */

-  unsigned int uv_mode_counts [VP9_YMODES][VP9_UV_MODES];

-  unsigned int i8x8_mode_counts [VP9_I8X8_MODES];   /* interframe intra mode probs */

-  unsigned int sub_mv_ref_counts [SUBMVREF_COUNT][VP9_SUBMVREFS];

-  unsigned int mbsplit_counts [VP9_NUMMBSPLITS];

-  vp9_prob pre_coef_probs [BLOCK_TYPES] [COEF_BANDS]

-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

-  vp9_prob pre_hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS]

-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

-  vp9_prob pre_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]

-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

-  vp9_prob pre_hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]

-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

-  vp9_prob pre_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]

-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

-  vp9_prob pre_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]

-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

-  unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS]

-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

-  unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS]

-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

-  unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]

-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

-  unsigned int hybrid_coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]

-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

-  unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]

-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

-  unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]

-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

-  nmv_context_counts NMVcount;

-  vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]

-                                 [VP9_SWITCHABLE_FILTERS - 1];

-  int mode_context[6][4];

-  int mode_context_a[6][4];

-  int vp8_mode_contexts[6][4];

-  int mv_ref_ct[6][4][2];

-  int mv_ref_ct_a[6][4][2];

-} FRAME_CONTEXT;

-typedef enum {

-  RECON_CLAMP_REQUIRED        = 0,

-  RECON_CLAMP_NOTREQUIRED     = 1

-} CLAMP_TYPE;

-typedef enum {

-  SINGLE_PREDICTION_ONLY = 0,

-  COMP_PREDICTION_ONLY   = 1,

-  HYBRID_PREDICTION      = 2,

-  NB_PREDICTION_TYPES    = 3,

-} COMPPREDMODE_TYPE;

-typedef enum {

-  ONLY_4X4            = 0,

-  ALLOW_8X8           = 1,

-  ALLOW_16X16         = 2,

-  TX_MODE_SELECT      = 3,

-  NB_TXFM_MODES       = 4,

-} TXFM_MODE;

-typedef struct VP9_COMMON_RTCD {

-#if CONFIG_RUNTIME_CPU_DETECT

-  vp9_idct_rtcd_vtable_t        idct;

-  vp9_subpix_rtcd_vtable_t      subpix;

-#if CONFIG_POSTPROC

-  vp9_postproc_rtcd_vtable_t    postproc;

-#endif

-  int                           flags;

-#else

-  int unused;

-#endif

-} VP9_COMMON_RTCD;

-typedef struct VP9Common {

-  struct vpx_internal_error_info  error;

-  DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][16]);

-  int Width;

-  int Height;

-  int horiz_scale;

-  int vert_scale;

-  YUV_TYPE clr_type;

-  CLAMP_TYPE  clamp_type;

-  YV12_BUFFER_CONFIG *frame_to_show;

-  YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS];

-  int fb_idx_ref_cnt[NUM_YV12_BUFFERS];

-  int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx;

-  YV12_BUFFER_CONFIG post_proc_buffer;

-  YV12_BUFFER_CONFIG temp_scale_frame;

-  FRAME_TYPE last_frame_type;  /* Save last frame's frame type for motion search. */

-  FRAME_TYPE frame_type;

-  int show_frame;

-  int frame_flags;

-  int MBs;

-  int mb_rows;

-  int mb_cols;

-  int mode_info_stride;

-  /* profile settings */

-  int experimental;

-  int mb_no_coeff_skip;

-  TXFM_MODE txfm_mode;

-  COMPPREDMODE_TYPE comp_pred_mode;

-  int no_lpf;

-  int use_bilinear_mc_filter;

-  int full_pixel;

-  int base_qindex;

-  int last_kf_gf_q;  /* Q used on the last GF or KF */

-  int y1dc_delta_q;

-  int y2dc_delta_q;

-  int y2ac_delta_q;

-  int uvdc_delta_q;

-  int uvac_delta_q;

-  unsigned int frames_since_golden;

-  unsigned int frames_till_alt_ref_frame;

-  /* We allocate a MODE_INFO struct for each macroblock, together with

-     an extra row on top and column on the left to simplify prediction. */

-  MODE_INFO *mip; /* Base of allocated array */

-  MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */

-  MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */

-  MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */

-  // Persistent mb segment id map used in prediction.

-  unsigned char *last_frame_seg_map;

-  INTERPOLATIONFILTERTYPE mcomp_filter_type;

-  LOOPFILTERTYPE filter_type;

-  loop_filter_info_n lf_info;

-  int filter_level;

-  int last_sharpness_level;

-  int sharpness_level;

-  int refresh_last_frame;       /* Two state 0 = NO, 1 = YES */

-  int refresh_golden_frame;     /* Two state 0 = NO, 1 = YES */

-  int refresh_alt_ref_frame;     /* Two state 0 = NO, 1 = YES */

-  int copy_buffer_to_gf;         /* 0 none, 1 Last to GF, 2 ARF to GF */

-  int copy_buffer_to_arf;        /* 0 none, 1 Last to ARF, 2 GF to ARF */

-  int refresh_entropy_probs;    /* Two state 0 = NO, 1 = YES */

-  int ref_frame_sign_bias[MAX_REF_FRAMES];    /* Two state 0, 1 */

-  /* Y,U,V,Y2 */

-  ENTROPY_CONTEXT_PLANES *above_context;   /* row of context for each plane */

-  ENTROPY_CONTEXT_PLANES left_context[2];  /* (up to) 4 contexts "" */

-  /* keyframe block modes are predicted by their above, left neighbors */

-  vp9_prob kf_bmode_prob [VP9_BINTRAMODES] [VP9_BINTRAMODES] [VP9_BINTRAMODES - 1];

-  vp9_prob kf_ymode_prob[8][VP9_YMODES - 1]; /* keyframe "" */

-#if CONFIG_SUPERBLOCKS

-  vp9_prob sb_kf_ymode_prob[8][VP9_I32X32_MODES - 1];

-#endif

-  int kf_ymode_probs_index;

-  int kf_ymode_probs_update;

-  vp9_prob kf_uv_mode_prob[VP9_YMODES] [VP9_UV_MODES - 1];

-  vp9_prob prob_intra_coded;

-  vp9_prob prob_last_coded;

-  vp9_prob prob_gf_coded;

-#if CONFIG_SUPERBLOCKS

-  vp9_prob sb_coded;

-#endif

-  // Context probabilities when using predictive coding of segment id

-  vp9_prob segment_pred_probs[PREDICTION_PROBS];

-  unsigned char temporal_update;

-  // Context probabilities for reference frame prediction

-  unsigned char ref_scores[MAX_REF_FRAMES];

-  vp9_prob ref_pred_probs[PREDICTION_PROBS];

-  vp9_prob mod_refprobs[MAX_REF_FRAMES][PREDICTION_PROBS];

-  vp9_prob prob_comppred[COMP_PRED_CONTEXTS];

-  // FIXME contextualize

-  vp9_prob prob_tx[TX_SIZE_MAX - 1];

-  vp9_prob mbskip_pred_probs[MBSKIP_CONTEXTS];

-  FRAME_CONTEXT lfc_a; /* last alt ref entropy */

-  FRAME_CONTEXT lfc; /* last frame entropy */

-  FRAME_CONTEXT fc;  /* this frame entropy */

-  // int mv_ref_ct[6][4][2];

-  // int mv_ref_ct_a[6][4][2];

-  // int mode_context[6][4];

-  // int mode_context_a[6][4];

-  // int vp8_mode_contexts[6][4];

-  unsigned int current_video_frame;

-  int near_boffset[3];

-  int version;

-#ifdef PACKET_TESTING

-  VP9_HEADER oh;

-#endif

-  double bitrate;

-  double framerate;

-#if CONFIG_RUNTIME_CPU_DETECT

-  VP9_COMMON_RTCD rtcd;

-#endif

-#if CONFIG_POSTPROC

-  struct postproc_state  postproc_state;

-#endif

-#if CONFIG_PRED_FILTER

-  /* Prediction filter variables */

-  int pred_filter_mode;   // 0=disabled at the frame level (no MB filtered)

-  // 1=enabled at the frame level (all MB filtered)

-  // 2=specified per MB (1=filtered, 0=non-filtered)

-  vp9_prob prob_pred_filter_off;

-#endif

-} VP9_COMMON;

-#endif  // __INC_ONYX_INT_H

--- a/vp8/common/onyxd.h

+++ /dev/null

@@ -1,68 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_ONYXD_H

-#define __INC_ONYXD_H

-/* Create/destroy static data structures. */

-#ifdef __cplusplus

-extern "C"

-{

-#endif

-#include "type_aliases.h"

-#include "vpx_scale/yv12config.h"

-#include "ppflags.h"

-#include "vpx_ports/mem.h"

-#include "vpx/vpx_codec.h"

-  typedef void   *VP9D_PTR;

-  typedef struct {

-    int     Width;

-    int     Height;

-    int     Version;

-    int     postprocess;

-    int     max_threads;

-    int     input_partition;

-  } VP9D_CONFIG;

-  typedef enum {

-    VP9_LAST_FLAG = 1,

-    VP9_GOLD_FLAG = 2,

-    VP9_ALT_FLAG = 4

-  } VP9_REFFRAME;

-  void vp9_initialize_dec(void);

-  int vp9_receive_compressed_data(VP9D_PTR comp, unsigned long size,

-                                  const unsigned char *dest,

-                                  int64_t time_stamp);

-  int vp9_get_raw_frame(VP9D_PTR comp, YV12_BUFFER_CONFIG *sd,

-                        int64_t *time_stamp, int64_t *time_end_stamp,

-                        vp9_ppflags_t *flags);

-  vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR comp,

-                                        VP9_REFFRAME ref_frame_flag,

-                                        YV12_BUFFER_CONFIG *sd);

-  vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR comp,

-                                        VP9_REFFRAME ref_frame_flag,

-                                        YV12_BUFFER_CONFIG *sd);

-  VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf);

-  void vp9_remove_decompressor(VP9D_PTR comp);

-#ifdef __cplusplus

-}

-#endif

-#endif  // __INC_ONYXD_H

--- a/vp8/common/postproc.c

+++ /dev/null

@@ -1,1035 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "vpx_scale/yv12config.h"

-#include "postproc.h"

-#include "vpx_scale/yv12extend.h"

-#include "vpx_scale/vpxscale.h"

-#include "systemdependent.h"

-#include <math.h>

-#include <stdlib.h>

-#include <stdio.h>

-#define RGB_TO_YUV(t)                                            \

-  ( (0.257*(float)(t >> 16))  + (0.504*(float)(t >> 8 & 0xff)) + \

-    (0.098*(float)(t & 0xff)) + 16),                             \

-  (-(0.148*(float)(t >> 16))  - (0.291*(float)(t >> 8 & 0xff)) + \

-    (0.439*(float)(t & 0xff)) + 128),                            \

-  ( (0.439*(float)(t >> 16))  - (0.368*(float)(t >> 8 & 0xff)) - \

-    (0.071*(float)(t & 0xff)) + 128)

-/* global constants */

-#if CONFIG_POSTPROC_VISUALIZER

-static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = {

-  { RGB_TO_YUV(0x98FB98) },   /* PaleGreen */

-  { RGB_TO_YUV(0x00FF00) },   /* Green */

-  { RGB_TO_YUV(0xADFF2F) },   /* GreenYellow */

-  { RGB_TO_YUV(0x8F0000) },   /* Dark Red */

-  { RGB_TO_YUV(0x008F8F) },   /* Dark Cyan */

-  { RGB_TO_YUV(0x008F8F) },   /* Dark Cyan */

-  { RGB_TO_YUV(0x008F8F) },   /* Dark Cyan */

-  { RGB_TO_YUV(0x8F0000) },   /* Dark Red */

-  { RGB_TO_YUV(0x8F0000) },   /* Dark Red */

-  { RGB_TO_YUV(0x228B22) },   /* ForestGreen */

-  { RGB_TO_YUV(0x006400) },   /* DarkGreen */

-  { RGB_TO_YUV(0x98F5FF) },   /* Cadet Blue */

-  { RGB_TO_YUV(0x6CA6CD) },   /* Sky Blue */

-  { RGB_TO_YUV(0x00008B) },   /* Dark blue */

-  { RGB_TO_YUV(0x551A8B) },   /* Purple */

-  { RGB_TO_YUV(0xFF0000) }    /* Red */

-  { RGB_TO_YUV(0xCC33FF) },   /* Magenta */

-};

-static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] = {

-  { RGB_TO_YUV(0x6633ff) },   /* Purple */

-  { RGB_TO_YUV(0xcc33ff) },   /* Magenta */

-  { RGB_TO_YUV(0xff33cc) },   /* Pink */

-  { RGB_TO_YUV(0xff3366) },   /* Coral */

-  { RGB_TO_YUV(0x3366ff) },   /* Blue */

-  { RGB_TO_YUV(0xed00f5) },   /* Dark Blue */

-  { RGB_TO_YUV(0x2e00b8) },   /* Dark Purple */

-  { RGB_TO_YUV(0xff6633) },   /* Orange */

-  { RGB_TO_YUV(0x33ccff) },   /* Light Blue */

-  { RGB_TO_YUV(0x8ab800) },   /* Green */

-  { RGB_TO_YUV(0xffcc33) },   /* Light Orange */

-  { RGB_TO_YUV(0x33ffcc) },   /* Aqua */

-  { RGB_TO_YUV(0x66ff33) },   /* Light Green */

-  { RGB_TO_YUV(0xccff33) },   /* Yellow */

-};

-static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] = {

-  { RGB_TO_YUV(0x00ff00) },   /* Blue */

-  { RGB_TO_YUV(0x0000ff) },   /* Green */

-  { RGB_TO_YUV(0xffff00) },   /* Yellow */

-  { RGB_TO_YUV(0xff0000) },   /* Red */

-};

-#endif

-static const short kernel5[] = {

-  1, 1, 4, 1, 1

-};

-const short vp9_rv[] = {

-  8, 5, 2, 2, 8, 12, 4, 9, 8, 3,

-  0, 3, 9, 0, 0, 0, 8, 3, 14, 4,

-  10, 1, 11, 14, 1, 14, 9, 6, 12, 11,

-  8, 6, 10, 0, 0, 8, 9, 0, 3, 14,

-  8, 11, 13, 4, 2, 9, 0, 3, 9, 6,

-  1, 2, 3, 14, 13, 1, 8, 2, 9, 7,

-  3, 3, 1, 13, 13, 6, 6, 5, 2, 7,

-  11, 9, 11, 8, 7, 3, 2, 0, 13, 13,

-  14, 4, 12, 5, 12, 10, 8, 10, 13, 10,

-  4, 14, 4, 10, 0, 8, 11, 1, 13, 7,

-  7, 14, 6, 14, 13, 2, 13, 5, 4, 4,

-  0, 10, 0, 5, 13, 2, 12, 7, 11, 13,

-  8, 0, 4, 10, 7, 2, 7, 2, 2, 5,

-  3, 4, 7, 3, 3, 14, 14, 5, 9, 13,

-  3, 14, 3, 6, 3, 0, 11, 8, 13, 1,

-  13, 1, 12, 0, 10, 9, 7, 6, 2, 8,

-  5, 2, 13, 7, 1, 13, 14, 7, 6, 7,

-  9, 6, 10, 11, 7, 8, 7, 5, 14, 8,

-  4, 4, 0, 8, 7, 10, 0, 8, 14, 11,

-  3, 12, 5, 7, 14, 3, 14, 5, 2, 6,

-  11, 12, 12, 8, 0, 11, 13, 1, 2, 0,

-  5, 10, 14, 7, 8, 0, 4, 11, 0, 8,

-  0, 3, 10, 5, 8, 0, 11, 6, 7, 8,

-  10, 7, 13, 9, 2, 5, 1, 5, 10, 2,

-  4, 3, 5, 6, 10, 8, 9, 4, 11, 14,

-  0, 10, 0, 5, 13, 2, 12, 7, 11, 13,

-  8, 0, 4, 10, 7, 2, 7, 2, 2, 5,

-  3, 4, 7, 3, 3, 14, 14, 5, 9, 13,

-  3, 14, 3, 6, 3, 0, 11, 8, 13, 1,

-  13, 1, 12, 0, 10, 9, 7, 6, 2, 8,

-  5, 2, 13, 7, 1, 13, 14, 7, 6, 7,

-  9, 6, 10, 11, 7, 8, 7, 5, 14, 8,

-  4, 4, 0, 8, 7, 10, 0, 8, 14, 11,

-  3, 12, 5, 7, 14, 3, 14, 5, 2, 6,

-  11, 12, 12, 8, 0, 11, 13, 1, 2, 0,

-  5, 10, 14, 7, 8, 0, 4, 11, 0, 8,

-  0, 3, 10, 5, 8, 0, 11, 6, 7, 8,

-  10, 7, 13, 9, 2, 5, 1, 5, 10, 2,

-  4, 3, 5, 6, 10, 8, 9, 4, 11, 14,

-  3, 8, 3, 7, 8, 5, 11, 4, 12, 3,

-  11, 9, 14, 8, 14, 13, 4, 3, 1, 2,

-  14, 6, 5, 4, 4, 11, 4, 6, 2, 1,

-  5, 8, 8, 12, 13, 5, 14, 10, 12, 13,

-  0, 9, 5, 5, 11, 10, 13, 9, 10, 13,

-};

-extern void vp9_blit_text(const char *msg, unsigned char *address,

-                          const int pitch);

-extern void vp9_blit_line(int x0, int x1, int y0, int y1,

-                          unsigned char *image, const int pitch);

-/****************************************************************************

- */

-void vp9_post_proc_down_and_across_c(unsigned char *src_ptr,

-                                     unsigned char *dst_ptr,

-                                     int src_pixels_per_line,

-                                     int dst_pixels_per_line,

-                                     int rows,

-                                     int cols,

-                                     int flimit) {

-  unsigned char *p_src, *p_dst;

-  int row;

-  int col;

-  int i;

-  int v;

-  int pitch = src_pixels_per_line;

-  unsigned char d[8];

-  (void)dst_pixels_per_line;

-  for (row = 0; row < rows; row++) {

-    /* post_proc_down for one row */

-    p_src = src_ptr;

-    p_dst = dst_ptr;

-    for (col = 0; col < cols; col++) {

-      int kernel = 4;

-      int v = p_src[col];

-      for (i = -2; i <= 2; i++) {

-        if (abs(v - p_src[col + i * pitch]) > flimit)

-          goto down_skip_convolve;

-        kernel += kernel5[2 + i] * p_src[col + i * pitch];

-      }

-      v = (kernel >> 3);

-    down_skip_convolve:

-      p_dst[col] = v;

-    }

-    /* now post_proc_across */

-    p_src = dst_ptr;

-    p_dst = dst_ptr;

-    for (i = 0; i < 8; i++)

-      d[i] = p_src[i];

-    for (col = 0; col < cols; col++) {

-      int kernel = 4;

-      v = p_src[col];

-      d[col & 7] = v;

-      for (i = -2; i <= 2; i++) {

-        if (abs(v - p_src[col + i]) > flimit)

-          goto across_skip_convolve;

-        kernel += kernel5[2 + i] * p_src[col + i];

-      }

-      d[col & 7] = (kernel >> 3);

-    across_skip_convolve:

-      if (col >= 2)

-        p_dst[col - 2] = d[(col - 2) & 7];

-    }

-    /* handle the last two pixels */

-    p_dst[col - 2] = d[(col - 2) & 7];

-    p_dst[col - 1] = d[(col - 1) & 7];

-    /* next row */

-    src_ptr += pitch;

-    dst_ptr += pitch;

-  }

-}

-static int q2mbl(int x) {

-  if (x < 20) x = 20;

-  x = 50 + (x - 50) * 10 / 8;

-  return x * x / 3;

-}

-void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch,

-                                 int rows, int cols, int flimit) {

-  int r, c, i;

-  unsigned char *s = src;

-  unsigned char d[16];

-  for (r = 0; r < rows; r++) {

-    int sumsq = 0;

-    int sum   = 0;

-    for (i = -8; i <= 6; i++) {

-      sumsq += s[i] * s[i];

-      sum   += s[i];

-      d[i + 8] = 0;

-    }

-    for (c = 0; c < cols + 8; c++) {

-      int x = s[c + 7] - s[c - 8];

-      int y = s[c + 7] + s[c - 8];

-      sum  += x;

-      sumsq += x * y;

-      d[c & 15] = s[c];

-      if (sumsq * 15 - sum * sum < flimit) {

-        d[c & 15] = (8 + sum + s[c]) >> 4;

-      }

-      s[c - 8] = d[(c - 8) & 15];

-    }

-    s += pitch;

-  }

-}

-void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch,

-                            int rows, int cols, int flimit) {

-  int r, c, i;

-  const short *rv3 = &vp9_rv[63 & rand()];

-  for (c = 0; c < cols; c++) {

-    unsigned char *s = &dst[c];

-    int sumsq = 0;

-    int sum   = 0;

-    unsigned char d[16];

-    const short *rv2 = rv3 + ((c * 17) & 127);

-    for (i = -8; i <= 6; i++) {

-      sumsq += s[i * pitch] * s[i * pitch];

-      sum   += s[i * pitch];

-    }

-    for (r = 0; r < rows + 8; r++) {

-      sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch];

-      sum  += s[7 * pitch] - s[-8 * pitch];

-      d[r & 15] = s[0];

-      if (sumsq * 15 - sum * sum < flimit) {

-        d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;

-      }

-      s[-8 * pitch] = d[(r - 8) & 15];

-      s += pitch;

-    }

-  }

-}

-static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG   *source,

-                                       YV12_BUFFER_CONFIG   *post,

-                                       int                   q,

-                                       int                   low_var_thresh,

-                                       int                   flag,

-                                       vp9_postproc_rtcd_vtable_t *rtcd) {

-  double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;

-  int ppl = (int)(level + .5);

-  (void) low_var_thresh;

-  (void) flag;

-  POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer,

-                                    source->y_stride,  post->y_stride,

-                                    source->y_height, source->y_width,  ppl);

-  POSTPROC_INVOKE(rtcd, across)(post->y_buffer, post->y_stride,

-                                post->y_height, post->y_width, q2mbl(q));

-  POSTPROC_INVOKE(rtcd, down)(post->y_buffer, post->y_stride,

-                              post->y_height, post->y_width, q2mbl(q));

-  POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer,

-                                    source->uv_stride, post->uv_stride,

-                                    source->uv_height, source->uv_width, ppl);

-  POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer,

-                                    source->uv_stride, post->uv_stride,

-                                    source->uv_height, source->uv_width, ppl);

-}

-void vp9_deblock(YV12_BUFFER_CONFIG         *source,

-                 YV12_BUFFER_CONFIG         *post,

-                 int                         q,

-                 int                         low_var_thresh,

-                 int                         flag,

-                 vp9_postproc_rtcd_vtable_t *rtcd) {

-  double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;

-  int ppl = (int)(level + .5);

-  (void) low_var_thresh;

-  (void) flag;

-  POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer,

-                                    source->y_stride,  post->y_stride,

-                                    source->y_height, source->y_width,   ppl);

-  POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer,

-                                    source->uv_stride, post->uv_stride,

-                                    source->uv_height, source->uv_width, ppl);

-  POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer,

-                                    source->uv_stride, post->uv_stride,

-                                    source->uv_height, source->uv_width, ppl);

-}

-void vp9_de_noise(YV12_BUFFER_CONFIG         *src,

-                  YV12_BUFFER_CONFIG         *post,

-                  int                         q,

-                  int                         low_var_thresh,

-                  int                         flag,

-                  vp9_postproc_rtcd_vtable_t *rtcd) {

-  double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;

-  int ppl = (int)(level + .5);

-  (void) post;

-  (void) low_var_thresh;

-  (void) flag;

-  POSTPROC_INVOKE(rtcd, downacross)(src->y_buffer + 2 * src->y_stride + 2,

-                                    src->y_buffer + 2 * src->y_stride + 2,

-                                    src->y_stride,

-                                    src->y_stride,

-                                    src->y_height - 4,

-                                    src->y_width - 4,

-                                    ppl);

-  POSTPROC_INVOKE(rtcd, downacross)(src->u_buffer + 2 * src->uv_stride + 2,

-                                    src->u_buffer + 2 * src->uv_stride + 2,

-                                    src->uv_stride,

-                                    src->uv_stride,

-                                    src->uv_height - 4,

-                                    src->uv_width - 4, ppl);

-  POSTPROC_INVOKE(rtcd, downacross)(src->v_buffer + 2 * src->uv_stride + 2,

-                                    src->v_buffer + 2 * src->uv_stride + 2,

-                                    src->uv_stride,

-                                    src->uv_stride,

-                                    src->uv_height - 4,

-                                    src->uv_width - 4, ppl);

-}

-double vp9_gaussian(double sigma, double mu, double x) {

-  return 1 / (sigma * sqrt(2.0 * 3.14159265)) *

-         (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));

-}

-static void fillrd(struct postproc_state *state, int q, int a) {

-  char char_dist[300];

-  double sigma;

-  int ai = a, qi = q, i;

-  vp9_clear_system_state();

-  sigma = ai + .5 + .6 * (63 - qi) / 63.0;

-  /* set up a lookup table of 256 entries that matches

-   * a gaussian distribution with sigma determined by q.

-   */

-  {

-    double i;

-    int next, j;

-    next = 0;

-    for (i = -32; i < 32; i++) {

-      int a = (int)(.5 + 256 * vp9_gaussian(sigma, 0, i));

-      if (a) {

-        for (j = 0; j < a; j++) {

-          char_dist[next + j] = (char) i;

-        }

-        next = next + j;

-      }

-    }

-    for (next = next; next < 256; next++)

-      char_dist[next] = 0;

-  }

-  for (i = 0; i < 3072; i++) {

-    state->noise[i] = char_dist[rand() & 0xff];

-  }

-  for (i = 0; i < 16; i++) {

-    state->blackclamp[i] = -char_dist[0];

-    state->whiteclamp[i] = -char_dist[0];

-    state->bothclamp[i] = -2 * char_dist[0];

-  }

-  state->last_q = q;

-  state->last_noise = a;

-}

-/****************************************************************************

- *

- *  ROUTINE       : plane_add_noise_c

- *

- *  INPUTS        : unsigned char *Start  starting address of buffer to

- *                                        add gaussian noise to

- *                  unsigned int Width    width of plane

- *                  unsigned int Height   height of plane

- *                  int  Pitch    distance between subsequent lines of frame

- *                  int  q        quantizer used to determine amount of noise

- *                                  to add

- *

- *  OUTPUTS       : None.

- *

- *  RETURNS       : void.

- *

- *  FUNCTION      : adds gaussian noise to a plane of pixels

- *

- *  SPECIAL NOTES : None.

- *

- ****************************************************************************/

-void vp9_plane_add_noise_c(unsigned char *Start, char *noise,

-                           char blackclamp[16],

-                           char whiteclamp[16],

-                           char bothclamp[16],

-                           unsigned int Width, unsigned int Height, int Pitch) {

-  unsigned int i, j;

-  for (i = 0; i < Height; i++) {

-    unsigned char *Pos = Start + i * Pitch;

-    char  *Ref = (char *)(noise + (rand() & 0xff));

-    for (j = 0; j < Width; j++) {

-      if (Pos[j] < blackclamp[0])

-        Pos[j] = blackclamp[0];

-      if (Pos[j] > 255 + whiteclamp[0])

-        Pos[j] = 255 + whiteclamp[0];

-      Pos[j] += Ref[j];

-    }

-  }

-}

-/* Blend the macro block with a solid colored square.  Leave the

- * edges unblended to give distinction to macro blocks in areas

- * filled with the same color block.

- */

-void vp9_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v,

-                          int y1, int u1, int v1, int alpha, int stride) {

-  int i, j;

-  int y1_const = y1 * ((1 << 16) - alpha);

-  int u1_const = u1 * ((1 << 16) - alpha);

-  int v1_const = v1 * ((1 << 16) - alpha);

-  y += 2 * stride + 2;

-  for (i = 0; i < 12; i++) {

-    for (j = 0; j < 12; j++) {

-      y[j] = (y[j] * alpha + y1_const) >> 16;

-    }

-    y += stride;

-  }

-  stride >>= 1;

-  u += stride + 1;

-  v += stride + 1;

-  for (i = 0; i < 6; i++) {

-    for (j = 0; j < 6; j++) {

-      u[j] = (u[j] * alpha + u1_const) >> 16;

-      v[j] = (v[j] * alpha + v1_const) >> 16;

-    }

-    u += stride;

-    v += stride;

-  }

-}

-/* Blend only the edge of the macro block.  Leave center

- * unblended to allow for other visualizations to be layered.

- */

-void vp9_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v,

-                          int y1, int u1, int v1, int alpha, int stride) {

-  int i, j;

-  int y1_const = y1 * ((1 << 16) - alpha);

-  int u1_const = u1 * ((1 << 16) - alpha);

-  int v1_const = v1 * ((1 << 16) - alpha);

-  for (i = 0; i < 2; i++) {

-    for (j = 0; j < 16; j++) {

-      y[j] = (y[j] * alpha + y1_const) >> 16;

-    }

-    y += stride;

-  }

-  for (i = 0; i < 12; i++) {

-    y[0]  = (y[0] * alpha  + y1_const) >> 16;

-    y[1]  = (y[1] * alpha  + y1_const) >> 16;

-    y[14] = (y[14] * alpha + y1_const) >> 16;

-    y[15] = (y[15] * alpha + y1_const) >> 16;

-    y += stride;

-  }

-  for (i = 0; i < 2; i++) {

-    for (j = 0; j < 16; j++) {

-      y[j] = (y[j] * alpha + y1_const) >> 16;

-    }

-    y += stride;

-  }

-  stride >>= 1;

-  for (j = 0; j < 8; j++) {

-    u[j] = (u[j] * alpha + u1_const) >> 16;

-    v[j] = (v[j] * alpha + v1_const) >> 16;

-  }

-  u += stride;

-  v += stride;

-  for (i = 0; i < 6; i++) {

-    u[0] = (u[0] * alpha + u1_const) >> 16;

-    v[0] = (v[0] * alpha + v1_const) >> 16;

-    u[7] = (u[7] * alpha + u1_const) >> 16;

-    v[7] = (v[7] * alpha + v1_const) >> 16;

-    u += stride;

-    v += stride;

-  }

-  for (j = 0; j < 8; j++) {

-    u[j] = (u[j] * alpha + u1_const) >> 16;

-    v[j] = (v[j] * alpha + v1_const) >> 16;

-  }

-}

-void vp9_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v,

-                   int y1, int u1, int v1, int alpha, int stride) {

-  int i, j;

-  int y1_const = y1 * ((1 << 16) - alpha);

-  int u1_const = u1 * ((1 << 16) - alpha);

-  int v1_const = v1 * ((1 << 16) - alpha);

-  for (i = 0; i < 4; i++) {

-    for (j = 0; j < 4; j++) {

-      y[j] = (y[j] * alpha + y1_const) >> 16;

-    }

-    y += stride;

-  }

-  stride >>= 1;

-  for (i = 0; i < 2; i++) {

-    for (j = 0; j < 2; j++) {

-      u[j] = (u[j] * alpha + u1_const) >> 16;

-      v[j] = (v[j] * alpha + v1_const) >> 16;

-    }

-    u += stride;

-    v += stride;

-  }

-}

-static void constrain_line(int x0, int *x1, int y0, int *y1,

-                           int width, int height) {

-  int dx;

-  int dy;

-  if (*x1 > width) {

-    dx = *x1 - x0;

-    dy = *y1 - y0;

-    *x1 = width;

-    if (dx)

-      *y1 = ((width - x0) * dy) / dx + y0;

-  }

-  if (*x1 < 0) {

-    dx = *x1 - x0;

-    dy = *y1 - y0;

-    *x1 = 0;

-    if (dx)

-      *y1 = ((0 - x0) * dy) / dx + y0;

-  }

-  if (*y1 > height) {

-    dx = *x1 - x0;

-    dy = *y1 - y0;

-    *y1 = height;

-    if (dy)

-      *x1 = ((height - y0) * dx) / dy + x0;

-  }

-  if (*y1 < 0) {

-    dx = *x1 - x0;

-    dy = *y1 - y0;

-    *y1 = 0;

-    if (dy)

-      *x1 = ((0 - y0) * dx) / dy + x0;

-  }

-}

-#if CONFIG_RUNTIME_CPU_DETECT

-#define RTCD_VTABLE(oci) (&(oci)->rtcd.postproc)

-#else

-#define RTCD_VTABLE(oci) NULL

-#endif

-int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest,

-                        vp9_ppflags_t *ppflags) {

-  int q = oci->filter_level * 10 / 6;

-  int flags = ppflags->post_proc_flag;

-  int deblock_level = ppflags->deblocking_level;

-  int noise_level = ppflags->noise_level;

-  if (!oci->frame_to_show)

-    return -1;

-  if (q > 63)

-    q = 63;

-  if (!flags) {

-    *dest = *oci->frame_to_show;

-    /* handle problem with extending borders */

-    dest->y_width = oci->Width;

-    dest->y_height = oci->Height;

-    dest->uv_height = dest->y_height / 2;

-    return 0;

-  }

-#if ARCH_X86||ARCH_X86_64

-  vpx_reset_mmx_state();

-#endif

-  if (flags & VP9D_DEMACROBLOCK) {

-    deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,

-                               q + (deblock_level - 5) * 10, 1, 0,

-                               RTCD_VTABLE(oci));

-  } else if (flags & VP9D_DEBLOCK) {

-    vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer,

-                q, 1, 0, RTCD_VTABLE(oci));

-  } else {

-    vp8_yv12_copy_frame_ptr(oci->frame_to_show, &oci->post_proc_buffer);

-  }

-  if (flags & VP9D_ADDNOISE) {

-    if (oci->postproc_state.last_q != q

-        || oci->postproc_state.last_noise != noise_level) {

-      fillrd(&oci->postproc_state, 63 - q, noise_level);

-    }

-    POSTPROC_INVOKE(RTCD_VTABLE(oci), addnoise)(oci->post_proc_buffer.y_buffer,

-                                                oci->postproc_state.noise,

-                                                oci->postproc_state.blackclamp,

-                                                oci->postproc_state.whiteclamp,

-                                                oci->postproc_state.bothclamp,

-                                                oci->post_proc_buffer.y_width,

-                                                oci->post_proc_buffer.y_height,

-                                                oci->post_proc_buffer.y_stride);

-  }

-#if CONFIG_POSTPROC_VISUALIZER

-  if (flags & VP9D_DEBUG_TXT_FRAME_INFO) {

-    char message[512];

-    sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",

-            (oci->frame_type == KEY_FRAME),

-            oci->refresh_golden_frame,

-            oci->base_qindex,

-            oci->filter_level,

-            flags,

-            oci->mb_cols, oci->mb_rows);

-    vp9_blit_text(message, oci->post_proc_buffer.y_buffer,

-                  oci->post_proc_buffer.y_stride);

-  }

-  if (flags & VP9D_DEBUG_TXT_MBLK_MODES) {

-    int i, j;

-    unsigned char *y_ptr;

-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;

-    int mb_rows = post->y_height >> 4;

-    int mb_cols = post->y_width  >> 4;

-    int mb_index = 0;

-    MODE_INFO *mi = oci->mi;

-    y_ptr = post->y_buffer + 4 * post->y_stride + 4;

-    /* vp9_filter each macro block */

-    for (i = 0; i < mb_rows; i++) {

-      for (j = 0; j < mb_cols; j++) {

-        char zz[4];

-        sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a');

-        vp9_blit_text(zz, y_ptr, post->y_stride);

-        mb_index++;

-        y_ptr += 16;

-      }

-      mb_index++; /* border */

-      y_ptr += post->y_stride  * 16 - post->y_width;

-    }

-  }

-  if (flags & VP9D_DEBUG_TXT_DC_DIFF) {

-    int i, j;

-    unsigned char *y_ptr;

-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;

-    int mb_rows = post->y_height >> 4;

-    int mb_cols = post->y_width  >> 4;

-    int mb_index = 0;

-    MODE_INFO *mi = oci->mi;

-    y_ptr = post->y_buffer + 4 * post->y_stride + 4;

-    /* vp9_filter each macro block */

-    for (i = 0; i < mb_rows; i++) {

-      for (j = 0; j < mb_cols; j++) {

-        char zz[4];

-        int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED &&

-                        mi[mb_index].mbmi.mode != SPLITMV &&

-                        mi[mb_index].mbmi.mb_skip_coeff);

-        if (oci->frame_type == KEY_FRAME)

-          sprintf(zz, "a");

-        else

-          sprintf(zz, "%c", dc_diff + '0');

-        vp9_blit_text(zz, y_ptr, post->y_stride);

-        mb_index++;

-        y_ptr += 16;

-      }

-      mb_index++; /* border */

-      y_ptr += post->y_stride  * 16 - post->y_width;

-    }

-  }

-  if (flags & VP9D_DEBUG_TXT_RATE_INFO) {

-    char message[512];

-    snprintf(message, sizeof(message),

-             "Bitrate: %10.2f frame_rate: %10.2f ",

-             oci->bitrate, oci->framerate);

-    vp9_blit_text(message, oci->post_proc_buffer.y_buffer,

-                  oci->post_proc_buffer.y_stride);

-  }

-  /* Draw motion vectors */

-  if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) {

-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;

-    int width  = post->y_width;

-    int height = post->y_height;

-    unsigned char *y_buffer = oci->post_proc_buffer.y_buffer;

-    int y_stride = oci->post_proc_buffer.y_stride;

-    MODE_INFO *mi = oci->mi;

-    int x0, y0;

-    for (y0 = 0; y0 < height; y0 += 16) {

-      for (x0 = 0; x0 < width; x0 += 16) {

-        int x1, y1;

-        if (!(ppflags->display_mv_flag & (1 << mi->mbmi.mode))) {

-          mi++;

-          continue;

-        }

-        if (mi->mbmi.mode == SPLITMV) {

-          switch (mi->mbmi.partitioning) {

-            case PARTITIONING_16X8 : {  /* mv_top_bottom */

-              union b_mode_info *bmi = &mi->bmi[0];

-              MV *mv = &bmi->mv.as_mv;

-              x1 = x0 + 8 + (mv->col >> 3);

-              y1 = y0 + 4 + (mv->row >> 3);

-              constrain_line(x0 + 8, &x1, y0 + 4, &y1, width, height);

-              vp9_blit_line(x0 + 8,  x1, y0 + 4,  y1, y_buffer, y_stride);

-              bmi = &mi->bmi[8];

-              x1 = x0 + 8 + (mv->col >> 3);

-              y1 = y0 + 12 + (mv->row >> 3);

-              constrain_line(x0 + 8, &x1, y0 + 12, &y1, width, height);

-              vp9_blit_line(x0 + 8,  x1, y0 + 12,  y1, y_buffer, y_stride);

-              break;

-            }

-            case PARTITIONING_8X16 : {  /* mv_left_right */

-              union b_mode_info *bmi = &mi->bmi[0];

-              MV *mv = &bmi->mv.as_mv;

-              x1 = x0 + 4 + (mv->col >> 3);

-              y1 = y0 + 8 + (mv->row >> 3);

-              constrain_line(x0 + 4, &x1, y0 + 8, &y1, width, height);

-              vp9_blit_line(x0 + 4,  x1, y0 + 8,  y1, y_buffer, y_stride);

-              bmi = &mi->bmi[2];

-              x1 = x0 + 12 + (mv->col >> 3);

-              y1 = y0 + 8 + (mv->row >> 3);

-              constrain_line(x0 + 12, &x1, y0 + 8, &y1, width, height);

-              vp9_blit_line(x0 + 12,  x1, y0 + 8,  y1, y_buffer, y_stride);

-              break;

-            }

-            case PARTITIONING_8X8 : {  /* mv_quarters   */

-              union b_mode_info *bmi = &mi->bmi[0];

-              MV *mv = &bmi->mv.as_mv;

-              x1 = x0 + 4 + (mv->col >> 3);

-              y1 = y0 + 4 + (mv->row >> 3);

-              constrain_line(x0 + 4, &x1, y0 + 4, &y1, width, height);

-              vp9_blit_line(x0 + 4,  x1, y0 + 4,  y1, y_buffer, y_stride);

-              bmi = &mi->bmi[2];

-              x1 = x0 + 12 + (mv->col >> 3);

-              y1 = y0 + 4 + (mv->row >> 3);

-              constrain_line(x0 + 12, &x1, y0 + 4, &y1, width, height);

-              vp9_blit_line(x0 + 12,  x1, y0 + 4,  y1, y_buffer, y_stride);

-              bmi = &mi->bmi[8];

-              x1 = x0 + 4 + (mv->col >> 3);

-              y1 = y0 + 12 + (mv->row >> 3);

-              constrain_line(x0 + 4, &x1, y0 + 12, &y1, width, height);

-              vp9_blit_line(x0 + 4,  x1, y0 + 12,  y1, y_buffer, y_stride);

-              bmi = &mi->bmi[10];

-              x1 = x0 + 12 + (mv->col >> 3);

-              y1 = y0 + 12 + (mv->row >> 3);

-              constrain_line(x0 + 12, &x1, y0 + 12, &y1, width, height);

-              vp9_blit_line(x0 + 12,  x1, y0 + 12,  y1, y_buffer, y_stride);

-              break;

-            }

-            case PARTITIONING_4X4:

-            default : {

-              union b_mode_info *bmi = mi->bmi;

-              int bx0, by0;

-              for (by0 = y0; by0 < (y0 + 16); by0 += 4) {

-                for (bx0 = x0; bx0 < (x0 + 16); bx0 += 4) {

-                  MV *mv = &bmi->mv.as_mv;

-                  x1 = bx0 + 2 + (mv->col >> 3);

-                  y1 = by0 + 2 + (mv->row >> 3);

-                  constrain_line(bx0 + 2, &x1, by0 + 2, &y1, width, height);

-                  vp9_blit_line(bx0 + 2,  x1, by0 + 2,  y1, y_buffer, y_stride);

-                  bmi++;

-                }

-              }

-            }

-          }

-        } else if (mi->mbmi.mode >= NEARESTMV) {

-          MV *mv = &mi->mbmi.mv.as_mv;

-          const int lx0 = x0 + 8;

-          const int ly0 = y0 + 8;

-          x1 = lx0 + (mv->col >> 3);

-          y1 = ly0 + (mv->row >> 3);

-          if (x1 != lx0 && y1 != ly0) {

-            constrain_line(lx0, &x1, ly0 - 1, &y1, width, height);

-            vp9_blit_line(lx0,  x1, ly0 - 1,  y1, y_buffer, y_stride);

-            constrain_line(lx0, &x1, ly0 + 1, &y1, width, height);

-            vp9_blit_line(lx0,  x1, ly0 + 1,  y1, y_buffer, y_stride);

-          } else

-            vp9_blit_line(lx0,  x1, ly0,  y1, y_buffer, y_stride);

-        }

-        mi++;

-      }

-      mi++;

-    }

-  }

-  /* Color in block modes */

-  if ((flags & VP9D_DEBUG_CLR_BLK_MODES)

-      && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) {

-    int y, x;

-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;

-    int width  = post->y_width;

-    int height = post->y_height;

-    unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;

-    unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;

-    unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;

-    int y_stride = oci->post_proc_buffer.y_stride;

-    MODE_INFO *mi = oci->mi;

-    for (y = 0; y < height; y += 16) {

-      for (x = 0; x < width; x += 16) {

-        int Y = 0, U = 0, V = 0;

-        if (mi->mbmi.mode == B_PRED &&

-            ((ppflags->display_mb_modes_flag & B_PRED) ||

-             ppflags->display_b_modes_flag)) {

-          int by, bx;

-          unsigned char *yl, *ul, *vl;

-          union b_mode_info *bmi = mi->bmi;

-          yl = y_ptr + x;

-          ul = u_ptr + (x >> 1);

-          vl = v_ptr + (x >> 1);

-          for (by = 0; by < 16; by += 4) {

-            for (bx = 0; bx < 16; bx += 4) {

-              if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode))

-                  || (ppflags->display_mb_modes_flag & B_PRED)) {

-                Y = B_PREDICTION_MODE_colors[bmi->as_mode.first][0];

-                U = B_PREDICTION_MODE_colors[bmi->as_mode.first][1];

-                V = B_PREDICTION_MODE_colors[bmi->as_mode.first][2];

-                POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_b)(yl + bx,

-                                                           ul + (bx >> 1),

-                                                           vl + (bx >> 1),

-                                                           Y, U, V,

-                                                           0xc000, y_stride);

-              }

-              bmi++;

-            }

-            yl += y_stride * 4;

-            ul += y_stride * 1;

-            vl += y_stride * 1;

-          }

-        } else if (ppflags->display_mb_modes_flag & (1 << mi->mbmi.mode)) {

-          Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];

-          U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];

-          V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];

-          POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_inner)(y_ptr + x,

-                                                            u_ptr + (x >> 1),

-                                                            v_ptr + (x >> 1),

-                                                            Y, U, V,

-                                                            0xc000, y_stride);

-        }

-        mi++;

-      }

-      y_ptr += y_stride * 16;

-      u_ptr += y_stride * 4;

-      v_ptr += y_stride * 4;

-      mi++;

-    }

-  }

-  /* Color in frame reference blocks */

-  if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) &&

-      ppflags->display_ref_frame_flag) {

-    int y, x;

-    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;

-    int width  = post->y_width;

-    int height = post->y_height;

-    unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;

-    unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;

-    unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;

-    int y_stride = oci->post_proc_buffer.y_stride;

-    MODE_INFO *mi = oci->mi;

-    for (y = 0; y < height; y += 16) {

-      for (x = 0; x < width; x += 16) {

-        int Y = 0, U = 0, V = 0;

-        if (ppflags->display_ref_frame_flag & (1 << mi->mbmi.ref_frame)) {

-          Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];

-          U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];

-          V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];

-          POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_outer)(y_ptr + x,

-                                                            u_ptr + (x >> 1),

-                                                            v_ptr + (x >> 1),

-                                                            Y, U, V,

-                                                            0xc000, y_stride);

-        }

-        mi++;

-      }

-      y_ptr += y_stride * 16;

-      u_ptr += y_stride * 4;

-      v_ptr += y_stride * 4;

-      mi++;

-    }

-  }

-#endif

-  *dest = oci->post_proc_buffer;

-  /* handle problem with extending borders */

-  dest->y_width = oci->Width;

-  dest->y_height = oci->Height;

-  dest->uv_height = dest->y_height / 2;

-  return 0;

-}

--- a/vp8/common/postproc.h

+++ /dev/null

@@ -1,128 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef POSTPROC_H

-#define POSTPROC_H

-#define prototype_postproc_inplace(sym)\

-  void sym(unsigned char *dst, int pitch, int rows, int cols, int flimit)

-#define prototype_postproc(sym)\

-  void sym(unsigned char *src, unsigned char *dst, int src_pitch, \

-           int dst_pitch, int rows, int cols, int flimit)

-#define prototype_postproc_addnoise(sym) \

-  void sym(unsigned char *s, char *noise, char blackclamp[16], \

-           char whiteclamp[16], char bothclamp[16], \

-           unsigned int w, unsigned int h, int pitch)

-#define prototype_postproc_blend_mb_inner(sym)\

-  void sym(unsigned char *y, unsigned char *u, unsigned char *v, \

-           int y1, int u1, int v1, int alpha, int stride)

-#define prototype_postproc_blend_mb_outer(sym)\

-  void sym(unsigned char *y, unsigned char *u, unsigned char *v, \

-           int y1, int u1, int v1, int alpha, int stride)

-#define prototype_postproc_blend_b(sym)\

-  void sym(unsigned char *y, unsigned char *u, unsigned char *v, \

-           int y1, int u1, int v1, int alpha, int stride)

-#if ARCH_X86 || ARCH_X86_64

-#include "x86/postproc_x86.h"

-#endif

-#ifndef vp9_postproc_down

-#define vp9_postproc_down vp9_mbpost_proc_down_c

-#endif

-extern prototype_postproc_inplace(vp9_postproc_down);

-#ifndef vp9_postproc_across

-#define vp9_postproc_across vp9_mbpost_proc_across_ip_c

-#endif

-extern prototype_postproc_inplace(vp9_postproc_across);

-#ifndef vp9_postproc_downacross

-#define vp9_postproc_downacross vp9_post_proc_down_and_across_c

-#endif

-extern prototype_postproc(vp9_postproc_downacross);

-#ifndef vp9_postproc_addnoise

-#define vp9_postproc_addnoise vp9_plane_add_noise_c

-#endif

-extern prototype_postproc_addnoise(vp9_postproc_addnoise);

-#ifndef vp9_postproc_blend_mb_inner

-#define vp9_postproc_blend_mb_inner vp9_blend_mb_inner_c

-#endif

-extern prototype_postproc_blend_mb_inner(vp9_postproc_blend_mb_inner);

-#ifndef vp9_postproc_blend_mb_outer

-#define vp9_postproc_blend_mb_outer vp9_blend_mb_outer_c

-#endif

-extern prototype_postproc_blend_mb_outer(vp9_postproc_blend_mb_outer);

-#ifndef vp9_postproc_blend_b

-#define vp9_postproc_blend_b vp9_blend_b_c

-#endif

-extern prototype_postproc_blend_b(vp9_postproc_blend_b);

-typedef prototype_postproc((*vp9_postproc_fn_t));

-typedef prototype_postproc_inplace((*vp9_postproc_inplace_fn_t));

-typedef prototype_postproc_addnoise((*vp9_postproc_addnoise_fn_t));

-typedef prototype_postproc_blend_mb_inner((*vp9_postproc_blend_mb_inner_fn_t));

-typedef prototype_postproc_blend_mb_outer((*vp9_postproc_blend_mb_outer_fn_t));

-typedef prototype_postproc_blend_b((*vp9_postproc_blend_b_fn_t));

-typedef struct {

-  vp9_postproc_inplace_fn_t           down;

-  vp9_postproc_inplace_fn_t           across;

-  vp9_postproc_fn_t                   downacross;

-  vp9_postproc_addnoise_fn_t          addnoise;

-  vp9_postproc_blend_mb_inner_fn_t    blend_mb_inner;

-  vp9_postproc_blend_mb_outer_fn_t    blend_mb_outer;

-  vp9_postproc_blend_b_fn_t           blend_b;

-} vp9_postproc_rtcd_vtable_t;

-#if CONFIG_RUNTIME_CPU_DETECT

-#define POSTPROC_INVOKE(ctx,fn) (ctx)->fn

-#else

-#define POSTPROC_INVOKE(ctx,fn) vp9_postproc_##fn

-#endif

-#include "vpx_ports/mem.h"

-struct postproc_state {

-  int           last_q;

-  int           last_noise;

-  char          noise[3072];

-  DECLARE_ALIGNED(16, char, blackclamp[16]);

-  DECLARE_ALIGNED(16, char, whiteclamp[16]);

-  DECLARE_ALIGNED(16, char, bothclamp[16]);

-};

-#include "onyxc_int.h"

-#include "ppflags.h"

-int vp9_post_proc_frame(struct VP9Common *oci, YV12_BUFFER_CONFIG *dest,

-                        vp9_ppflags_t *flags);

-void vp9_de_noise(YV12_BUFFER_CONFIG         *source,

-                  YV12_BUFFER_CONFIG         *post,

-                  int                         q,

-                  int                         low_var_thresh,

-                  int                         flag,

-                  vp9_postproc_rtcd_vtable_t *rtcd);

-void vp9_deblock(YV12_BUFFER_CONFIG         *source,

-                 YV12_BUFFER_CONFIG         *post,

-                 int                         q,

-                 int                         low_var_thresh,

-                 int                         flag,

-                 vp9_postproc_rtcd_vtable_t *rtcd);

-#endif

--- a/vp8/common/ppc/copy_altivec.asm

+++ /dev/null

@@ -1,47 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl copy_mem16x16_ppc

-;# r3 unsigned char *src

-;# r4 int src_stride

-;# r5 unsigned char *dst

-;# r6 int dst_stride

-;# Make the assumption that input will not be aligned,

-;#  but the output will be.  So two reads and a perm

-;#  for the input, but only one store for the output.

-copy_mem16x16_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xe000

-    mtspr   256, r12            ;# set VRSAVE

-    li      r10, 16

-    mtctr   r10

-cp_16x16_loop:

-    lvsl    v0,  0, r3          ;# permutate value for alignment

-    lvx     v1,   0, r3

-    lvx     v2, r10, r3

-    vperm   v1, v1, v2, v0

-    stvx    v1,  0, r5

-    add     r3, r3, r4          ;# increment source pointer

-    add     r5, r5, r6          ;# increment destination pointer

-    bdnz    cp_16x16_loop

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

--- a/vp8/common/ppc/filter_altivec.asm

+++ /dev/null

@@ -1,1013 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl sixtap_predict_ppc

-    .globl sixtap_predict8x4_ppc

-    .globl sixtap_predict8x8_ppc

-    .globl sixtap_predict16x16_ppc

-.macro load_c V, LABEL, OFF, R0, R1

-    lis     \R0, \LABEL@ha

-    la      \R1, \LABEL@l(\R0)

-    lvx     \V, \OFF, \R1

-.endm

-.macro load_hfilter V0, V1

-    load_c \V0, HFilter, r5, r9, r10

-    addi    r5,  r5, 16

-    lvx     \V1, r5, r10

-.endm

-;# Vertical filtering

-.macro Vprolog

-    load_c v0, VFilter, r6, r3, r10

-    vspltish v5, 8

-    vspltish v6, 3

-    vslh    v6, v5, v6      ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    vspltb  v1, v0, 1

-    vspltb  v2, v0, 2

-    vspltb  v3, v0, 3

-    vspltb  v4, v0, 4

-    vspltb  v5, v0, 5

-    vspltb  v0, v0, 0

-.endm

-.macro vpre_load

-    Vprolog

-    li      r10,  16

-    lvx     v10,   0, r9    ;# v10..v14 = first 5 rows

-    lvx     v11, r10, r9

-    addi    r9,   r9, 32

-    lvx     v12,   0, r9

-    lvx     v13, r10, r9

-    addi    r9,   r9, 32

-    lvx     v14,   0, r9

-.endm

-.macro Msum Re, Ro, V, T, TMP

-                                ;# (Re,Ro) += (V*T)

-    vmuleub \TMP, \V, \T        ;# trashes v8

-    vadduhm \Re, \Re, \TMP      ;# Re = evens, saturation unnecessary

-    vmuloub \TMP, \V, \T

-    vadduhm \Ro, \Ro, \TMP      ;# Ro = odds

-.endm

-.macro vinterp_no_store P0 P1 P2 P3 P4 P5

-    vmuleub  v8, \P0, v0        ;# 64 + 4 positive taps

-    vadduhm v16, v6, v8

-    vmuloub  v8, \P0, v0

-    vadduhm v17, v6, v8

-    Msum v16, v17, \P2, v2, v8

-    Msum v16, v17, \P3, v3, v8

-    Msum v16, v17, \P5, v5, v8

-    vmuleub v18, \P1, v1        ;# 2 negative taps

-    vmuloub v19, \P1, v1

-    Msum v18, v19, \P4, v4, v8

-    vsubuhs v16, v16, v18       ;# subtract neg from pos

-    vsubuhs v17, v17, v19

-    vsrh    v16, v16, v7        ;# divide by 128

-    vsrh    v17, v17, v7        ;# v16 v17 = evens, odds

-    vmrghh  v18, v16, v17       ;# v18 v19 = 16-bit result in order

-    vmrglh  v19, v16, v17

-    vpkuhus  \P0, v18, v19      ;# P0 = 8-bit result

-.endm

-.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5

-    vmuleub v24, \P0, v13       ;# 64 + 4 positive taps

-    vadduhm v21, v20, v24

-    vmuloub v24, \P0, v13

-    vadduhm v22, v20, v24

-    Msum v21, v22, \P2, v15, v25

-    Msum v21, v22, \P3, v16, v25

-    Msum v21, v22, \P5, v18, v25

-    vmuleub v23, \P1, v14       ;# 2 negative taps

-    vmuloub v24, \P1, v14

-    Msum v23, v24, \P4, v17, v25

-    vsubuhs v21, v21, v23       ;# subtract neg from pos

-    vsubuhs v22, v22, v24

-    vsrh    v21, v21, v19       ;# divide by 128

-    vsrh    v22, v22, v19       ;# v16 v17 = evens, odds

-    vmrghh  v23, v21, v22       ;# v18 v19 = 16-bit result in order

-    vmrglh  v24, v21, v22

-    vpkuhus \P0, v23, v24       ;# P0 = 8-bit result

-.endm

-.macro Vinterp P0 P1 P2 P3 P4 P5

-    vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5

-    stvx    \P0, 0, r7

-    add     r7, r7, r8      ;# 33 ops per 16 pels

-.endm

-.macro luma_v P0, P1, P2, P3, P4, P5

-    addi    r9,   r9, 16        ;# P5 = newest input row

-    lvx     \P5,   0, r9

-    Vinterp \P0, \P1, \P2, \P3, \P4, \P5

-.endm

-.macro luma_vtwo

-    luma_v v10, v11, v12, v13, v14, v15

-    luma_v v11, v12, v13, v14, v15, v10

-.endm

-.macro luma_vfour

-    luma_vtwo

-    luma_v v12, v13, v14, v15, v10, v11

-    luma_v v13, v14, v15, v10, v11, v12

-.endm

-.macro luma_vsix

-    luma_vfour

-    luma_v v14, v15, v10, v11, v12, v13

-    luma_v v15, v10, v11, v12, v13, v14

-.endm

-.macro Interp4 R I I4

-    vmsummbm \R, v13, \I, v15

-    vmsummbm \R, v14, \I4, \R

-.endm

-.macro Read8x8 VD, RS, RP, increment_counter

-    lvsl    v21,  0, \RS        ;# permutate value for alignment

-    ;# input to filter is 21 bytes wide, output is 16 bytes.

-    ;#  input will can span three vectors if not aligned correctly.

-    lvx     \VD,   0, \RS

-    lvx     v20, r10, \RS

-.if \increment_counter

-    add     \RS, \RS, \RP

-.endif

-    vperm   \VD, \VD, v20, v21

-.endm

-.macro interp_8x8 R

-    vperm   v20, \R, \R, v16    ;# v20 = 0123 1234 2345 3456

-    vperm   v21, \R, \R, v17    ;# v21 = 4567 5678 6789 789A

-    Interp4 v20, v20,  v21      ;# v20 = result 0 1 2 3

-    vperm   \R, \R, \R, v18     ;# R   = 89AB 9ABC ABCx BCxx

-    Interp4 v21, v21, \R        ;# v21 = result 4 5 6 7

-    vpkswus \R, v20, v21        ;#  R = 0 1 2 3 4 5 6 7

-    vsrh    \R, \R, v19

-    vpkuhus \R, \R, \R          ;# saturate and pack

-.endm

-.macro Read4x4 VD, RS, RP, increment_counter

-    lvsl    v21,  0, \RS        ;# permutate value for alignment

-    ;# input to filter is 21 bytes wide, output is 16 bytes.

-    ;#  input will can span three vectors if not aligned correctly.

-    lvx     v20,   0, \RS

-.if \increment_counter

-    add     \RS, \RS, \RP

-.endif

-    vperm   \VD, v20, v20, v21

-.endm

-    .text

-    .align 2

-;# r3 unsigned char * src

-;# r4 int src_pitch

-;# r5 int x_offset

-;# r6 int y_offset

-;# r7 unsigned char * dst

-;# r8 int dst_pitch

-sixtap_predict_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xff87

-    ori     r12, r12, 0xffc0

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    slwi.   r5, r5, 5           ;# index into horizontal filter array

-    vspltish v19, 7

-    ;# If there isn't any filtering to be done for the horizontal, then

-    ;#  just skip to the second pass.

-    beq-    vertical_only_4x4

-    ;# load up horizontal filter

-    load_hfilter v13, v14

-    ;# rounding added in on the multiply

-    vspltisw v16, 8

-    vspltisw v15, 3

-    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040

-    ;# Load up permutation constants

-    load_c v16, B_0123, 0, r9, r10

-    load_c v17, B_4567, 0, r9, r10

-    load_c v18, B_89AB, 0, r9, r10

-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after

-    addi    r3, r3, -2

-    addi    r9, r3, 0

-    li      r10, 16

-    Read8x8 v2, r3, r4, 1

-    Read8x8 v3, r3, r4, 1

-    Read8x8 v4, r3, r4, 1

-    Read8x8 v5, r3, r4, 1

-    slwi.   r6, r6, 4           ;# index into vertical filter array

-    ;# filter a line

-    interp_8x8 v2

-    interp_8x8 v3

-    interp_8x8 v4

-    interp_8x8 v5

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional 5 lines that are needed

-    ;#  for the vertical filter.

-    beq-    store_4x4

-    ;# only needed if there is a vertical filter present

-    ;# if the second filter is not null then need to back off by 2*pitch

-    sub     r9, r9, r4

-    sub     r9, r9, r4

-    Read8x8 v0, r9, r4, 1

-    Read8x8 v1, r9, r4, 0

-    Read8x8 v6, r3, r4, 1

-    Read8x8 v7, r3, r4, 1

-    Read8x8 v8, r3, r4, 0

-    interp_8x8 v0

-    interp_8x8 v1

-    interp_8x8 v6

-    interp_8x8 v7

-    interp_8x8 v8

-    b       second_pass_4x4

-vertical_only_4x4:

-    ;# only needed if there is a vertical filter present

-    ;# if the second filter is not null then need to back off by 2*pitch

-    sub     r3, r3, r4

-    sub     r3, r3, r4

-    li      r10, 16

-    Read8x8 v0, r3, r4, 1

-    Read8x8 v1, r3, r4, 1

-    Read8x8 v2, r3, r4, 1

-    Read8x8 v3, r3, r4, 1

-    Read8x8 v4, r3, r4, 1

-    Read8x8 v5, r3, r4, 1

-    Read8x8 v6, r3, r4, 1

-    Read8x8 v7, r3, r4, 1

-    Read8x8 v8, r3, r4, 0

-    slwi    r6, r6, 4           ;# index into vertical filter array

-second_pass_4x4:

-    load_c   v20, b_hilo_4x4, 0, r9, r10

-    load_c   v21, b_hilo, 0, r9, r10

-    ;# reposition input so that it can go through the

-    ;# filtering phase with one pass.

-    vperm   v0, v0, v1, v20     ;# 0 1 x x

-    vperm   v2, v2, v3, v20     ;# 2 3 x x

-    vperm   v4, v4, v5, v20     ;# 4 5 x x

-    vperm   v6, v6, v7, v20     ;# 6 7 x x

-    vperm   v0, v0, v2, v21     ;# 0 1 2 3

-    vperm   v4, v4, v6, v21     ;# 4 5 6 7

-    vsldoi  v1, v0, v4, 4

-    vsldoi  v2, v0, v4, 8

-    vsldoi  v3, v0, v4, 12

-    vsldoi  v5, v4, v8, 4

-    load_c   v13, VFilter, r6, r9, r10

-    vspltish v15, 8

-    vspltish v20, 3

-    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    vspltb  v14, v13, 1

-    vspltb  v15, v13, 2

-    vspltb  v16, v13, 3

-    vspltb  v17, v13, 4

-    vspltb  v18, v13, 5

-    vspltb  v13, v13, 0

-    vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5

-    stvx    v0, 0, r1

-    lwz     r0, 0(r1)

-    stw     r0, 0(r7)

-    add     r7, r7, r8

-    lwz     r0, 4(r1)

-    stw     r0, 0(r7)

-    add     r7, r7, r8

-    lwz     r0, 8(r1)

-    stw     r0, 0(r7)

-    add     r7, r7, r8

-    lwz     r0, 12(r1)

-    stw     r0, 0(r7)

-    b       exit_4x4

-store_4x4:

-    stvx    v2, 0, r1

-    lwz     r0, 0(r1)

-    stw     r0, 0(r7)

-    add     r7, r7, r8

-    stvx    v3, 0, r1

-    lwz     r0, 0(r1)

-    stw     r0, 0(r7)

-    add     r7, r7, r8

-    stvx    v4, 0, r1

-    lwz     r0, 0(r1)

-    stw     r0, 0(r7)

-    add     r7, r7, r8

-    stvx    v5, 0, r1

-    lwz     r0, 0(r1)

-    stw     r0, 0(r7)

-exit_4x4:

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-.macro w_8x8 V, D, R, P

-    stvx    \V, 0, r1

-    lwz     \R, 0(r1)

-    stw     \R, 0(r7)

-    lwz     \R, 4(r1)

-    stw     \R, 4(r7)

-    add     \D, \D, \P

-.endm

-    .align 2

-;# r3 unsigned char * src

-;# r4 int src_pitch

-;# r5 int x_offset

-;# r6 int y_offset

-;# r7 unsigned char * dst

-;# r8 int dst_pitch

-sixtap_predict8x4_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xffc0

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    slwi.   r5, r5, 5           ;# index into horizontal filter array

-    vspltish v19, 7

-    ;# If there isn't any filtering to be done for the horizontal, then

-    ;#  just skip to the second pass.

-    beq-    second_pass_pre_copy_8x4

-    load_hfilter v13, v14

-    ;# rounding added in on the multiply

-    vspltisw v16, 8

-    vspltisw v15, 3

-    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040

-    ;# Load up permutation constants

-    load_c v16, B_0123, 0, r9, r10

-    load_c v17, B_4567, 0, r9, r10

-    load_c v18, B_89AB, 0, r9, r10

-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after

-    addi    r3, r3, -2

-    addi    r9, r3, 0

-    li      r10, 16

-    Read8x8 v2, r3, r4, 1

-    Read8x8 v3, r3, r4, 1

-    Read8x8 v4, r3, r4, 1

-    Read8x8 v5, r3, r4, 1

-    slwi.   r6, r6, 4           ;# index into vertical filter array

-    ;# filter a line

-    interp_8x8 v2

-    interp_8x8 v3

-    interp_8x8 v4

-    interp_8x8 v5

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional 5 lines that are needed

-    ;#  for the vertical filter.

-    beq-    store_8x4

-    ;# only needed if there is a vertical filter present

-    ;# if the second filter is not null then need to back off by 2*pitch

-    sub     r9, r9, r4

-    sub     r9, r9, r4

-    Read8x8 v0, r9, r4, 1

-    Read8x8 v1, r9, r4, 0

-    Read8x8 v6, r3, r4, 1

-    Read8x8 v7, r3, r4, 1

-    Read8x8 v8, r3, r4, 0

-    interp_8x8 v0

-    interp_8x8 v1

-    interp_8x8 v6

-    interp_8x8 v7

-    interp_8x8 v8

-    b       second_pass_8x4

-second_pass_pre_copy_8x4:

-    ;# only needed if there is a vertical filter present

-    ;# if the second filter is not null then need to back off by 2*pitch

-    sub     r3, r3, r4

-    sub     r3, r3, r4

-    li      r10, 16

-    Read8x8 v0,  r3, r4, 1

-    Read8x8 v1,  r3, r4, 1

-    Read8x8 v2,  r3, r4, 1

-    Read8x8 v3,  r3, r4, 1

-    Read8x8 v4,  r3, r4, 1

-    Read8x8 v5,  r3, r4, 1

-    Read8x8 v6,  r3, r4, 1

-    Read8x8 v7,  r3, r4, 1

-    Read8x8 v8,  r3, r4, 1

-    slwi    r6, r6, 4           ;# index into vertical filter array

-second_pass_8x4:

-    load_c v13, VFilter, r6, r9, r10

-    vspltish v15, 8

-    vspltish v20, 3

-    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    vspltb  v14, v13, 1

-    vspltb  v15, v13, 2

-    vspltb  v16, v13, 3

-    vspltb  v17, v13, 4

-    vspltb  v18, v13, 5

-    vspltb  v13, v13, 0

-    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5

-    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6

-    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7

-    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8

-    cmpi    cr0, r8, 8

-    beq     cr0, store_aligned_8x4

-    w_8x8   v0, r7, r0, r8

-    w_8x8   v1, r7, r0, r8

-    w_8x8   v2, r7, r0, r8

-    w_8x8   v3, r7, r0, r8

-    b       exit_8x4

-store_aligned_8x4:

-    load_c v10, b_hilo, 0, r9, r10

-    vperm   v0, v0, v1, v10

-    vperm   v2, v2, v3, v10

-    stvx    v0, 0, r7

-    addi    r7, r7, 16

-    stvx    v2, 0, r7

-    b       exit_8x4

-store_8x4:

-    cmpi    cr0, r8, 8

-    beq     cr0, store_aligned2_8x4

-    w_8x8   v2, r7, r0, r8

-    w_8x8   v3, r7, r0, r8

-    w_8x8   v4, r7, r0, r8

-    w_8x8   v5, r7, r0, r8

-    b       exit_8x4

-store_aligned2_8x4:

-    load_c v10, b_hilo, 0, r9, r10

-    vperm   v2, v2, v3, v10

-    vperm   v4, v4, v5, v10

-    stvx    v2, 0, r7

-    addi    r7, r7, 16

-    stvx    v4, 0, r7

-exit_8x4:

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 2

-;# r3 unsigned char * src

-;# r4 int src_pitch

-;# r5 int x_offset

-;# r6 int y_offset

-;# r7 unsigned char * dst

-;# r8 int dst_pitch

-;# Because the width that needs to be filtered will fit in a single altivec

-;#  register there is no need to loop.  Everything can stay in registers.

-sixtap_predict8x8_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xffc0

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    slwi.   r5, r5, 5           ;# index into horizontal filter array

-    vspltish v19, 7

-    ;# If there isn't any filtering to be done for the horizontal, then

-    ;#  just skip to the second pass.

-    beq-    second_pass_pre_copy_8x8

-    load_hfilter v13, v14

-    ;# rounding added in on the multiply

-    vspltisw v16, 8

-    vspltisw v15, 3

-    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040

-    ;# Load up permutation constants

-    load_c v16, B_0123, 0, r9, r10

-    load_c v17, B_4567, 0, r9, r10

-    load_c v18, B_89AB, 0, r9, r10

-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after

-    addi    r3, r3, -2

-    addi    r9, r3, 0

-    li      r10, 16

-    Read8x8 v2, r3, r4, 1

-    Read8x8 v3, r3, r4, 1

-    Read8x8 v4, r3, r4, 1

-    Read8x8 v5, r3, r4, 1

-    Read8x8 v6, r3, r4, 1

-    Read8x8 v7, r3, r4, 1

-    Read8x8 v8, r3, r4, 1

-    Read8x8 v9, r3, r4, 1

-    slwi.   r6, r6, 4           ;# index into vertical filter array

-    ;# filter a line

-    interp_8x8 v2

-    interp_8x8 v3

-    interp_8x8 v4

-    interp_8x8 v5

-    interp_8x8 v6

-    interp_8x8 v7

-    interp_8x8 v8

-    interp_8x8 v9

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional 5 lines that are needed

-    ;#  for the vertical filter.

-    beq-    store_8x8

-    ;# only needed if there is a vertical filter present

-    ;# if the second filter is not null then need to back off by 2*pitch

-    sub     r9, r9, r4

-    sub     r9, r9, r4

-    Read8x8 v0,  r9, r4, 1

-    Read8x8 v1,  r9, r4, 0

-    Read8x8 v10, r3, r4, 1

-    Read8x8 v11, r3, r4, 1

-    Read8x8 v12, r3, r4, 0

-    interp_8x8 v0

-    interp_8x8 v1

-    interp_8x8 v10

-    interp_8x8 v11

-    interp_8x8 v12

-    b       second_pass_8x8

-second_pass_pre_copy_8x8:

-    ;# only needed if there is a vertical filter present

-    ;# if the second filter is not null then need to back off by 2*pitch

-    sub     r3, r3, r4

-    sub     r3, r3, r4

-    li      r10, 16

-    Read8x8 v0,  r3, r4, 1

-    Read8x8 v1,  r3, r4, 1

-    Read8x8 v2,  r3, r4, 1

-    Read8x8 v3,  r3, r4, 1

-    Read8x8 v4,  r3, r4, 1

-    Read8x8 v5,  r3, r4, 1

-    Read8x8 v6,  r3, r4, 1

-    Read8x8 v7,  r3, r4, 1

-    Read8x8 v8,  r3, r4, 1

-    Read8x8 v9,  r3, r4, 1

-    Read8x8 v10, r3, r4, 1

-    Read8x8 v11, r3, r4, 1

-    Read8x8 v12, r3, r4, 0

-    slwi    r6, r6, 4           ;# index into vertical filter array

-second_pass_8x8:

-    load_c v13, VFilter, r6, r9, r10

-    vspltish v15, 8

-    vspltish v20, 3

-    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    vspltb  v14, v13, 1

-    vspltb  v15, v13, 2

-    vspltb  v16, v13, 3

-    vspltb  v17, v13, 4

-    vspltb  v18, v13, 5

-    vspltb  v13, v13, 0

-    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5

-    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6

-    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7

-    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8

-    vinterp_no_store_8x8 v4, v5, v6, v7,  v8,  v9

-    vinterp_no_store_8x8 v5, v6, v7, v8,  v9,  v10

-    vinterp_no_store_8x8 v6, v7, v8, v9,  v10, v11

-    vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12

-    cmpi    cr0, r8, 8

-    beq     cr0, store_aligned_8x8

-    w_8x8   v0, r7, r0, r8

-    w_8x8   v1, r7, r0, r8

-    w_8x8   v2, r7, r0, r8

-    w_8x8   v3, r7, r0, r8

-    w_8x8   v4, r7, r0, r8

-    w_8x8   v5, r7, r0, r8

-    w_8x8   v6, r7, r0, r8

-    w_8x8   v7, r7, r0, r8

-    b       exit_8x8

-store_aligned_8x8:

-    load_c v10, b_hilo, 0, r9, r10

-    vperm   v0, v0, v1, v10

-    vperm   v2, v2, v3, v10

-    vperm   v4, v4, v5, v10

-    vperm   v6, v6, v7, v10

-    stvx    v0, 0, r7

-    addi    r7, r7, 16

-    stvx    v2, 0, r7

-    addi    r7, r7, 16

-    stvx    v4, 0, r7

-    addi    r7, r7, 16

-    stvx    v6, 0, r7

-    b       exit_8x8

-store_8x8:

-    cmpi    cr0, r8, 8

-    beq     cr0, store_aligned2_8x8

-    w_8x8   v2, r7, r0, r8

-    w_8x8   v3, r7, r0, r8

-    w_8x8   v4, r7, r0, r8

-    w_8x8   v5, r7, r0, r8

-    w_8x8   v6, r7, r0, r8

-    w_8x8   v7, r7, r0, r8

-    w_8x8   v8, r7, r0, r8

-    w_8x8   v9, r7, r0, r8

-    b       exit_8x8

-store_aligned2_8x8:

-    load_c v10, b_hilo, 0, r9, r10

-    vperm   v2, v2, v3, v10

-    vperm   v4, v4, v5, v10

-    vperm   v6, v6, v7, v10

-    vperm   v8, v8, v9, v10

-    stvx    v2, 0, r7

-    addi    r7, r7, 16

-    stvx    v4, 0, r7

-    addi    r7, r7, 16

-    stvx    v6, 0, r7

-    addi    r7, r7, 16

-    stvx    v8, 0, r7

-exit_8x8:

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 2

-;# r3 unsigned char * src

-;# r4 int src_pitch

-;# r5 int x_offset

-;# r6 int y_offset

-;# r7 unsigned char * dst

-;# r8 int dst_pitch

-;# Two pass filtering.  First pass is Horizontal edges, second pass is vertical

-;#  edges.  One of the filters can be null, but both won't be.  Needs to use a

-;#  temporary buffer because the source buffer can't be modified and the buffer

-;#  for the destination is not large enough to hold the temporary data.

-sixtap_predict16x16_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xf000

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-416(r1)         ;# create space on the stack

-    ;# Three possiblities

-    ;#  1. First filter is null.  Don't use a temp buffer.

-    ;#  2. Second filter is null.  Don't use a temp buffer.

-    ;#  3. Neither are null, use temp buffer.

-    ;# First Pass (horizontal edge)

-    ;#  setup pointers for src

-    ;#  if possiblity (1) then setup the src pointer to be the orginal and jump

-    ;#  to second pass.  this is based on if x_offset is 0.

-    ;# load up horizontal filter

-    slwi.   r5, r5, 5           ;# index into horizontal filter array

-    load_hfilter v4, v5

-    beq-    copy_horizontal_16x21

-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after

-    addi    r3, r3, -2

-    slwi.   r6, r6, 4           ;# index into vertical filter array

-    ;# setup constants

-    ;# v14 permutation value for alignment

-    load_c v14, b_hperm, 0, r9, r10

-    ;# These statements are guessing that there won't be a second pass,

-    ;#  but if there is then inside the bypass they need to be set

-    li      r0, 16              ;# prepare for no vertical filter

-    ;# Change the output pointer and pitch to be the actual

-    ;#  desination instead of a temporary buffer.

-    addi    r9, r7, 0

-    addi    r5, r8, 0

-    ;# no vertical filter, so write the output from the first pass

-    ;#  directly into the output buffer.

-    beq-    no_vertical_filter_bypass

-    ;# if the second filter is not null then need to back off by 2*pitch

-    sub     r3, r3, r4

-    sub     r3, r3, r4

-    ;# setup counter for the number of lines that are going to be filtered

-    li      r0, 21

-    ;# use the stack as temporary storage

-    la      r9, 48(r1)

-    li      r5, 16

-no_vertical_filter_bypass:

-    mtctr   r0

-    ;# rounding added in on the multiply

-    vspltisw v10, 8

-    vspltisw v12, 3

-    vslw    v12, v10, v12       ;# 0x00000040000000400000004000000040

-    ;# downshift by 7 ( divide by 128 ) at the end

-    vspltish v13, 7

-    ;# index to the next set of vectors in the row.

-    li      r10, 16

-    li      r12, 32

-horizontal_loop_16x16:

-    lvsl    v15,  0, r3         ;# permutate value for alignment

-    ;# input to filter is 21 bytes wide, output is 16 bytes.

-    ;#  input will can span three vectors if not aligned correctly.

-    lvx     v1,   0, r3

-    lvx     v2, r10, r3

-    lvx     v3, r12, r3

-    vperm   v8, v1, v2, v15

-    vperm   v9, v2, v3, v15     ;# v8 v9 = 21 input pixels left-justified

-    vsldoi  v11, v8, v9, 4

-    ;# set 0

-    vmsummbm v6, v4, v8, v12    ;# taps times elements

-    vmsummbm v0, v5, v11, v6

-    ;# set 1

-    vsldoi  v10, v8, v9, 1

-    vsldoi  v11, v8, v9, 5

-    vmsummbm v6, v4, v10, v12

-    vmsummbm v1, v5, v11, v6

-    ;# set 2

-    vsldoi  v10, v8, v9, 2

-    vsldoi  v11, v8, v9, 6

-    vmsummbm v6, v4, v10, v12

-    vmsummbm v2, v5, v11, v6

-    ;# set 3

-    vsldoi  v10, v8, v9, 3

-    vsldoi  v11, v8, v9, 7

-    vmsummbm v6, v4, v10, v12

-    vmsummbm v3, v5, v11, v6

-    vpkswus v0, v0, v1          ;# v0 = 0 4 8 C 1 5 9 D (16-bit)

-    vpkswus v1, v2, v3          ;# v1 = 2 6 A E 3 7 B F

-    vsrh    v0, v0, v13         ;# divide v0, v1 by 128

-    vsrh    v1, v1, v13

-    vpkuhus v0, v0, v1          ;# v0 = scrambled 8-bit result

-    vperm   v0, v0, v0, v14     ;# v0 = correctly-ordered result

-    stvx    v0,  0, r9

-    add     r9, r9, r5

-    add     r3, r3, r4

-    bdnz    horizontal_loop_16x16

-    ;# check again to see if vertical filter needs to be done.

-    cmpi    cr0, r6, 0

-    beq     cr0, end_16x16

-    ;# yes there is, so go to the second pass

-    b       second_pass_16x16

-copy_horizontal_16x21:

-    li      r10, 21

-    mtctr   r10

-    li      r10, 16

-    sub     r3, r3, r4

-    sub     r3, r3, r4

-    ;# this is done above if there is a horizontal filter,

-    ;#  if not it needs to be done down here.

-    slwi    r6, r6, 4           ;# index into vertical filter array

-    ;# always write to the stack when doing a horizontal copy

-    la      r9, 48(r1)

-copy_horizontal_loop_16x21:

-    lvsl    v15,  0, r3         ;# permutate value for alignment

-    lvx     v1,   0, r3

-    lvx     v2, r10, r3

-    vperm   v8, v1, v2, v15

-    stvx    v8,  0, r9

-    addi    r9, r9, 16

-    add     r3, r3, r4

-    bdnz    copy_horizontal_loop_16x21

-second_pass_16x16:

-    ;# always read from the stack when doing a vertical filter

-    la      r9, 48(r1)

-    ;# downshift by 7 ( divide by 128 ) at the end

-    vspltish v7, 7

-    vpre_load

-    luma_vsix

-    luma_vsix

-    luma_vfour

-end_16x16:

-    addi    r1, r1, 416         ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .data

-    .align 4

-HFilter:

-    .byte     0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0

-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .byte     0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12

-    .byte    -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0

-    .byte     2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36

-    .byte    -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0

-    .byte     0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50

-    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0

-    .byte     3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77

-    .byte   -16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0

-    .byte     0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93

-    .byte    -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0

-    .byte     1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108

-    .byte   -11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0

-    .byte     0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123

-    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0

-    .align 4

-VFilter:

-    .byte     0,  0,128,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .byte     0,  6,123, 12,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .byte     2, 11,108, 36,  8,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .byte     0,  9, 93, 50,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .byte     3, 16, 77, 77, 16,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .byte     0,  6, 50, 93,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .byte     1,  8, 36,108, 11,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .byte     0,  1, 12,123,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .align 4

-b_hperm:

-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15

-    .align 4

-B_0123:

-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6

-    .align 4

-B_4567:

-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10

-    .align 4

-B_89AB:

-    .byte     8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14

-    .align 4

-b_hilo:

-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

-    .align 4

-b_hilo_4x4:

-    .byte     0,  1,  2,  3, 16, 17, 18, 19,  0,  0,  0,  0,  0,  0,  0,  0

--- a/vp8/common/ppc/filter_bilinear_altivec.asm

+++ /dev/null

@@ -1,677 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl bilinear_predict4x4_ppc

-    .globl bilinear_predict8x4_ppc

-    .globl bilinear_predict8x8_ppc

-    .globl bilinear_predict16x16_ppc

-.macro load_c V, LABEL, OFF, R0, R1

-    lis     \R0, \LABEL@ha

-    la      \R1, \LABEL@l(\R0)

-    lvx     \V, \OFF, \R1

-.endm

-.macro load_vfilter V0, V1

-    load_c \V0, vfilter_b, r6, r9, r10

-    addi    r6,  r6, 16

-    lvx     \V1, r6, r10

-.endm

-.macro HProlog jump_label

-    ;# load up horizontal filter

-    slwi.   r5, r5, 4           ;# index into horizontal filter array

-    ;# index to the next set of vectors in the row.

-    li      r10, 16

-    li      r12, 32

-    ;# downshift by 7 ( divide by 128 ) at the end

-    vspltish v19, 7

-    ;# If there isn't any filtering to be done for the horizontal, then

-    ;#  just skip to the second pass.

-    beq     \jump_label

-    load_c v20, hfilter_b, r5, r9, r0

-    ;# setup constants

-    ;# v14 permutation value for alignment

-    load_c v28, b_hperm_b, 0, r9, r0

-    ;# rounding added in on the multiply

-    vspltisw v21, 8

-    vspltisw v18, 3

-    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040

-    slwi.   r6, r6, 5           ;# index into vertical filter array

-.endm

-;# Filters a horizontal line

-;# expects:

-;#  r3  src_ptr

-;#  r4  pitch

-;#  r10 16

-;#  r12 32

-;#  v17 perm intput

-;#  v18 rounding

-;#  v19 shift

-;#  v20 filter taps

-;#  v21 tmp

-;#  v22 tmp

-;#  v23 tmp

-;#  v24 tmp

-;#  v25 tmp

-;#  v26 tmp

-;#  v27 tmp

-;#  v28 perm output

-;#

-.macro HFilter V

-    vperm   v24, v21, v21, v10  ;# v20 = 0123 1234 2345 3456

-    vperm   v25, v21, v21, v11  ;# v21 = 4567 5678 6789 789A

-    vmsummbm v24, v20, v24, v18

-    vmsummbm v25, v20, v25, v18

-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)

-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128

-    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result

-.endm

-.macro hfilter_8 V, increment_counter

-    lvsl    v17,  0, r3         ;# permutate value for alignment

-    ;# input to filter is 9 bytes wide, output is 8 bytes.

-    lvx     v21,   0, r3

-    lvx     v22, r10, r3

-.if \increment_counter

-    add     r3, r3, r4

-.endif

-    vperm   v21, v21, v22, v17

-    HFilter \V

-.endm

-.macro load_and_align_8 V, increment_counter

-    lvsl    v17,  0, r3         ;# permutate value for alignment

-    ;# input to filter is 21 bytes wide, output is 16 bytes.

-    ;#  input will can span three vectors if not aligned correctly.

-    lvx     v21,   0, r3

-    lvx     v22, r10, r3

-.if \increment_counter

-    add     r3, r3, r4

-.endif

-    vperm   \V, v21, v22, v17

-.endm

-.macro write_aligned_8 V, increment_counter

-    stvx    \V,  0, r7

-.if \increment_counter

-    add     r7, r7, r8

-.endif

-.endm

-.macro vfilter_16 P0 P1

-    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps

-    vadduhm v22, v18, v22

-    vmuloub v23, \P0, v20

-    vadduhm v23, v18, v23

-    vmuleub v24, \P1, v21

-    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary

-    vmuloub v25, \P1, v21

-    vadduhm v23, v23, v25       ;# Ro = odds

-    vsrh    v22, v22, v19       ;# divide by 128

-    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds

-    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order

-    vmrglh  v23, v22, v23

-    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result

-.endm

-.macro w_8x8 V, D, R, P

-    stvx    \V, 0, r1

-    lwz     \R, 0(r1)

-    stw     \R, 0(r7)

-    lwz     \R, 4(r1)

-    stw     \R, 4(r7)

-    add     \D, \D, \P

-.endm

-    .align 2

-;# r3 unsigned char * src

-;# r4 int src_pitch

-;# r5 int x_offset

-;# r6 int y_offset

-;# r7 unsigned char * dst

-;# r8 int dst_pitch

-bilinear_predict4x4_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xf830

-    ori     r12, r12, 0xfff8

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    HProlog second_pass_4x4_pre_copy_b

-    ;# Load up permutation constants

-    load_c v10, b_0123_b, 0, r9, r12

-    load_c v11, b_4567_b, 0, r9, r12

-    hfilter_8 v0, 1

-    hfilter_8 v1, 1

-    hfilter_8 v2, 1

-    hfilter_8 v3, 1

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional line that is needed

-    ;#  for the vertical filter.

-    beq     store_out_4x4_b

-    hfilter_8 v4, 0

-    b   second_pass_4x4_b

-second_pass_4x4_pre_copy_b:

-    slwi    r6, r6, 5           ;# index into vertical filter array

-    load_and_align_8  v0, 1

-    load_and_align_8  v1, 1

-    load_and_align_8  v2, 1

-    load_and_align_8  v3, 1

-    load_and_align_8  v4, 1

-second_pass_4x4_b:

-    vspltish v20, 8

-    vspltish v18, 3

-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    load_vfilter v20, v21

-    vfilter_16 v0,  v1

-    vfilter_16 v1,  v2

-    vfilter_16 v2,  v3

-    vfilter_16 v3,  v4

-store_out_4x4_b:

-    stvx    v0, 0, r1

-    lwz     r0, 0(r1)

-    stw     r0, 0(r7)

-    add     r7, r7, r8

-    stvx    v1, 0, r1

-    lwz     r0, 0(r1)

-    stw     r0, 0(r7)

-    add     r7, r7, r8

-    stvx    v2, 0, r1

-    lwz     r0, 0(r1)

-    stw     r0, 0(r7)

-    add     r7, r7, r8

-    stvx    v3, 0, r1

-    lwz     r0, 0(r1)

-    stw     r0, 0(r7)

-exit_4x4:

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 2

-;# r3 unsigned char * src

-;# r4 int src_pitch

-;# r5 int x_offset

-;# r6 int y_offset

-;# r7 unsigned char * dst

-;# r8 int dst_pitch

-bilinear_predict8x4_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xf830

-    ori     r12, r12, 0xfff8

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    HProlog second_pass_8x4_pre_copy_b

-    ;# Load up permutation constants

-    load_c v10, b_0123_b, 0, r9, r12

-    load_c v11, b_4567_b, 0, r9, r12

-    hfilter_8 v0, 1

-    hfilter_8 v1, 1

-    hfilter_8 v2, 1

-    hfilter_8 v3, 1

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional line that is needed

-    ;#  for the vertical filter.

-    beq     store_out_8x4_b

-    hfilter_8 v4, 0

-    b   second_pass_8x4_b

-second_pass_8x4_pre_copy_b:

-    slwi    r6, r6, 5           ;# index into vertical filter array

-    load_and_align_8  v0, 1

-    load_and_align_8  v1, 1

-    load_and_align_8  v2, 1

-    load_and_align_8  v3, 1

-    load_and_align_8  v4, 1

-second_pass_8x4_b:

-    vspltish v20, 8

-    vspltish v18, 3

-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    load_vfilter v20, v21

-    vfilter_16 v0,  v1

-    vfilter_16 v1,  v2

-    vfilter_16 v2,  v3

-    vfilter_16 v3,  v4

-store_out_8x4_b:

-    cmpi    cr0, r8, 8

-    beq     cr0, store_aligned_8x4_b

-    w_8x8   v0, r7, r0, r8

-    w_8x8   v1, r7, r0, r8

-    w_8x8   v2, r7, r0, r8

-    w_8x8   v3, r7, r0, r8

-    b       exit_8x4

-store_aligned_8x4_b:

-    load_c v10, b_hilo_b, 0, r9, r10

-    vperm   v0, v0, v1, v10

-    vperm   v2, v2, v3, v10

-    stvx    v0, 0, r7

-    addi    r7, r7, 16

-    stvx    v2, 0, r7

-exit_8x4:

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 2

-;# r3 unsigned char * src

-;# r4 int src_pitch

-;# r5 int x_offset

-;# r6 int y_offset

-;# r7 unsigned char * dst

-;# r8 int dst_pitch

-bilinear_predict8x8_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xfff0

-    ori     r12, r12, 0xffff

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    HProlog second_pass_8x8_pre_copy_b

-    ;# Load up permutation constants

-    load_c v10, b_0123_b, 0, r9, r12

-    load_c v11, b_4567_b, 0, r9, r12

-    hfilter_8 v0, 1

-    hfilter_8 v1, 1

-    hfilter_8 v2, 1

-    hfilter_8 v3, 1

-    hfilter_8 v4, 1

-    hfilter_8 v5, 1

-    hfilter_8 v6, 1

-    hfilter_8 v7, 1

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional line that is needed

-    ;#  for the vertical filter.

-    beq     store_out_8x8_b

-    hfilter_8 v8, 0

-    b   second_pass_8x8_b

-second_pass_8x8_pre_copy_b:

-    slwi    r6, r6, 5           ;# index into vertical filter array

-    load_and_align_8  v0, 1

-    load_and_align_8  v1, 1

-    load_and_align_8  v2, 1

-    load_and_align_8  v3, 1

-    load_and_align_8  v4, 1

-    load_and_align_8  v5, 1

-    load_and_align_8  v6, 1

-    load_and_align_8  v7, 1

-    load_and_align_8  v8, 0

-second_pass_8x8_b:

-    vspltish v20, 8

-    vspltish v18, 3

-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    load_vfilter v20, v21

-    vfilter_16 v0,  v1

-    vfilter_16 v1,  v2

-    vfilter_16 v2,  v3

-    vfilter_16 v3,  v4

-    vfilter_16 v4,  v5

-    vfilter_16 v5,  v6

-    vfilter_16 v6,  v7

-    vfilter_16 v7,  v8

-store_out_8x8_b:

-    cmpi    cr0, r8, 8

-    beq     cr0, store_aligned_8x8_b

-    w_8x8   v0, r7, r0, r8

-    w_8x8   v1, r7, r0, r8

-    w_8x8   v2, r7, r0, r8

-    w_8x8   v3, r7, r0, r8

-    w_8x8   v4, r7, r0, r8

-    w_8x8   v5, r7, r0, r8

-    w_8x8   v6, r7, r0, r8

-    w_8x8   v7, r7, r0, r8

-    b       exit_8x8

-store_aligned_8x8_b:

-    load_c v10, b_hilo_b, 0, r9, r10

-    vperm   v0, v0, v1, v10

-    vperm   v2, v2, v3, v10

-    vperm   v4, v4, v5, v10

-    vperm   v6, v6, v7, v10

-    stvx    v0, 0, r7

-    addi    r7, r7, 16

-    stvx    v2, 0, r7

-    addi    r7, r7, 16

-    stvx    v4, 0, r7

-    addi    r7, r7, 16

-    stvx    v6, 0, r7

-exit_8x8:

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-;# Filters a horizontal line

-;# expects:

-;#  r3  src_ptr

-;#  r4  pitch

-;#  r10 16

-;#  r12 32

-;#  v17 perm intput

-;#  v18 rounding

-;#  v19 shift

-;#  v20 filter taps

-;#  v21 tmp

-;#  v22 tmp

-;#  v23 tmp

-;#  v24 tmp

-;#  v25 tmp

-;#  v26 tmp

-;#  v27 tmp

-;#  v28 perm output

-;#

-.macro hfilter_16 V, increment_counter

-    lvsl    v17,  0, r3         ;# permutate value for alignment

-    ;# input to filter is 21 bytes wide, output is 16 bytes.

-    ;#  input will can span three vectors if not aligned correctly.

-    lvx     v21,   0, r3

-    lvx     v22, r10, r3

-    lvx     v23, r12, r3

-.if \increment_counter

-    add     r3, r3, r4

-.endif

-    vperm   v21, v21, v22, v17

-    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified

-    ;# set 0

-    vmsummbm v24, v20, v21, v18 ;# taps times elements

-    ;# set 1

-    vsldoi  v23, v21, v22, 1

-    vmsummbm v25, v20, v23, v18

-    ;# set 2

-    vsldoi  v23, v21, v22, 2

-    vmsummbm v26, v20, v23, v18

-    ;# set 3

-    vsldoi  v23, v21, v22, 3

-    vmsummbm v27, v20, v23, v18

-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)

-    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F

-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128

-    vsrh    v25, v25, v19

-    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result

-    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result

-.endm

-.macro load_and_align_16 V, increment_counter

-    lvsl    v17,  0, r3         ;# permutate value for alignment

-    ;# input to filter is 21 bytes wide, output is 16 bytes.

-    ;#  input will can span three vectors if not aligned correctly.

-    lvx     v21,   0, r3

-    lvx     v22, r10, r3

-.if \increment_counter

-    add     r3, r3, r4

-.endif

-    vperm   \V, v21, v22, v17

-.endm

-.macro write_16 V, increment_counter

-    stvx    \V,  0, r7

-.if \increment_counter

-    add     r7, r7, r8

-.endif

-.endm

-    .align 2

-;# r3 unsigned char * src

-;# r4 int src_pitch

-;# r5 int x_offset

-;# r6 int y_offset

-;# r7 unsigned char * dst

-;# r8 int dst_pitch

-bilinear_predict16x16_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xfff8

-    mtspr   256, r12            ;# set VRSAVE

-    HProlog second_pass_16x16_pre_copy_b

-    hfilter_16 v0,  1

-    hfilter_16 v1,  1

-    hfilter_16 v2,  1

-    hfilter_16 v3,  1

-    hfilter_16 v4,  1

-    hfilter_16 v5,  1

-    hfilter_16 v6,  1

-    hfilter_16 v7,  1

-    hfilter_16 v8,  1

-    hfilter_16 v9,  1

-    hfilter_16 v10, 1

-    hfilter_16 v11, 1

-    hfilter_16 v12, 1

-    hfilter_16 v13, 1

-    hfilter_16 v14, 1

-    hfilter_16 v15, 1

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional line that is needed

-    ;#  for the vertical filter.

-    beq     store_out_16x16_b

-    hfilter_16 v16, 0

-    b   second_pass_16x16_b

-second_pass_16x16_pre_copy_b:

-    slwi    r6, r6, 5           ;# index into vertical filter array

-    load_and_align_16  v0,  1

-    load_and_align_16  v1,  1

-    load_and_align_16  v2,  1

-    load_and_align_16  v3,  1

-    load_and_align_16  v4,  1

-    load_and_align_16  v5,  1

-    load_and_align_16  v6,  1

-    load_and_align_16  v7,  1

-    load_and_align_16  v8,  1

-    load_and_align_16  v9,  1

-    load_and_align_16  v10, 1

-    load_and_align_16  v11, 1

-    load_and_align_16  v12, 1

-    load_and_align_16  v13, 1

-    load_and_align_16  v14, 1

-    load_and_align_16  v15, 1

-    load_and_align_16  v16, 0

-second_pass_16x16_b:

-    vspltish v20, 8

-    vspltish v18, 3

-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    load_vfilter v20, v21

-    vfilter_16 v0,  v1

-    vfilter_16 v1,  v2

-    vfilter_16 v2,  v3

-    vfilter_16 v3,  v4

-    vfilter_16 v4,  v5

-    vfilter_16 v5,  v6

-    vfilter_16 v6,  v7

-    vfilter_16 v7,  v8

-    vfilter_16 v8,  v9

-    vfilter_16 v9,  v10

-    vfilter_16 v10, v11

-    vfilter_16 v11, v12

-    vfilter_16 v12, v13

-    vfilter_16 v13, v14

-    vfilter_16 v14, v15

-    vfilter_16 v15, v16

-store_out_16x16_b:

-    write_16 v0,  1

-    write_16 v1,  1

-    write_16 v2,  1

-    write_16 v3,  1

-    write_16 v4,  1

-    write_16 v5,  1

-    write_16 v6,  1

-    write_16 v7,  1

-    write_16 v8,  1

-    write_16 v9,  1

-    write_16 v10, 1

-    write_16 v11, 1

-    write_16 v12, 1

-    write_16 v13, 1

-    write_16 v14, 1

-    write_16 v15, 0

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .data

-    .align 4

-hfilter_b:

-    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0

-    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0

-    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0

-    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0

-    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0

-    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0

-    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0

-    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0

-    .align 4

-vfilter_b:

-    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128

-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112

-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96

-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32

-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80

-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48

-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64

-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64

-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48

-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80

-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32

-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96

-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112

-    .align 4

-b_hperm_b:

-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15

-    .align 4

-b_0123_b:

-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6

-    .align 4

-b_4567_b:

-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10

-b_hilo_b:

-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

--- a/vp8/common/ppc/idctllm_altivec.asm

+++ /dev/null

@@ -1,189 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl short_idct4x4llm_ppc

-.macro load_c V, LABEL, OFF, R0, R1

-    lis     \R0, \LABEL@ha

-    la      \R1, \LABEL@l(\R0)

-    lvx     \V, \OFF, \R1

-.endm

-;# r3 short *input

-;# r4 short *output

-;# r5 int pitch

-    .align 2

-short_idct4x4llm_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xfff8

-    mtspr   256, r12            ;# set VRSAVE

-    load_c v8, sinpi8sqrt2, 0, r9, r10

-    load_c v9, cospi8sqrt2minus1, 0, r9, r10

-    load_c v10, hi_hi, 0, r9, r10

-    load_c v11, lo_lo, 0, r9, r10

-    load_c v12, shift_16, 0, r9, r10

-    li      r10,  16

-    lvx     v0,   0, r3         ;# input ip[0], ip[ 4]

-    lvx     v1, r10, r3         ;# input ip[8], ip[12]

-    ;# first pass

-    vupkhsh v2, v0

-    vupkhsh v3, v1

-    vaddsws v6, v2, v3          ;# a1 = ip[0]+ip[8]

-    vsubsws v7, v2, v3          ;# b1 = ip[0]-ip[8]

-    vupklsh v0, v0

-    vmulosh v4, v0, v8

-    vsraw   v4, v4, v12

-    vaddsws v4, v4, v0          ;# ip[ 4] * sin(pi/8) * sqrt(2)

-    vupklsh v1, v1

-    vmulosh v5, v1, v9

-    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)

-    vaddsws v5, v5, v1

-    vsubsws v4, v4, v5          ;# c1

-    vmulosh v3, v1, v8

-    vsraw   v3, v3, v12

-    vaddsws v3, v3, v1          ;# ip[12] * sin(pi/8) * sqrt(2)

-    vmulosh v5, v0, v9

-    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)

-    vaddsws v5, v5, v0

-    vaddsws v3, v3, v5          ;# d1

-    vaddsws v0, v6, v3          ;# a1 + d1

-    vsubsws v3, v6, v3          ;# a1 - d1

-    vaddsws v1, v7, v4          ;# b1 + c1

-    vsubsws v2, v7, v4          ;# b1 - c1

-    ;# transpose input

-    vmrghw  v4, v0, v1          ;# a0 b0 a1 b1

-    vmrghw  v5, v2, v3          ;# c0 d0 c1 d1

-    vmrglw  v6, v0, v1          ;# a2 b2 a3 b3

-    vmrglw  v7, v2, v3          ;# c2 d2 c3 d3

-    vperm   v0, v4, v5, v10     ;# a0 b0 c0 d0

-    vperm   v1, v4, v5, v11     ;# a1 b1 c1 d1

-    vperm   v2, v6, v7, v10     ;# a2 b2 c2 d2

-    vperm   v3, v6, v7, v11     ;# a3 b3 c3 d3

-    ;# second pass

-    vaddsws v6, v0, v2          ;# a1 = ip[0]+ip[8]

-    vsubsws v7, v0, v2          ;# b1 = ip[0]-ip[8]

-    vmulosh v4, v1, v8

-    vsraw   v4, v4, v12

-    vaddsws v4, v4, v1          ;# ip[ 4] * sin(pi/8) * sqrt(2)

-    vmulosh v5, v3, v9

-    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)

-    vaddsws v5, v5, v3

-    vsubsws v4, v4, v5          ;# c1

-    vmulosh v2, v3, v8

-    vsraw   v2, v2, v12

-    vaddsws v2, v2, v3          ;# ip[12] * sin(pi/8) * sqrt(2)

-    vmulosh v5, v1, v9

-    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)

-    vaddsws v5, v5, v1

-    vaddsws v3, v2, v5          ;# d1

-    vaddsws v0, v6, v3          ;# a1 + d1

-    vsubsws v3, v6, v3          ;# a1 - d1

-    vaddsws v1, v7, v4          ;# b1 + c1

-    vsubsws v2, v7, v4          ;# b1 - c1

-    vspltish v6, 4

-    vspltish v7, 3

-    vpkswss v0, v0, v1

-    vpkswss v1, v2, v3

-    vaddshs v0, v0, v6

-    vaddshs v1, v1, v6

-    vsrah   v0, v0, v7

-    vsrah   v1, v1, v7

-    ;# transpose output

-    vmrghh  v2, v0, v1          ;# a0 c0 a1 c1 a2 c2 a3 c3

-    vmrglh  v3, v0, v1          ;# b0 d0 b1 d1 b2 d2 b3 d3

-    vmrghh  v0, v2, v3          ;# a0 b0 c0 d0 a1 b1 c1 d1

-    vmrglh  v1, v2, v3          ;# a2 b2 c2 d2 a3 b3 c3 d3

-    stwu    r1,-416(r1)         ;# create space on the stack

-    stvx    v0,  0, r1

-    lwz     r6, 0(r1)

-    stw     r6, 0(r4)

-    lwz     r6, 4(r1)

-    stw     r6, 4(r4)

-    add     r4, r4, r5

-    lwz     r6,  8(r1)

-    stw     r6,  0(r4)

-    lwz     r6, 12(r1)

-    stw     r6,  4(r4)

-    add     r4, r4, r5

-    stvx    v1,  0, r1

-    lwz     r6, 0(r1)

-    stw     r6, 0(r4)

-    lwz     r6, 4(r1)

-    stw     r6, 4(r4)

-    add     r4, r4, r5

-    lwz     r6,  8(r1)

-    stw     r6,  0(r4)

-    lwz     r6, 12(r1)

-    stw     r6,  4(r4)

-    addi    r1, r1, 416         ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 4

-sinpi8sqrt2:

-    .short  35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468

-    .align 4

-cospi8sqrt2minus1:

-    .short  20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091

-    .align 4

-shift_16:

-    .long      16,    16,    16,    16

-    .align 4

-hi_hi:

-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

-    .align 4

-lo_lo:

-    .byte     8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31

--- a/vp8/common/ppc/loopfilter_altivec.c

+++ /dev/null

@@ -1,127 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "loopfilter.h"

-#include "onyxc_int.h"

-typedef void loop_filter_function_y_ppc

-(

-  unsigned char *s,   // source pointer

-  int p,              // pitch

-  const signed char *flimit,

-  const signed char *limit,

-  const signed char *thresh

-);

-typedef void loop_filter_function_uv_ppc

-(

-  unsigned char *u,   // source pointer

-  unsigned char *v,   // source pointer

-  int p,              // pitch

-  const signed char *flimit,

-  const signed char *limit,

-  const signed char *thresh

-);

-typedef void loop_filter_function_s_ppc

-(

-  unsigned char *s,   // source pointer

-  int p,              // pitch

-  const signed char *flimit

-);

-loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc;

-loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc;

-loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc;

-loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc;

-loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc;

-loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc;

-loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc;

-loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc;

-loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc;

-loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;

-// Horizontal MB filtering

-void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                         int y_stride, int uv_stride, loop_filter_info *lfi) {

-  mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);

-  if (u_ptr)

-    mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);

-}

-void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                          int y_stride, int uv_stride, loop_filter_info *lfi) {

-  (void)u_ptr;

-  (void)v_ptr;

-  (void)uv_stride;

-  loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim);

-}

-// Vertical MB Filtering

-void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                         int y_stride, int uv_stride, loop_filter_info *lfi) {

-  mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);

-  if (u_ptr)

-    mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);

-}

-void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                          int y_stride, int uv_stride, loop_filter_info *lfi) {

-  (void)u_ptr;

-  (void)v_ptr;

-  (void)uv_stride;

-  loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim);

-}

-// Horizontal B Filtering

-void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                        int y_stride, int uv_stride, loop_filter_info *lfi) {

-  // These should all be done at once with one call, instead of 3

-  loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);

-  loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);

-  loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);

-  if (u_ptr)

-    loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);

-}

-void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                         int y_stride, int uv_stride, loop_filter_info *lfi) {

-  (void)u_ptr;

-  (void)v_ptr;

-  (void)uv_stride;

-  loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim);

-  loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim);

-  loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim);

-}

-// Vertical B Filtering

-void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                        int y_stride, int uv_stride, loop_filter_info *lfi) {

-  loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);

-  if (u_ptr)

-    loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);

-}

-void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

-                         int y_stride, int uv_stride, loop_filter_info *lfi) {

-  (void)u_ptr;

-  (void)v_ptr;

-  (void)uv_stride;

-  loop_filter_simple_vertical_edge_ppc(y_ptr + 4,  y_stride, lfi->flim);

-  loop_filter_simple_vertical_edge_ppc(y_ptr + 8,  y_stride, lfi->flim);

-  loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim);

-}

--- a/vp8/common/ppc/loopfilter_filters_altivec.asm

+++ /dev/null

@@ -1,1253 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl mbloop_filter_horizontal_edge_y_ppc

-    .globl loop_filter_horizontal_edge_y_ppc

-    .globl mbloop_filter_vertical_edge_y_ppc

-    .globl loop_filter_vertical_edge_y_ppc

-    .globl mbloop_filter_horizontal_edge_uv_ppc

-    .globl loop_filter_horizontal_edge_uv_ppc

-    .globl mbloop_filter_vertical_edge_uv_ppc

-    .globl loop_filter_vertical_edge_uv_ppc

-    .globl loop_filter_simple_horizontal_edge_ppc

-    .globl loop_filter_simple_vertical_edge_ppc

-    .text

-;# We often need to perform transposes (and other transpose-like operations)

-;#   on matrices of data.  This is simplified by the fact that we usually

-;#   operate on hunks of data whose dimensions are powers of 2, or at least

-;#   divisible by highish powers of 2.

-;#

-;#   These operations can be very confusing.  They become more straightforward

-;#   when we think of them as permutations of address bits: Concatenate a

-;#   group of vector registers and think of it as occupying a block of

-;#   memory beginning at address zero.  The low four bits 0...3 of the

-;#   address then correspond to position within a register, the higher-order

-;#   address bits select the register.

-;#

-;#   Although register selection, at the code level, is arbitrary, things

-;#   are simpler if we use contiguous ranges of register numbers, simpler

-;#   still if the low-order bits of the register number correspond to

-;#   conceptual address bits.  We do this whenever reasonable.

-;#

-;#   A 16x16 transpose can then be thought of as an operation on

-;#   a 256-element block of memory.  It takes 8 bits 0...7 to address this

-;#   memory and the effect of a transpose is to interchange address bit

-;#   0 with 4, 1 with 5, 2 with 6, and 3 with 7.  Bits 0...3 index the

-;#   column, which is interchanged with the row addressed by bits 4..7.

-;#

-;#   The altivec merge instructions provide a rapid means of effecting

-;#   many of these transforms.  They operate at three widths (8,16,32).

-;#   Writing V(x) for vector register #x, paired merges permute address

-;#   indices as follows.

-;#

-;#   0->1  1->2  2->3  3->(4+d)  (4+s)->0:

-;#

-;#      vmrghb  V( x),          V( y), V( y + (1<<s))

-;#      vmrglb  V( x + (1<<d)), V( y), V( y + (1<<s))

-;#

-;#

-;#   =0=   1->2  2->3  3->(4+d)  (4+s)->1:

-;#

-;#      vmrghh  V( x),          V( y), V( y + (1<<s))

-;#      vmrglh  V( x + (1<<d)), V( y), V( y + (1<<s))

-;#

-;#

-;#   =0=   =1=   2->3  3->(4+d)  (4+s)->2:

-;#

-;#      vmrghw  V( x),          V( y), V( y + (1<<s))

-;#      vmrglw  V( x + (1<<d)), V( y), V( y + (1<<s))

-;#

-;#

-;#   Unfortunately, there is no doubleword merge instruction.

-;#   The following sequence uses "vperm" is a substitute.

-;#   Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c)

-;#   are in registers Vhihi and Vlolo, we can also effect the permutation

-;#

-;#   =0=   =1=   =2=   3->(4+d)  (4+s)->3   by the sequence:

-;#

-;#      vperm   V( x),          V( y), V( y + (1<<s)), Vhihi

-;#      vperm   V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo

-;#

-;#

-;#   Except for bits s and d, the other relationships between register

-;#   number (= high-order part of address) bits are at the disposal of

-;#   the programmer.

-;#

-;# To avoid excess transposes, we filter all 3 vertical luma subblock

-;#   edges together.  This requires a single 16x16 transpose, which, in

-;#   the above language, amounts to the following permutation of address

-;#   indices:  0<->4   1<->5  2<->6  3<->7, which we accomplish by

-;#   4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0.

-;#

-;#   Except for the fact that the destination registers get written

-;#   before we are done referencing the old contents, the cyclic transform

-;#   is effected by

-;#

-;#      x = 0;  do {

-;#          vmrghb V(2x),   V(x), V(x+8);

-;#          vmrghb V(2x+1), V(x), V(x+8);

-;#      } while( ++x < 8);

-;#

-;#   For clarity, and because we can afford it, we do this transpose

-;#   using all 32 registers, alternating the banks 0..15  and  16 .. 31,

-;#   leaving the final result in 16 .. 31, as the lower registers are

-;#   used in the filtering itself.

-;#

-.macro Tpair A, B, X, Y

-    vmrghb  \A, \X, \Y

-    vmrglb  \B, \X, \Y

-.endm

-;# Each step takes 8*2 = 16 instructions

-.macro t16_even

-    Tpair v16,v17,  v0,v8

-    Tpair v18,v19,  v1,v9

-    Tpair v20,v21,  v2,v10

-    Tpair v22,v23,  v3,v11

-    Tpair v24,v25,  v4,v12

-    Tpair v26,v27,  v5,v13

-    Tpair v28,v29,  v6,v14

-    Tpair v30,v31,  v7,v15

-.endm

-.macro t16_odd

-    Tpair v0,v1, v16,v24

-    Tpair v2,v3, v17,v25

-    Tpair v4,v5, v18,v26

-    Tpair v6,v7, v19,v27

-    Tpair v8,v9, v20,v28

-    Tpair v10,v11, v21,v29

-    Tpair v12,v13, v22,v30

-    Tpair v14,v15, v23,v31

-.endm

-;# Whole transpose takes 4*16 = 64 instructions

-.macro t16_full

-    t16_odd

-    t16_even

-    t16_odd

-    t16_even

-.endm

-;# Vertical edge filtering requires transposes.  For the simple filter,

-;#   we need to convert 16 rows of 4 pels each into 4 registers of 16 pels

-;#   each.  Writing 0 ... 63 for the pixel indices, the desired result is:

-;#

-;#  v0 =  0  1 ... 14 15

-;#  v1 = 16 17 ... 30 31

-;#  v2 = 32 33 ... 47 48

-;#  v3 = 49 50 ... 62 63

-;#

-;#  In frame-buffer memory, the layout is:

-;#

-;#     0  16  32  48

-;#     1  17  33  49

-;#     ...

-;#    15  31  47  63.

-;#

-;#  We begin by reading the data 32 bits at a time (using scalar operations)

-;#  into a temporary array, reading the rows of the array into vector registers,

-;#  with the following layout:

-;#

-;#  v0 =  0 16 32 48  4 20 36 52  8 24 40 56  12 28 44 60

-;#  v1 =  1 17 33 49  5 21 ...                      45 61

-;#  v2 =  2 18 ...                                  46 62

-;#  v3 =  3 19 ...                                  47 63

-;#

-;#  From the "address-bit" perspective discussed above, we simply need to

-;#  interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone.

-;#  In other words, we transpose each of the four 4x4 submatrices.

-;#

-;#  This transformation is its own inverse, and we need to perform it

-;#  again before writing the pixels back into the frame buffer.

-;#

-;#  It acts in place on registers v0...v3, uses v4...v7 as temporaries,

-;#  and assumes that v14/v15 contain the b_hihi/b_lolo selectors

-;#  defined above.  We think of both groups of 4 registers as having

-;#  "addresses" {0,1,2,3} * 16.

-;#

-.macro Transpose4times4x4 Vlo, Vhi

-    ;# d=s=0        0->1  1->2  2->3  3->4  4->0  =5=

-    vmrghb  v4, v0, v1

-    vmrglb  v5, v0, v1

-    vmrghb  v6, v2, v3

-    vmrglb  v7, v2, v3

-    ;# d=0 s=1      =0=   1->2  2->3  3->4  4->5  5->1

-    vmrghh  v0, v4, v6

-    vmrglh  v1, v4, v6

-    vmrghh  v2, v5, v7

-    vmrglh  v3, v5, v7

-    ;# d=s=0        =0=   =1=   2->3  3->4  4->2  =5=

-    vmrghw  v4, v0, v1

-    vmrglw  v5, v0, v1

-    vmrghw  v6, v2, v3

-    vmrglw  v7, v2, v3

-    ;# d=0  s=1     =0=   =1=   =2=   3->4  4->5  5->3

-    vperm   v0, v4, v6, \Vlo

-    vperm   v1, v4, v6, \Vhi

-    vperm   v2, v5, v7, \Vlo

-    vperm   v3, v5, v7, \Vhi

-.endm

-;# end Transpose4times4x4

-;# Normal mb vertical edge filter transpose.

-;#

-;#   We read 8 columns of data, initially in the following pattern:

-;#

-;#  (0,0)  (1,0) ... (7,0)  (0,1)  (1,1) ... (7,1)

-;#  (0,2)  (1,2) ... (7,2)  (0,3)  (1,3) ... (7,3)

-;#  ...

-;#  (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15)

-;#

-;#   and wish to convert to:

-;#

-;#  (0,0) ... (0,15)

-;#  (1,0) ... (1,15)

-;#  ...

-;#  (7,0) ... (7,15).

-;#

-;#  In "address bit" language, we wish to map

-;#

-;#  0->4  1->5  2->6  3->0  4->1  5->2  6->3, i.e., I -> (I+4) mod 7.

-;#

-;#  This can be accomplished by 4 iterations of the cyclic transform

-;#

-;#  I -> (I+1) mod 7;

-;#

-;#  each iteration can be realized by (d=0, s=2):

-;#

-;#  x = 0;  do  Tpair( V(2x),V(2x+1),  V(x),V(x+4))  while( ++x < 4);

-;#

-;#  The input/output is in registers v0...v7.  We use v10...v17 as mirrors;

-;#  preserving v8 = sign converter.

-;#

-;#  Inverse transpose is similar, except here I -> (I+3) mod 7 and the

-;#  result lands in the "mirror" registers v10...v17

-;#

-.macro t8x16_odd

-    Tpair v10, v11,  v0, v4

-    Tpair v12, v13,  v1, v5

-    Tpair v14, v15,  v2, v6

-    Tpair v16, v17,  v3, v7

-.endm

-.macro t8x16_even

-    Tpair v0, v1,  v10, v14

-    Tpair v2, v3,  v11, v15

-    Tpair v4, v5,  v12, v16

-    Tpair v6, v7,  v13, v17

-.endm

-.macro transpose8x16_fwd

-    t8x16_odd

-    t8x16_even

-    t8x16_odd

-    t8x16_even

-.endm

-.macro transpose8x16_inv

-    t8x16_odd

-    t8x16_even

-    t8x16_odd

-.endm

-.macro Transpose16x16

-    vmrghb  v0, v16, v24

-    vmrglb  v1, v16, v24

-    vmrghb  v2, v17, v25

-    vmrglb  v3, v17, v25

-    vmrghb  v4, v18, v26

-    vmrglb  v5, v18, v26

-    vmrghb  v6, v19, v27

-    vmrglb  v7, v19, v27

-    vmrghb  v8, v20, v28

-    vmrglb  v9, v20, v28

-    vmrghb  v10, v21, v29

-    vmrglb  v11, v21, v29

-    vmrghb  v12, v22, v30

-    vmrglb  v13, v22, v30

-    vmrghb  v14, v23, v31

-    vmrglb  v15, v23, v31

-    vmrghb  v16, v0, v8

-    vmrglb  v17, v0, v8

-    vmrghb  v18, v1, v9

-    vmrglb  v19, v1, v9

-    vmrghb  v20, v2, v10

-    vmrglb  v21, v2, v10

-    vmrghb  v22, v3, v11

-    vmrglb  v23, v3, v11

-    vmrghb  v24, v4, v12

-    vmrglb  v25, v4, v12

-    vmrghb  v26, v5, v13

-    vmrglb  v27, v5, v13

-    vmrghb  v28, v6, v14

-    vmrglb  v29, v6, v14

-    vmrghb  v30, v7, v15

-    vmrglb  v31, v7, v15

-    vmrghb  v0, v16, v24

-    vmrglb  v1, v16, v24

-    vmrghb  v2, v17, v25

-    vmrglb  v3, v17, v25

-    vmrghb  v4, v18, v26

-    vmrglb  v5, v18, v26

-    vmrghb  v6, v19, v27

-    vmrglb  v7, v19, v27

-    vmrghb  v8, v20, v28

-    vmrglb  v9, v20, v28

-    vmrghb  v10, v21, v29

-    vmrglb  v11, v21, v29

-    vmrghb  v12, v22, v30

-    vmrglb  v13, v22, v30

-    vmrghb  v14, v23, v31

-    vmrglb  v15, v23, v31

-    vmrghb  v16, v0, v8

-    vmrglb  v17, v0, v8

-    vmrghb  v18, v1, v9

-    vmrglb  v19, v1, v9

-    vmrghb  v20, v2, v10

-    vmrglb  v21, v2, v10

-    vmrghb  v22, v3, v11

-    vmrglb  v23, v3, v11

-    vmrghb  v24, v4, v12

-    vmrglb  v25, v4, v12

-    vmrghb  v26, v5, v13

-    vmrglb  v27, v5, v13

-    vmrghb  v28, v6, v14

-    vmrglb  v29, v6, v14

-    vmrghb  v30, v7, v15

-    vmrglb  v31, v7, v15

-.endm

-;# load_g loads a global vector (whose address is in the local variable Gptr)

-;#   into vector register Vreg.  Trashes r0

-.macro load_g Vreg, Gptr

-    lwz     r0, \Gptr

-    lvx     \Vreg, 0, r0

-.endm

-;# exploit the saturation here.  if the answer is negative

-;# it will be clamped to 0.  orring 0 with a positive

-;# number will be the positive number (abs)

-;# RES = abs( A-B), trashes TMP

-.macro Abs RES, TMP, A, B

-    vsububs \RES, \A, \B

-    vsububs \TMP, \B, \A

-    vor     \RES, \RES, \TMP

-.endm

-;# RES = Max( RES, abs( A-B)), trashes TMP

-.macro max_abs RES, TMP, A, B

-    vsububs \TMP, \A, \B

-    vmaxub  \RES, \RES, \TMP

-    vsububs \TMP, \B, \A

-    vmaxub  \RES, \RES, \TMP

-.endm

-.macro Masks

-    ;# build masks

-    ;# input is all 8 bit unsigned (0-255).  need to

-    ;# do abs(vala-valb) > limit.  but no need to compare each

-    ;# value to the limit.  find the max of the absolute differences

-    ;# and compare that to the limit.

-    ;# First hev

-    Abs     v14, v13, v2, v3    ;# |P1 - P0|

-    max_abs  v14, v13, v5, v4    ;# |Q1 - Q0|

-    vcmpgtub v10, v14, v10      ;# HEV = true if thresh exceeded

-    ;# Next limit

-    max_abs  v14, v13, v0, v1    ;# |P3 - P2|

-    max_abs  v14, v13, v1, v2    ;# |P2 - P1|

-    max_abs  v14, v13, v6, v5    ;# |Q2 - Q1|

-    max_abs  v14, v13, v7, v6    ;# |Q3 - Q2|

-    vcmpgtub v9, v14, v9        ;# R = true if limit exceeded

-    ;# flimit

-    Abs     v14, v13, v3, v4    ;# |P0 - Q0|

-    vcmpgtub v8, v14, v8        ;# X = true if flimit exceeded

-    vor     v8, v8, v9          ;# R = true if flimit or limit exceeded

-    ;# done building masks

-.endm

-.macro build_constants RFL, RLI, RTH, FL, LI, TH

-    ;# build constants

-    lvx     \FL, 0, \RFL        ;# flimit

-    lvx     \LI, 0, \RLI        ;# limit

-    lvx     \TH, 0, \RTH        ;# thresh

-    vspltisb v11, 8

-    vspltisb v12, 4

-    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080

-.endm

-.macro load_data_y

-    ;# setup strides/pointers to be able to access

-    ;# all of the data

-    add     r5, r4, r4          ;# r5 = 2 * stride

-    sub     r6, r3, r5          ;# r6 -> 2 rows back

-    neg     r7, r4              ;# r7 = -stride

-    ;# load 16 pixels worth of data to work on

-    sub     r0, r6, r5          ;# r0 -> 4 rows back (temp)

-    lvx     v0,  0, r0          ;# P3  (read only)

-    lvx     v1, r7, r6          ;# P2

-    lvx     v2,  0, r6          ;# P1

-    lvx     v3, r7, r3          ;# P0

-    lvx     v4,  0, r3          ;# Q0

-    lvx     v5, r4, r3          ;# Q1

-    lvx     v6, r5, r3          ;# Q2

-    add     r0, r3, r5          ;# r0 -> 2 rows fwd (temp)

-    lvx     v7, r4, r0          ;# Q3  (read only)

-.endm

-;# Expects

-;#  v10 == HEV

-;#  v13 == tmp

-;#  v14 == tmp

-.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT

-    vxor    \P1, \P1, v11       ;# SP1

-    vxor    \P0, \P0, v11       ;# SP0

-    vxor    \Q0, \Q0, v11       ;# SQ0

-    vxor    \Q1, \Q1, v11       ;# SQ1

-    vsubsbs v13, \P1, \Q1       ;# f  = c (P1 - Q1)

-.if \HEV_PRESENT

-    vand    v13, v13, v10       ;# f &= hev

-.endif

-    vsubsbs v14, \Q0, \P0       ;# -126 <=  X = Q0-P0  <= +126

-    vaddsbs v13, v13, v14

-    vaddsbs v13, v13, v14

-    vaddsbs v13, v13, v14       ;# A = c( c(P1-Q1) + 3*(Q0-P0))

-    vandc   v13, v13, v8        ;# f &= mask

-    vspltisb v8, 3

-    vspltisb v9, 4

-    vaddsbs v14, v13, v9        ;# f1 = c (f+4)

-    vaddsbs v15, v13, v8        ;# f2 = c (f+3)

-    vsrab   v13, v14, v8        ;# f1 >>= 3

-    vsrab   v15, v15, v8        ;# f2 >>= 3

-    vsubsbs \Q0, \Q0, v13       ;# u1 = c (SQ0 - f1)

-    vaddsbs \P0, \P0, v15       ;# u2 = c (SP0 + f2)

-.endm

-.macro vp8_mbfilter

-    Masks

-    ;# start the fitering here

-    vxor    v1, v1, v11         ;# SP2

-    vxor    v2, v2, v11         ;# SP1

-    vxor    v3, v3, v11         ;# SP0

-    vxor    v4, v4, v11         ;# SQ0

-    vxor    v5, v5, v11         ;# SQ1

-    vxor    v6, v6, v11         ;# SQ2

-    ;# add outer taps if we have high edge variance

-    vsubsbs v13, v2, v5         ;# f  = c (SP1-SQ1)

-    vsubsbs v14, v4, v3         ;# SQ0-SP0

-    vaddsbs v13, v13, v14

-    vaddsbs v13, v13, v14

-    vaddsbs v13, v13, v14       ;# f  = c( c(SP1-SQ1) + 3*(SQ0-SP0))

-    vandc   v13, v13, v8        ;# f &= mask

-    vand    v15, v13, v10       ;# f2 = f & hev

-    ;# save bottom 3 bits so that we round one side +4 and the other +3

-    vspltisb v8, 3

-    vspltisb v9, 4

-    vaddsbs v14, v15, v9        ;# f1 = c (f+4)

-    vaddsbs v15, v15, v8        ;# f2 = c (f+3)

-    vsrab   v14, v14, v8        ;# f1 >>= 3

-    vsrab   v15, v15, v8        ;# f2 >>= 3

-    vsubsbs v4, v4, v14         ;# u1 = c (SQ0 - f1)

-    vaddsbs v3, v3, v15         ;# u2 = c (SP0 + f2)

-    ;# only apply wider filter if not high edge variance

-    vandc   v13, v13, v10       ;# f &= ~hev

-    vspltisb v9, 2

-    vnor    v8, v8, v8

-    vsrb    v9, v8, v9          ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f

-    vupkhsb v9, v9              ;# 0x003f003f003f003f003f003f003f003f

-    vspltisb v8, 9

-    ;# roughly 1/7th difference across boundary

-    vspltish v10, 7

-    vmulosb v14, v8, v13        ;# A = c( c(P1-Q1) + 3*(Q0-P0))

-    vmulesb v15, v8, v13

-    vaddshs v14, v14, v9        ;# +=  63

-    vaddshs v15, v15, v9

-    vsrah   v14, v14, v10       ;# >>= 7

-    vsrah   v15, v15, v10

-    vmrglh  v10, v15, v14

-    vmrghh  v15, v15, v14

-    vpkshss v10, v15, v10       ;# X = saturated down to bytes

-    vsubsbs v6, v6, v10         ;# subtract from Q and add to P

-    vaddsbs v1, v1, v10

-    vxor    v6, v6, v11

-    vxor    v1, v1, v11

-    ;# roughly 2/7th difference across boundary

-    vspltish v10, 7

-    vaddubm v12, v8, v8

-    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))

-    vmulesb v15, v12, v13

-    vaddshs v14, v14, v9

-    vaddshs v15, v15, v9

-    vsrah   v14, v14, v10       ;# >>= 7

-    vsrah   v15, v15, v10

-    vmrglh  v10, v15, v14

-    vmrghh  v15, v15, v14

-    vpkshss v10, v15, v10       ;# X = saturated down to bytes

-    vsubsbs v5, v5, v10         ;# subtract from Q and add to P

-    vaddsbs v2, v2, v10

-    vxor    v5, v5, v11

-    vxor    v2, v2, v11

-    ;# roughly 3/7th difference across boundary

-    vspltish v10, 7

-    vaddubm v12, v12, v8

-    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))

-    vmulesb v15, v12, v13

-    vaddshs v14, v14, v9

-    vaddshs v15, v15, v9

-    vsrah   v14, v14, v10       ;# >>= 7

-    vsrah   v15, v15, v10

-    vmrglh  v10, v15, v14

-    vmrghh  v15, v15, v14

-    vpkshss v10, v15, v10       ;# X = saturated down to bytes

-    vsubsbs v4, v4, v10         ;# subtract from Q and add to P

-    vaddsbs v3, v3, v10

-    vxor    v4, v4, v11

-    vxor    v3, v3, v11

-.endm

-.macro SBFilter

-    Masks

-    common_adjust v3, v4, v2, v5, 1

-    ;# outer tap adjustments

-    vspltisb v8, 1

-    vaddubm v13, v13, v8        ;# f  += 1

-    vsrab   v13, v13, v8        ;# f >>= 1

-    vandc   v13, v13, v10       ;# f &= ~hev

-    vsubsbs v5, v5, v13         ;# u1 = c (SQ1 - f)

-    vaddsbs v2, v2, v13         ;# u2 = c (SP1 + f)

-    vxor    v2, v2, v11

-    vxor    v3, v3, v11

-    vxor    v4, v4, v11

-    vxor    v5, v5, v11

-.endm

-    .align 2

-mbloop_filter_horizontal_edge_y_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    mtspr   256, r12            ;# set VRSAVE

-    build_constants r5, r6, r7, v8, v9, v10

-    load_data_y

-    vp8_mbfilter

-    stvx     v1, r7, r6         ;# P2

-    stvx     v2,  0, r6         ;# P1

-    stvx     v3, r7, r3         ;# P0

-    stvx     v4,  0, r3         ;# Q0

-    stvx     v5, r4, r3         ;# Q1

-    stvx     v6, r5, r3         ;# Q2

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 2

-;#  r3 unsigned char *s

-;#  r4 int p

-;#  r5 const signed char *flimit

-;#  r6 const signed char *limit

-;#  r7 const signed char *thresh

-loop_filter_horizontal_edge_y_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    mtspr   256, r12            ;# set VRSAVE

-    build_constants r5, r6, r7, v8, v9, v10

-    load_data_y

-    SBFilter

-    stvx     v2,  0, r6         ;# P1

-    stvx     v3, r7, r3         ;# P0

-    stvx     v4,  0, r3         ;# Q0

-    stvx     v5, r4, r3         ;# Q1

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-;# Filtering a vertical mb.  Each mb is aligned on a 16 byte boundary.

-;#  So we can read in an entire mb aligned.  However if we want to filter the mb

-;#  edge we run into problems.  For the loopfilter we require 4 bytes before the mb

-;#  and 4 after for a total of 8 bytes.  Reading 16 bytes inorder to get 4 is a bit

-;#  of a waste.  So this is an even uglier way to get around that.

-;# Using the regular register file words are read in and then saved back out to

-;#  memory to align and order them up.  Then they are read in using the

-;#  vector register file.

-.macro RLVmb V, R

-    lwzux   r0, r3, r4

-    stw     r0, 4(\R)

-    lwz     r0,-4(r3)

-    stw     r0, 0(\R)

-    lwzux   r0, r3, r4

-    stw     r0,12(\R)

-    lwz     r0,-4(r3)

-    stw     r0, 8(\R)

-    lvx     \V, 0, \R

-.endm

-.macro WLVmb V, R

-    stvx    \V, 0, \R

-    lwz     r0,12(\R)

-    stwux   r0, r3, r4

-    lwz     r0, 8(\R)

-    stw     r0,-4(r3)

-    lwz     r0, 4(\R)

-    stwux   r0, r3, r4

-    lwz     r0, 0(\R)

-    stw     r0,-4(r3)

-.endm

-    .align 2

-;#  r3 unsigned char *s

-;#  r4 int p

-;#  r5 const signed char *flimit

-;#  r6 const signed char *limit

-;#  r7 const signed char *thresh

-mbloop_filter_vertical_edge_y_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xc000

-    mtspr   256, r12            ;# set VRSAVE

-    la      r9, -48(r1)         ;# temporary space for reading in vectors

-    sub     r3, r3, r4

-    RLVmb v0, r9

-    RLVmb v1, r9

-    RLVmb v2, r9

-    RLVmb v3, r9

-    RLVmb v4, r9

-    RLVmb v5, r9

-    RLVmb v6, r9

-    RLVmb v7, r9

-    transpose8x16_fwd

-    build_constants r5, r6, r7, v8, v9, v10

-    vp8_mbfilter

-    transpose8x16_inv

-    add r3, r3, r4

-    neg r4, r4

-    WLVmb v17, r9

-    WLVmb v16, r9

-    WLVmb v15, r9

-    WLVmb v14, r9

-    WLVmb v13, r9

-    WLVmb v12, r9

-    WLVmb v11, r9

-    WLVmb v10, r9

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-.macro RL V, R, P

-    lvx     \V, 0,  \R

-    add     \R, \R, \P

-.endm

-.macro WL V, R, P

-    stvx    \V, 0,  \R

-    add     \R, \R, \P

-.endm

-.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3

-                                ;# K = |P0-P1| already

-    Abs     v14, v13, \Q0, \Q1  ;# M = |Q0-Q1|

-    vmaxub  v14, v14, v4        ;# M = max( |P0-P1|, |Q0-Q1|)

-    vcmpgtub v10, v14, v0

-    Abs     v4, v5, \Q2, \Q3    ;# K = |Q2-Q3| = next |P0-P1]

-    max_abs  v14, v13, \Q1, \Q2  ;# M = max( M, |Q1-Q2|)

-    max_abs  v14, v13, \P1, \P2  ;# M = max( M, |P1-P2|)

-    max_abs  v14, v13, \P2, \P3  ;# M = max( M, |P2-P3|)

-    vmaxub   v14, v14, v4       ;# M = max interior abs diff

-    vcmpgtub v9, v14, v2        ;# M = true if int_l exceeded

-    Abs     v14, v13, \P0, \Q0  ;# X = Abs( P0-Q0)

-    vcmpgtub v8, v14, v3        ;# X = true if edge_l exceeded

-    vor     v8, v8, v9          ;# M = true if edge_l or int_l exceeded

-    ;# replace P1,Q1 w/signed versions

-    common_adjust \P0, \Q0, \P1, \Q1, 1

-    vaddubm v13, v13, v1        ;# -16 <= M <= 15, saturation irrelevant

-    vsrab   v13, v13, v1

-    vandc   v13, v13, v10       ;# adjust P1,Q1 by (M+1)>>1  if ! hev

-    vsubsbs \Q1, \Q1, v13

-    vaddsbs \P1, \P1, v13

-    vxor    \P1, \P1, v11       ;# P1

-    vxor    \P0, \P0, v11       ;# P0

-    vxor    \Q0, \Q0, v11       ;# Q0

-    vxor    \Q1, \Q1, v11       ;# Q1

-.endm

-    .align 2

-;#  r3 unsigned char *s

-;#  r4 int p

-;#  r5 const signed char *flimit

-;#  r6 const signed char *limit

-;#  r7 const signed char *thresh

-loop_filter_vertical_edge_y_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xffff

-    mtspr   256, r12            ;# set VRSAVE

-    addi    r9, r3, 0

-    RL      v16, r9, r4

-    RL      v17, r9, r4

-    RL      v18, r9, r4

-    RL      v19, r9, r4

-    RL      v20, r9, r4

-    RL      v21, r9, r4

-    RL      v22, r9, r4

-    RL      v23, r9, r4

-    RL      v24, r9, r4

-    RL      v25, r9, r4

-    RL      v26, r9, r4

-    RL      v27, r9, r4

-    RL      v28, r9, r4

-    RL      v29, r9, r4

-    RL      v30, r9, r4

-    lvx     v31, 0, r9

-    Transpose16x16

-    vspltisb v1, 1

-    build_constants r5, r6, r7, v3, v2, v0

-    Abs v4, v5, v19, v18                            ;# K(v14) = first |P0-P1|

-    Fil v16, v17, v18, v19,  v20, v21, v22, v23

-    Fil v20, v21, v22, v23,  v24, v25, v26, v27

-    Fil v24, v25, v26, v27,  v28, v29, v30, v31

-    Transpose16x16

-    addi    r9, r3, 0

-    WL      v16, r9, r4

-    WL      v17, r9, r4

-    WL      v18, r9, r4

-    WL      v19, r9, r4

-    WL      v20, r9, r4

-    WL      v21, r9, r4

-    WL      v22, r9, r4

-    WL      v23, r9, r4

-    WL      v24, r9, r4

-    WL      v25, r9, r4

-    WL      v26, r9, r4

-    WL      v27, r9, r4

-    WL      v28, r9, r4

-    WL      v29, r9, r4

-    WL      v30, r9, r4

-    stvx    v31, 0, r9

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

-.macro active_chroma_sel V

-    andi.   r7, r3, 8       ;# row origin modulo 16

-    add     r7, r7, r7      ;# selects selectors

-    lis     r12, _chromaSelectors@ha

-    la      r0,  _chromaSelectors@l(r12)

-    lwzux   r0, r7, r0      ;# leave selector addr in r7

-    lvx     \V, 0, r0       ;# mask to concatenate active U,V pels

-.endm

-.macro hread_uv Dest, U, V, Offs, VMask

-    lvx     \U, \Offs, r3

-    lvx     \V, \Offs, r4

-    vperm   \Dest, \U, \V, \VMask   ;# Dest = active part of U then V

-.endm

-.macro hwrite_uv New, U, V, Offs, Umask, Vmask

-    vperm   \U, \New, \U, \Umask    ;# Combine new pels with siblings

-    vperm   \V, \New, \V, \Vmask

-    stvx    \U, \Offs, r3           ;# Write to frame buffer

-    stvx    \V, \Offs, r4

-.endm

-;# Process U,V in parallel.

-.macro load_chroma_h

-    neg     r9, r5          ;# r9 = -1 * stride

-    add     r8, r9, r9      ;# r8 = -2 * stride

-    add     r10, r5, r5     ;# r10 = 2 * stride

-    active_chroma_sel v12

-    ;# P3, Q3 are read-only; need not save addresses or sibling pels

-    add     r6, r8, r8      ;# r6 = -4 * stride

-    hread_uv v0, v14, v15, r6, v12

-    add     r6, r10, r5     ;# r6 =  3 * stride

-    hread_uv v7, v14, v15, r6, v12

-    ;# Others are read/write; save addresses and sibling pels

-    add     r6, r8, r9      ;# r6 = -3 * stride

-    hread_uv v1, v16, v17, r6,  v12

-    hread_uv v2, v18, v19, r8,  v12

-    hread_uv v3, v20, v21, r9,  v12

-    hread_uv v4, v22, v23, 0,   v12

-    hread_uv v5, v24, v25, r5,  v12

-    hread_uv v6, v26, v27, r10, v12

-.endm

-.macro uresult_sel V

-    load_g   \V, 4(r7)

-.endm

-.macro vresult_sel V

-    load_g   \V, 8(r7)

-.endm

-;# always write P1,P0,Q0,Q1

-.macro store_chroma_h

-    uresult_sel v11

-    vresult_sel v12

-    hwrite_uv v2, v18, v19, r8, v11, v12

-    hwrite_uv v3, v20, v21, r9, v11, v12

-    hwrite_uv v4, v22, v23, 0,  v11, v12

-    hwrite_uv v5, v24, v25, r5, v11, v12

-.endm

-    .align 2

-;#  r3 unsigned char *u

-;#  r4 unsigned char *v

-;#  r5 int p

-;#  r6 const signed char *flimit

-;#  r7 const signed char *limit

-;#  r8 const signed char *thresh

-mbloop_filter_horizontal_edge_uv_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xffff

-    mtspr   256, r12            ;# set VRSAVE

-    build_constants r6, r7, r8, v8, v9, v10

-    load_chroma_h

-    vp8_mbfilter

-    store_chroma_h

-    hwrite_uv v1, v16, v17, r6,  v11, v12    ;# v1 == P2

-    hwrite_uv v6, v26, v27, r10, v11, v12    ;# v6 == Q2

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 2

-;#  r3 unsigned char *u

-;#  r4 unsigned char *v

-;#  r5 int p

-;#  r6 const signed char *flimit

-;#  r7 const signed char *limit

-;#  r8 const signed char *thresh

-loop_filter_horizontal_edge_uv_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xffff

-    mtspr   256, r12            ;# set VRSAVE

-    build_constants r6, r7, r8, v8, v9, v10

-    load_chroma_h

-    SBFilter

-    store_chroma_h

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-.macro R V, R

-    lwzux   r0, r3, r5

-    stw     r0, 4(\R)

-    lwz     r0,-4(r3)

-    stw     r0, 0(\R)

-    lwzux   r0, r4, r5

-    stw     r0,12(\R)

-    lwz     r0,-4(r4)

-    stw     r0, 8(\R)

-    lvx     \V, 0, \R

-.endm

-.macro W V, R

-    stvx    \V, 0, \R

-    lwz     r0,12(\R)

-    stwux   r0, r4, r5

-    lwz     r0, 8(\R)

-    stw     r0,-4(r4)

-    lwz     r0, 4(\R)

-    stwux   r0, r3, r5

-    lwz     r0, 0(\R)

-    stw     r0,-4(r3)

-.endm

-.macro chroma_vread R

-    sub r3, r3, r5          ;# back up one line for simplicity

-    sub r4, r4, r5

-    R v0, \R

-    R v1, \R

-    R v2, \R

-    R v3, \R

-    R v4, \R

-    R v5, \R

-    R v6, \R

-    R v7, \R

-    transpose8x16_fwd

-.endm

-.macro chroma_vwrite R

-    transpose8x16_inv

-    add     r3, r3, r5

-    add     r4, r4, r5

-    neg     r5, r5          ;# Write rows back in reverse order

-    W v17, \R

-    W v16, \R

-    W v15, \R

-    W v14, \R

-    W v13, \R

-    W v12, \R

-    W v11, \R

-    W v10, \R

-.endm

-    .align 2

-;#  r3 unsigned char *u

-;#  r4 unsigned char *v

-;#  r5 int p

-;#  r6 const signed char *flimit

-;#  r7 const signed char *limit

-;#  r8 const signed char *thresh

-mbloop_filter_vertical_edge_uv_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xc000

-    mtspr   256, r12            ;# set VRSAVE

-    la      r9, -48(r1)         ;# temporary space for reading in vectors

-    chroma_vread r9

-    build_constants r6, r7, r8, v8, v9, v10

-    vp8_mbfilter

-    chroma_vwrite r9

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 2

-;#  r3 unsigned char *u

-;#  r4 unsigned char *v

-;#  r5 int p

-;#  r6 const signed char *flimit

-;#  r7 const signed char *limit

-;#  r8 const signed char *thresh

-loop_filter_vertical_edge_uv_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xc000

-    mtspr   256, r12            ;# set VRSAVE

-    la      r9, -48(r1)         ;# temporary space for reading in vectors

-    chroma_vread r9

-    build_constants r6, r7, r8, v8, v9, v10

-    SBFilter

-    chroma_vwrite r9

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=-

-.macro vp8_simple_filter

-    Abs v14, v13, v1, v2    ;# M = abs( P0 - Q0)

-    vcmpgtub v8, v14, v8    ;# v5 = true if _over_ limit

-    ;# preserve unsigned v0 and v3

-    common_adjust v1, v2, v0, v3, 0

-    vxor v1, v1, v11

-    vxor v2, v2, v11        ;# cvt Q0, P0 back to pels

-.endm

-.macro simple_vertical

-    addi    r8,  0, 16

-    addi    r7, r5, 32

-    lvx     v0,  0, r5

-    lvx     v1, r8, r5

-    lvx     v2,  0, r7

-    lvx     v3, r8, r7

-    lis     r12, _B_hihi@ha

-    la      r0,  _B_hihi@l(r12)

-    lvx     v16, 0, r0

-    lis     r12, _B_lolo@ha

-    la      r0,  _B_lolo@l(r12)

-    lvx     v17, 0, r0

-    Transpose4times4x4 v16, v17

-    vp8_simple_filter

-    vxor v0, v0, v11

-    vxor v3, v3, v11        ;# cvt Q0, P0 back to pels

-    Transpose4times4x4 v16, v17

-    stvx    v0,  0, r5

-    stvx    v1, r8, r5

-    stvx    v2,  0, r7

-    stvx    v3, r8, r7

-.endm

-    .align 2

-;#  r3 unsigned char *s

-;#  r4 int p

-;#  r5 const signed char *flimit

-loop_filter_simple_horizontal_edge_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    mtspr   256, r12            ;# set VRSAVE

-    ;# build constants

-    lvx     v8, 0, r5           ;# flimit

-    vspltisb v11, 8

-    vspltisb v12, 4

-    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080

-    neg     r5, r4              ;# r5 = -1 * stride

-    add     r6, r5, r5          ;# r6 = -2 * stride

-    lvx     v0, r6, r3          ;# v0 = P1 = 16 pels two rows above edge

-    lvx     v1, r5, r3          ;# v1 = P0 = 16 pels one row  above edge

-    lvx     v2,  0, r3          ;# v2 = Q0 = 16 pels one row  below edge

-    lvx     v3, r4, r3          ;# v3 = Q1 = 16 pels two rows below edge

-    vp8_simple_filter

-    stvx    v1, r5, r3          ;# store P0

-    stvx    v2,  0, r3          ;# store Q0

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-.macro RLV Offs

-    stw     r0, (\Offs*4)(r5)

-    lwzux   r0, r7, r4

-.endm

-.macro WLV Offs

-    lwz     r0, (\Offs*4)(r5)

-    stwux   r0, r7, r4

-.endm

-    .align 2

-;#  r3 unsigned char *s

-;#  r4 int p

-;#  r5 const signed char *flimit

-loop_filter_simple_vertical_edge_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xc000

-    mtspr   256, r12            ;# set VRSAVE

-    ;# build constants

-    lvx     v8, 0, r5           ;# flimit

-    vspltisb v11, 8

-    vspltisb v12, 4

-    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080

-    la r5, -96(r1)              ;# temporary space for reading in vectors

-    ;# Store 4 pels at word "Offs" in temp array, then advance r7

-    ;#   to next row and read another 4 pels from the frame buffer.

-    subi    r7, r3,  2          ;# r7 -> 2 pels before start

-    lwzx    r0,  0, r7          ;# read first 4 pels

-    ;# 16 unaligned word accesses

-    RLV 0

-    RLV 4

-    RLV 8

-    RLV 12

-    RLV 1

-    RLV 5

-    RLV 9

-    RLV 13

-    RLV 2

-    RLV 6

-    RLV 10

-    RLV 14

-    RLV 3

-    RLV 7

-    RLV 11

-    stw     r0, (15*4)(r5)      ;# write last 4 pels

-    simple_vertical

-    ;# Read temp array, write frame buffer.

-    subi    r7, r3,  2          ;# r7 -> 2 pels before start

-    lwzx    r0,  0, r5          ;# read/write first 4 pels

-    stwx    r0,  0, r7

-    WLV 4

-    WLV 8

-    WLV 12

-    WLV 1

-    WLV 5

-    WLV 9

-    WLV 13

-    WLV 2

-    WLV 6

-    WLV 10

-    WLV 14

-    WLV 3

-    WLV 7

-    WLV 11

-    WLV 15

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .data

-_chromaSelectors:

-    .long   _B_hihi

-    .long   _B_Ures0

-    .long   _B_Vres0

-    .long   0

-    .long   _B_lolo

-    .long   _B_Ures8

-    .long   _B_Vres8

-    .long   0

-    .align 4

-_B_Vres8:

-    .byte   16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15

-    .align 4

-_B_Ures8:

-    .byte   16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,  5,  6,  7

-    .align 4

-_B_lolo:

-    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31

-    .align 4

-_B_Vres0:

-    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31

-    .align 4

-_B_Ures0:

-    .byte    0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31

-    .align 4

-_B_hihi:

-    .byte    0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

--- a/vp8/common/ppc/platform_altivec.asm

+++ /dev/null

@@ -1,59 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl save_platform_context

-    .globl restore_platform_context

-.macro W V P

-    stvx    \V,  0, \P

-    addi    \P, \P, 16

-.endm

-.macro R V P

-    lvx     \V,  0, \P

-    addi    \P, \P, 16

-.endm

-;# r3 context_ptr

-    .align 2

-save_platform_contex:

-    W v20, r3

-    W v21, r3

-    W v22, r3

-    W v23, r3

-    W v24, r3

-    W v25, r3

-    W v26, r3

-    W v27, r3

-    W v28, r3

-    W v29, r3

-    W v30, r3

-    W v31, r3

-    blr

-;# r3 context_ptr

-    .align 2

-restore_platform_context:

-    R v20, r3

-    R v21, r3

-    R v22, r3

-    R v23, r3

-    R v24, r3

-    R v25, r3

-    R v26, r3

-    R v27, r3

-    R v28, r3

-    R v29, r3

-    R v30, r3

-    R v31, r3

-    blr

--- a/vp8/common/ppc/recon_altivec.asm

+++ /dev/null

@@ -1,175 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl recon4b_ppc

-    .globl recon2b_ppc

-    .globl recon_b_ppc

-.macro row_of16 Diff Pred Dst Stride

-    lvx     v1,  0, \Pred           ;# v1 = pred = p0..p15

-    addi    \Pred, \Pred, 16        ;# next pred

-    vmrghb  v2, v0, v1              ;# v2 = 16-bit p0..p7

-    lvx     v3,  0, \Diff           ;# v3 = d0..d7

-    vaddshs v2, v2, v3              ;# v2 = r0..r7

-    vmrglb  v1, v0, v1              ;# v1 = 16-bit p8..p15

-    lvx     v3, r8, \Diff           ;# v3 = d8..d15

-    addi    \Diff, \Diff, 32        ;# next diff

-    vaddshs v3, v3, v1              ;# v3 = r8..r15

-    vpkshus v2, v2, v3              ;# v2 = 8-bit r0..r15

-    stvx    v2,  0, \Dst            ;# to dst

-    add     \Dst, \Dst, \Stride     ;# next dst

-.endm

-    .text

-    .align 2

-;#  r3 = short *diff_ptr,

-;#  r4 = unsigned char *pred_ptr,

-;#  r5 = unsigned char *dst_ptr,

-;#  r6 = int stride

-recon4b_ppc:

-    mfspr   r0, 256                     ;# get old VRSAVE

-    stw     r0, -8(r1)                  ;# save old VRSAVE to stack

-    oris    r0, r0, 0xf000

-    mtspr   256,r0                      ;# set VRSAVE

-    vxor    v0, v0, v0

-    li      r8, 16

-    row_of16 r3, r4, r5, r6

-    row_of16 r3, r4, r5, r6

-    row_of16 r3, r4, r5, r6

-    row_of16 r3, r4, r5, r6

-    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack

-    mtspr   256, r12                    ;# reset old VRSAVE

-    blr

-.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels

-    lvx     v1,  0, \Pred       ;# v1 = pred = p0..p15

-    vmrghb  v2, v0, v1          ;# v2 = 16-bit p0..p7

-    lvx     v3,  0, \Diff       ;# v3 = d0..d7

-    vaddshs v2, v2, v3          ;# v2 = r0..r7

-    vmrglb  v1, v0, v1          ;# v1 = 16-bit p8..p15

-    lvx     v3, r8, \Diff       ;# v2 = d8..d15

-    vaddshs v3, v3, v1          ;# v3 = r8..r15

-    vpkshus v2, v2, v3          ;# v3 = 8-bit r0..r15

-    stvx    v2,  0, r10         ;# 2 rows to dst from buf

-    lwz     r0, 0(r10)

-.if \write_first_four_pels

-    stw     r0, 0(\Dst)

-    .else

-    stwux   r0, \Dst, \Stride

-.endif

-    lwz     r0, 4(r10)

-    stw     r0, 4(\Dst)

-    lwz     r0, 8(r10)

-    stwux   r0, \Dst, \Stride       ;# advance dst to next row

-    lwz     r0, 12(r10)

-    stw     r0, 4(\Dst)

-.endm

-    .align 2

-;#  r3 = short *diff_ptr,

-;#  r4 = unsigned char *pred_ptr,

-;#  r5 = unsigned char *dst_ptr,

-;#  r6 = int stride

-recon2b_ppc:

-    mfspr   r0, 256                     ;# get old VRSAVE

-    stw     r0, -8(r1)                  ;# save old VRSAVE to stack

-    oris    r0, r0, 0xf000

-    mtspr   256,r0                      ;# set VRSAVE

-    vxor    v0, v0, v0

-    li      r8, 16

-    la      r10, -48(r1)                ;# buf

-    two_rows_of8 r3, r4, r5, r6, 1

-    addi    r4, r4, 16;                 ;# next pred

-    addi    r3, r3, 32;                 ;# next diff

-    two_rows_of8 r3, r4, r5, r6, 0

-    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack

-    mtspr   256, r12                    ;# reset old VRSAVE

-    blr

-.macro get_two_diff_rows

-    stw     r0, 0(r10)

-    lwz     r0, 4(r3)

-    stw     r0, 4(r10)

-    lwzu    r0, 32(r3)

-    stw     r0, 8(r10)

-    lwz     r0, 4(r3)

-    stw     r0, 12(r10)

-    lvx     v3, 0, r10

-.endm

-    .align 2

-;#  r3 = short *diff_ptr,

-;#  r4 = unsigned char *pred_ptr,

-;#  r5 = unsigned char *dst_ptr,

-;#  r6 = int stride

-recon_b_ppc:

-    mfspr   r0, 256                     ;# get old VRSAVE

-    stw     r0, -8(r1)                  ;# save old VRSAVE to stack

-    oris    r0, r0, 0xf000

-    mtspr   256,r0                      ;# set VRSAVE

-    vxor    v0, v0, v0

-    la      r10, -48(r1)    ;# buf

-    lwz     r0, 0(r4)

-    stw     r0, 0(r10)

-    lwz     r0, 16(r4)

-    stw     r0, 4(r10)

-    lwz     r0, 32(r4)

-    stw     r0, 8(r10)

-    lwz     r0, 48(r4)

-    stw     r0, 12(r10)

-    lvx     v1,  0, r10;    ;# v1 = pred = p0..p15

-    lwz r0, 0(r3)           ;# v3 = d0..d7

-    get_two_diff_rows

-    vmrghb  v2, v0, v1;     ;# v2 = 16-bit p0..p7

-    vaddshs v2, v2, v3;     ;# v2 = r0..r7

-    lwzu r0, 32(r3)         ;# v3 = d8..d15

-    get_two_diff_rows

-    vmrglb  v1, v0, v1;     ;# v1 = 16-bit p8..p15

-    vaddshs v3, v3, v1;     ;# v3 = r8..r15

-    vpkshus v2, v2, v3;     ;# v2 = 8-bit r0..r15

-    stvx    v2,  0, r10;    ;# 16 pels to dst from buf

-    lwz     r0, 0(r10)

-    stw     r0, 0(r5)

-    lwz     r0, 4(r10)

-    stwux   r0, r5, r6

-    lwz     r0, 8(r10)

-    stwux   r0, r5, r6

-    lwz     r0, 12(r10)

-    stwx    r0, r5, r6

-    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack

-    mtspr   256, r12                    ;# reset old VRSAVE

-    blr

--- a/vp8/common/ppc/systemdependent.c

+++ /dev/null

@@ -1,167 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "subpixel.h"

-#include "loopfilter.h"

-#include "recon.h"

-#include "idct.h"

-#include "onyxc_int.h"

-void (*vp8_short_idct4x4)(short *input, short *output, int pitch);

-void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);

-void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);

-extern void (*vp9_post_proc_down_and_across)(

-  unsigned char *src_ptr,

-  unsigned char *dst_ptr,

-  int src_pixels_per_line,

-  int dst_pixels_per_line,

-  int rows,

-  int cols,

-  int flimit

-);

-extern void (*vp9_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);

-extern void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit);

-extern void (*vp9_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);

-extern void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);

-extern void vp9_post_proc_down_and_across_c

-(

-  unsigned char *src_ptr,

-  unsigned char *dst_ptr,

-  int src_pixels_per_line,

-  int dst_pixels_per_line,

-  int rows,

-  int cols,

-  int flimit

-);

-void vp9_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);

-extern copy_mem_block_function *vp9_copy_mem16x16;

-extern copy_mem_block_function *vp9_copy_mem8x8;

-extern copy_mem_block_function *vp9_copy_mem8x4;

-// PPC

-extern subpixel_predict_function sixtap_predict_ppc;

-extern subpixel_predict_function sixtap_predict8x4_ppc;

-extern subpixel_predict_function sixtap_predict8x8_ppc;

-extern subpixel_predict_function sixtap_predict16x16_ppc;

-extern subpixel_predict_function bilinear_predict4x4_ppc;

-extern subpixel_predict_function bilinear_predict8x4_ppc;

-extern subpixel_predict_function bilinear_predict8x8_ppc;

-extern subpixel_predict_function bilinear_predict16x16_ppc;

-extern copy_mem_block_function copy_mem16x16_ppc;

-void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

-void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

-void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

-extern void short_idct4x4llm_ppc(short *input, short *output, int pitch);

-// Generic C

-extern subpixel_predict_function vp9_sixtap_predict_c;

-extern subpixel_predict_function vp9_sixtap_predict8x4_c;

-extern subpixel_predict_function vp9_sixtap_predict8x8_c;

-extern subpixel_predict_function vp9_sixtap_predict16x16_c;

-extern subpixel_predict_function vp9_bilinear_predict4x4_c;

-extern subpixel_predict_function vp9_bilinear_predict8x4_c;

-extern subpixel_predict_function vp9_bilinear_predict8x8_c;

-extern subpixel_predict_function vp9_bilinear_predict16x16_c;

-extern copy_mem_block_function vp9_copy_mem16x16_c;

-extern copy_mem_block_function vp9_copy_mem8x8_c;

-extern copy_mem_block_function vp9_copy_mem8x4_c;

-void vp9_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

-void vp9_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

-void vp9_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

-extern void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch);

-extern void vp9_short_idct4x4llm_c(short *input, short *output, int pitch);

-extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);

-// PPC

-extern loop_filter_block_function loop_filter_mbv_ppc;

-extern loop_filter_block_function loop_filter_bv_ppc;

-extern loop_filter_block_function loop_filter_mbh_ppc;

-extern loop_filter_block_function loop_filter_bh_ppc;

-extern loop_filter_block_function loop_filter_mbvs_ppc;

-extern loop_filter_block_function loop_filter_bvs_ppc;

-extern loop_filter_block_function loop_filter_mbhs_ppc;

-extern loop_filter_block_function loop_filter_bhs_ppc;

-// Generic C

-extern loop_filter_block_function vp9_loop_filter_mbv_c;

-extern loop_filter_block_function vp9_loop_filter_bv_c;

-extern loop_filter_block_function vp9_loop_filter_mbh_c;

-extern loop_filter_block_function vp9_loop_filter_bh_c;

-extern loop_filter_block_function vp9_loop_filter_mbvs_c;

-extern loop_filter_block_function vp9_loop_filter_bvs_c;

-extern loop_filter_block_function vp9_loop_filter_mbhs_c;

-extern loop_filter_block_function vp9_loop_filter_bhs_c;

-extern loop_filter_block_function *vp8_lf_mbvfull;

-extern loop_filter_block_function *vp8_lf_mbhfull;

-extern loop_filter_block_function *vp8_lf_bvfull;

-extern loop_filter_block_function *vp8_lf_bhfull;

-extern loop_filter_block_function *vp8_lf_mbvsimple;

-extern loop_filter_block_function *vp8_lf_mbhsimple;

-extern loop_filter_block_function *vp8_lf_bvsimple;

-extern loop_filter_block_function *vp8_lf_bhsimple;

-void vp9_clear_c(void) {

-}

-void vp9_machine_specific_config(void) {

-  // Pure C:

-  vp9_clear_system_state                = vp9_clear_c;

-  vp9_recon_b                          = vp9_recon_b_c;

-  vp9_recon4b                         = vp9_recon4b_c;

-  vp9_recon2b                         = vp9_recon2b_c;

-  vp9_bilinear_predict16x16            = bilinear_predict16x16_ppc;

-  vp9_bilinear_predict8x8              = bilinear_predict8x8_ppc;

-  vp9_bilinear_predict8x4              = bilinear_predict8x4_ppc;

-  vp8_bilinear_predict                 = bilinear_predict4x4_ppc;

-  vp9_sixtap_predict16x16              = sixtap_predict16x16_ppc;

-  vp9_sixtap_predict8x8                = sixtap_predict8x8_ppc;

-  vp9_sixtap_predict8x4                = sixtap_predict8x4_ppc;

-  vp9_sixtap_predict                   = sixtap_predict_ppc;

-  vp8_short_idct4x4_1                  = vp9_short_idct4x4llm_1_c;

-  vp8_short_idct4x4                    = short_idct4x4llm_ppc;

-  vp8_dc_only_idct                      = vp8_dc_only_idct_c;

-  vp8_lf_mbvfull                       = loop_filter_mbv_ppc;

-  vp8_lf_bvfull                        = loop_filter_bv_ppc;

-  vp8_lf_mbhfull                       = loop_filter_mbh_ppc;

-  vp8_lf_bhfull                        = loop_filter_bh_ppc;

-  vp8_lf_mbvsimple                     = loop_filter_mbvs_ppc;

-  vp8_lf_bvsimple                      = loop_filter_bvs_ppc;

-  vp8_lf_mbhsimple                     = loop_filter_mbhs_ppc;

-  vp8_lf_bhsimple                      = loop_filter_bhs_ppc;

-  vp9_post_proc_down_and_across           = vp9_post_proc_down_and_across_c;

-  vp9_mbpost_proc_down                  = vp9_mbpost_proc_down_c;

-  vp9_mbpost_proc_across_ip              = vp9_mbpost_proc_across_ip_c;

-  vp9_plane_add_noise                   = vp9_plane_add_noise_c;

-  vp9_copy_mem16x16                    = copy_mem16x16_ppc;

-  vp9_copy_mem8x8                      = vp9_copy_mem8x8_c;

-  vp9_copy_mem8x4                      = vp9_copy_mem8x4_c;

-}

--- a/vp8/common/ppflags.h

+++ /dev/null

@@ -1,38 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_PPFLAGS_H

-#define __INC_PPFLAGS_H

-enum {

-  VP9D_NOFILTERING            = 0,

-  VP9D_DEBLOCK                = 1 << 0,

-  VP9D_DEMACROBLOCK           = 1 << 1,

-  VP9D_ADDNOISE               = 1 << 2,

-  VP9D_DEBUG_TXT_FRAME_INFO   = 1 << 3,

-  VP9D_DEBUG_TXT_MBLK_MODES   = 1 << 4,

-  VP9D_DEBUG_TXT_DC_DIFF      = 1 << 5,

-  VP9D_DEBUG_TXT_RATE_INFO    = 1 << 6,

-  VP9D_DEBUG_DRAW_MV          = 1 << 7,

-  VP9D_DEBUG_CLR_BLK_MODES    = 1 << 8,

-  VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9

-};

-typedef struct {

-  int post_proc_flag;

-  int deblocking_level;

-  int noise_level;

-  int display_ref_frame_flag;

-  int display_mb_modes_flag;

-  int display_b_modes_flag;

-  int display_mv_flag;

-} vp9_ppflags_t;

-#endif

--- a/vp8/common/pragmas.h

+++ /dev/null

@@ -1,19 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifdef __INTEL_COMPILER

-#pragma warning(disable:997 1011 170)

-#endif

-#ifdef _MSC_VER

-#pragma warning(disable:4799)

-#endif

--- a/vp8/common/pred_common.c

+++ /dev/null

@@ -1,463 +1,0 @@

-/*

- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp8/common/pred_common.h"

-#include "vp8/common/seg_common.h"

-// TBD prediction functions for various bitstream signals

-// Returns a context number for the given MB prediction signal

-unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,

-                                   const MACROBLOCKD *const xd,

-                                   PRED_ID pred_id) {

-  int pred_context;

-  MODE_INFO *m = xd->mode_info_context;

-  // Note:

-  // The mode info data structure has a one element border above and to the

-  // left of the entries correpsonding to real macroblocks.

-  // The prediction flags in these dummy entries are initialised to 0.

-  switch (pred_id) {

-    case PRED_SEG_ID:

-      pred_context = (m - 1)->mbmi.seg_id_predicted +

-                     (m - cm->mode_info_stride)->mbmi.seg_id_predicted;

-      break;

-    case PRED_REF:

-      pred_context = (m - 1)->mbmi.ref_predicted +

-                     (m - cm->mode_info_stride)->mbmi.ref_predicted;

-      break;

-    case PRED_COMP:

-      // Context based on use of comp pred flag by neighbours

-      // pred_context =

-      //   ((m - 1)->mbmi.second_ref_frame != INTRA_FRAME) +

-      //    ((m - cm->mode_info_stride)->mbmi.second_ref_frame != INTRA_FRAME);

-      // Context based on mode and reference frame

-      // if ( m->mbmi.ref_frame == LAST_FRAME )

-      //    pred_context = 0 + (m->mbmi.mode != ZEROMV);

-      // else if ( m->mbmi.ref_frame == GOLDEN_FRAME )

-      //    pred_context = 2 + (m->mbmi.mode != ZEROMV);

-      // else

-      //    pred_context = 4 + (m->mbmi.mode != ZEROMV);

-      if (m->mbmi.ref_frame == LAST_FRAME)

-        pred_context = 0;

-      else

-        pred_context = 1;

-      break;

-    case PRED_MBSKIP:

-      pred_context = (m - 1)->mbmi.mb_skip_coeff +

-                     (m - cm->mode_info_stride)->mbmi.mb_skip_coeff;

-      break;

-    case PRED_SWITCHABLE_INTERP:

-      {

-        int left_in_image = (m - 1)->mbmi.mb_in_image;

-        int above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image;

-        int left_mode = (m - 1)->mbmi.mode;

-        int above_mode = (m - cm->mode_info_stride)->mbmi.mode;

-        int left_interp, above_interp;

-        if (left_in_image && left_mode >= NEARESTMV && left_mode <= SPLITMV)

-          left_interp = vp9_switchable_interp_map[(m - 1)->mbmi.interp_filter];

-        else

-          left_interp = VP9_SWITCHABLE_FILTERS;

-        if (above_in_image && above_mode >= NEARESTMV && above_mode <= SPLITMV)

-          above_interp = vp9_switchable_interp_map[

-              (m - cm->mode_info_stride)->mbmi.interp_filter];

-        else

-          above_interp = VP9_SWITCHABLE_FILTERS;

-        if (left_interp == above_interp)

-          pred_context = left_interp;

-        else if (left_interp == VP9_SWITCHABLE_FILTERS &&

-                 above_interp != VP9_SWITCHABLE_FILTERS)

-          pred_context = above_interp;

-        else if (left_interp != VP9_SWITCHABLE_FILTERS &&

-                 above_interp == VP9_SWITCHABLE_FILTERS)

-          pred_context = left_interp;

-        else

-          pred_context = VP9_SWITCHABLE_FILTERS;

-      }

-      break;

-    default:

-      // TODO *** add error trap code.

-      pred_context = 0;

-      break;

-  }

-  return pred_context;

-}

-// This function returns a context probability for coding a given

-// prediction signal

-vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,

-                          const MACROBLOCKD *const xd,

-                          PRED_ID pred_id) {

-  vp9_prob pred_probability;

-  int pred_context;

-  // Get the appropriate prediction context

-  pred_context = vp9_get_pred_context(cm, xd, pred_id);

-  switch (pred_id) {

-    case PRED_SEG_ID:

-      pred_probability = cm->segment_pred_probs[pred_context];

-      break;

-    case PRED_REF:

-      pred_probability = cm->ref_pred_probs[pred_context];

-      break;

-    case PRED_COMP:

-      // In keeping with convention elsewhre the probability returned is

-      // the probability of a "0" outcome which in this case means the

-      // probability of comp pred off.

-      pred_probability = cm->prob_comppred[pred_context];

-      break;

-    case PRED_MBSKIP:

-      pred_probability = cm->mbskip_pred_probs[pred_context];

-      break;

-    default:

-      // TODO *** add error trap code.

-      pred_probability = 128;

-      break;

-  }

-  return pred_probability;

-}

-// This function returns a context probability ptr for coding a given

-// prediction signal

-const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,

-                                   const MACROBLOCKD *const xd,

-                                   PRED_ID pred_id) {

-  const vp9_prob *pred_probability;

-  int pred_context;

-  // Get the appropriate prediction context

-  pred_context = vp9_get_pred_context(cm, xd, pred_id);

-  switch (pred_id) {

-    case PRED_SEG_ID:

-      pred_probability = &cm->segment_pred_probs[pred_context];

-      break;

-    case PRED_REF:

-      pred_probability = &cm->ref_pred_probs[pred_context];

-      break;

-    case PRED_COMP:

-      // In keeping with convention elsewhre the probability returned is

-      // the probability of a "0" outcome which in this case means the

-      // probability of comp pred off.

-      pred_probability = &cm->prob_comppred[pred_context];

-      break;

-    case PRED_MBSKIP:

-      pred_probability = &cm->mbskip_pred_probs[pred_context];

-      break;

-    case PRED_SWITCHABLE_INTERP:

-      pred_probability = &cm->fc.switchable_interp_prob[pred_context][0];

-      break;

-    default:

-      // TODO *** add error trap code.

-      pred_probability = NULL;

-      break;

-  }

-  return pred_probability;

-}

-// This function returns the status of the given prediction signal.

-// I.e. is the predicted value for the given signal correct.

-unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,

-                                PRED_ID pred_id) {

-  unsigned char pred_flag = 0;

-  switch (pred_id) {

-    case PRED_SEG_ID:

-      pred_flag = xd->mode_info_context->mbmi.seg_id_predicted;

-      break;

-    case PRED_REF:

-      pred_flag = xd->mode_info_context->mbmi.ref_predicted;

-      break;

-    case PRED_MBSKIP:

-      pred_flag = xd->mode_info_context->mbmi.mb_skip_coeff;

-      break;

-    default:

-      // TODO *** add error trap code.

-      pred_flag = 0;

-      break;

-  }

-  return pred_flag;

-}

-// This function sets the status of the given prediction signal.

-// I.e. is the predicted value for the given signal correct.

-void vp9_set_pred_flag(MACROBLOCKD *const xd,

-                       PRED_ID pred_id,

-                       unsigned char pred_flag) {

-#if CONFIG_SUPERBLOCKS

-  const int mis = xd->mode_info_stride;

-#endif

-  switch (pred_id) {

-    case PRED_SEG_ID:

-      xd->mode_info_context->mbmi.seg_id_predicted = pred_flag;

-#if CONFIG_SUPERBLOCKS

-      if (xd->mode_info_context->mbmi.encoded_as_sb) {

-        if (xd->mb_to_right_edge > 0)

-          xd->mode_info_context[1].mbmi.seg_id_predicted = pred_flag;

-        if (xd->mb_to_bottom_edge > 0) {

-          xd->mode_info_context[mis].mbmi.seg_id_predicted = pred_flag;

-          if (xd->mb_to_right_edge > 0)

-            xd->mode_info_context[mis + 1].mbmi.seg_id_predicted = pred_flag;

-        }

-      }

-#endif

-      break;

-    case PRED_REF:

-      xd->mode_info_context->mbmi.ref_predicted = pred_flag;

-#if CONFIG_SUPERBLOCKS

-      if (xd->mode_info_context->mbmi.encoded_as_sb) {

-        if (xd->mb_to_right_edge > 0)

-          xd->mode_info_context[1].mbmi.ref_predicted = pred_flag;

-        if (xd->mb_to_bottom_edge > 0) {

-          xd->mode_info_context[mis].mbmi.ref_predicted = pred_flag;

-          if (xd->mb_to_right_edge > 0)

-            xd->mode_info_context[mis + 1].mbmi.ref_predicted = pred_flag;

-        }

-      }

-#endif

-      break;

-    case PRED_MBSKIP:

-      xd->mode_info_context->mbmi.mb_skip_coeff = pred_flag;

-#if CONFIG_SUPERBLOCKS

-      if (xd->mode_info_context->mbmi.encoded_as_sb) {

-        if (xd->mb_to_right_edge > 0)

-          xd->mode_info_context[1].mbmi.mb_skip_coeff = pred_flag;

-        if (xd->mb_to_bottom_edge > 0) {

-          xd->mode_info_context[mis].mbmi.mb_skip_coeff = pred_flag;

-          if (xd->mb_to_right_edge > 0)

-            xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = pred_flag;

-        }

-      }

-#endif

-      break;

-    default:

-      // TODO *** add error trap code.

-      break;

-  }

-}

-// The following contain the guts of the prediction code used to

-// peredict various bitstream signals.

-// Macroblock segment id prediction function

-unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm,

-                                    const MACROBLOCKD *const xd, int MbIndex) {

-  // Currently the prediction for the macroblock segment ID is

-  // the value stored for this macroblock in the previous frame.

-#if CONFIG_SUPERBLOCKS

-  if (!xd->mode_info_context->mbmi.encoded_as_sb) {

-#endif

-    return cm->last_frame_seg_map[MbIndex];

-#if CONFIG_SUPERBLOCKS

-  } else {

-    int seg_id = cm->last_frame_seg_map[MbIndex];

-    int mb_col = MbIndex % cm->mb_cols;

-    int mb_row = MbIndex / cm->mb_cols;

-    if (mb_col + 1 < cm->mb_cols)

-      seg_id = seg_id && cm->last_frame_seg_map[MbIndex + 1];

-    if (mb_row + 1 < cm->mb_rows) {

-      seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols];

-      if (mb_col + 1 < cm->mb_cols)

-        seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols + 1];

-    }

-    return seg_id;

-  }

-#endif

-}

-MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,

-                                    const MACROBLOCKD *const xd) {

-  MODE_INFO *m = xd->mode_info_context;

-  MV_REFERENCE_FRAME left;

-  MV_REFERENCE_FRAME above;

-  MV_REFERENCE_FRAME above_left;

-  MV_REFERENCE_FRAME pred_ref = LAST_FRAME;

-  int segment_id = xd->mode_info_context->mbmi.segment_id;

-  int seg_ref_active;

-  int i;

-  unsigned char frame_allowed[MAX_REF_FRAMES] = {1, 1, 1, 1};

-  unsigned char ref_score[MAX_REF_FRAMES];

-  unsigned char best_score = 0;

-  unsigned char left_in_image;

-  unsigned char above_in_image;

-  unsigned char above_left_in_image;

-  // Is segment coding ennabled

-  seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);

-  // Special case treatment if segment coding is enabled.

-  // Dont allow prediction of a reference frame that the segment

-  // does not allow

-  if (seg_ref_active) {

-    for (i = 0; i < MAX_REF_FRAMES; i++) {

-      frame_allowed[i] =

-        vp9_check_segref(xd, segment_id, i);

-      // Score set to 0 if ref frame not allowed

-      ref_score[i] = cm->ref_scores[i] * frame_allowed[i];

-    }

-  } else

-    vpx_memcpy(ref_score, cm->ref_scores, sizeof(ref_score));

-  // Reference frames used by neighbours

-  left = (m - 1)->mbmi.ref_frame;

-  above = (m - cm->mode_info_stride)->mbmi.ref_frame;

-  above_left = (m - 1 - cm->mode_info_stride)->mbmi.ref_frame;

-  // Are neighbours in image

-  left_in_image = (m - 1)->mbmi.mb_in_image;

-  above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image;

-  above_left_in_image = (m - 1 - cm->mode_info_stride)->mbmi.mb_in_image;

-  // Adjust scores for candidate reference frames based on neigbours

-  if (frame_allowed[left] && left_in_image) {

-    ref_score[left] += 16;

-    if (above_left_in_image && (left == above_left))

-      ref_score[left] += 4;

-  }

-  if (frame_allowed[above] && above_in_image) {

-    ref_score[above] += 16;

-    if (above_left_in_image && (above == above_left))

-      ref_score[above] += 4;

-  }

-  // Now choose the candidate with the highest score

-  for (i = 0; i < MAX_REF_FRAMES; i++) {

-    if (ref_score[i] > best_score) {

-      pred_ref = i;

-      best_score = ref_score[i];

-    }

-  }

-  return pred_ref;

-}

-// Functions to computes a set of modified reference frame probabilities

-// to use when the prediction of the reference frame value fails

-void vp9_calc_ref_probs(int *count, vp9_prob *probs) {

-  int tot_count;

-  tot_count = count[0] + count[1] + count[2] + count[3];

-  if (tot_count) {

-    probs[0] = (vp9_prob)((count[0] * 255 + (tot_count >> 1)) / tot_count);

-    probs[0] += !probs[0];

-  } else

-    probs[0] = 128;

-  tot_count -= count[0];

-  if (tot_count) {

-    probs[1] = (vp9_prob)((count[1] * 255 + (tot_count >> 1)) / tot_count);

-    probs[1] += !probs[1];

-  } else

-    probs[1] = 128;

-  tot_count -= count[1];

-  if (tot_count) {

-    probs[2] = (vp9_prob)((count[2] * 255 + (tot_count >> 1)) / tot_count);

-    probs[2] += !probs[2];

-  } else

-    probs[2] = 128;

-}

-// Computes a set of modified conditional probabilities for the reference frame

-// Values willbe set to 0 for reference frame options that are not possible

-// because wither they were predicted and prediction has failed or because

-// they are not allowed for a given segment.

-void vp9_compute_mod_refprobs(VP9_COMMON *const cm) {

-  int norm_cnt[MAX_REF_FRAMES];

-  int intra_count;

-  int inter_count;

-  int last_count;

-  int gfarf_count;

-  int gf_count;

-  int arf_count;

-  intra_count = cm->prob_intra_coded;

-  inter_count = (255 - intra_count);

-  last_count = (inter_count * cm->prob_last_coded) / 255;

-  gfarf_count = inter_count - last_count;

-  gf_count = (gfarf_count * cm->prob_gf_coded) / 255;

-  arf_count = gfarf_count - gf_count;

-  // Work out modified reference frame probabilities to use where prediction

-  // of the reference frame fails

-  norm_cnt[0] = 0;

-  norm_cnt[1] = last_count;

-  norm_cnt[2] = gf_count;

-  norm_cnt[3] = arf_count;

-  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[INTRA_FRAME]);

-  cm->mod_refprobs[INTRA_FRAME][0] = 0;    // This branch implicit

-  norm_cnt[0] = intra_count;

-  norm_cnt[1] = 0;

-  norm_cnt[2] = gf_count;

-  norm_cnt[3] = arf_count;

-  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[LAST_FRAME]);

-  cm->mod_refprobs[LAST_FRAME][1] = 0;    // This branch implicit

-  norm_cnt[0] = intra_count;

-  norm_cnt[1] = last_count;

-  norm_cnt[2] = 0;

-  norm_cnt[3] = arf_count;

-  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[GOLDEN_FRAME]);

-  cm->mod_refprobs[GOLDEN_FRAME][2] = 0;  // This branch implicit

-  norm_cnt[0] = intra_count;

-  norm_cnt[1] = last_count;

-  norm_cnt[2] = gf_count;

-  norm_cnt[3] = 0;

-  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[ALTREF_FRAME]);

-  cm->mod_refprobs[ALTREF_FRAME][2] = 0;  // This branch implicit

-  // Score the reference frames based on overal frequency.

-  // These scores contribute to the prediction choices.

-  // Max score 17 min 1

-  cm->ref_scores[INTRA_FRAME] = 1 + (intra_count * 16 / 255);

-  cm->ref_scores[LAST_FRAME] = 1 + (last_count * 16 / 255);

-  cm->ref_scores[GOLDEN_FRAME] = 1 + (gf_count * 16 / 255);

-  cm->ref_scores[ALTREF_FRAME] = 1 + (arf_count * 16 / 255);

-}

--- a/vp8/common/pred_common.h

+++ /dev/null

@@ -1,56 +1,0 @@

-/*

- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "type_aliases.h"

-#include "onyxc_int.h"

-#include "vp8/common/blockd.h"

-#ifndef __INC_PRED_COMMON_H__

-#define __INC_PRED_COMMON_H__ 1

-// Predicted items

-typedef enum {

-  PRED_SEG_ID = 0,               // Segment identifier

-  PRED_REF = 1,

-  PRED_COMP = 2,

-  PRED_MBSKIP = 3,

-  PRED_SWITCHABLE_INTERP = 4

-} PRED_ID;

-extern unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,

-                                          const MACROBLOCKD *const xd,

-                                          PRED_ID pred_id);

-extern vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,

-                                  const MACROBLOCKD *const xd,

-                                  PRED_ID pred_id);

-extern const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,

-                                          const MACROBLOCKD *const xd,

-                                          PRED_ID pred_id);

-extern unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,

-                                       PRED_ID pred_id);

-extern void vp9_set_pred_flag(MACROBLOCKD *const xd,

-                              PRED_ID pred_id,

-                              unsigned char pred_flag);

-extern unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm,

-                                           const MACROBLOCKD *const xd,

-                                           int MbIndex);

-extern MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,

-                                       const MACROBLOCKD *const xd);

-extern void vp9_compute_mod_refprobs(VP9_COMMON *const cm);

-#endif /* __INC_PRED_COMMON_H__ */

--- a/vp8/common/quant_common.c

+++ /dev/null

@@ -1,125 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "quant_common.h"

-static int dc_qlookup[QINDEX_RANGE];

-static int ac_qlookup[QINDEX_RANGE];

-#define ACDC_MIN 4

-void vp9_init_quant_tables() {

-  int i;

-  int current_val = 4;

-  int last_val = 4;

-  int ac_val;

-  for (i = 0; i < QINDEX_RANGE; i++) {

-    ac_qlookup[i] = current_val;

-    current_val = (int)((double)current_val * 1.02);

-    if (current_val == last_val)

-      current_val++;

-    last_val = current_val;

-    ac_val = ac_qlookup[i];

-    dc_qlookup[i] = (0.000000305 * ac_val * ac_val * ac_val) +

-                    (-0.00065 * ac_val * ac_val) +

-                    (0.9 * ac_val) + 0.5;

-    if (dc_qlookup[i] < ACDC_MIN)

-      dc_qlookup[i] = ACDC_MIN;

-  }

-}

-int vp9_dc_quant(int QIndex, int Delta) {

-  int retval;

-  QIndex = QIndex + Delta;

-  if (QIndex > MAXQ)

-    QIndex = MAXQ;

-  else if (QIndex < 0)

-    QIndex = 0;

-  retval = dc_qlookup[ QIndex ];

-  return retval;

-}

-int vp9_dc2quant(int QIndex, int Delta) {

-  int retval;

-  QIndex = QIndex + Delta;

-  if (QIndex > MAXQ)

-    QIndex = MAXQ;

-  else if (QIndex < 0)

-    QIndex = 0;

-  retval = dc_qlookup[ QIndex ];

-  return retval;

-}

-int vp9_dc_uv_quant(int QIndex, int Delta) {

-  int retval;

-  QIndex = QIndex + Delta;

-  if (QIndex > MAXQ)

-    QIndex = MAXQ;

-  else if (QIndex < 0)

-    QIndex = 0;

-  retval = dc_qlookup[ QIndex ];

-  return retval;

-}

-int vp9_ac_yquant(int QIndex) {

-  int retval;

-  if (QIndex > MAXQ)

-    QIndex = MAXQ;

-  else if (QIndex < 0)

-    QIndex = 0;

-  retval = ac_qlookup[ QIndex ];

-  return retval;

-}

-int vp9_ac2quant(int QIndex, int Delta) {

-  int retval;

-  QIndex = QIndex + Delta;

-  if (QIndex > MAXQ)

-    QIndex = MAXQ;

-  else if (QIndex < 0)

-    QIndex = 0;

-  retval = (ac_qlookup[ QIndex ] * 775) / 1000;

-  if (retval < 4)

-    retval = 4;

-  return retval;

-}

-int vp9_ac_uv_quant(int QIndex, int Delta) {

-  int retval;

-  QIndex = QIndex + Delta;

-  if (QIndex > MAXQ)

-    QIndex = MAXQ;

-  else if (QIndex < 0)

-    QIndex = 0;

-  retval = ac_qlookup[ QIndex ];

-  return retval;

-}

--- a/vp8/common/quant_common.h

+++ /dev/null

@@ -1,22 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "string.h"

-#include "blockd.h"

-#include "onyxc_int.h"

-extern void vp9_init_quant_tables();

-extern int vp9_ac_yquant(int QIndex);

-extern int vp9_dc_quant(int QIndex, int Delta);

-extern int vp9_dc2quant(int QIndex, int Delta);

-extern int vp9_ac2quant(int QIndex, int Delta);

-extern int vp9_dc_uv_quant(int QIndex, int Delta);

-extern int vp9_ac_uv_quant(int QIndex, int Delta);

--- a/vp8/common/recon.c

+++ /dev/null

@@ -1,197 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "vpx_rtcd.h"

-#include "blockd.h"

-void vp9_recon_b_c

-(

-  unsigned char *pred_ptr,

-  short *diff_ptr,

-  unsigned char *dst_ptr,

-  int stride

-) {

-  int r, c;

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

-      int a = diff_ptr[c] + pred_ptr[c];

-      if (a < 0)

-        a = 0;

-      if (a > 255)

-        a = 255;

-      dst_ptr[c] = (unsigned char) a;

-    }

-    dst_ptr += stride;

-    diff_ptr += 16;

-    pred_ptr += 16;

-  }

-}

-void vp9_recon_uv_b_c

-(

-  unsigned char *pred_ptr,

-  short *diff_ptr,

-  unsigned char *dst_ptr,

-  int stride

-) {

-  int r, c;

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

-      int a = diff_ptr[c] + pred_ptr[c];

-      if (a < 0)

-        a = 0;

-      if (a > 255)

-        a = 255;

-      dst_ptr[c] = (unsigned char) a;

-    }

-    dst_ptr += stride;

-    diff_ptr += 8;

-    pred_ptr += 8;

-  }

-}

-void vp9_recon4b_c

-(

-  unsigned char *pred_ptr,

-  short *diff_ptr,

-  unsigned char *dst_ptr,

-  int stride

-) {

-  int r, c;

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 16; c++) {

-      int a = diff_ptr[c] + pred_ptr[c];

-      if (a < 0)

-        a = 0;

-      if (a > 255)

-        a = 255;

-      dst_ptr[c] = (unsigned char) a;

-    }

-    dst_ptr += stride;

-    diff_ptr += 16;

-    pred_ptr += 16;

-  }

-}

-void vp9_recon2b_c

-(

-  unsigned char *pred_ptr,

-  short *diff_ptr,

-  unsigned char *dst_ptr,

-  int stride

-) {

-  int r, c;

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 8; c++) {

-      int a = diff_ptr[c] + pred_ptr[c];

-      if (a < 0)

-        a = 0;

-      if (a > 255)

-        a = 255;

-      dst_ptr[c] = (unsigned char) a;

-    }

-    dst_ptr += stride;

-    diff_ptr += 8;

-    pred_ptr += 8;

-  }

-}

-#if CONFIG_SUPERBLOCKS

-void vp9_recon_mby_s_c(MACROBLOCKD *xd, uint8_t *dst) {

-  int x, y;

-  BLOCKD *b = &xd->block[0];

-  int stride = b->dst_stride;

-  short *diff = b->diff;

-  for (y = 0; y < 16; y++) {

-    for (x = 0; x < 16; x++) {

-      int a = dst[x] + diff[x];

-      if (a < 0)

-        a = 0;

-      else if (a > 255)

-        a = 255;

-      dst[x] = a;

-    }

-    dst += stride;

-    diff += 16;

-  }

-}

-void vp9_recon_mbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {

-  int x, y, i;

-  uint8_t *dst = udst;

-  for (i = 0; i < 2; i++, dst = vdst) {

-    BLOCKD *b = &xd->block[16 + 4 * i];

-    int stride = b->dst_stride;

-    short *diff = b->diff;

-    for (y = 0; y < 8; y++) {

-      for (x = 0; x < 8; x++) {

-        int a = dst[x] + diff[x];

-        if (a < 0)

-          a = 0;

-        else if (a > 255)

-          a = 255;

-        dst[x] = a;

-      }

-      dst += stride;

-      diff += 8;

-    }

-  }

-}

-#endif

-void vp9_recon_mby_c(MACROBLOCKD *xd) {

-  int i;

-  for (i = 0; i < 16; i += 4) {

-    BLOCKD *b = &xd->block[i];

-    vp9_recon4b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);

-  }

-}

-void vp9_recon_mb_c(MACROBLOCKD *xd) {

-  int i;

-  for (i = 0; i < 16; i += 4) {

-    BLOCKD *b = &xd->block[i];

-    vp9_recon4b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);

-  }

-  for (i = 16; i < 24; i += 2) {

-    BLOCKD *b = &xd->block[i];

-    vp9_recon2b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);

-  }

-}

--- a/vp8/common/reconinter.c

+++ /dev/null

@@ -1,1145 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "vpx/vpx_integer.h"

-#include "subpixel.h"

-#include "blockd.h"

-#include "reconinter.h"

-#if CONFIG_RUNTIME_CPU_DETECT

-#include "onyxc_int.h"

-#endif

-void vp9_setup_interp_filters(MACROBLOCKD *xd,

-                              INTERPOLATIONFILTERTYPE mcomp_filter_type,

-                              VP9_COMMON *cm) {

-  if (mcomp_filter_type == SIXTAP) {

-    xd->subpixel_predict        = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, sixtap4x4);

-    xd->subpixel_predict8x4     = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, sixtap8x4);

-    xd->subpixel_predict8x8     = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, sixtap8x8);

-    xd->subpixel_predict16x16   = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, sixtap16x16);

-    xd->subpixel_predict_avg    = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, sixtap_avg4x4);

-    xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, sixtap_avg8x8);

-    xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, sixtap_avg16x16);

-  } else if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) {

-    xd->subpixel_predict        = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, eighttap4x4);

-    xd->subpixel_predict8x4     = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, eighttap8x4);

-    xd->subpixel_predict8x8     = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, eighttap8x8);

-    xd->subpixel_predict16x16   = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, eighttap16x16);

-    xd->subpixel_predict_avg    = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, eighttap_avg4x4);

-    xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, eighttap_avg8x8);

-    xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, eighttap_avg16x16);

-  } else if (mcomp_filter_type == EIGHTTAP_SHARP) {

-    xd->subpixel_predict        = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, eighttap4x4_sharp);

-    xd->subpixel_predict8x4     = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, eighttap8x4_sharp);

-    xd->subpixel_predict8x8     = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, eighttap8x8_sharp);

-    xd->subpixel_predict16x16   = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, eighttap16x16_sharp);

-    xd->subpixel_predict_avg    = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, eighttap_avg4x4_sharp);

-    xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, eighttap_avg8x8_sharp);

-    xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, eighttap_avg16x16_sharp);

-  }

-  else {

-    xd->subpixel_predict        = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, bilinear4x4);

-    xd->subpixel_predict8x4     = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, bilinear8x4);

-    xd->subpixel_predict8x8     = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, bilinear8x8);

-    xd->subpixel_predict16x16   = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, bilinear16x16);

-    xd->subpixel_predict_avg    = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, bilinear_avg4x4);

-    xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, bilinear_avg8x8);

-    xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(

-        &cm->rtcd.subpix, bilinear_avg16x16);

-  }

-}

-void vp9_copy_mem16x16_c(unsigned char *src,

-                         int src_stride,

-                         unsigned char *dst,

-                         int dst_stride) {

-  int r;

-  for (r = 0; r < 16; r++) {

-#if !(CONFIG_FAST_UNALIGNED)

-    dst[0] = src[0];

-    dst[1] = src[1];

-    dst[2] = src[2];

-    dst[3] = src[3];

-    dst[4] = src[4];

-    dst[5] = src[5];

-    dst[6] = src[6];

-    dst[7] = src[7];

-    dst[8] = src[8];

-    dst[9] = src[9];

-    dst[10] = src[10];

-    dst[11] = src[11];

-    dst[12] = src[12];

-    dst[13] = src[13];

-    dst[14] = src[14];

-    dst[15] = src[15];

-#else

-    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];

-    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];

-    ((uint32_t *)dst)[2] = ((uint32_t *)src)[2];

-    ((uint32_t *)dst)[3] = ((uint32_t *)src)[3];

-#endif

-    src += src_stride;

-    dst += dst_stride;

-  }

-}

-void vp9_avg_mem16x16_c(unsigned char *src,

-                        int src_stride,

-                        unsigned char *dst,

-                        int dst_stride) {

-  int r;

-  for (r = 0; r < 16; r++) {

-    int n;

-    for (n = 0; n < 16; n++) {

-      dst[n] = (dst[n] + src[n] + 1) >> 1;

-    }

-    src += src_stride;

-    dst += dst_stride;

-  }

-}

-void vp9_copy_mem8x8_c(unsigned char *src,

-                       int src_stride,

-                       unsigned char *dst,

-                       int dst_stride) {

-  int r;

-  for (r = 0; r < 8; r++) {

-#if !(CONFIG_FAST_UNALIGNED)

-    dst[0] = src[0];

-    dst[1] = src[1];

-    dst[2] = src[2];

-    dst[3] = src[3];

-    dst[4] = src[4];

-    dst[5] = src[5];

-    dst[6] = src[6];

-    dst[7] = src[7];

-#else

-    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];

-    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];

-#endif

-    src += src_stride;

-    dst += dst_stride;

-  }

-}

-void vp9_avg_mem8x8_c(unsigned char *src,

-                      int src_stride,

-                      unsigned char *dst,

-                      int dst_stride) {

-  int r;

-  for (r = 0; r < 8; r++) {

-    int n;

-    for (n = 0; n < 8; n++) {

-      dst[n] = (dst[n] + src[n] + 1) >> 1;

-    }

-    src += src_stride;

-    dst += dst_stride;

-  }

-}

-void vp9_copy_mem8x4_c(unsigned char *src,

-                       int src_stride,

-                       unsigned char *dst,

-                       int dst_stride) {

-  int r;

-  for (r = 0; r < 4; r++) {

-#if !(CONFIG_FAST_UNALIGNED)

-    dst[0] = src[0];

-    dst[1] = src[1];

-    dst[2] = src[2];

-    dst[3] = src[3];

-    dst[4] = src[4];

-    dst[5] = src[5];

-    dst[6] = src[6];

-    dst[7] = src[7];

-#else

-    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];

-    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];

-#endif

-    src += src_stride;

-    dst += dst_stride;

-  }

-}

-void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) {

-  int r;

-  unsigned char *ptr_base;

-  unsigned char *ptr;

-  unsigned char *pred_ptr = d->predictor;

-  int_mv mv;

-  ptr_base = *(d->base_pre);

-  mv.as_int = d->bmi.as_mv.first.as_int;

-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {

-    ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

-          (mv.as_mv.col >> 3);

-    sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,

-         pred_ptr, pitch);

-  } else {

-    ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

-                (mv.as_mv.col >> 3);

-    ptr = ptr_base;

-    for (r = 0; r < 4; r++) {

-#if !(CONFIG_FAST_UNALIGNED)

-      pred_ptr[0]  = ptr[0];

-      pred_ptr[1]  = ptr[1];

-      pred_ptr[2]  = ptr[2];

-      pred_ptr[3]  = ptr[3];

-#else

-      *(uint32_t *)pred_ptr = *(uint32_t *)ptr;

-#endif

-      pred_ptr     += pitch;

-      ptr         += d->pre_stride;

-    }

-  }

-}

-/*

- * Similar to vp9_build_inter_predictors_b(), but instead of storing the

- * results in d->predictor, we average the contents of d->predictor (which

- * come from an earlier call to vp9_build_inter_predictors_b()) with the

- * predictor of the second reference frame / motion vector.

- */

-void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,

-                                      vp9_subpix_fn_t sppf) {

-  int r;

-  unsigned char *ptr_base;

-  unsigned char *ptr;

-  unsigned char *pred_ptr = d->predictor;

-  int_mv mv;

-  ptr_base = *(d->base_second_pre);

-  mv.as_int = d->bmi.as_mv.second.as_int;

-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {

-    ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

-          (mv.as_mv.col >> 3);

-    sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,

-         pred_ptr, pitch);

-  } else {

-    ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

-                (mv.as_mv.col >> 3);

-    ptr = ptr_base;

-    for (r = 0; r < 4; r++) {

-      pred_ptr[0]  = (pred_ptr[0] + ptr[0] + 1) >> 1;

-      pred_ptr[1]  = (pred_ptr[1] + ptr[1] + 1) >> 1;

-      pred_ptr[2]  = (pred_ptr[2] + ptr[2] + 1) >> 1;

-      pred_ptr[3]  = (pred_ptr[3] + ptr[3] + 1) >> 1;

-      pred_ptr    += pitch;

-      ptr         += d->pre_stride;

-    }

-  }

-}

-void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {

-  unsigned char *ptr_base;

-  unsigned char *ptr;

-  unsigned char *pred_ptr = d->predictor;

-  int_mv mv;

-  ptr_base = *(d->base_pre);

-  mv.as_int = d->bmi.as_mv.first.as_int;

-  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

-        (mv.as_mv.col >> 3);

-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {

-    xd->subpixel_predict8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,

-                            (mv.as_mv.row & 7) << 1, pred_ptr, pitch);

-  } else {

-    vp9_copy_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);

-  }

-}

-/*

- * Similar to build_inter_predictors_4b(), but instead of storing the

- * results in d->predictor, we average the contents of d->predictor (which

- * come from an earlier call to build_inter_predictors_4b()) with the

- * predictor of the second reference frame / motion vector.

- */

-void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,

-                                      BLOCKD *d, int pitch) {

-  unsigned char *ptr_base;

-  unsigned char *ptr;

-  unsigned char *pred_ptr = d->predictor;

-  int_mv mv;

-  ptr_base = *(d->base_second_pre);

-  mv.as_int = d->bmi.as_mv.second.as_int;

-  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

-        (mv.as_mv.col >> 3);

-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {

-    xd->subpixel_predict_avg8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,

-                               (mv.as_mv.row & 7) << 1, pred_ptr, pitch);

-  } else {

-    vp9_avg_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);

-  }

-}

-static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {

-  unsigned char *ptr_base;

-  unsigned char *ptr;

-  unsigned char *pred_ptr = d->predictor;

-  int_mv mv;

-  ptr_base = *(d->base_pre);

-  mv.as_int = d->bmi.as_mv.first.as_int;

-  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

-        (mv.as_mv.col >> 3);

-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {

-    xd->subpixel_predict8x4(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,

-                           (mv.as_mv.row & 7) << 1, pred_ptr, pitch);

-  } else {

-    vp9_copy_mem8x4(ptr, d->pre_stride, pred_ptr, pitch);

-  }

-}

-/*encoder only*/

-#if CONFIG_PRED_FILTER

-// Select the thresholded or non-thresholded filter

-#define USE_THRESH_FILTER 0

-#define PRED_FILT_LEN 5

-static const int filt_shift = 4;

-static const int pred_filter[PRED_FILT_LEN] = {1, 2, 10, 2, 1};

-// Alternative filter {1, 1, 4, 1, 1}

-#if !USE_THRESH_FILTER

-void filter_mb(unsigned char *src, int src_stride,

-               unsigned char *dst, int dst_stride,

-               int width, int height) {

-  int i, j, k;

-  unsigned int Temp[32 * 32];

-  unsigned int  *pTmp = Temp;

-  unsigned char *pSrc = src - (1 + src_stride) * (PRED_FILT_LEN / 2);

-  // Horizontal

-  for (i = 0; i < height + PRED_FILT_LEN - 1; i++) {

-    for (j = 0; j < width; j++) {

-      int sum = 0;

-      for (k = 0; k < PRED_FILT_LEN; k++)

-        sum += pSrc[j + k] * pred_filter[k];

-      pTmp[j] = sum;

-    }

-    pSrc += src_stride;

-    pTmp += width;

-  }

-  // Vertical

-  pTmp = Temp;

-  for (i = 0; i < width; i++) {

-    unsigned char *pDst = dst + i;

-    for (j = 0; j < height; j++) {

-      int sum = 0;

-      for (k = 0; k < PRED_FILT_LEN; k++)

-        sum += pTmp[(j + k) * width] * pred_filter[k];

-      // Round

-      sum = (sum + ((1 << (filt_shift << 1)) >> 1)) >> (filt_shift << 1);

-      pDst[j * dst_stride] = (sum < 0 ? 0 : sum > 255 ? 255 : sum);

-    }

-    ++pTmp;

-  }

-}

-#else

-// Based on vp9_post_proc_down_and_across_c (postproc.c)

-void filter_mb(unsigned char *src, int src_stride,

-               unsigned char *dst, int dst_stride,

-               int width, int height) {

-  unsigned char *pSrc, *pDst;

-  int row;

-  int col;

-  int i;

-  int v;

-  unsigned char d[8];

-  /* TODO flimit should be linked to the quantizer value */

-  int flimit = 7;

-  for (row = 0; row < height; row++) {

-    /* post_proc_down for one row */

-    pSrc = src;

-    pDst = dst;

-    for (col = 0; col < width; col++) {

-      int kernel = (1 << (filt_shift - 1));

-      int v = pSrc[col];

-      for (i = -2; i <= 2; i++) {

-        if (abs(v - pSrc[col + i * src_stride]) > flimit)

-          goto down_skip_convolve;

-        kernel += pred_filter[2 + i] * pSrc[col + i * src_stride];

-      }

-      v = (kernel >> filt_shift);

-    down_skip_convolve:

-      pDst[col] = v;

-    }

-    /* now post_proc_across */

-    pSrc = dst;

-    pDst = dst;

-    for (i = 0; i < 8; i++)

-      d[i] = pSrc[i];

-    for (col = 0; col < width; col++) {

-      int kernel = (1 << (filt_shift - 1));

-      v = pSrc[col];

-      d[col & 7] = v;

-      for (i = -2; i <= 2; i++) {

-        if (abs(v - pSrc[col + i]) > flimit)

-          goto across_skip_convolve;

-        kernel += pred_filter[2 + i] * pSrc[col + i];

-      }

-      d[col & 7] = (kernel >> filt_shift);

-    across_skip_convolve:

-      if (col >= 2)

-        pDst[col - 2] = d[(col - 2) & 7];

-    }

-    /* handle the last two pixels */

-    pDst[col - 2] = d[(col - 2) & 7];

-    pDst[col - 1] = d[(col - 1) & 7];

-    /* next row */

-    src += src_stride;

-    dst += dst_stride;

-  }

-}

-#endif  // !USE_THRESH_FILTER

-#endif  // CONFIG_PRED_FILTER

-/*encoder only*/

-void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {

-  int i, j;

-  BLOCKD *blockd = xd->block;

-  /* build uv mvs */

-  for (i = 0; i < 2; i++) {

-    for (j = 0; j < 2; j++) {

-      int yoffset = i * 8 + j * 2;

-      int uoffset = 16 + i * 2 + j;

-      int voffset = 20 + i * 2 + j;

-      int temp;

-      temp = blockd[yoffset  ].bmi.as_mv.first.as_mv.row

-             + blockd[yoffset + 1].bmi.as_mv.first.as_mv.row

-             + blockd[yoffset + 4].bmi.as_mv.first.as_mv.row

-             + blockd[yoffset + 5].bmi.as_mv.first.as_mv.row;

-      if (temp < 0) temp -= 4;

-      else temp += 4;

-      xd->block[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) &

-        xd->fullpixel_mask;

-      temp = blockd[yoffset  ].bmi.as_mv.first.as_mv.col

-             + blockd[yoffset + 1].bmi.as_mv.first.as_mv.col

-             + blockd[yoffset + 4].bmi.as_mv.first.as_mv.col

-             + blockd[yoffset + 5].bmi.as_mv.first.as_mv.col;

-      if (temp < 0) temp -= 4;

-      else temp += 4;

-      blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) &

-        xd->fullpixel_mask;

-      blockd[voffset].bmi.as_mv.first.as_mv.row =

-        blockd[uoffset].bmi.as_mv.first.as_mv.row;

-      blockd[voffset].bmi.as_mv.first.as_mv.col =

-        blockd[uoffset].bmi.as_mv.first.as_mv.col;

-      if (xd->mode_info_context->mbmi.second_ref_frame) {

-        temp = blockd[yoffset  ].bmi.as_mv.second.as_mv.row

-               + blockd[yoffset + 1].bmi.as_mv.second.as_mv.row

-               + blockd[yoffset + 4].bmi.as_mv.second.as_mv.row

-               + blockd[yoffset + 5].bmi.as_mv.second.as_mv.row;

-        if (temp < 0) {

-          temp -= 4;

-        } else {

-          temp += 4;

-        }

-        blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) &

-          xd->fullpixel_mask;

-        temp = blockd[yoffset  ].bmi.as_mv.second.as_mv.col

-               + blockd[yoffset + 1].bmi.as_mv.second.as_mv.col

-               + blockd[yoffset + 4].bmi.as_mv.second.as_mv.col

-               + blockd[yoffset + 5].bmi.as_mv.second.as_mv.col;

-        if (temp < 0) {

-          temp -= 4;

-        } else {

-          temp += 4;

-        }

-        blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) &

-          xd->fullpixel_mask;

-        blockd[voffset].bmi.as_mv.second.as_mv.row =

-          blockd[uoffset].bmi.as_mv.second.as_mv.row;

-        blockd[voffset].bmi.as_mv.second.as_mv.col =

-          blockd[uoffset].bmi.as_mv.second.as_mv.col;

-      }

-    }

-  }

-  for (i = 16; i < 24; i += 2) {

-    BLOCKD *d0 = &blockd[i];

-    BLOCKD *d1 = &blockd[i + 1];

-    if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)

-      build_inter_predictors2b(xd, d0, 8);

-    else {

-      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict);

-      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict);

-    }

-    if (xd->mode_info_context->mbmi.second_ref_frame) {

-      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg);

-      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg);

-    }

-  }

-}

-static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {

-  /* If the MV points so far into the UMV border that no visible pixels

-   * are used for reconstruction, the subpel part of the MV can be

-   * discarded and the MV limited to 16 pixels with equivalent results.

-   *

-   * This limit kicks in at 19 pixels for the top and left edges, for

-   * the 16 pixels plus 3 taps right of the central pixel when subpel

-   * filtering. The bottom and right edges use 16 pixels plus 2 pixels

-   * left of the central pixel when filtering.

-   */

-  if (mv->col < (xd->mb_to_left_edge - ((16 + INTERP_EXTEND) << 3)))

-    mv->col = xd->mb_to_left_edge - (16 << 3);

-  else if (mv->col > xd->mb_to_right_edge + ((15 + INTERP_EXTEND) << 3))

-    mv->col = xd->mb_to_right_edge + (16 << 3);

-  if (mv->row < (xd->mb_to_top_edge - ((16 + INTERP_EXTEND) << 3)))

-    mv->row = xd->mb_to_top_edge - (16 << 3);

-  else if (mv->row > xd->mb_to_bottom_edge + ((15 + INTERP_EXTEND) << 3))

-    mv->row = xd->mb_to_bottom_edge + (16 << 3);

-}

-/* A version of the above function for chroma block MVs.*/

-static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {

-  mv->col = (2 * mv->col < (xd->mb_to_left_edge - ((16 + INTERP_EXTEND) << 3))) ?

-            (xd->mb_to_left_edge - (16 << 3)) >> 1 : mv->col;

-  mv->col = (2 * mv->col > xd->mb_to_right_edge + ((15 + INTERP_EXTEND) << 3)) ?

-            (xd->mb_to_right_edge + (16 << 3)) >> 1 : mv->col;

-  mv->row = (2 * mv->row < (xd->mb_to_top_edge - ((16 + INTERP_EXTEND) << 3))) ?

-            (xd->mb_to_top_edge - (16 << 3)) >> 1 : mv->row;

-  mv->row = (2 * mv->row > xd->mb_to_bottom_edge + ((15 + INTERP_EXTEND) << 3)) ?

-            (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row;

-}

-/*encoder only*/

-void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,

-                                             unsigned char *dst_y,

-                                             int dst_ystride,

-                                             int clamp_mvs) {

-  unsigned char *ptr_base = xd->pre.y_buffer;

-  unsigned char *ptr;

-  int pre_stride = xd->block[0].pre_stride;

-  int_mv ymv;

-  ymv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;

-  if (clamp_mvs)

-    clamp_mv_to_umv_border(&ymv.as_mv, xd);

-  ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3);

-#if CONFIG_PRED_FILTER

-  if (xd->mode_info_context->mbmi.pred_filter_enabled) {

-    if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {

-      // Sub-pel filter needs extended input

-      int len = 15 + (INTERP_EXTEND << 1);

-      unsigned char Temp[32 * 32]; // Data required by sub-pel filter

-      unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);

-      // Copy extended MB into Temp array, applying the spatial filter

-      filter_mb(ptr - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,

-                Temp, len, len, len);

-      // Sub-pel interpolation

-      xd->subpixel_predict16x16(pTemp, len,

-                                (ymv.as_mv.col & 7) << 1,

-                                (ymv.as_mv.row & 7) << 1,

-                                dst_y, dst_ystride);

-    } else {

-      // Apply spatial filter to create the prediction directly

-      filter_mb(ptr, pre_stride, dst_y, dst_ystride, 16, 16);

-    }

-  } else

-#endif

-    if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {

-      xd->subpixel_predict16x16(ptr, pre_stride,

-                                (ymv.as_mv.col & 7) << 1,

-                                (ymv.as_mv.row & 7) << 1,

-                                dst_y, dst_ystride);

-    } else {

-      vp9_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride);

-    }

-}

-void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,

-                                              unsigned char *dst_u,

-                                              unsigned char *dst_v,

-                                              int dst_uvstride) {

-  int offset;

-  unsigned char *uptr, *vptr;

-  int pre_stride = xd->block[0].pre_stride;

-  int_mv _o16x16mv;

-  int_mv _16x16mv;

-  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;

-  if (xd->mode_info_context->mbmi.need_to_clamp_mvs)

-    clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);

-  _o16x16mv = _16x16mv;

-  /* calc uv motion vectors */

-  if (_16x16mv.as_mv.row < 0)

-    _16x16mv.as_mv.row -= 1;

-  else

-    _16x16mv.as_mv.row += 1;

-  if (_16x16mv.as_mv.col < 0)

-    _16x16mv.as_mv.col -= 1;

-  else

-    _16x16mv.as_mv.col += 1;

-  _16x16mv.as_mv.row /= 2;

-  _16x16mv.as_mv.col /= 2;

-  _16x16mv.as_mv.row &= xd->fullpixel_mask;

-  _16x16mv.as_mv.col &= xd->fullpixel_mask;

-  pre_stride >>= 1;

-  offset = (_16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);

-  uptr = xd->pre.u_buffer + offset;

-  vptr = xd->pre.v_buffer + offset;

-#if CONFIG_PRED_FILTER

-  if (xd->mode_info_context->mbmi.pred_filter_enabled) {

-    int i;

-    unsigned char *pSrc = uptr;

-    unsigned char *pDst = dst_u;

-    int len = 7 + (INTERP_EXTEND << 1);

-    unsigned char Temp[32 * 32]; // Data required by the sub-pel filter

-    unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);

-    // U & V

-    for (i = 0; i < 2; i++) {

-      if (_o16x16mv.as_int & 0x000f000f) {

-        // Copy extended MB into Temp array, applying the spatial filter

-        filter_mb(pSrc - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,

-                  Temp, len, len, len);

-        // Sub-pel filter

-        xd->subpixel_predict8x8(pTemp, len,

-                                _o16x16mv.as_mv.col & 15,

-                                _o16x16mv.as_mv.row & 15,

-                                pDst, dst_uvstride);

-      } else {

-        filter_mb(pSrc, pre_stride, pDst, dst_uvstride, 8, 8);

-      }

-      // V

-      pSrc = vptr;

-      pDst = dst_v;

-    }

-  } else

-#endif

-    if (_o16x16mv.as_int & 0x000f000f) {

-      xd->subpixel_predict8x8(uptr, pre_stride, _o16x16mv.as_mv.col & 15,

-                              _o16x16mv.as_mv.row & 15, dst_u, dst_uvstride);

-      xd->subpixel_predict8x8(vptr, pre_stride, _o16x16mv.as_mv.col & 15,

-                              _o16x16mv.as_mv.row & 15, dst_v, dst_uvstride);

-    } else {

-      vp9_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);

-      vp9_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);

-    }

-}

-void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,

-                                            unsigned char *dst_y,

-                                            unsigned char *dst_u,

-                                            unsigned char *dst_v,

-                                            int dst_ystride, int dst_uvstride) {

-  vp9_build_1st_inter16x16_predictors_mby(xd, dst_y, dst_ystride,

-      xd->mode_info_context->mbmi.need_to_clamp_mvs);

-  vp9_build_1st_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);

-}

-#if CONFIG_SUPERBLOCKS

-void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,

-                                        unsigned char *dst_y,

-                                        unsigned char *dst_u,

-                                        unsigned char *dst_v,

-                                        int dst_ystride,

-                                        int dst_uvstride) {

-  uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;

-  uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer,

-          *v2 = x->second_pre.v_buffer;

-  int n;

-  for (n = 0; n < 4; n++)

-  {

-    const int x_idx = n & 1, y_idx = n >> 1;

-    x->pre.y_buffer = y1 + y_idx * 16 * x->pre.y_stride  + x_idx * 16;

-    x->pre.u_buffer = u1 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;

-    x->pre.v_buffer = v1 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;

-    vp9_build_1st_inter16x16_predictors_mb(x,

-      dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,

-      dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,

-      dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,

-      dst_ystride, dst_uvstride);

-    if (x->mode_info_context->mbmi.second_ref_frame) {

-      x->second_pre.y_buffer = y2 + y_idx * 16 * x->pre.y_stride  + x_idx * 16;

-      x->second_pre.u_buffer = u2 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;

-      x->second_pre.v_buffer = v2 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;

-      vp9_build_2nd_inter16x16_predictors_mb(x,

-        dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,

-        dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,

-        dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,

-        dst_ystride, dst_uvstride);

-    }

-  }

-  x->pre.y_buffer = y1;

-  x->pre.u_buffer = u1;

-  x->pre.v_buffer = v1;

-  if (x->mode_info_context->mbmi.second_ref_frame) {

-    x->second_pre.y_buffer = y2;

-    x->second_pre.u_buffer = u2;

-    x->second_pre.v_buffer = v2;

-  }

-}

-#endif

-/*

- * The following functions should be called after an initial

- * call to vp9_build_1st_inter16x16_predictors_mb() or _mby()/_mbuv().

- * It will run a second sixtap filter on a (different) ref

- * frame and average the result with the output of the

- * first sixtap filter. The second reference frame is stored

- * in x->second_pre (the reference frame index is in

- * x->mode_info_context->mbmi.second_ref_frame). The second

- * motion vector is x->mode_info_context->mbmi.second_mv.

- *

- * This allows blending prediction from two reference frames

- * which sometimes leads to better prediction than from a

- * single reference framer.

- */

-void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,

-                                             unsigned char *dst_y,

-                                             int dst_ystride) {

-  unsigned char *ptr;

-  int_mv _16x16mv;

-  int mv_row;

-  int mv_col;

-  unsigned char *ptr_base = xd->second_pre.y_buffer;

-  int pre_stride = xd->block[0].pre_stride;

-  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;

-  if (xd->mode_info_context->mbmi.need_to_clamp_secondmv)

-    clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);

-  mv_row = _16x16mv.as_mv.row;

-  mv_col = _16x16mv.as_mv.col;

-  ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);

-#if CONFIG_PRED_FILTER

-  if (xd->mode_info_context->mbmi.pred_filter_enabled) {

-    if ((mv_row | mv_col) & 7) {

-      // Sub-pel filter needs extended input

-      int len = 15 + (INTERP_EXTEND << 1);

-      unsigned char Temp[32 * 32]; // Data required by sub-pel filter

-      unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);

-      // Copy extended MB into Temp array, applying the spatial filter

-      filter_mb(ptr - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,

-                Temp, len, len, len);

-      // Sub-pel filter

-      xd->subpixel_predict_avg16x16(pTemp, len, (mv_col & 7) << 1,

-                                    (mv_row & 7) << 1, dst_y, dst_ystride);

-    } else {

-      // TODO Needs to AVERAGE with the dst_y

-      // For now, do not apply the prediction filter in these cases!

-      vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);

-    }

-  } else

-#endif  // CONFIG_PRED_FILTER

-  {

-    if ((mv_row | mv_col) & 7) {

-      xd->subpixel_predict_avg16x16(ptr, pre_stride, (mv_col & 7) << 1,

-                                    (mv_row & 7) << 1, dst_y, dst_ystride);

-    } else {

-      vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);

-    }

-  }

-}

-void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,

-                                              unsigned char *dst_u,

-                                              unsigned char *dst_v,

-                                              int dst_uvstride) {

-  int offset;

-  unsigned char *uptr, *vptr;

-  int_mv _16x16mv;

-  int mv_row;

-  int mv_col;

-  int omv_row, omv_col;

-  int pre_stride = xd->block[0].pre_stride;

-  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;

-  if (xd->mode_info_context->mbmi.need_to_clamp_secondmv)

-    clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);

-  mv_row = _16x16mv.as_mv.row;

-  mv_col = _16x16mv.as_mv.col;

-  /* calc uv motion vectors */

-  omv_row = mv_row;

-  omv_col = mv_col;

-  mv_row = (mv_row + (mv_row > 0)) >> 1;

-  mv_col = (mv_col + (mv_col > 0)) >> 1;

-  mv_row &= xd->fullpixel_mask;

-  mv_col &= xd->fullpixel_mask;

-  pre_stride >>= 1;

-  offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);

-  uptr = xd->second_pre.u_buffer + offset;

-  vptr = xd->second_pre.v_buffer + offset;

-#if CONFIG_PRED_FILTER

-  if (xd->mode_info_context->mbmi.pred_filter_enabled) {

-    int i;

-    int len = 7 + (INTERP_EXTEND << 1);

-    unsigned char Temp[32 * 32]; // Data required by sub-pel filter

-    unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);

-    unsigned char *pSrc = uptr;

-    unsigned char *pDst = dst_u;

-    // U & V

-    for (i = 0; i < 2; i++) {

-      if ((omv_row | omv_col) & 15) {

-        // Copy extended MB into Temp array, applying the spatial filter

-        filter_mb(pSrc - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,

-                  Temp, len, len, len);

-        // Sub-pel filter

-        xd->subpixel_predict_avg8x8(pTemp, len, omv_col & 15,

-                                    omv_row & 15, pDst, dst_uvstride);

-      } else {

-        // TODO Needs to AVERAGE with the dst_[u|v]

-        // For now, do not apply the prediction filter here!

-        vp9_avg_mem8x8(pSrc, pre_stride, pDst, dst_uvstride);

-      }

-      // V

-      pSrc = vptr;

-      pDst = dst_v;

-    }

-  } else

-#endif  // CONFIG_PRED_FILTER

-    if ((omv_row | omv_col) & 15) {

-      xd->subpixel_predict_avg8x8(uptr, pre_stride, omv_col & 15,

-                                  omv_row & 15, dst_u, dst_uvstride);

-      xd->subpixel_predict_avg8x8(vptr, pre_stride, omv_col & 15,

-                                  omv_row & 15, dst_v, dst_uvstride);

-    } else {

-      vp9_avg_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);

-      vp9_avg_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);

-    }

-}

-void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,

-                                            unsigned char *dst_y,

-                                            unsigned char *dst_u,

-                                            unsigned char *dst_v,

-                                            int dst_ystride,

-                                            int dst_uvstride) {

-  vp9_build_2nd_inter16x16_predictors_mby(xd, dst_y, dst_ystride);

-  vp9_build_2nd_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);

-}

-static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {

-  int i;

-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;

-  BLOCKD *blockd = xd->block;

-  if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) {

-    blockd[ 0].bmi = xd->mode_info_context->bmi[ 0];

-    blockd[ 2].bmi = xd->mode_info_context->bmi[ 2];

-    blockd[ 8].bmi = xd->mode_info_context->bmi[ 8];

-    blockd[10].bmi = xd->mode_info_context->bmi[10];

-    if (mbmi->need_to_clamp_mvs) {

-      clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.first.as_mv, xd);

-      clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.first.as_mv, xd);

-      clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.first.as_mv, xd);

-      clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.first.as_mv, xd);

-      if (mbmi->second_ref_frame) {

-        clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.second.as_mv, xd);

-        clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.second.as_mv, xd);

-        clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.second.as_mv, xd);

-        clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.second.as_mv, xd);

-      }

-    }

-    vp9_build_inter_predictors4b(xd, &blockd[ 0], 16);

-    vp9_build_inter_predictors4b(xd, &blockd[ 2], 16);

-    vp9_build_inter_predictors4b(xd, &blockd[ 8], 16);

-    vp9_build_inter_predictors4b(xd, &blockd[10], 16);

-    if (mbmi->second_ref_frame) {

-      vp9_build_2nd_inter_predictors4b(xd, &blockd[ 0], 16);

-      vp9_build_2nd_inter_predictors4b(xd, &blockd[ 2], 16);

-      vp9_build_2nd_inter_predictors4b(xd, &blockd[ 8], 16);

-      vp9_build_2nd_inter_predictors4b(xd, &blockd[10], 16);

-    }

-  } else {

-    for (i = 0; i < 16; i += 2) {

-      BLOCKD *d0 = &blockd[i];

-      BLOCKD *d1 = &blockd[i + 1];

-      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];

-      blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1];

-      if (mbmi->need_to_clamp_mvs) {

-        clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.first.as_mv, xd);

-        clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.first.as_mv, xd);

-        if (mbmi->second_ref_frame) {

-          clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.second.as_mv, xd);

-          clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.second.as_mv, xd);

-        }

-      }

-      if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)

-        build_inter_predictors2b(xd, d0, 16);

-      else {

-        vp9_build_inter_predictors_b(d0, 16, xd->subpixel_predict);

-        vp9_build_inter_predictors_b(d1, 16, xd->subpixel_predict);

-      }

-      if (mbmi->second_ref_frame) {

-        vp9_build_2nd_inter_predictors_b(d0, 16, xd->subpixel_predict_avg);

-        vp9_build_2nd_inter_predictors_b(d1, 16, xd->subpixel_predict_avg);

-      }

-    }

-  }

-  for (i = 16; i < 24; i += 2) {

-    BLOCKD *d0 = &blockd[i];

-    BLOCKD *d1 = &blockd[i + 1];

-    if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)

-      build_inter_predictors2b(xd, d0, 8);

-    else {

-      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict);

-      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict);

-    }

-    if (mbmi->second_ref_frame) {

-      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg);

-      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg);

-    }

-  }

-}

-static

-void build_4x4uvmvs(MACROBLOCKD *xd) {

-  int i, j;

-  BLOCKD *blockd = xd->block;

-  for (i = 0; i < 2; i++) {

-    for (j = 0; j < 2; j++) {

-      int yoffset = i * 8 + j * 2;

-      int uoffset = 16 + i * 2 + j;

-      int voffset = 20 + i * 2 + j;

-      int temp;

-      temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.row

-             + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.row

-             + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.row

-             + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.row;

-      if (temp < 0) temp -= 4;

-      else temp += 4;

-      blockd[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) &

-                                                  xd->fullpixel_mask;

-      temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.col

-             + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.col

-             + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.col

-             + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.col;

-      if (temp < 0) temp -= 4;

-      else temp += 4;

-      blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) &

-        xd->fullpixel_mask;

-      // if (x->mode_info_context->mbmi.need_to_clamp_mvs)

-      clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd);

-      // if (x->mode_info_context->mbmi.need_to_clamp_mvs)

-      clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd);

-      blockd[voffset].bmi.as_mv.first.as_mv.row =

-        blockd[uoffset].bmi.as_mv.first.as_mv.row;

-      blockd[voffset].bmi.as_mv.first.as_mv.col =

-        blockd[uoffset].bmi.as_mv.first.as_mv.col;

-      if (xd->mode_info_context->mbmi.second_ref_frame) {

-        temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.row

-               + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.row

-               + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.row

-               + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.row;

-        if (temp < 0) {

-          temp -= 4;

-        } else {

-          temp += 4;

-        }

-       blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) &

-                                                    xd->fullpixel_mask;

-        temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.col

-               + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.col

-               + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.col

-               + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.col;

-        if (temp < 0) {

-          temp -= 4;

-        } else {

-          temp += 4;

-        }

-        blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) &

-                                                        xd->fullpixel_mask;

-        // if (mbmi->need_to_clamp_mvs)

-        clamp_uvmv_to_umv_border(

-          &blockd[uoffset].bmi.as_mv.second.as_mv, xd);

-        // if (mbmi->need_to_clamp_mvs)

-        clamp_uvmv_to_umv_border(

-          &blockd[uoffset].bmi.as_mv.second.as_mv, xd);

-        blockd[voffset].bmi.as_mv.second.as_mv.row =

-          blockd[uoffset].bmi.as_mv.second.as_mv.row;

-        blockd[voffset].bmi.as_mv.second.as_mv.col =

-          blockd[uoffset].bmi.as_mv.second.as_mv.col;

-      }

-    }

-  }

-}

-void vp9_build_inter_predictors_mb(MACROBLOCKD *xd) {

-  if (xd->mode_info_context->mbmi.mode != SPLITMV) {

-    vp9_build_1st_inter16x16_predictors_mb(xd, xd->predictor,

-                                           &xd->predictor[256],

-                                           &xd->predictor[320], 16, 8);

-    if (xd->mode_info_context->mbmi.second_ref_frame) {

-      /* 256 = offset of U plane in Y+U+V buffer;

-       * 320 = offset of V plane in Y+U+V buffer.

-       * (256=16x16, 320=16x16+8x8). */

-      vp9_build_2nd_inter16x16_predictors_mb(xd, xd->predictor,

-                                             &xd->predictor[256],

-                                             &xd->predictor[320], 16, 8);

-    }

-  } else {

-    build_4x4uvmvs(xd);

-    build_inter4x4_predictors_mb(xd);

-  }

-}

--- a/vp8/common/reconinter.h

+++ /dev/null

@@ -1,78 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_RECONINTER_H

-#define __INC_RECONINTER_H

-#include "onyxc_int.h"

-extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,

-                                                    unsigned char *dst_y,

-                                                    int dst_ystride,

-                                                    int clamp_mvs);

-extern void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,

-                                                     unsigned char *dst_u,

-                                                     unsigned char *dst_v,

-                                                     int dst_uvstride);

-extern void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,

-                                                   unsigned char *dst_y,

-                                                   unsigned char *dst_u,

-                                                   unsigned char *dst_v,

-                                                   int dst_ystride,

-                                                   int dst_uvstride);

-extern void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,

-                                                    unsigned char *dst_y,

-                                                    int dst_ystride);

-extern void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,

-                                                     unsigned char *dst_u,

-                                                     unsigned char *dst_v,

-                                                     int dst_uvstride);

-extern void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,

-                                                   unsigned char *dst_y,

-                                                   unsigned char *dst_u,

-                                                   unsigned char *dst_v,

-                                                   int dst_ystride,

-                                                   int dst_uvstride);

-#if CONFIG_SUPERBLOCKS

-extern void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,

-                                               unsigned char *dst_y,

-                                               unsigned char *dst_u,

-                                               unsigned char *dst_v,

-                                               int dst_ystride,

-                                               int dst_uvstride);

-#endif

-extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd);

-extern void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,

-                                         vp9_subpix_fn_t sppf);

-extern void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,

-                                             vp9_subpix_fn_t sppf);

-extern void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d,

-                                         int pitch);

-extern void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,

-                                             BLOCKD *d, int pitch);

-extern void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd);

-extern void vp9_setup_interp_filters(MACROBLOCKD *xd,

-                                     INTERPOLATIONFILTERTYPE filter,

-                                     VP9_COMMON *cm);

-#endif  // __INC_RECONINTER_H

--- a/vp8/common/reconintra.c

+++ /dev/null

@@ -1,490 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <stdio.h>

-#include "vpx_ports/config.h"

-#include "vpx_rtcd.h"

-#include "reconintra.h"

-#include "vpx_mem/vpx_mem.h"

-/* For skip_recon_mb(), add vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd)

- * and vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd).

- */

-static void d27_predictor(uint8_t *ypred_ptr, int y_stride, int n,

-                          uint8_t *yabove_row, uint8_t *yleft_col) {

-  int r, c, h, w, v;

-  int a, b;

-  r = 0;

-  for (c = 0; c < n - 2; c++) {

-    if (c & 1)

-      a = yleft_col[r + 1];

-    else

-      a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1;

-    b = yabove_row[c + 2];

-    ypred_ptr[c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);

-  }

-  for (r = 1; r < n / 2 - 1; r++) {

-    for (c = 0; c < n - 2 - 2 * r; c++) {

-      if (c & 1)

-        a = yleft_col[r + 1];

-      else

-        a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1;

-      b = ypred_ptr[(r - 1) * y_stride + c + 2];

-      ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);

-    }

-  }

-  for (; r < n - 1; ++r) {

-    for (c = 0; c < n; c++) {

-      v = (c & 1 ? yleft_col[r + 1] : (yleft_col[r] + yleft_col[r + 1] + 1) >> 1);

-      h = r - c / 2;

-      ypred_ptr[h * y_stride + c] = v;

-    }

-  }

-  c = 0;

-  r = n - 1;

-  ypred_ptr[r * y_stride] = (ypred_ptr[(r - 1) * y_stride] +

-                             yleft_col[r] + 1) >> 1;

-  for (r = n - 2; r >= n / 2; --r) {

-    w = c + (n - 1 - r) * 2;

-    ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] +

-                                   ypred_ptr[r * y_stride + w - 1] + 1) >> 1;

-  }

-  for (c = 1; c < n; c++) {

-    for (r = n - 1; r >= n / 2 + c / 2; --r) {

-      w = c + (n - 1 - r) * 2;

-      ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] +

-                                     ypred_ptr[r * y_stride + w - 1] + 1) >> 1;

-    }

-  }

-}

-static void d63_predictor(uint8_t *ypred_ptr, int y_stride, int n,

-                          uint8_t *yabove_row, uint8_t *yleft_col) {

-  int r, c, h, w, v;

-  int a, b;

-  c = 0;

-  for (r = 0; r < n - 2; r++) {

-    if (r & 1)

-      a = yabove_row[c + 1];

-    else

-      a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1;

-    b = yleft_col[r + 2];

-    ypred_ptr[r * y_stride] = (2 * a + (r + 1) * b + (r + 3) / 2) / (r + 3);

-  }

-  for (c = 1; c < n / 2 - 1; c++) {

-    for (r = 0; r < n - 2 - 2 * c; r++) {

-      if (r & 1)

-        a = yabove_row[c + 1];

-      else

-        a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1;

-      b = ypred_ptr[(r + 2) * y_stride + c - 1];

-      ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);

-    }

-  }

-  for (; c < n - 1; ++c) {

-    for (r = 0; r < n; r++) {

-      v = (r & 1 ? yabove_row[c + 1] : (yabove_row[c] + yabove_row[c + 1] + 1) >> 1);

-      w = c - r / 2;

-      ypred_ptr[r * y_stride + w] = v;

-    }

-  }

-  r = 0;

-  c = n - 1;

-  ypred_ptr[c] = (ypred_ptr[(c - 1)] + yabove_row[c] + 1) >> 1;

-  for (c = n - 2; c >= n / 2; --c) {

-    h = r + (n - 1 - c) * 2;

-    ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] +

-                                   ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1;

-  }

-  for (r = 1; r < n; r++) {

-    for (c = n - 1; c >= n / 2 + r / 2; --c) {

-      h = r + (n - 1 - c) * 2;

-      ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] +

-                                     ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1;

-    }

-  }

-}

-static void d45_predictor(uint8_t *ypred_ptr, int y_stride, int n,

-                          uint8_t *yabove_row, uint8_t *yleft_col) {

-  int r, c;

-  for (r = 0; r < n - 1; ++r) {

-    for (c = 0; c <= r; ++c) {

-      ypred_ptr[(r - c) * y_stride + c] =

-        (yabove_row[r + 1] * (c + 1) +

-         yleft_col[r + 1] * (r - c + 1) + r / 2 + 1) / (r + 2);

-    }

-  }

-  for (c = 0; c <= r; ++c) {

-    int yabove_ext = yabove_row[r]; // 2*yabove_row[r] - yabove_row[r-1];

-    int yleft_ext = yleft_col[r]; // 2*yleft_col[r] - yleft_col[r-1];

-    yabove_ext = (yabove_ext > 255 ? 255 : (yabove_ext < 0 ? 0 : yabove_ext));

-    yleft_ext = (yleft_ext > 255 ? 255 : (yleft_ext < 0 ? 0 : yleft_ext));

-    ypred_ptr[(r - c) * y_stride + c] =

-      (yabove_ext * (c + 1) +

-       yleft_ext * (r - c + 1) + r / 2 + 1) / (r + 2);

-  }

-  for (r = 1; r < n; ++r) {

-    for (c = n - r; c < n; ++c)

-      ypred_ptr[r * y_stride + c] = (ypred_ptr[(r - 1) * y_stride + c] +

-                                     ypred_ptr[r * y_stride + c - 1] + 1) >> 1;

-  }

-}

-static void d117_predictor(uint8_t *ypred_ptr, int y_stride, int n,

-                           uint8_t *yabove_row, uint8_t *yleft_col) {

-  int r, c;

-  for (c = 0; c < n; c++)

-    ypred_ptr[c] = (yabove_row[c - 1] + yabove_row[c] + 1) >> 1;

-  ypred_ptr += y_stride;

-  for (c = 0; c < n; c++)

-    ypred_ptr[c] = yabove_row[c - 1];

-  ypred_ptr += y_stride;

-  for (r = 2; r < n; ++r) {

-    ypred_ptr[0] = yleft_col[r - 2];

-    for (c = 1; c < n; c++)

-      ypred_ptr[c] = ypred_ptr[-2 * y_stride + c - 1];

-    ypred_ptr += y_stride;

-  }

-}

-static void d135_predictor(uint8_t *ypred_ptr, int y_stride, int n,

-                           uint8_t *yabove_row, uint8_t *yleft_col) {

-  int r, c;

-  ypred_ptr[0] = yabove_row[-1];

-  for (c = 1; c < n; c++)

-    ypred_ptr[c] = yabove_row[c - 1];

-  for (r = 1; r < n; ++r)

-    ypred_ptr[r * y_stride] = yleft_col[r - 1];

-  ypred_ptr += y_stride;

-  for (r = 1; r < n; ++r) {

-    for (c = 1; c < n; c++) {

-      ypred_ptr[c] = ypred_ptr[-y_stride + c - 1];

-    }

-    ypred_ptr += y_stride;

-  }

-}

-static void d153_predictor(uint8_t *ypred_ptr, int y_stride, int n,

-                           uint8_t *yabove_row, uint8_t *yleft_col) {

-  int r, c;

-  ypred_ptr[0] = (yabove_row[-1] + yleft_col[0] + 1) >> 1;

-  for (r = 1; r < n; r++)

-    ypred_ptr[r * y_stride] = (yleft_col[r - 1] + yleft_col[r] + 1) >> 1;

-  ypred_ptr++;

-  ypred_ptr[0] = yabove_row[-1];

-  for (r = 1; r < n; r++)

-    ypred_ptr[r * y_stride] = yleft_col[r - 1];

-  ypred_ptr++;

-  for (c = 0; c < n - 2; c++)

-    ypred_ptr[c] = yabove_row[c];

-  ypred_ptr += y_stride;

-  for (r = 1; r < n; ++r) {

-    for (c = 0; c < n - 2; c++)

-      ypred_ptr[c] = ypred_ptr[-y_stride + c - 2];

-    ypred_ptr += y_stride;

-  }

-}

-void vp9_recon_intra_mbuv(MACROBLOCKD *xd) {

-  int i;

-  for (i = 16; i < 24; i += 2) {

-    BLOCKD *b = &xd->block[i];

-    vp9_recon2b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);

-  }

-}

-void vp9_build_intra_predictors_internal(unsigned char *src, int src_stride,

-                                         unsigned char *ypred_ptr,

-                                         int y_stride, int mode, int bsize,

-                                         int up_available, int left_available) {

-  unsigned char *yabove_row = src - src_stride;

-  unsigned char yleft_col[32];

-  unsigned char ytop_left = yabove_row[-1];

-  int r, c, i;

-  for (i = 0; i < bsize; i++) {

-    yleft_col[i] = src[i * src_stride - 1];

-  }

-  /* for Y */

-  switch (mode) {

-    case DC_PRED: {

-      int expected_dc;

-      int i;

-      int shift;

-      int average = 0;

-      int log2_bsize_minus_1;

-      assert(bsize == 4 || bsize == 8 || bsize == 16 || bsize == 32);

-      if (bsize == 4) {

-        log2_bsize_minus_1 = 1;

-      } else if (bsize == 8) {

-        log2_bsize_minus_1 = 2;

-      } else if (bsize == 16) {

-        log2_bsize_minus_1 = 3;

-      } else /* bsize == 32 */ {

-        log2_bsize_minus_1 = 4;

-      }

-      if (up_available || left_available) {

-        if (up_available) {

-          for (i = 0; i < bsize; i++) {

-            average += yabove_row[i];

-          }

-        }

-        if (left_available) {

-          for (i = 0; i < bsize; i++) {

-            average += yleft_col[i];

-          }

-        }

-        shift = log2_bsize_minus_1 + up_available + left_available;

-        expected_dc = (average + (1 << (shift - 1))) >> shift;

-      } else {

-        expected_dc = 128;

-      }

-      for (r = 0; r < bsize; r++) {

-        vpx_memset(ypred_ptr, expected_dc, bsize);

-        ypred_ptr += y_stride;

-      }

-    }

-    break;

-    case V_PRED: {

-      for (r = 0; r < bsize; r++) {

-        memcpy(ypred_ptr, yabove_row, bsize);

-        ypred_ptr += y_stride;

-      }

-    }

-    break;

-    case H_PRED: {

-      for (r = 0; r < bsize; r++) {

-        vpx_memset(ypred_ptr, yleft_col[r], bsize);

-        ypred_ptr += y_stride;

-      }

-    }

-    break;

-    case TM_PRED: {

-      for (r = 0; r < bsize; r++) {

-        for (c = 0; c < bsize; c++) {

-          int pred =  yleft_col[r] + yabove_row[ c] - ytop_left;

-          if (pred < 0)

-            pred = 0;

-          if (pred > 255)

-            pred = 255;

-          ypred_ptr[c] = pred;

-        }

-        ypred_ptr += y_stride;

-      }

-    }

-    break;

-    case D45_PRED: {

-      d45_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

-    }

-    break;

-    case D135_PRED: {

-      d135_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

-    }

-    break;

-    case D117_PRED: {

-      d117_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

-    }

-    break;

-    case D153_PRED: {

-      d153_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

-    }

-    break;

-    case D27_PRED: {

-      d27_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

-    }

-    break;

-    case D63_PRED: {

-      d63_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

-    }

-    break;

-    case I8X8_PRED:

-    case B_PRED:

-    case NEARESTMV:

-    case NEARMV:

-    case ZEROMV:

-    case NEWMV:

-    case SPLITMV:

-    case MB_MODE_COUNT:

-      break;

-  }

-}

-void vp9_build_intra_predictors_mby(MACROBLOCKD *xd) {

-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,

-                                      xd->predictor, 16,

-                                      xd->mode_info_context->mbmi.mode, 16,

-                                      xd->up_available, xd->left_available);

-}

-void vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd) {

-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,

-                                      xd->dst.y_buffer, xd->dst.y_stride,

-                                      xd->mode_info_context->mbmi.mode, 16,

-                                      xd->up_available, xd->left_available);

-}

-#if CONFIG_SUPERBLOCKS

-void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd) {

-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,

-                                      xd->dst.y_buffer, xd->dst.y_stride,

-                                      xd->mode_info_context->mbmi.mode, 32,

-                                      xd->up_available, xd->left_available);

-}

-#endif

-#if CONFIG_COMP_INTRA_PRED

-void vp9_build_comp_intra_predictors_mby(MACROBLOCKD *xd) {

-  unsigned char predictor[2][256];

-  int i;

-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,

-                                      predictor[0], 16,

-                                      xd->mode_info_context->mbmi.mode,

-                                      16, xd->up_available,

-                                      xd->left_available);

-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,

-                                      predictor[1], 16,

-                                      xd->mode_info_context->mbmi.second_mode,

-                                      16, xd->up_available,

-                                      xd->left_available);

-  for (i = 0; i < 256; i++) {

-    xd->predictor[i] = (predictor[0][i] + predictor[1][i] + 1) >> 1;

-  }

-}

-#endif

-void vp9_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd,

-                                              unsigned char *upred_ptr,

-                                              unsigned char *vpred_ptr,

-                                              int uv_stride,

-                                              int mode, int bsize) {

-  vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride,

-                                      upred_ptr, uv_stride, mode, bsize,

-                                      xd->up_available, xd->left_available);

-  vp9_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride,

-                                      vpred_ptr, uv_stride, mode, bsize,

-                                      xd->up_available, xd->left_available);

-}

-void vp9_build_intra_predictors_mbuv(MACROBLOCKD *xd) {

-  vp9_build_intra_predictors_mbuv_internal(xd, &xd->predictor[256],

-                                           &xd->predictor[320], 8,

-                                           xd->mode_info_context->mbmi.uv_mode,

-                                           8);

-}

-void vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd) {

-  vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,

-                                           xd->dst.v_buffer,

-                                           xd->dst.uv_stride,

-                                           xd->mode_info_context->mbmi.uv_mode,

-                                           8);

-}

-#if CONFIG_SUPERBLOCKS

-void vp9_build_intra_predictors_sbuv_s(MACROBLOCKD *xd) {

-  vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,

-                                           xd->dst.v_buffer, xd->dst.uv_stride,

-                                           xd->mode_info_context->mbmi.uv_mode,

-                                           16);

-}

-#endif

-#if CONFIG_COMP_INTRA_PRED

-void vp9_build_comp_intra_predictors_mbuv(MACROBLOCKD *xd) {

-  unsigned char predictor[2][2][64];

-  int i;

-  vp9_build_intra_predictors_mbuv_internal(

-    xd, predictor[0][0], predictor[1][0], 8,

-    xd->mode_info_context->mbmi.uv_mode, 8);

-  vp9_build_intra_predictors_mbuv_internal(

-    xd, predictor[0][1], predictor[1][1], 8,

-    xd->mode_info_context->mbmi.second_uv_mode, 8);

-  for (i = 0; i < 64; i++) {

-    xd->predictor[256 + i] = (predictor[0][0][i] + predictor[0][1][i] + 1) >> 1;

-    xd->predictor[256 + 64 + i] = (predictor[1][0][i] +

-                                   predictor[1][1][i] + 1) >> 1;

-  }

-}

-#endif

-void vp9_intra8x8_predict(BLOCKD *xd,

-                          int mode,

-                          unsigned char *predictor) {

-  vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,

-                                      xd->dst_stride, predictor, 16,

-                                      mode, 8, 1, 1);

-}

-#if CONFIG_COMP_INTRA_PRED

-void vp9_comp_intra8x8_predict(BLOCKD *xd,

-                               int mode, int second_mode,

-                               unsigned char *out_predictor) {

-  unsigned char predictor[2][8 * 16];

-  int i, j;

-  vp9_intra8x8_predict(xd, mode, predictor[0]);

-  vp9_intra8x8_predict(xd, second_mode, predictor[1]);

-  for (i = 0; i < 8 * 16; i += 16) {

-    for (j = i; j < i + 8; j++) {

-      out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;

-    }

-  }

-}

-#endif

-void vp9_intra_uv4x4_predict(BLOCKD *xd,

-                             int mode,

-                             unsigned char *predictor) {

-  vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,

-                                      xd->dst_stride, predictor, 8,

-                                      mode, 4, 1, 1);

-}

-#if CONFIG_COMP_INTRA_PRED

-void vp9_comp_intra_uv4x4_predict(BLOCKD *xd,

-                                  int mode, int mode2,

-                                  unsigned char *out_predictor) {

-  unsigned char predictor[2][8 * 4];

-  int i, j;

-  vp9_intra_uv4x4_predict(xd, mode, predictor[0]);

-  vp9_intra_uv4x4_predict(xd, mode2, predictor[1]);

-  for (i = 0; i < 4 * 8; i += 8) {

-    for (j = i; j < i + 4; j++) {

-      out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;

-    }

-  }

-}

-#endif

-/* TODO: try different ways of use Y-UV mode correlation

- Current code assumes that a uv 4x4 block use same mode

- as corresponding Y 8x8 area

- */

--- a/vp8/common/reconintra.h

+++ /dev/null

@@ -1,18 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_RECONINTRA_H

-#define __INC_RECONINTRA_H

-#include "blockd.h"

-extern void init_intra_left_above_pixels(MACROBLOCKD *xd);

-#endif  // __INC_RECONINTRA_H

--- a/vp8/common/reconintra4x4.c

+++ /dev/null

@@ -1,321 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "vpx_mem/vpx_mem.h"

-#include "reconintra.h"

-#include "vpx_rtcd.h"

-void vp9_intra4x4_predict_c(BLOCKD *x, int b_mode,

-                            unsigned char *predictor) {

-  int i, r, c;

-  unsigned char *Above = *(x->base_dst) + x->dst - x->dst_stride;

-  unsigned char Left[4];

-  unsigned char top_left = Above[-1];

-  Left[0] = (*(x->base_dst))[x->dst - 1];

-  Left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride];

-  Left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride];

-  Left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride];

-  switch (b_mode) {

-    case B_DC_PRED: {

-      int expected_dc = 0;

-      for (i = 0; i < 4; i++) {

-        expected_dc += Above[i];

-        expected_dc += Left[i];

-      }

-      expected_dc = (expected_dc + 4) >> 3;

-      for (r = 0; r < 4; r++) {

-        for (c = 0; c < 4; c++) {

-          predictor[c] = expected_dc;

-        }

-        predictor += 16;

-      }

-    }

-    break;

-    case B_TM_PRED: {

-      /* prediction similar to true_motion prediction */

-      for (r = 0; r < 4; r++) {

-        for (c = 0; c < 4; c++) {

-          int pred = Above[c] - top_left + Left[r];

-          if (pred < 0)

-            pred = 0;

-          if (pred > 255)

-            pred = 255;

-          predictor[c] = pred;

-        }

-        predictor += 16;

-      }

-    }

-    break;

-    case B_VE_PRED: {

-      unsigned int ap[4];

-      ap[0] = Above[0];

-      ap[1] = Above[1];

-      ap[2] = Above[2];

-      ap[3] = Above[3];

-      for (r = 0; r < 4; r++) {

-        for (c = 0; c < 4; c++) {

-          predictor[c] = ap[c];

-        }

-        predictor += 16;

-      }

-    }

-    break;

-    case B_HE_PRED: {

-      unsigned int lp[4];

-      lp[0] = Left[0];

-      lp[1] = Left[1];

-      lp[2] = Left[2];

-      lp[3] = Left[3];

-      for (r = 0; r < 4; r++) {

-        for (c = 0; c < 4; c++) {

-          predictor[c] = lp[r];

-        }

-        predictor += 16;

-      }

-    }

-    break;

-    case B_LD_PRED: {

-      unsigned char *ptr = Above;

-      predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;

-      predictor[0 * 16 + 1] =

-        predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;

-      predictor[0 * 16 + 2] =

-        predictor[1 * 16 + 1] =

-          predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;

-      predictor[0 * 16 + 3] =

-        predictor[1 * 16 + 2] =

-          predictor[2 * 16 + 1] =

-            predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;

-      predictor[1 * 16 + 3] =

-        predictor[2 * 16 + 2] =

-          predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;

-      predictor[2 * 16 + 3] =

-        predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;

-      predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;

-    }

-    break;

-    case B_RD_PRED: {

-      unsigned char pp[9];

-      pp[0] = Left[3];

-      pp[1] = Left[2];

-      pp[2] = Left[1];

-      pp[3] = Left[0];

-      pp[4] = top_left;

-      pp[5] = Above[0];

-      pp[6] = Above[1];

-      pp[7] = Above[2];

-      pp[8] = Above[3];

-      predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;

-      predictor[3 * 16 + 1] =

-        predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;

-      predictor[3 * 16 + 2] =

-        predictor[2 * 16 + 1] =

-          predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;

-      predictor[3 * 16 + 3] =

-        predictor[2 * 16 + 2] =

-          predictor[1 * 16 + 1] =

-            predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;

-      predictor[2 * 16 + 3] =

-        predictor[1 * 16 + 2] =

-          predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;

-      predictor[1 * 16 + 3] =

-        predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;

-      predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;

-    }

-    break;

-    case B_VR_PRED: {

-      unsigned char pp[9];

-      pp[0] = Left[3];

-      pp[1] = Left[2];

-      pp[2] = Left[1];

-      pp[3] = Left[0];

-      pp[4] = top_left;

-      pp[5] = Above[0];

-      pp[6] = Above[1];

-      pp[7] = Above[2];

-      pp[8] = Above[3];

-      predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;

-      predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;

-      predictor[3 * 16 + 1] =

-        predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;

-      predictor[2 * 16 + 1] =

-        predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1;

-      predictor[3 * 16 + 2] =

-        predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;

-      predictor[2 * 16 + 2] =

-        predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1;

-      predictor[3 * 16 + 3] =

-        predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;

-      predictor[2 * 16 + 3] =

-        predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1;

-      predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;

-      predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1;

-    }

-    break;

-    case B_VL_PRED: {

-      unsigned char *pp = Above;

-      predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;

-      predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;

-      predictor[2 * 16 + 0] =

-        predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1;

-      predictor[1 * 16 + 1] =

-        predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;

-      predictor[2 * 16 + 1] =

-        predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1;

-      predictor[3 * 16 + 1] =

-        predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;

-      predictor[0 * 16 + 3] =

-        predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1;

-      predictor[1 * 16 + 3] =

-        predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;

-      predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;

-      predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;

-    }

-    break;

-    case B_HD_PRED: {

-      unsigned char pp[9];

-      pp[0] = Left[3];

-      pp[1] = Left[2];

-      pp[2] = Left[1];

-      pp[3] = Left[0];

-      pp[4] = top_left;

-      pp[5] = Above[0];

-      pp[6] = Above[1];

-      pp[7] = Above[2];

-      pp[8] = Above[3];

-      predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;

-      predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;

-      predictor[2 * 16 + 0] =

-        predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1;

-      predictor[2 * 16 + 1] =

-        predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;

-      predictor[2 * 16 + 2] =

-        predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;

-      predictor[2 * 16 + 3] =

-        predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;

-      predictor[1 * 16 + 2] =

-        predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1;

-      predictor[1 * 16 + 3] =

-        predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;

-      predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;

-      predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;

-    }

-    break;

-    case B_HU_PRED: {

-      unsigned char *pp = Left;

-      predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;

-      predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;

-      predictor[0 * 16 + 2] =

-        predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1;

-      predictor[0 * 16 + 3] =

-        predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;

-      predictor[1 * 16 + 2] =

-        predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;

-      predictor[1 * 16 + 3] =

-        predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;

-      predictor[2 * 16 + 2] =

-        predictor[2 * 16 + 3] =

-          predictor[3 * 16 + 0] =

-            predictor[3 * 16 + 1] =

-              predictor[3 * 16 + 2] =

-                predictor[3 * 16 + 3] = pp[3];

-    }

-    break;

-  }

-}

-#if CONFIG_COMP_INTRA_PRED

-void vp9_comp_intra4x4_predict_c(BLOCKD *x,

-                               int b_mode, int b_mode2,

-                               unsigned char *out_predictor) {

-  unsigned char predictor[2][4 * 16];

-  int i, j;

-  vp9_intra4x4_predict(x, b_mode, predictor[0]);

-  vp9_intra4x4_predict(x, b_mode2, predictor[1]);

-  for (i = 0; i < 16 * 4; i += 16) {

-    for (j = i; j < i + 4; j++) {

-      out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;

-    }

-  }

-}

-#endif

-/* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and

- * to the right prediction have filled in pixels to use.

- */

-void vp9_intra_prediction_down_copy(MACROBLOCKD *xd) {

-  int extend_edge = (xd->mb_to_right_edge == 0 && xd->mb_index < 2);

-  unsigned char *above_right = *(xd->block[0].base_dst) + xd->block[0].dst -

-                               xd->block[0].dst_stride + 16;

-  unsigned int *src_ptr = (unsigned int *)

-      (above_right - (xd->mb_index == 3 ? 16 * xd->block[0].dst_stride : 0));

-  unsigned int *dst_ptr0 = (unsigned int *)above_right;

-  unsigned int *dst_ptr1 =

-    (unsigned int *)(above_right + 4 * xd->block[0].dst_stride);

-  unsigned int *dst_ptr2 =

-    (unsigned int *)(above_right + 8 * xd->block[0].dst_stride);

-  unsigned int *dst_ptr3 =

-    (unsigned int *)(above_right + 12 * xd->block[0].dst_stride);

-  if (extend_edge) {

-    *src_ptr = ((uint8_t *) src_ptr)[-1] * 0x01010101U;

-  }

-  *dst_ptr0 = *src_ptr;

-  *dst_ptr1 = *src_ptr;

-  *dst_ptr2 = *src_ptr;

-  *dst_ptr3 = *src_ptr;

-}

--- a/vp8/common/reconintra4x4.h

+++ /dev/null

@@ -1,17 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_RECONINTRA4x4_H

-#define __INC_RECONINTRA4x4_H

-extern void vp9_intra_prediction_down_copy(MACROBLOCKD *xd);

-#endif

--- a/vp8/common/rtcd.c

+++ /dev/null

@@ -1,105 +1,0 @@

-/*

- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_config.h"

-#define RTCD_C

-#include "vpx_rtcd.h"

-#if CONFIG_MULTITHREAD && defined(_WIN32)

-#include <windows.h>

-#include <stdlib.h>

-static void once(void (*func)(void))

-{

-    static CRITICAL_SECTION *lock;

-    static LONG waiters;

-    static int done;

-    void *lock_ptr = &lock;

-    /* If the initialization is complete, return early. This isn't just an

-     * optimization, it prevents races on the destruction of the global

-     * lock.

-     */

-    if(done)

-        return;

-    InterlockedIncrement(&waiters);

-    /* Get a lock. We create one and try to make it the one-true-lock,

-     * throwing it away if we lost the race.

-     */

-    {

-        /* Scope to protect access to new_lock */

-        CRITICAL_SECTION *new_lock = malloc(sizeof(CRITICAL_SECTION));

-        InitializeCriticalSection(new_lock);

-        if (InterlockedCompareExchangePointer(lock_ptr, new_lock, NULL) != NULL)

-        {

-            DeleteCriticalSection(new_lock);

-            free(new_lock);

-        }

-    }

-    /* At this point, we have a lock that can be synchronized on. We don't

-     * care which thread actually performed the allocation.

-     */

-    EnterCriticalSection(lock);

-    if (!done)

-    {

-        func();

-        done = 1;

-    }

-    LeaveCriticalSection(lock);

-    /* Last one out should free resources. The destructed objects are

-     * protected by checking if(done) above.

-     */

-    if(!InterlockedDecrement(&waiters))

-    {

-        DeleteCriticalSection(lock);

-        free(lock);

-        lock = NULL;

-    }

-}

-#elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H

-#include <pthread.h>

-static void once(void (*func)(void))

-{

-    static pthread_once_t lock = PTHREAD_ONCE_INIT;

-    pthread_once(&lock, func);

-}

-#else

-/* No-op version that performs no synchronization. vpx_rtcd() is idempotent,

- * so as long as your platform provides atomic loads/stores of pointers

- * no synchronization is strictly necessary.

- */

-static void once(void (*func)(void))

-{

-    static int done;

-    if(!done)

-    {

-        func();

-        done = 1;

-    }

-}

-#endif

-void vpx_rtcd()

-{

-    once(setup_rtcd_internal);

-}

--- a/vp8/common/rtcd_defs.sh

+++ /dev/null

@@ -1,482 +1,0 @@

-common_forward_decls() {

-cat <<EOF

-struct loop_filter_info;

-struct blockd;

-struct macroblockd;

-struct loop_filter_info;

-/* Encoder forward decls */

-struct block;

-struct macroblock;

-struct variance_vtable;

-/* Encoder forward decls */

-struct variance_vtable;

-union int_mv;

-struct yv12_buffer_config;

-EOF

-}

-forward_decls common_forward_decls

-prototype void vp9_filter_block2d_4x4_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"

-prototype void vp9_filter_block2d_8x4_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"

-prototype void vp9_filter_block2d_8x8_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"

-prototype void vp9_filter_block2d_16x16_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"

-# At the very least, MSVC 2008 has compiler bug exhibited by this code; code

-# compiles warning free but a dissassembly of generated code show bugs. To be

-# on the safe side, only enabled when compiled with 'gcc'.

-if [ "$CONFIG_GCC" = "yes" ]; then

-    specialize vp9_filter_block2d_4x4_8 sse4_1 sse2

-fi

-    specialize vp9_filter_block2d_8x4_8 ssse3 #sse4_1 sse2

-    specialize vp9_filter_block2d_8x8_8 ssse3 #sse4_1 sse2

-    specialize vp9_filter_block2d_16x16_8 ssse3 #sse4_1 sse2

-#

-# Dequant

-#

-prototype void vp9_dequantize_b "struct blockd *x"

-specialize vp9_dequantize_b mmx

-prototype void vp9_dequantize_b_2x2 "struct blockd *x"

-specialize vp9_dequantize_b_2x2

-prototype void vp9_dequant_dc_idct_add_y_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs, short *dc, struct macroblockd *xd"

-specialize vp9_dequant_dc_idct_add_y_block_8x8

-prototype void vp9_dequant_idct_add_y_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs, struct macroblockd *xd"

-specialize vp9_dequant_idct_add_y_block_8x8

-prototype void vp9_dequant_idct_add_uv_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, char *eobs, struct macroblockd *xd"

-specialize vp9_dequant_idct_add_uv_block_8x8

-prototype void vp9_dequant_idct_add_16x16 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"

-specialize vp9_dequant_idct_add_16x16

-prototype void vp9_dequant_idct_add "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"

-specialize vp9_dequant_idct_add

-prototype void vp9_dequant_dc_idct_add "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc"

-specialize vp9_dequant_dc_idct_add

-prototype void vp9_dequant_dc_idct_add_y_block "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs, short *dc"

-specialize vp9_dequant_dc_idct_add_y_block mmx

-prototype void vp9_dequant_idct_add_y_block "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs"

-specialize vp9_dequant_idct_add_y_block mmx

-prototype void vp9_dequant_idct_add_uv_block "short *q, short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, char *eobs"

-specialize vp9_dequant_idct_add_uv_block mmx

-#

-# RECON

-#

-prototype void vp9_copy_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"

-specialize vp9_copy_mem16x16 mmx sse2 media neon dspr2

-vp9_copy_mem16x16_media=vp9_copy_mem16x16_v6

-vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2

-prototype void vp9_copy_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"

-specialize vp9_copy_mem8x8 mmx media neon dspr2

-vp9_copy_mem8x8_media=vp9_copy_mem8x8_v6

-vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2

-prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"

-specialize vp9_copy_mem8x4 mmx

-prototype void vp9_intra4x4_predict "unsigned char *Above, unsigned char *yleft, int left_stride, B_PREDICTION_MODE b_mode, unsigned char *dst, int dst_stride, unsigned char top_left"

-specialize vp9_intra4x4_predict

-prototype void vp9_avg_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"

-specialize vp9_avg_mem16x16

-prototype void vp9_avg_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"

-specialize vp9_avg_mem8x8

-prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"

-specialize vp9_copy_mem8x4 mmx media neon dspr2

-vp9_copy_mem8x4_media=vp9_copy_mem8x4_v6

-vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2

-prototype void vp9_recon_b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"

-specialize vp9_recon_b

-prototype void vp9_recon_uv_b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"

-specialize vp9_recon_uv_b

-prototype void vp9_recon2b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"

-specialize vp9_recon2b sse2

-prototype void vp9_recon4b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"

-specialize vp9_recon4b sse2

-prototype void vp9_recon_mb "struct macroblockd *x"

-specialize vp9_recon_mb

-prototype void vp9_recon_mby "struct macroblockd *x"

-specialize vp9_recon_mby

-prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"

-specialize vp9_build_intra_predictors_mby_s

-prototype void vp9_build_intra_predictors_sby_s "struct macroblockd *x"

-specialize vp9_build_intra_predictors_sby_s;

-prototype void vp9_build_intra_predictors_sbuv_s "struct macroblockd *x"

-specialize vp9_build_intra_predictors_sbuv_s;

-prototype void vp9_build_intra_predictors_mby "struct macroblockd *x"

-specialize vp9_build_intra_predictors_mby;

-prototype void vp9_build_comp_intra_predictors_mby "struct macroblockd *x"

-specialize vp9_build_comp_intra_predictors_mby;

-prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"

-specialize vp9_build_intra_predictors_mby_s;

-prototype void vp9_build_intra_predictors_mbuv "struct macroblockd *x"

-specialize vp9_build_intra_predictors_mbuv;

-prototype void vp9_build_intra_predictors_mbuv_s "struct macroblockd *x"

-specialize vp9_build_intra_predictors_mbuv_s;

-prototype void vp9_build_comp_intra_predictors_mbuv "struct macroblockd *x"

-specialize vp9_build_comp_intra_predictors_mbuv;

-prototype void vp9_intra4x4_predict "struct blockd *x, int b_mode, unsigned char *predictor"

-specialize vp9_intra4x4_predict;

-prototype void vp9_comp_intra4x4_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"

-specialize vp9_comp_intra4x4_predict;

-prototype void vp9_intra8x8_predict "struct blockd *x, int b_mode, unsigned char *predictor"

-specialize vp9_intra8x8_predict;

-prototype void vp9_comp_intra8x8_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"

-specialize vp9_comp_intra8x8_predict;

-prototype void vp9_intra_uv4x4_predict "struct blockd *x, int b_mode, unsigned char *predictor"

-specialize vp9_intra_uv4x4_predict;

-prototype void vp9_comp_intra_uv4x4_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"

-specialize vp9_comp_intra_uv4x4_predict;

-#

-# Loopfilter

-#

-prototype void vp9_loop_filter_mbv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"

-specialize vp9_loop_filter_mbv sse2

-prototype void vp9_loop_filter_bv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"

-specialize vp9_loop_filter_bv sse2

-prototype void vp9_loop_filter_bv8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"

-specialize vp9_loop_filter_bv8x8 sse2

-prototype void vp9_loop_filter_mbh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"

-specialize vp9_loop_filter_mbh sse2

-prototype void vp9_loop_filter_bh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"

-specialize vp9_loop_filter_bh sse2

-prototype void vp9_loop_filter_bh8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"

-specialize vp9_loop_filter_bh8x8 sse2

-prototype void vp9_loop_filter_simple_mbv "unsigned char *y, int ystride, const unsigned char *blimit"

-specialize vp9_loop_filter_simple_mbv mmx sse2 media neon

-vp9_loop_filter_simple_mbv_c=vp9_loop_filter_simple_vertical_edge_c

-vp9_loop_filter_simple_mbv_mmx=vp9_loop_filter_simple_vertical_edge_mmx

-vp9_loop_filter_simple_mbv_sse2=vp9_loop_filter_simple_vertical_edge_sse2

-vp9_loop_filter_simple_mbv_media=vp9_loop_filter_simple_vertical_edge_armv6

-vp9_loop_filter_simple_mbv_neon=vp9_loop_filter_mbvs_neon

-prototype void vp9_loop_filter_simple_mbh "unsigned char *y, int ystride, const unsigned char *blimit"

-specialize vp9_loop_filter_simple_mbh mmx sse2 media neon

-vp9_loop_filter_simple_mbh_c=vp9_loop_filter_simple_horizontal_edge_c

-vp9_loop_filter_simple_mbh_mmx=vp9_loop_filter_simple_horizontal_edge_mmx

-vp9_loop_filter_simple_mbh_sse2=vp9_loop_filter_simple_horizontal_edge_sse2

-vp9_loop_filter_simple_mbh_media=vp9_loop_filter_simple_horizontal_edge_armv6

-vp9_loop_filter_simple_mbh_neon=vp9_loop_filter_mbhs_neon

-prototype void vp9_loop_filter_simple_bv "unsigned char *y, int ystride, const unsigned char *blimit"

-specialize vp9_loop_filter_simple_bv mmx sse2 media neon

-vp9_loop_filter_simple_bv_c=vp9_loop_filter_bvs_c

-vp9_loop_filter_simple_bv_mmx=vp9_loop_filter_bvs_mmx

-vp9_loop_filter_simple_bv_sse2=vp9_loop_filter_bvs_sse2

-vp9_loop_filter_simple_bv_media=vp9_loop_filter_bvs_armv6

-vp9_loop_filter_simple_bv_neon=vp9_loop_filter_bvs_neon

-prototype void vp9_loop_filter_simple_bh "unsigned char *y, int ystride, const unsigned char *blimit"

-specialize vp9_loop_filter_simple_bh mmx sse2 media neon

-vp9_loop_filter_simple_bh_c=vp9_loop_filter_bhs_c

-vp9_loop_filter_simple_bh_mmx=vp9_loop_filter_bhs_mmx

-vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2

-vp9_loop_filter_simple_bh_media=vp9_loop_filter_bhs_armv6

-vp9_loop_filter_simple_bh_neon=vp9_loop_filter_bhs_neon

-#

-# sad 16x3, 3x16

-#

-if [ "$CONFIG_NEWBESTREFMV" = "yes" ]; then

-prototype unsigned int vp9_sad16x3 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"

-specialize vp9_sad16x3 sse2

-prototype unsigned int vp9_sad3x16 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"

-specialize vp9_sad3x16 sse2

-fi

-#

-# Encoder functions below this point.

-#

-if [ "$CONFIG_VP8_ENCODER" = "yes" ]; then

-# variance

-[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2

-prototype unsigned int vp9_variance32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

-specialize vp9_variance32x32

-prototype unsigned int vp9_variance16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

-specialize vp9_variance16x16 mmx sse2

-vp9_variance16x16_sse2=vp9_variance16x16_wmt

-vp9_variance16x16_mmx=vp9_variance16x16_mmx

-prototype unsigned int vp9_variance16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

-specialize vp9_variance16x8 mmx sse2

-vp9_variance16x8_sse2=vp9_variance16x8_wmt

-vp9_variance16x8_mmx=vp9_variance16x8_mmx

-prototype unsigned int vp9_variance8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

-specialize vp9_variance8x16 mmx sse2

-vp9_variance8x16_sse2=vp9_variance8x16_wmt

-vp9_variance8x16_mmx=vp9_variance8x16_mmx

-prototype unsigned int vp9_variance8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

-specialize vp9_variance8x8 mmx sse2

-vp9_variance8x8_sse2=vp9_variance8x8_wmt

-vp9_variance8x8_mmx=vp9_variance8x8_mmx

-prototype unsigned int vp9_variance4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

-specialize vp9_variance4x4 mmx sse2

-vp9_variance4x4_sse2=vp9_variance4x4_wmt

-vp9_variance4x4_mmx=vp9_variance4x4_mmx

-prototype unsigned int vp9_sub_pixel_variance32x32 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"

-specialize vp9_sub_pixel_variance32x32

-prototype unsigned int vp9_sub_pixel_variance16x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"

-specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3

-vp9_sub_pixel_variance16x16_sse2=vp9_sub_pixel_variance16x16_wmt

-prototype unsigned int vp9_sub_pixel_variance8x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"

-specialize vp9_sub_pixel_variance8x16 sse2 mmx

-vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt

-prototype unsigned int vp9_sub_pixel_variance16x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"

-specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3

-vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;

-vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt

-prototype unsigned int vp9_sub_pixel_variance8x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"

-specialize vp9_sub_pixel_variance8x8 sse2 mmx

-vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt

-prototype unsigned int vp9_sub_pixel_variance4x4 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"

-specialize vp9_sub_pixel_variance4x4 sse2 mmx

-vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt

-prototype unsigned int vp9_sad32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"

-specialize vp9_sad32x32

-prototype unsigned int vp9_sad16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"

-specialize vp9_sad16x16 mmx sse2 sse3

-vp9_sad16x16_sse2=vp9_sad16x16_wmt

-prototype unsigned int vp9_sad16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"

-specialize vp9_sad16x8 mmx sse2

-vp9_sad16x8_sse2=vp9_sad16x8_wmt

-prototype unsigned int vp9_sad8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"

-specialize vp9_sad8x16 mmx sse2

-vp9_sad8x16_sse2=vp9_sad8x16_wmt

-prototype unsigned int vp9_sad8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"

-specialize vp9_sad8x8 mmx sse2

-vp9_sad8x8_sse2=vp9_sad8x8_wmt

-prototype unsigned int vp9_sad4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"

-specialize vp9_sad4x4 mmx sse2

-vp9_sad4x4_sse2=vp9_sad4x4_wmt

-prototype unsigned int vp9_variance_halfpixvar16x16_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

-specialize vp9_variance_halfpixvar16x16_h mmx sse2

-vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt

-prototype unsigned int vp9_variance_halfpixvar16x16_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

-specialize vp9_variance_halfpixvar16x16_v mmx sse2

-vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt

-prototype unsigned int vp9_variance_halfpixvar16x16_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

-specialize vp9_variance_halfpixvar16x16_hv mmx sse2

-vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt

-prototype unsigned int vp9_variance_halfpixvar32x32_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

-specialize vp9_variance_halfpixvar32x32_h

-prototype unsigned int vp9_variance_halfpixvar32x32_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

-specialize vp9_variance_halfpixvar32x32_v

-prototype unsigned int vp9_variance_halfpixvar32x32_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

-specialize vp9_variance_halfpixvar32x32_hv

-prototype void vp9_sad32x32x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"

-specialize vp9_sad32x32x3

-prototype void vp9_sad16x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"

-specialize vp9_sad16x16x3 sse3 ssse3

-prototype void vp9_sad16x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"

-specialize vp9_sad16x8x3 sse3 ssse3

-prototype void vp9_sad8x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"

-specialize vp9_sad8x16x3 sse3

-prototype void vp9_sad8x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"

-specialize vp9_sad8x8x3 sse3

-prototype void vp9_sad4x4x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"

-specialize vp9_sad4x4x3 sse3

-prototype void vp9_sad32x32x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"

-specialize vp9_sad32x32x8

-prototype void vp9_sad16x16x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"

-specialize vp9_sad16x16x8 sse4

-prototype void vp9_sad16x8x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"

-specialize vp9_sad16x8x8 sse4

-prototype void vp9_sad8x16x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"

-specialize vp9_sad8x16x8 sse4

-prototype void vp9_sad8x8x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"

-specialize vp9_sad8x8x8 sse4

-prototype void vp9_sad4x4x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"

-specialize vp9_sad4x4x8 sse4

-prototype void vp9_sad32x32x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"

-specialize vp9_sad32x32x4d

-prototype void vp9_sad16x16x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"

-specialize vp9_sad16x16x4d sse3

-prototype void vp9_sad16x8x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"

-specialize vp9_sad16x8x4d sse3

-prototype void vp9_sad8x16x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"

-specialize vp9_sad8x16x4d sse3

-prototype void vp9_sad8x8x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"

-specialize vp9_sad8x8x4d sse3

-prototype void vp9_sad4x4x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"

-specialize vp9_sad4x4x4d sse3

-#

-# Block copy

-#

-case $arch in

-    x86*)

-    prototype void vp9_copy32xn "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n"

-    specialize vp9_copy32xn sse2 sse3

-    ;;

-esac

-prototype unsigned int vp9_sub_pixel_mse16x16 "const unsigned char  *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse"

-specialize vp9_sub_pixel_mse16x16 sse2 mmx

-vp9_sub_pixel_mse16x16_sse2=vp9_sub_pixel_mse16x16_wmt

-prototype unsigned int vp9_mse16x16 "const unsigned char *src_ptr, int  source_stride, const unsigned char *ref_ptr, int  recon_stride, unsigned int *sse"

-specialize vp9_mse16x16 mmx sse2

-vp9_mse16x16_sse2=vp9_mse16x16_wmt

-prototype unsigned int vp9_sub_pixel_mse32x32 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"

-specialize vp9_sub_pixel_mse32x32

-prototype unsigned int vp9_get_mb_ss "const short *"

-specialize vp9_get_mb_ss mmx sse2

-# ENCODEMB INVOKE

-prototype int vp9_mbblock_error "struct macroblock *mb, int dc"

-specialize vp9_mbblock_error mmx sse2

-vp9_mbblock_error_sse2=vp9_mbblock_error_xmm

-prototype int vp9_block_error "short *coeff, short *dqcoeff, int block_size"

-specialize vp9_block_error mmx sse2

-vp9_block_error_sse2=vp9_block_error_xmm

-prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"

-specialize vp9_subtract_b mmx sse2

-prototype int vp9_mbuverror "struct macroblock *mb"

-specialize vp9_mbuverror mmx sse2

-vp9_mbuverror_sse2=vp9_mbuverror_xmm

-prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"

-specialize vp9_subtract_b mmx sse2

-prototype void vp9_subtract_mby "short *diff, unsigned char *src, unsigned char *pred, int stride"

-specialize vp9_subtract_mby mmx sse2

-prototype void vp9_subtract_mbuv "short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride"

-specialize vp9_subtract_mbuv mmx sse2

-#

-# Structured Similarity (SSIM)

-#

-if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then

-    [ $arch = "x86_64" ] && sse2_on_x86_64=sse2

-    prototype void vp9_ssim_parms_8x8 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"

-    specialize vp9_ssim_parms_8x8 $sse2_on_x86_64

-    prototype void vp9_ssim_parms_16x16 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"

-    specialize vp9_ssim_parms_16x16 $sse2_on_x86_64

-fi

-# fdct functions

-prototype void vp9_fht "const short *input, int pitch, short *output, int tx_type, int tx_dim"

-specialize vp9_fht

-prototype void vp9_short_fdct8x8 "short *InputData, short *OutputData, int pitch"

-specialize vp9_short_fdct8x8

-prototype void vp9_short_fhaar2x2 "short *InputData, short *OutputData, int pitch"

-specialize vp9_short_fhaar2x2

-prototype void vp9_short_fdct4x4 "short *InputData, short *OutputData, int pitch"

-specialize vp9_short_fdct4x4

-prototype void vp9_short_fdct8x4 "short *InputData, short *OutputData, int pitch"

-specialize vp9_short_fdct8x4

-prototype void vp9_short_walsh4x4 "short *InputData, short *OutputData, int pitch"

-specialize vp9_short_walsh4x4

-prototype void vp9_short_fdct16x16 "short *InputData, short *OutputData, int pitch"

-specialize vp9_short_fdct16x16

-prototype void vp9_short_walsh4x4_lossless "short *InputData, short *OutputData, int pitch"

-specialize vp9_short_walsh4x4_lossless

-prototype void vp9_short_walsh4x4_x8 "short *InputData, short *OutputData, int pitch"

-specialize vp9_short_walsh4x4_x8

-prototype void vp9_short_walsh8x4_x8 "short *InputData, short *OutputData, int pitch"

-specialize vp9_short_walsh8x4_x8

-fi

-# end encoder functions

--- a/vp8/common/sadmxn.h

+++ /dev/null

@@ -1,37 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_SAD_H

-#define __INC_SAD_H

-static __inline

-unsigned int sad_mx_n_c(

-  const unsigned char *src_ptr,

-  int  src_stride,

-  const unsigned char *ref_ptr,

-  int  ref_stride,

-  int m,

-  int n) {

-  int r, c;

-  unsigned int sad = 0;

-  for (r = 0; r < n; r++) {

-    for (c = 0; c < m; c++) {

-      sad += abs(src_ptr[c] - ref_ptr[c]);

-    }

-    src_ptr += src_stride;

-    ref_ptr += ref_stride;

-  }

-  return sad;

-}

-#endif

--- a/vp8/common/seg_common.c

+++ /dev/null

@@ -1,103 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp8/common/seg_common.h"

-static const int segfeaturedata_signed[SEG_LVL_MAX] = { 1, 1, 0, 0, 0, 0 };

-static const int seg_feature_data_bits[SEG_LVL_MAX] = { QINDEX_BITS, 6, 4, 4, 6, 2 };

-// These functions provide access to new segment level features.

-// Eventually these function may be "optimized out" but for the moment,

-// the coding mechanism is still subject to change so these provide a

-// convenient single point of change.

-int vp9_segfeature_active(const MACROBLOCKD *xd,

-                          int segment_id,

-                          SEG_LVL_FEATURES feature_id) {

-  // Return true if mask bit set and segmentation enabled.

-  return (xd->segmentation_enabled &&

-          (xd->segment_feature_mask[segment_id] &

-           (0x01 << feature_id)));

-}

-void vp9_clearall_segfeatures(MACROBLOCKD *xd) {

-  vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));

-  vpx_memset(xd->segment_feature_mask, 0, sizeof(xd->segment_feature_mask));

-}

-void vp9_enable_segfeature(MACROBLOCKD *xd,

-                           int segment_id,

-                           SEG_LVL_FEATURES feature_id) {

-  xd->segment_feature_mask[segment_id] |= (0x01 << feature_id);

-}

-void vp9_disable_segfeature(MACROBLOCKD *xd,

-                            int segment_id,

-                            SEG_LVL_FEATURES feature_id) {

-  xd->segment_feature_mask[segment_id] &= ~(1 << feature_id);

-}

-int vp9_seg_feature_data_bits(SEG_LVL_FEATURES feature_id) {

-  return seg_feature_data_bits[feature_id];

-}

-int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {

-  return (segfeaturedata_signed[feature_id]);

-}

-void vp9_clear_segdata(MACROBLOCKD *xd,

-                       int segment_id,

-                       SEG_LVL_FEATURES feature_id) {

-  xd->segment_feature_data[segment_id][feature_id] = 0;

-}

-void vp9_set_segdata(MACROBLOCKD *xd,

-                     int segment_id,

-                     SEG_LVL_FEATURES feature_id,

-                     int seg_data) {

-  xd->segment_feature_data[segment_id][feature_id] = seg_data;

-}

-int vp9_get_segdata(const MACROBLOCKD *xd,

-                    int segment_id,

-                    SEG_LVL_FEATURES feature_id) {

-  return xd->segment_feature_data[segment_id][feature_id];

-}

-void vp9_clear_segref(MACROBLOCKD *xd, int segment_id) {

-  xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] = 0;

-}

-void vp9_set_segref(MACROBLOCKD *xd,

-                    int segment_id,

-                    MV_REFERENCE_FRAME ref_frame) {

-  xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] |=

-    (1 << ref_frame);

-}

-int vp9_check_segref(const MACROBLOCKD *xd,

-                     int segment_id,

-                     MV_REFERENCE_FRAME ref_frame) {

-  return (xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] &

-          (1 << ref_frame)) ? 1 : 0;

-}

-int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id) {

-  return (xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] &

-          ~(1 << INTRA_FRAME)) ? 1 : 0;

-}

-int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id) {

-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_TRANSFORM))

-    return vp9_get_segdata(xd, segment_id, SEG_LVL_TRANSFORM);

-  else

-    return TX_4X4;

-}

-// TBD? Functions to read and write segment data with range / validity checking

--- a/vp8/common/seg_common.h

+++ /dev/null

@@ -1,64 +1,0 @@

-/*

- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "type_aliases.h"

-#include "onyxc_int.h"

-#include "vp8/common/blockd.h"

-#ifndef __INC_SEG_COMMON_H__

-#define __INC_SEG_COMMON_H__ 1

-int vp9_segfeature_active(const MACROBLOCKD *xd,

-                          int segment_id,

-                          SEG_LVL_FEATURES feature_id);

-void vp9_clearall_segfeatures(MACROBLOCKD *xd);

-void vp9_enable_segfeature(MACROBLOCKD *xd,

-                           int segment_id,

-                           SEG_LVL_FEATURES feature_id);

-void vp9_disable_segfeature(MACROBLOCKD *xd,

-                            int segment_id,

-                            SEG_LVL_FEATURES feature_id);

-int vp9_seg_feature_data_bits(SEG_LVL_FEATURES feature_id);

-int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id);

-void vp9_clear_segdata(MACROBLOCKD *xd,

-                       int segment_id,

-                       SEG_LVL_FEATURES feature_id);

-void vp9_set_segdata(MACROBLOCKD *xd,

-                     int segment_id,

-                     SEG_LVL_FEATURES feature_id,

-                     int seg_data);

-int vp9_get_segdata(const MACROBLOCKD *xd,

-                    int segment_id,

-                    SEG_LVL_FEATURES feature_id);

-void vp9_clear_segref(MACROBLOCKD *xd, int segment_id);

-void vp9_set_segref(MACROBLOCKD *xd,

-                    int segment_id,

-                    MV_REFERENCE_FRAME ref_frame);

-int vp9_check_segref(const MACROBLOCKD *xd,

-                     int segment_id,

-                     MV_REFERENCE_FRAME ref_frame);

-int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id);

-int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id);

-#endif /* __INC_SEG_COMMON_H__ */

--- a/vp8/common/setupintrarecon.c

+++ /dev/null

@@ -1,31 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "setupintrarecon.h"

-#include "vpx_mem/vpx_mem.h"

-void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf) {

-  int i;

-  /* set up frame new frame for intra coded blocks */

-  vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);

-  for (i = 0; i < ybf->y_height; i++)

-    ybf->y_buffer[ybf->y_stride * i - 1] = (unsigned char) 129;

-  vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);

-  for (i = 0; i < ybf->uv_height; i++)

-    ybf->u_buffer[ybf->uv_stride * i - 1] = (unsigned char) 129;

-  vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);

-  for (i = 0; i < ybf->uv_height; i++)

-    ybf->v_buffer[ybf->uv_stride * i - 1] = (unsigned char) 129;

-}

--- a/vp8/common/setupintrarecon.h

+++ /dev/null

@@ -1,13 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_scale/yv12config.h"

-extern void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);

--- a/vp8/common/subpixel.h

+++ /dev/null

@@ -1,204 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef SUBPIXEL_H

-#define SUBPIXEL_H

-#define prototype_subpixel_predict(sym) \

-  void sym(unsigned char *src, int src_pitch, int xofst, int yofst, \

-           unsigned char *dst, int dst_pitch)

-#if ARCH_X86 || ARCH_X86_64

-#include "x86/subpixel_x86.h"

-#endif

-#if ARCH_ARM

-#include "arm/subpixel_arm.h"

-#endif

-#ifndef vp9_subpix_sixtap16x16

-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_sixtap16x16);

-#ifndef vp9_subpix_sixtap8x8

-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_sixtap8x8);

-#ifndef vp9_subpix_sixtap_avg16x16

-#define vp9_subpix_sixtap_avg16x16 vp9_sixtap_predict_avg16x16_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_sixtap_avg16x16);

-#ifndef vp9_subpix_sixtap_avg8x8

-#define vp9_subpix_sixtap_avg8x8 vp9_sixtap_predict_avg8x8_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_sixtap_avg8x8);

-#ifndef vp9_subpix_sixtap8x4

-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_sixtap8x4);

-#ifndef vp9_subpix_sixtap4x4

-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_sixtap4x4);

-#ifndef vp9_subpix_sixtap_avg4x4

-#define vp9_subpix_sixtap_avg4x4 vp9_sixtap_predict_avg_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_sixtap_avg4x4);

-#ifndef vp9_subpix_eighttap16x16

-#define vp9_subpix_eighttap16x16 vp9_eighttap_predict16x16_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_eighttap16x16);

-#ifndef vp9_subpix_eighttap8x8

-#define vp9_subpix_eighttap8x8 vp9_eighttap_predict8x8_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_eighttap8x8);

-#ifndef vp9_subpix_eighttap_avg16x16

-#define vp9_subpix_eighttap_avg16x16 vp9_eighttap_predict_avg16x16_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg16x16);

-#ifndef vp9_subpix_eighttap_avg8x8

-#define vp9_subpix_eighttap_avg8x8 vp9_eighttap_predict_avg8x8_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg8x8);

-#ifndef vp9_subpix_eighttap8x4

-#define vp9_subpix_eighttap8x4 vp9_eighttap_predict8x4_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_eighttap8x4);

-#ifndef vp9_subpix_eighttap4x4

-#define vp9_subpix_eighttap4x4 vp9_eighttap_predict_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_eighttap4x4);

-#ifndef vp9_subpix_eighttap_avg4x4

-#define vp9_subpix_eighttap_avg4x4 vp9_eighttap_predict_avg4x4_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg4x4);

-#ifndef vp9_subpix_eighttap16x16_sharp

-#define vp9_subpix_eighttap16x16_sharp vp9_eighttap_predict16x16_sharp_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_eighttap16x16_sharp);

-#ifndef vp9_subpix_eighttap8x8_sharp

-#define vp9_subpix_eighttap8x8_sharp vp9_eighttap_predict8x8_sharp_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_eighttap8x8_sharp);

-#ifndef vp9_subpix_eighttap_avg16x16_sharp

-#define vp9_subpix_eighttap_avg16x16_sharp vp9_eighttap_predict_avg16x16_sharp_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg16x16_sharp);

-#ifndef vp9_subpix_eighttap_avg8x8_sharp

-#define vp9_subpix_eighttap_avg8x8_sharp vp9_eighttap_predict_avg8x8_sharp_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg8x8_sharp);

-#ifndef vp9_subpix_eighttap8x4_sharp

-#define vp9_subpix_eighttap8x4_sharp vp9_eighttap_predict8x4_sharp_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_eighttap8x4_sharp);

-#ifndef vp9_subpix_eighttap4x4_sharp

-#define vp9_subpix_eighttap4x4_sharp vp9_eighttap_predict_sharp_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_eighttap4x4_sharp);

-#ifndef vp9_subpix_eighttap_avg4x4_sharp

-#define vp9_subpix_eighttap_avg4x4_sharp vp9_eighttap_predict_avg4x4_sharp_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg4x4_sharp);

-#ifndef vp9_subpix_bilinear16x16

-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_bilinear16x16);

-#ifndef vp9_subpix_bilinear8x8

-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_bilinear8x8);

-#ifndef vp9_subpix_bilinear_avg16x16

-#define vp9_subpix_bilinear_avg16x16 vp9_bilinear_predict_avg16x16_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_bilinear_avg16x16);

-#ifndef vp9_subpix_bilinear_avg8x8

-#define vp9_subpix_bilinear_avg8x8 vp9_bilinear_predict_avg8x8_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_bilinear_avg8x8);

-#ifndef vp9_subpix_bilinear8x4

-#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_bilinear8x4);

-#ifndef vp9_subpix_bilinear4x4

-#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_bilinear4x4);

-#ifndef vp9_subpix_bilinear_avg4x4

-#define vp9_subpix_bilinear_avg4x4 vp9_bilinear_predict_avg4x4_c

-#endif

-extern prototype_subpixel_predict(vp9_subpix_bilinear_avg4x4);

-typedef prototype_subpixel_predict((*vp9_subpix_fn_t));

-typedef struct {

-  vp9_subpix_fn_t  eighttap16x16;

-  vp9_subpix_fn_t  eighttap8x8;

-  vp9_subpix_fn_t  eighttap_avg16x16;

-  vp9_subpix_fn_t  eighttap_avg8x8;

-  vp9_subpix_fn_t  eighttap_avg4x4;

-  vp9_subpix_fn_t  eighttap8x4;

-  vp9_subpix_fn_t  eighttap4x4;

-  vp9_subpix_fn_t  eighttap16x16_sharp;

-  vp9_subpix_fn_t  eighttap8x8_sharp;

-  vp9_subpix_fn_t  eighttap_avg16x16_sharp;

-  vp9_subpix_fn_t  eighttap_avg8x8_sharp;

-  vp9_subpix_fn_t  eighttap_avg4x4_sharp;

-  vp9_subpix_fn_t  eighttap8x4_sharp;

-  vp9_subpix_fn_t  eighttap4x4_sharp;

-  vp9_subpix_fn_t  sixtap16x16;

-  vp9_subpix_fn_t  sixtap8x8;

-  vp9_subpix_fn_t  sixtap_avg16x16;

-  vp9_subpix_fn_t  sixtap_avg8x8;

-  vp9_subpix_fn_t  sixtap8x4;

-  vp9_subpix_fn_t  sixtap4x4;

-  vp9_subpix_fn_t  sixtap_avg4x4;

-  vp9_subpix_fn_t  bilinear16x16;

-  vp9_subpix_fn_t  bilinear8x8;

-  vp9_subpix_fn_t  bilinear_avg16x16;

-  vp9_subpix_fn_t  bilinear_avg8x8;

-  vp9_subpix_fn_t  bilinear8x4;

-  vp9_subpix_fn_t  bilinear4x4;

-  vp9_subpix_fn_t  bilinear_avg4x4;

-} vp9_subpix_rtcd_vtable_t;

-#if CONFIG_RUNTIME_CPU_DETECT

-#define SUBPIX_INVOKE(ctx,fn) (ctx)->fn

-#else

-#define SUBPIX_INVOKE(ctx,fn) vp9_subpix_##fn

-#endif

-#endif

--- a/vp8/common/swapyv12buffer.c

+++ /dev/null

@@ -1,32 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "swapyv12buffer.h"

-void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,

-                          YV12_BUFFER_CONFIG *last_frame) {

-  unsigned char *temp;

-  temp = last_frame->buffer_alloc;

-  last_frame->buffer_alloc = new_frame->buffer_alloc;

-  new_frame->buffer_alloc = temp;

-  temp = last_frame->y_buffer;

-  last_frame->y_buffer = new_frame->y_buffer;

-  new_frame->y_buffer = temp;

-  temp = last_frame->u_buffer;

-  last_frame->u_buffer = new_frame->u_buffer;

-  new_frame->u_buffer = temp;

-  temp = last_frame->v_buffer;

-  last_frame->v_buffer = new_frame->v_buffer;

-  new_frame->v_buffer = temp;

-}

--- a/vp8/common/swapyv12buffer.h

+++ /dev/null

@@ -1,19 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __SWAPYV12_BUFFER_H

-#define __SWAPYV12_BUFFER_H

-#include "vpx_scale/yv12config.h"

-void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,

-                          YV12_BUFFER_CONFIG *last_frame);

-#endif  // __SWAPYV12_BUFFER_H

--- a/vp8/common/systemdependent.h

+++ /dev/null

@@ -1,21 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#if ARCH_X86 || ARCH_X86_64

-void vpx_reset_mmx_state(void);

-#define vp9_clear_system_state() vpx_reset_mmx_state()

-#else

-#define vp9_clear_system_state()

-#endif

-struct VP9Common;

-void vp9_machine_specific_config(struct VP9Common *);

--- a/vp8/common/tapify.py

+++ /dev/null

@@ -1,106 +1,0 @@

-"""

- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

-"""

-#!/usr/bin/env python

-import sys,string,os,re,math,numpy

-scale = 2**16

-def dist(p1,p2):

-  x1,y1 = p1

-  x2,y2 = p2

-  if x1==x2 and y1==y2 :

-    return 1.0

-  return 1/ math.sqrt((x1-x2)*(x1-x2)+(y1-y2)*(y1-y2))

-def gettaps(p):

-  def l(b):

-    return int(math.floor(b))

-  def h(b):

-    return int(math.ceil(b))

-  def t(b,p,s):

-    return int((scale*dist(b,p)+s/2)/s)

-  r,c = p

-  ul=[l(r),l(c)]

-  ur=[l(r),h(c)]

-  ll=[h(r),l(c)]

-  lr=[h(r),h(c)]

-  sum = dist(ul,p)+dist(ur,p)+dist(ll,p)+dist(lr,p)

-  t4 = scale - t(ul,p,sum) - t(ur,p,sum) - t(ll,p,sum);

-  return [[ul,t(ul,p,sum)],[ur,t(ur,p,sum)],

-          [ll,t(ll,p,sum)],[lr,t4]]

-def print_mb_taps(angle,blocksize):

-  theta = angle / 57.2957795;

-  affine = [[math.cos(theta),-math.sin(theta)],

-            [math.sin(theta),math.cos(theta)]]

-  radius = (float(blocksize)-1)/2

-  print " // angle of",angle,"degrees"

-  for y in range(blocksize) :

-    for x in range(blocksize) :

-      r,c = numpy.dot(affine,[y-radius, x-radius])

-      tps = gettaps([r+radius,c+radius])

-      for t in tps :

-        p,t = t

-        tr,tc = p

-        print " %2d, %2d, %5d, " % (tr,tc,t,),

-      print " // %2d,%2d " % (y,x)

-i=float(sys.argv[1])

-while  i <= float(sys.argv[2]) :

-  print_mb_taps(i,float(sys.argv[4]))

-  i=i+float(sys.argv[3])

-"""

-taps = []

-pt=dict()

-ptr=dict()

-for y in range(16) :

-  for x in range(16) :

-    r,c = numpy.dot(affine,[y-7.5, x-7.5])

-    tps = gettaps([r+7.5,c+7.5])

-    j=0

-    for tp in tps :

-      p,i = tp

-      r,c = p

-      pt[y,x,j]= [p,i]

-      try:

-        ptr[r,j,c].append([y,x])

-      except:

-        ptr[r,j,c]=[[y,x]]

-      j = j+1

-for key in sorted(pt.keys()) :

-  print key,pt[key]

-lr = -99

-lj = -99

-lc = 0

-shuf=""

-mask=""

-for r,j,c in sorted(ptr.keys()) :

-  for y,x in ptr[r,j,c] :

-    if lr != r or lj != j :

-      print "shuf_"+str(lr)+"_"+str(lj)+"_"+shuf.ljust(16,"0"), lc

-      shuf=""

-      lc = 0

-    for i in range(lc,c-1) :

-      shuf = shuf +"0"

-    shuf = shuf + hex(x)[2]

-    lc =c

-    break

-  lr = r

-  lj = j

-#  print r,j,c,ptr[r,j,c]

-#  print

-for r,j,c in sorted(ptr.keys()) :

-  for y,x in ptr[r,j,c] :

-    print r,j,c,y,x

-    break

-"""

--- a/vp8/common/textblit.c

+++ /dev/null

@@ -1,116 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <stdlib.h>

-void vp9_blit_text(const char *msg, unsigned char *address, const int pitch) {

-  int letter_bitmap;

-  unsigned char *output_pos = address;

-  int colpos;

-  const int font[] = {

-    0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000,

-    0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110,

-    0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA,

-    0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20,

-    0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF,

-    0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F,

-    0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2,

-    0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731,

-    0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820

-  };

-  colpos = 0;

-  while (msg[colpos] != 0) {

-    char letter = msg[colpos];

-    int fontcol, fontrow;

-    if (letter <= 'Z' && letter >= ' ')

-      letter_bitmap = font[letter - ' '];

-    else if (letter <= 'z' && letter >= 'a')

-      letter_bitmap = font[letter - 'a' + 'A' - ' '];

-    else

-      letter_bitmap = font[0];

-    for (fontcol = 6; fontcol >= 0; fontcol--)

-      for (fontrow = 0; fontrow < 5; fontrow++)

-        output_pos[fontrow * pitch + fontcol] =

-          ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0);

-    output_pos += 7;

-    colpos++;

-  }

-}

-static void plot(const int x, const int y, unsigned char *image, const int pitch) {

-  image [x + y * pitch] ^= 255;

-}

-/* Bresenham line algorithm */

-void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch) {

-  int steep = abs(y1 - y0) > abs(x1 - x0);

-  int deltax, deltay;

-  int error, ystep, y, x;

-  if (steep) {

-    int t;

-    t = x0;

-    x0 = y0;

-    y0 = t;

-    t = x1;

-    x1 = y1;

-    y1 = t;

-  }

-  if (x0 > x1) {

-    int t;

-    t = x0;

-    x0 = x1;

-    x1 = t;

-    t = y0;

-    y0 = y1;

-    y1 = t;

-  }

-  deltax = x1 - x0;

-  deltay = abs(y1 - y0);

-  error  = deltax / 2;

-  y = y0;

-  if (y0 < y1)

-    ystep = 1;

-  else

-    ystep = -1;

-  if (steep) {

-    for (x = x0; x <= x1; x++) {

-      plot(y, x, image, pitch);

-      error = error - deltay;

-      if (error < 0) {

-        y = y + ystep;

-        error = error + deltax;

-      }

-    }

-  } else {

-    for (x = x0; x <= x1; x++) {

-      plot(x, y, image, pitch);

-      error = error - deltay;

-      if (error < 0) {

-        y = y + ystep;

-        error = error + deltax;

-      }

-    }

-  }

-}

--- a/vp8/common/treecoder.c

+++ /dev/null

@@ -1,138 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_config.h"

-#if defined(CONFIG_DEBUG) && CONFIG_DEBUG

-#include <assert.h>

-#endif

-#include <stdio.h>

-#include "treecoder.h"

-static void tree2tok(

-  struct vp9_token_struct *const p,

-  vp9_tree t,

-  int i,

-  int v,

-  int L

-) {

-  v += v;

-  ++L;

-  do {

-    const vp9_tree_index j = t[i++];

-    if (j <= 0) {

-      p[-j].value = v;

-      p[-j].Len = L;

-    } else

-      tree2tok(p, t, j, v, L);

-  } while (++v & 1);

-}

-void vp9_tokens_from_tree(struct vp9_token_struct *p, vp9_tree t) {

-  tree2tok(p, t, 0, 0, 0);

-}

-void vp9_tokens_from_tree_offset(struct vp9_token_struct *p, vp9_tree t,

-                                 int offset) {

-  tree2tok(p - offset, t, 0, 0, 0);

-}

-static void branch_counts(

-  int n,                      /* n = size of alphabet */

-  vp9_token tok               [ /* n */ ],

-  vp9_tree tree,

-  unsigned int branch_ct       [ /* n-1 */ ] [2],

-  const unsigned int num_events[ /* n */ ]

-) {

-  const int tree_len = n - 1;

-  int t = 0;

-#if CONFIG_DEBUG

-  assert(tree_len);

-#endif

-  do {

-    branch_ct[t][0] = branch_ct[t][1] = 0;

-  } while (++t < tree_len);

-  t = 0;

-  do {

-    int L = tok[t].Len;

-    const int enc = tok[t].value;

-    const unsigned int ct = num_events[t];

-    vp9_tree_index i = 0;

-    do {

-      const int b = (enc >> --L) & 1;

-      const int j = i >> 1;

-#if CONFIG_DEBUG

-      assert(j < tree_len  &&  0 <= L);

-#endif

-      branch_ct [j] [b] += ct;

-      i = tree[ i + b];

-    } while (i > 0);

-#if CONFIG_DEBUG

-    assert(!L);

-#endif

-  } while (++t < n);

-}

-void vp9_tree_probs_from_distribution(

-  int n,                      /* n = size of alphabet */

-  vp9_token tok               [ /* n */ ],

-  vp9_tree tree,

-  vp9_prob probs          [ /* n-1 */ ],

-  unsigned int branch_ct       [ /* n-1 */ ] [2],

-  const unsigned int num_events[ /* n */ ],

-  unsigned int Pfac,

-  int rd

-) {

-  const int tree_len = n - 1;

-  int t = 0;

-  branch_counts(n, tok, tree, branch_ct, num_events);

-  do {

-    const unsigned int *const c = branch_ct[t];

-    const unsigned int tot = c[0] + c[1];

-#if CONFIG_DEBUG

-    assert(tot < (1 << 24));        /* no overflow below */

-#endif

-    if (tot) {

-      const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot;

-      probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */

-    } else

-      probs[t] = vp9_prob_half;

-  } while (++t < tree_len);

-}

-vp9_prob vp9_bin_prob_from_distribution(const unsigned int counts[2]) {

-  int tot_count = counts[0] + counts[1];

-  vp9_prob prob;

-  if (tot_count) {

-    prob = (counts[0] * 255 + (tot_count >> 1)) / tot_count;

-    prob += !prob;

-  } else {

-    prob = 128;

-  }

-  return prob;

-}

--- a/vp8/common/treecoder.h

+++ /dev/null

@@ -1,75 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_TREECODER_H

-#define __INC_TREECODER_H

-typedef unsigned char vp9_prob;

-#define vp9_prob_half ( (vp9_prob) 128)

-typedef signed char vp9_tree_index;

-struct bool_coder_spec;

-typedef struct bool_coder_spec bool_coder_spec;

-typedef struct bool_writer bool_writer;

-typedef struct bool_reader bool_reader;

-typedef const bool_coder_spec c_bool_coder_spec;

-typedef const bool_writer c_bool_writer;

-typedef const bool_reader c_bool_reader;

-# define vp9_complement( x) (255 - x)

-/* We build coding trees compactly in arrays.

-   Each node of the tree is a pair of vp9_tree_indices.

-   Array index often references a corresponding probability table.

-   Index <= 0 means done encoding/decoding and value = -Index,

-   Index > 0 means need another bit, specification at index.

-   Nonnegative indices are always even;  processing begins at node 0. */

-typedef const vp9_tree_index vp9_tree[], *vp9_tree_p;

-typedef const struct vp9_token_struct {

-  int value;

-  int Len;

-} vp9_token;

-/* Construct encoding array from tree. */

-void vp9_tokens_from_tree(struct vp9_token_struct *, vp9_tree);

-void vp9_tokens_from_tree_offset(struct vp9_token_struct *, vp9_tree,

-                                 int offset);

-/* Convert array of token occurrence counts into a table of probabilities

-   for the associated binary encoding tree.  Also writes count of branches

-   taken for each node on the tree; this facilitiates decisions as to

-   probability updates. */

-void vp9_tree_probs_from_distribution(

-  int n,                      /* n = size of alphabet */

-  vp9_token tok               [ /* n */ ],

-  vp9_tree tree,

-  vp9_prob probs          [ /* n-1 */ ],

-  unsigned int branch_ct       [ /* n-1 */ ] [2],

-  const unsigned int num_events[ /* n */ ],

-  unsigned int Pfactor,

-  int Round

-);

-vp9_prob vp9_bin_prob_from_distribution(const unsigned int counts[2]);

-#endif

--- a/vp8/common/type_aliases.h

+++ /dev/null

@@ -1,120 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-/****************************************************************************

-*

-*   Module Title :     type_aliases.h

-*

-*   Description  :     Standard type aliases

-*

-****************************************************************************/

-#ifndef __INC_TYPE_ALIASES_H

-#define __INC_TYPE_ALIASES_H

-/****************************************************************************

-* Macros

-****************************************************************************/

-#define EXPORT

-#define IMPORT          extern      /* Used to declare imported data & routines */

-#define PRIVATE         static      /* Used to declare & define module-local data */

-#define LOCAL           static      /* Used to define all persistent routine-local data */

-#define STD_IN_PATH     0           /* Standard input path */

-#define STD_OUT_PATH    1           /* Standard output path */

-#define STD_ERR_PATH    2           /* Standard error path */

-#define STD_IN_FILE     stdin       /* Standard input file pointer */

-#define STD_OUT_FILE    stdout      /* Standard output file pointer */

-#define STD_ERR_FILE    stderr      /* Standard error file pointer */

-#define max_int         0x7FFFFFFF

-#define __export

-#define _export

-#define CCONV

-#ifndef NULL

-#ifdef __cplusplus

-#define NULL    0

-#else

-#define NULL    ((void *)0)

-#endif

-#endif

-#ifndef FALSE

-#define FALSE   0

-#endif

-#ifndef TRUE

-#define TRUE    1

-#endif

-/****************************************************************************

-* Typedefs

-****************************************************************************/

-#ifndef TYPE_INT8

-#define TYPE_INT8

-typedef signed char     INT8;

-#endif

-#ifndef TYPE_INT16

-/*#define TYPE_INT16*/

-typedef signed short    INT16;

-#endif

-#ifndef TYPE_INT32

-/*#define TYPE_INT32*/

-typedef signed int      INT32;

-#endif

-#ifndef TYPE_UINT8

-/*#define TYPE_UINT8*/

-typedef unsigned char   UINT8;

-#endif

-#ifndef TYPE_UINT32

-/*#define TYPE_UINT32*/

-typedef unsigned int    UINT32;

-#endif

-#ifndef TYPE_UINT16

-/*#define TYPE_UINT16*/

-typedef unsigned short  UINT16;

-#endif

-#ifndef TYPE_BOOL

-/*#define TYPE_BOOL*/

-typedef int             BOOL;

-#endif

-typedef unsigned char   BOOLEAN;

-#ifdef _MSC_VER

-typedef __int64 INT64;

-#ifndef INT64_MAX

-#define INT64_MAX LLONG_MAX

-#endif

-#else

-#ifndef TYPE_INT64

-#ifdef _TMS320C6X

-/* for now we only have 40bits */

-typedef long INT64;

-#else

-typedef long long INT64;

-#endif

-#endif

-#endif

-/* Floating point */

-typedef  double         FLOAT64;

-typedef  float          FLOAT32;

-#endif

--- a/vp8/common/x86/filter_sse2.c

+++ /dev/null

@@ -1,289 +1,0 @@

-/*

- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <assert.h> // for alignment checks

-#include <emmintrin.h> // SSE2

-#include "vp8/common/filter.h"

-#include "vpx_ports/mem.h" // for DECLARE_ALIGNED

-#include "vpx_rtcd.h"

-// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is

-//           just a quick partial snapshot so that other can already use some

-//           speedup.

-// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap

-//           filtering.

-// TODO(cd): Add some comments, better variable naming.

-// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum

-//           of positive above 128), or have higher precision filter

-//           coefficients.

-DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {

-  VP9_FILTER_WEIGHT >> 1,

-  VP9_FILTER_WEIGHT >> 1,

-  VP9_FILTER_WEIGHT >> 1,

-  VP9_FILTER_WEIGHT >> 1,

-};

-// Creating a macro to do more than four pixels at once to hide instruction

-// latency is actually slower :-(

-#define DO_FOUR_PIXELS(result, src_ptr, offset)                                \

-  {                                                                            \

-  /* Do shifted load to achieve require shuffles through unpacking */          \

-  const __m128i src0  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \

-  const __m128i src1  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \

-  const __m128i src2  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \

-  const __m128i src3  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \

-  const __m128i src01 = _mm_unpacklo_epi8(src0, src1);                         \

-  const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero);                     \

-  const __m128i src23 = _mm_unpacklo_epi8(src2, src3);                         \

-  const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero);                     \

-  /* Shit by 4 bytes through suffle to get additional shifted loads */         \

-  const __m128i src4  = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1));      \

-  const __m128i src5  = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1));      \

-  const __m128i src6  = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1));      \

-  const __m128i src7  = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1));      \

-  const __m128i src45 = _mm_unpacklo_epi8(src4, src5);                         \

-  const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero);                     \

-  const __m128i src67 = _mm_unpacklo_epi8(src6, src7);                         \

-  const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero);                     \

-  /* multiply accumulate them */                                               \

-  const __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                       \

-  const __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                       \

-  const __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                       \

-  const __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                       \

-  const __m128i mad0123 = _mm_add_epi32(mad01, mad23);                         \

-  const __m128i mad4567 = _mm_add_epi32(mad45, mad67);                         \

-  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \

-  mad_all = _mm_add_epi32(mad_all, rounding);                                  \

-  result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);                          \

-  }

-void vp9_filter_block2d_4x4_8_sse2

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  __m128i intermediateA, intermediateB, intermediateC;

-  const int kInterp_Extend = 4;

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);

-  // check alignment

-  assert(0 == ((long)HFilter_aligned16)%16);

-  assert(0 == ((long)VFilter_aligned16)%16);

-  {

-    __m128i transpose3_0;

-    __m128i transpose3_1;

-    __m128i transpose3_2;

-    __m128i transpose3_3;

-    // Horizontal pass (src -> intermediate).

-    {

-      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);

-      // get first two columns filter coefficients

-      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));

-      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));

-      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));

-      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));

-      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);

-      {

-        __m128i mad_all0;

-        __m128i mad_all1;

-        __m128i mad_all2;

-        __m128i mad_all3;

-        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)

-        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)

-        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)

-        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)

-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);

-        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);

-        // --

-        src_ptr += src_stride*4;

-        // --

-        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)

-        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)

-        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)

-        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)

-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);

-        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);

-        // --

-        src_ptr += src_stride*4;

-        // --

-        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)

-        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)

-        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)

-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);

-        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);

-      }

-    }

-    // Transpose result (intermediate -> transpose3_x)

-    {

-      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33

-      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73

-      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx

-      const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB);

-      const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB);

-      const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC);

-      const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC);

-      // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53

-      // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73

-      // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx

-      // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx

-      const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);

-      const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);

-      const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3);

-      const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3);

-      // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63

-      // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73

-      // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx

-      // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx

-      const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1);

-      const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1);

-      const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3);

-      const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3);

-      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71

-      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73

-      // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx

-      // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx

-      transpose3_0 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),

-                                           _mm_castsi128_ps(transpose2_2),

-                                           _MM_SHUFFLE(1, 0, 1, 0)));

-      transpose3_1 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),

-                                           _mm_castsi128_ps(transpose2_2),

-                                           _MM_SHUFFLE(3, 2, 3, 2)));

-      transpose3_2 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),

-                                           _mm_castsi128_ps(transpose2_3),

-                                           _MM_SHUFFLE(1, 0, 1, 0)));

-      transpose3_3 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),

-                                           _mm_castsi128_ps(transpose2_3),

-                                           _MM_SHUFFLE(3, 2, 3, 2)));

-      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx

-      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx

-      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx

-      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx

-    }

-    // Vertical pass (transpose3_x -> dst).

-    {

-      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);

-      // get first two columns filter coefficients

-      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));

-      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));

-      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));

-      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));

-      __m128i col0, col1, col2, col3;

-        DECLARE_ALIGNED(16, unsigned char, temp[32]);

-      {

-        _mm_store_si128((__m128i *)temp, transpose3_0);

-        DO_FOUR_PIXELS(col0, temp, 0);

-      }

-      {

-        _mm_store_si128((__m128i *)temp, transpose3_1);

-        DO_FOUR_PIXELS(col1, temp, 0);

-      }

-      {

-        _mm_store_si128((__m128i *)temp, transpose3_2);

-        DO_FOUR_PIXELS(col2, temp, 0);

-      }

-      {

-        _mm_store_si128((__m128i *)temp, transpose3_3);

-        DO_FOUR_PIXELS(col3, temp, 0);

-      }

-      // transpose

-      {

-        __m128i T0 = _mm_unpacklo_epi32(col0, col1);

-        __m128i T1 = _mm_unpacklo_epi32(col2, col3);

-        __m128i T2 = _mm_unpackhi_epi32(col0, col1);

-        __m128i T3 = _mm_unpackhi_epi32(col2, col3);

-        col0 = _mm_unpacklo_epi64(T0, T1);

-        col1 = _mm_unpackhi_epi64(T0, T1);

-        col2 = _mm_unpacklo_epi64(T2, T3);

-        col3 = _mm_unpackhi_epi64(T2, T3);

-      }

-      // saturate to 8 bit

-      {

-        col0 = _mm_packs_epi32(col0, col0);

-        col0 = _mm_packus_epi16(col0, col0);

-        col1 = _mm_packs_epi32(col1, col1);

-        col1 = _mm_packus_epi16(col1, col1);

-        col2 = _mm_packs_epi32 (col2, col2);

-        col2 = _mm_packus_epi16(col2, col2);

-        col3 = _mm_packs_epi32 (col3, col3);

-        col3 = _mm_packus_epi16(col3, col3);

-      }

-      // store

-      {

-        *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0);

-        *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1);

-        *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2);

-        *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3);

-      }

-    }

-  }

-}

-void vp9_filter_block2d_8x4_8_sse2

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  int j;

-  for (j=0; j<8; j+=4) {

-    vp9_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride,

-                                  HFilter_aligned16, VFilter_aligned16,

-                                  dst_ptr + j, dst_stride);

-  }

-}

-void vp9_filter_block2d_8x8_8_sse2

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  int i, j;

-  for (i=0; i<8; i+=4) {

-    for (j=0; j<8; j+=4) {

-      vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,

-                                    HFilter_aligned16, VFilter_aligned16,

-                                    dst_ptr + j + i*dst_stride, dst_stride);

-    }

-  }

-}

-void vp9_filter_block2d_16x16_8_sse2

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  int i, j;

-  for (i=0; i<16; i+=4) {

-    for (j=0; j<16; j+=4) {

-      vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,

-                                    HFilter_aligned16, VFilter_aligned16,

-                                    dst_ptr + j + i*dst_stride, dst_stride);

-    }

-  }

-}

--- a/vp8/common/x86/filter_sse4.c

+++ /dev/null

@@ -1,362 +1,0 @@

-/*

- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <assert.h> // for alignment checks

-#include <smmintrin.h> // SSE4.1

-#include "vp8/common/filter.h"

-#include "vpx_ports/mem.h" // for DECLARE_ALIGNED

-#include "vpx_rtcd.h"

-// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is

-//           just a quick partial snapshot so that other can already use some

-//           speedup.

-// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap

-//           filtering.

-// TODO(cd): Reduce source size by using macros instead of current code

-//           duplication.

-// TODO(cd): Add some comments, better variable naming.

-// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum

-//           of positive above 128), or have higher precision filter

-//           coefficients.

-DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = {

-  0x00, 0x01,

-  0x01, 0x02,

-  0x02, 0x03,

-  0x03, 0x04,

-  0x02, 0x03,

-  0x03, 0x04,

-  0x04, 0x05,

-  0x05, 0x06,

-};

-DECLARE_ALIGNED(16, static const unsigned char, mask4567_c[16]) = {

-  0x04, 0x05,

-  0x05, 0x06,

-  0x06, 0x07,

-  0x07, 0x08,

-  0x06, 0x07,

-  0x07, 0x08,

-  0x08, 0x09,

-  0x09, 0x0A,

-};

-DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {

-  VP9_FILTER_WEIGHT >> 1,

-  VP9_FILTER_WEIGHT >> 1,

-  VP9_FILTER_WEIGHT >> 1,

-  VP9_FILTER_WEIGHT >> 1,

-};

-DECLARE_ALIGNED(16, static const unsigned char, transpose_c[16]) = {

-  0, 4,  8, 12,

-  1, 5,  9, 13,

-  2, 6, 10, 14,

-  3, 7, 11, 15

-};

-// Creating a macro to do more than four pixels at once to hide instruction

-// latency is actually slower :-(

-#define DO_FOUR_PIXELS(result, offset)                                         \

-  {                                                                            \

-  /*load pixels*/                                                              \

-  __m128i src  = _mm_loadu_si128((const __m128i *)(src_ptr + offset));         \

-  /* extract the ones used for first column */                                 \

-  __m128i src0123 = _mm_shuffle_epi8(src, mask0123);                           \

-  __m128i src4567 = _mm_shuffle_epi8(src, mask4567);                           \

-  __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);                         \

-  __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);                         \

-  __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);                         \

-  __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);                         \

-  /* multiply accumulate them */                                               \

-  __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                             \

-  __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                             \

-  __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                             \

-  __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                             \

-  __m128i mad0123 = _mm_add_epi32(mad01, mad23);                               \

-  __m128i mad4567 = _mm_add_epi32(mad45, mad67);                               \

-  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \

-  mad_all = _mm_add_epi32(mad_all, rounding);                                  \

-  result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);                          \

-  }

-void vp9_filter_block2d_4x4_8_sse4_1

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  __m128i intermediateA, intermediateB, intermediateC;

-  const int kInterp_Extend = 4;

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i mask0123 = _mm_load_si128((const __m128i *)mask0123_c);

-  const __m128i mask4567 = _mm_load_si128((const __m128i *)mask4567_c);

-  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);

-  const __m128i transpose = _mm_load_si128((const __m128i *)transpose_c);

-  // check alignment

-  assert(0 == ((long)HFilter_aligned16)%16);

-  assert(0 == ((long)VFilter_aligned16)%16);

-  {

-    __m128i transpose3_0;

-    __m128i transpose3_1;

-    __m128i transpose3_2;

-    __m128i transpose3_3;

-    // Horizontal pass (src -> intermediate).

-    {

-      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);

-      // get first two columns filter coefficients

-      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));

-      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));

-      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));

-      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));

-      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);

-      {

-        __m128i mad_all0;

-        __m128i mad_all1;

-        __m128i mad_all2;

-        __m128i mad_all3;

-        DO_FOUR_PIXELS(mad_all0, 0*src_stride)

-        DO_FOUR_PIXELS(mad_all1, 1*src_stride)

-        DO_FOUR_PIXELS(mad_all2, 2*src_stride)

-        DO_FOUR_PIXELS(mad_all3, 3*src_stride)

-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);

-        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);

-        // --

-        src_ptr += src_stride*4;

-        // --

-        DO_FOUR_PIXELS(mad_all0, 0*src_stride)

-        DO_FOUR_PIXELS(mad_all1, 1*src_stride)

-        DO_FOUR_PIXELS(mad_all2, 2*src_stride)

-        DO_FOUR_PIXELS(mad_all3, 3*src_stride)

-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);

-        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);

-        // --

-        src_ptr += src_stride*4;

-        // --

-        DO_FOUR_PIXELS(mad_all0, 0*src_stride)

-        DO_FOUR_PIXELS(mad_all1, 1*src_stride)

-        DO_FOUR_PIXELS(mad_all2, 2*src_stride)

-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);

-        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);

-      }

-    }

-    // Transpose result (intermediate -> transpose3_x)

-    {

-      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33

-      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73

-      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx

-      const __m128i transpose1_0 = _mm_shuffle_epi8(intermediateA, transpose);

-      const __m128i transpose1_1 = _mm_shuffle_epi8(intermediateB, transpose);

-      const __m128i transpose1_2 = _mm_shuffle_epi8(intermediateC, transpose);

-      // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33

-      // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73

-      // 80 90 A0 xx 81 91 A1 xx 82 92 A2 xx 83 93 A3 xx

-      const __m128i transpose2_0 = _mm_unpacklo_epi32(transpose1_0, transpose1_1);

-      const __m128i transpose2_1 = _mm_unpackhi_epi32(transpose1_0, transpose1_1);

-      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71

-      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73

-      transpose3_0 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),

-                                           _mm_castsi128_ps(transpose1_2),

-                                           _MM_SHUFFLE(0, 0, 1, 0)));

-      transpose3_1 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),

-                                           _mm_castsi128_ps(transpose1_2),

-                                           _MM_SHUFFLE(1, 1, 3, 2)));

-      transpose3_2 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),

-                                           _mm_castsi128_ps(transpose1_2),

-                                           _MM_SHUFFLE(2, 2, 1, 0)));

-      transpose3_3 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),

-                                           _mm_castsi128_ps(transpose1_2),

-                                           _MM_SHUFFLE(3, 3, 3, 2)));

-      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx

-      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx

-      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx

-      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx

-    }

-    // Vertical pass (transpose3_x -> dst).

-    {

-      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);

-      // get first two columns filter coefficients

-      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));

-      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));

-      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));

-      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));

-      __m128i col0, col1, col2, col3;

-      {

-        //load pixels

-        __m128i src  = transpose3_0;

-        // extract the ones used for first column

-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);

-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);

-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);

-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);

-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);

-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);

-        // multiply accumulate them

-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);

-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);

-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);

-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);

-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);

-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);

-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);

-        mad_all = _mm_add_epi32(mad_all, rounding);

-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);

-        mad_all = _mm_packs_epi32(mad_all, mad_all);

-        col0 = _mm_packus_epi16(mad_all, mad_all);

-      }

-      {

-        //load pixels

-        __m128i src  = transpose3_1;

-        // extract the ones used for first column

-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);

-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);

-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);

-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);

-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);

-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);

-        // multiply accumulate them

-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);

-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);

-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);

-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);

-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);

-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);

-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);

-        mad_all = _mm_add_epi32(mad_all, rounding);

-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);

-        mad_all = _mm_packs_epi32(mad_all, mad_all);

-        col1 = _mm_packus_epi16(mad_all, mad_all);

-      }

-      {

-        //load pixels

-        __m128i src  = transpose3_2;

-        // extract the ones used for first column

-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);

-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);

-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);

-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);

-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);

-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);

-        // multiply accumulate them

-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);

-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);

-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);

-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);

-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);

-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);

-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);

-        mad_all = _mm_add_epi32(mad_all, rounding);

-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);

-        mad_all = _mm_packs_epi32(mad_all, mad_all);

-        col2 = _mm_packus_epi16(mad_all, mad_all);

-      }

-      {

-        //load pixels

-        __m128i src  = transpose3_3;

-        // extract the ones used for first column

-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);

-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);

-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);

-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);

-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);

-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);

-        // multiply accumulate them

-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);

-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);

-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);

-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);

-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);

-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);

-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);

-        mad_all = _mm_add_epi32(mad_all, rounding);

-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);

-        mad_all = _mm_packs_epi32(mad_all, mad_all);

-        col3 = _mm_packus_epi16(mad_all, mad_all);

-      }

-      {

-        __m128i col01 = _mm_unpacklo_epi8(col0, col1);

-        __m128i col23 = _mm_unpacklo_epi8(col2, col3);

-        __m128i col0123 = _mm_unpacklo_epi16(col01, col23);

-        //TODO(cd): look into Ronald's comment:

-        //    Future suggestion: I believe here, too, you can merge the

-        //    packs_epi32() and pacus_epi16() for the 4 cols above, so that

-        //    you get the data in a single register, and then use pshufb

-        //    (shuffle_epi8()) instead of the unpacks here. Should be

-        //    2+3+2 instructions faster.

-        *((unsigned int *)&dst_ptr[dst_stride * 0]) =

-            _mm_extract_epi32(col0123, 0);

-        *((unsigned int *)&dst_ptr[dst_stride * 1]) =

-            _mm_extract_epi32(col0123, 1);

-        *((unsigned int *)&dst_ptr[dst_stride * 2]) =

-            _mm_extract_epi32(col0123, 2);

-        *((unsigned int *)&dst_ptr[dst_stride * 3]) =

-            _mm_extract_epi32(col0123, 3);

-      }

-    }

-  }

-}

-void vp9_filter_block2d_8x4_8_sse4_1

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  int j;

-  for (j=0; j<8; j+=4) {

-    vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j, src_stride,

-                                    HFilter_aligned16, VFilter_aligned16,

-                                    dst_ptr + j, dst_stride);

-  }

-}

-void vp9_filter_block2d_8x8_8_sse4_1

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  int i, j;

-  for (i=0; i<8; i+=4) {

-    for (j=0; j<8; j+=4) {

-      vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,

-                                      HFilter_aligned16, VFilter_aligned16,

-                                      dst_ptr + j + i*dst_stride, dst_stride);

-    }

-  }

-}

-void vp9_filter_block2d_16x16_8_sse4_1

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  int i, j;

-  for (i=0; i<16; i+=4) {

-    for (j=0; j<16; j+=4) {

-      vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,

-                                      HFilter_aligned16, VFilter_aligned16,

-                                      dst_ptr + j + i*dst_stride, dst_stride);

-    }

-  }

-}

--- a/vp8/common/x86/idct_x86.h

+++ /dev/null

@@ -1,64 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef IDCT_X86_H

-#define IDCT_X86_H

-/* Note:

- *

- * This platform is commonly built for runtime CPU detection. If you modify

- * any of the function mappings present in this file, be sure to also update

- * them in the function pointer initialization code

- */

-#if HAVE_MMX

-extern prototype_idct(vp9_short_idct4x4llm_1_mmx);

-extern prototype_idct(vp9_short_idct4x4llm_mmx);

-extern prototype_idct_scalar_add(vp9_dc_only_idct_add_mmx);

-extern prototype_second_order(vp9_short_inv_walsh4x4_mmx);

-extern prototype_second_order(vp9_short_inv_walsh4x4_1_mmx);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_idct_idct1

-#define vp9_idct_idct1 vp9_short_idct4x4llm_1_mmx

-#undef  vp9_idct_idct16

-#define vp9_idct_idct16 vp9_short_idct4x4llm_mmx

-#undef  vp9_idct_idct1_scalar_add

-#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_mmx

-#undef vp9_idct_iwalsh16

-#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_mmx

-#undef vp9_idct_iwalsh1

-#define vp9_idct_iwalsh1 vp9_short_inv_walsh4x4_1_mmx

-#endif

-#endif

-#if HAVE_SSE2

-extern prototype_second_order(vp9_short_inv_walsh4x4_sse2);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef vp9_idct_iwalsh16

-#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_sse2

-#endif

-#endif

-#endif

--- a/vp8/common/x86/idctllm_mmx.asm

+++ /dev/null

@@ -1,241 +1,0 @@

-;

-;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "third_party/x86inc/x86inc.asm"

-SECTION_RODATA

-align 16

-x_s1sqr2:      times 4 dw 0x8A8C

-align 16

-x_c1sqr2less1: times 4 dw 0x4E7B

-align 16

-pw_16:         times 4 dw 16

-SECTION .text

-; /****************************************************************************

-; * Notes:

-; *

-; * This implementation makes use of 16 bit fixed point version of two multiply

-; * constants:

-; *        1.   sqrt(2) * cos (pi/8)

-; *        2.   sqrt(2) * sin (pi/8)

-; * Because the first constant is bigger than 1, to maintain the same 16 bit

-; * fixed point precision as the second one, we use a trick of

-; *        x * a = x + x*(a-1)

-; * so

-; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).

-; *

-; * For the second constant, because of the 16bit version is 35468, which

-; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative

-; * number.

-; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x

-; *

-; **************************************************************************/

-INIT_MMX

-;void short_idct4x4llm_mmx(short *input, short *output, int pitch)

-cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit

-    mova            m0,     [inpq +0]

-    mova            m1,     [inpq +8]

-    mova            m2,     [inpq+16]

-    mova            m3,     [inpq+24]

-    psubw           m0,      m2             ; b1= 0-2

-    paddw           m2,      m2             ;

-    mova            m5,      m1

-    paddw           m2,      m0             ; a1 =0+2

-    pmulhw          m5,     [x_s1sqr2]       ;

-    paddw           m5,      m1             ; ip1 * sin(pi/8) * sqrt(2)

-    mova            m7,      m3             ;

-    pmulhw          m7,     [x_c1sqr2less1]   ;

-    paddw           m7,      m3             ; ip3 * cos(pi/8) * sqrt(2)

-    psubw           m7,      m5             ; c1

-    mova            m5,      m1

-    mova            m4,      m3

-    pmulhw          m5,     [x_c1sqr2less1]

-    paddw           m5,      m1

-    pmulhw          m3,     [x_s1sqr2]

-    paddw           m3,      m4

-    paddw           m3,      m5             ; d1

-    mova            m6,      m2             ; a1

-    mova            m4,      m0             ; b1

-    paddw           m2,      m3             ;0

-    paddw           m4,      m7             ;1

-    psubw           m0,      m7             ;2

-    psubw           m6,      m3             ;3

-    mova            m1,      m2             ; 03 02 01 00

-    mova            m3,      m4             ; 23 22 21 20

-    punpcklwd       m1,      m0             ; 11 01 10 00

-    punpckhwd       m2,      m0             ; 13 03 12 02

-    punpcklwd       m3,      m6             ; 31 21 30 20

-    punpckhwd       m4,      m6             ; 33 23 32 22

-    mova            m0,      m1             ; 11 01 10 00

-    mova            m5,      m2             ; 13 03 12 02

-    punpckldq       m0,      m3             ; 30 20 10 00

-    punpckhdq       m1,      m3             ; 31 21 11 01

-    punpckldq       m2,      m4             ; 32 22 12 02

-    punpckhdq       m5,      m4             ; 33 23 13 03

-    mova            m3,      m5             ; 33 23 13 03

-    psubw           m0,      m2             ; b1= 0-2

-    paddw           m2,      m2             ;

-    mova            m5,      m1

-    paddw           m2,      m0             ; a1 =0+2

-    pmulhw          m5,     [x_s1sqr2]        ;

-    paddw           m5,      m1             ; ip1 * sin(pi/8) * sqrt(2)

-    mova            m7,      m3             ;

-    pmulhw          m7,     [x_c1sqr2less1]   ;

-    paddw           m7,      m3             ; ip3 * cos(pi/8) * sqrt(2)

-    psubw           m7,      m5             ; c1

-    mova            m5,      m1

-    mova            m4,      m3

-    pmulhw          m5,     [x_c1sqr2less1]

-    paddw           m5,      m1

-    pmulhw          m3,     [x_s1sqr2]

-    paddw           m3,      m4

-    paddw           m3,      m5             ; d1

-    paddw           m0,     [pw_16]

-    paddw           m2,     [pw_16]

-    mova            m6,      m2             ; a1

-    mova            m4,      m0             ; b1

-    paddw           m2,      m3             ;0

-    paddw           m4,      m7             ;1

-    psubw           m0,      m7             ;2

-    psubw           m6,      m3             ;3

-    psraw           m2,      5

-    psraw           m0,      5

-    psraw           m4,      5

-    psraw           m6,      5

-    mova            m1,      m2             ; 03 02 01 00

-    mova            m3,      m4             ; 23 22 21 20

-    punpcklwd       m1,      m0             ; 11 01 10 00

-    punpckhwd       m2,      m0             ; 13 03 12 02

-    punpcklwd       m3,      m6             ; 31 21 30 20

-    punpckhwd       m4,      m6             ; 33 23 32 22

-    mova            m0,      m1             ; 11 01 10 00

-    mova            m5,      m2             ; 13 03 12 02

-    punpckldq       m0,      m3             ; 30 20 10 00

-    punpckhdq       m1,      m3             ; 31 21 11 01

-    punpckldq       m2,      m4             ; 32 22 12 02

-    punpckhdq       m5,      m4             ; 33 23 13 03

-    mova        [outq],      m0

-    mova     [outq+r2],      m1

-    mova [outq+pitq*2],      m2

-    add           outq,      pitq

-    mova [outq+pitq*2],      m5

-    RET

-;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)

-cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit

-    movh            m0,     [inpq]

-    paddw           m0,     [pw_16]

-    psraw           m0,      5

-    punpcklwd       m0,      m0

-    punpckldq       m0,      m0

-    mova        [outq],      m0

-    mova   [outq+pitq],      m0

-    mova [outq+pitq*2],      m0

-    add             r1,      r2

-    mova [outq+pitq*2],      m0

-    RET

-;void dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)

-cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride

-%if ARCH_X86_64

-    movsxd         strideq,      dword stridem

-%else

-    mov            strideq,      stridem

-%endif

-    pxor                m0,      m0

-    movh                m5,      in_dcq ; dc

-    paddw               m5,     [pw_16]

-    psraw               m5,      5

-    punpcklwd           m5,      m5

-    punpckldq           m5,      m5

-    movh                m1,     [predq]

-    punpcklbw           m1,      m0

-    paddsw              m1,      m5

-    packuswb            m1,      m0              ; pack and unpack to saturate

-    movh            [dstq],      m1

-    movh                m2,     [predq+pitq]

-    punpcklbw           m2,      m0

-    paddsw              m2,      m5

-    packuswb            m2,      m0              ; pack and unpack to saturate

-    movh    [dstq+strideq],      m2

-    movh                m3,     [predq+2*pitq]

-    punpcklbw           m3,      m0

-    paddsw              m3,      m5

-    packuswb            m3,      m0              ; pack and unpack to saturate

-    movh  [dstq+2*strideq],      m3

-    add               dstq,      strideq

-    add              predq,      pitq

-    movh                m4,     [predq+2*pitq]

-    punpcklbw           m4,      m0

-    paddsw              m4,      m5

-    packuswb            m4,      m0              ; pack and unpack to saturate

-    movh  [dstq+2*strideq],      m4

-    RET

--- a/vp8/common/x86/idctllm_sse2.asm

+++ /dev/null

@@ -1,712 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;void vp9_idct_dequant_0_2x_sse2

-; (

-;   short *qcoeff       - 0

-;   short *dequant      - 1

-;   unsigned char *pre  - 2

-;   unsigned char *dst  - 3

-;   int dst_stride      - 4

-;   int blk_stride      - 5

-; )

-global sym(vp9_idct_dequant_0_2x_sse2)

-sym(vp9_idct_dequant_0_2x_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    GET_GOT     rbx

-    ; end prolog

-        mov         rdx,            arg(1) ; dequant

-        mov         rax,            arg(0) ; qcoeff

-        movd        xmm4,           [rax]

-        movd        xmm5,           [rdx]

-        pinsrw      xmm4,           [rax+32],   4

-        pinsrw      xmm5,           [rdx],      4

-        pmullw      xmm4,           xmm5

-    ; Zero out xmm5, for use unpacking

-        pxor        xmm5,           xmm5

-    ; clear coeffs

-        movd        [rax],          xmm5

-        movd        [rax+32],       xmm5

-;pshufb

-        pshuflw     xmm4,           xmm4,       00000000b

-        pshufhw     xmm4,           xmm4,       00000000b

-        mov         rax,            arg(2) ; pre

-        paddw       xmm4,           [GLOBAL(fours)]

-        movsxd      rcx,            dword ptr arg(5) ; blk_stride

-        psraw       xmm4,           3

-        movq        xmm0,           [rax]

-        movq        xmm1,           [rax+rcx]

-        movq        xmm2,           [rax+2*rcx]

-        lea         rcx,            [3*rcx]

-        movq        xmm3,           [rax+rcx]

-        punpcklbw   xmm0,           xmm5

-        punpcklbw   xmm1,           xmm5

-        punpcklbw   xmm2,           xmm5

-        punpcklbw   xmm3,           xmm5

-        mov         rax,            arg(3) ; dst

-        movsxd      rdx,            dword ptr arg(4) ; dst_stride

-    ; Add to predict buffer

-        paddw       xmm0,           xmm4

-        paddw       xmm1,           xmm4

-        paddw       xmm2,           xmm4

-        paddw       xmm3,           xmm4

-    ; pack up before storing

-        packuswb    xmm0,           xmm5

-        packuswb    xmm1,           xmm5

-        packuswb    xmm2,           xmm5

-        packuswb    xmm3,           xmm5

-    ; store blocks back out

-        movq        [rax],          xmm0

-        movq        [rax + rdx],    xmm1

-        lea         rax,            [rax + 2*rdx]

-        movq        [rax],          xmm2

-        movq        [rax + rdx],    xmm3

-    ; begin epilog

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-global sym(vp9_idct_dequant_full_2x_sse2)

-sym(vp9_idct_dequant_full_2x_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ; special case when 2 blocks have 0 or 1 coeffs

-    ; dc is set as first coeff, so no need to load qcoeff

-        mov         rax,            arg(0) ; qcoeff

-        mov         rsi,            arg(2) ; pre

-        mov         rdi,            arg(3) ; dst

-        movsxd      rcx,            dword ptr arg(5) ; blk_stride

-    ; Zero out xmm7, for use unpacking

-        pxor        xmm7,           xmm7

-        mov         rdx,            arg(1)  ; dequant

-    ; note the transpose of xmm1 and xmm2, necessary for shuffle

-    ;   to spit out sensicle data

-        movdqa      xmm0,           [rax]

-        movdqa      xmm2,           [rax+16]

-        movdqa      xmm1,           [rax+32]

-        movdqa      xmm3,           [rax+48]

-    ; Clear out coeffs

-        movdqa      [rax],          xmm7

-        movdqa      [rax+16],       xmm7

-        movdqa      [rax+32],       xmm7

-        movdqa      [rax+48],       xmm7

-    ; dequantize qcoeff buffer

-        pmullw      xmm0,           [rdx]

-        pmullw      xmm2,           [rdx+16]

-        pmullw      xmm1,           [rdx]

-        pmullw      xmm3,           [rdx+16]

-    ; repack so block 0 row x and block 1 row x are together

-        movdqa      xmm4,           xmm0

-        punpckldq   xmm0,           xmm1

-        punpckhdq   xmm4,           xmm1

-        pshufd      xmm0,           xmm0,       11011000b

-        pshufd      xmm1,           xmm4,       11011000b

-        movdqa      xmm4,           xmm2

-        punpckldq   xmm2,           xmm3

-        punpckhdq   xmm4,           xmm3

-        pshufd      xmm2,           xmm2,       11011000b

-        pshufd      xmm3,           xmm4,       11011000b

-    ; first pass

-        psubw       xmm0,           xmm2        ; b1 = 0-2

-        paddw       xmm2,           xmm2        ;

-        movdqa      xmm5,           xmm1

-        paddw       xmm2,           xmm0        ; a1 = 0+2

-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)

-        movdqa      xmm7,           xmm3

-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)

-        psubw       xmm7,           xmm5        ; c1

-        movdqa      xmm5,           xmm1

-        movdqa      xmm4,           xmm3

-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm5,           xmm1

-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm3,           xmm4

-        paddw       xmm3,           xmm5        ; d1

-        movdqa      xmm6,           xmm2        ; a1

-        movdqa      xmm4,           xmm0        ; b1

-        paddw       xmm2,           xmm3        ;0

-        paddw       xmm4,           xmm7        ;1

-        psubw       xmm0,           xmm7        ;2

-        psubw       xmm6,           xmm3        ;3

-    ; transpose for the second pass

-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

-        pshufd      xmm0,           xmm2,       11011000b

-        pshufd      xmm2,           xmm1,       11011000b

-        pshufd      xmm1,           xmm5,       11011000b

-        pshufd      xmm3,           xmm7,       11011000b

-    ; second pass

-        psubw       xmm0,           xmm2            ; b1 = 0-2

-        paddw       xmm2,           xmm2

-        movdqa      xmm5,           xmm1

-        paddw       xmm2,           xmm0            ; a1 = 0+2

-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)

-        movdqa      xmm7,           xmm3

-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)

-        psubw       xmm7,           xmm5            ; c1

-        movdqa      xmm5,           xmm1

-        movdqa      xmm4,           xmm3

-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm5,           xmm1

-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm3,           xmm4

-        paddw       xmm3,           xmm5            ; d1

-        paddw       xmm0,           [GLOBAL(fours)]

-        paddw       xmm2,           [GLOBAL(fours)]

-        movdqa      xmm6,           xmm2            ; a1

-        movdqa      xmm4,           xmm0            ; b1

-        paddw       xmm2,           xmm3            ;0

-        paddw       xmm4,           xmm7            ;1

-        psubw       xmm0,           xmm7            ;2

-        psubw       xmm6,           xmm3            ;3

-        psraw       xmm2,           3

-        psraw       xmm0,           3

-        psraw       xmm4,           3

-        psraw       xmm6,           3

-    ; transpose to save

-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

-        pshufd      xmm0,           xmm2,       11011000b

-        pshufd      xmm2,           xmm1,       11011000b

-        pshufd      xmm1,           xmm5,       11011000b

-        pshufd      xmm3,           xmm7,       11011000b

-        pxor        xmm7,           xmm7

-    ; Load up predict blocks

-        movq        xmm4,           [rsi]

-        movq        xmm5,           [rsi+rcx]

-        punpcklbw   xmm4,           xmm7

-        punpcklbw   xmm5,           xmm7

-        paddw       xmm0,           xmm4

-        paddw       xmm1,           xmm5

-        movq        xmm4,           [rsi+2*rcx]

-        lea         rcx,            [3*rcx]

-        movq        xmm5,           [rsi+rcx]

-        punpcklbw   xmm4,           xmm7

-        punpcklbw   xmm5,           xmm7

-        paddw       xmm2,           xmm4

-        paddw       xmm3,           xmm5

-.finish:

-    ; pack up before storing

-        packuswb    xmm0,           xmm7

-        packuswb    xmm1,           xmm7

-        packuswb    xmm2,           xmm7

-        packuswb    xmm3,           xmm7

-    ; Load destination stride before writing out,

-    ;   doesn't need to persist

-        movsxd      rdx,            dword ptr arg(4) ; dst_stride

-    ; store blocks back out

-        movq        [rdi],          xmm0

-        movq        [rdi + rdx],    xmm1

-        lea         rdi,            [rdi + 2*rdx]

-        movq        [rdi],          xmm2

-        movq        [rdi + rdx],    xmm3

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_idct_dequant_dc_0_2x_sse2

-; (

-;   short *qcoeff       - 0

-;   short *dequant      - 1

-;   unsigned char *pre  - 2

-;   unsigned char *dst  - 3

-;   int dst_stride      - 4

-;   short *dc           - 5

-; )

-global sym(vp9_idct_dequant_dc_0_2x_sse2)

-sym(vp9_idct_dequant_dc_0_2x_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ; special case when 2 blocks have 0 or 1 coeffs

-    ; dc is set as first coeff, so no need to load qcoeff

-        mov         rax,            arg(0) ; qcoeff

-        mov         rsi,            arg(2) ; pre

-        mov         rdi,            arg(3) ; dst

-        mov         rdx,            arg(5) ; dc

-    ; Zero out xmm5, for use unpacking

-        pxor        xmm5,           xmm5

-    ; load up 2 dc words here == 2*16 = doubleword

-        movd        xmm4,           [rdx]

-    ; Load up predict blocks

-        movq        xmm0,           [rsi]

-        movq        xmm1,           [rsi+16]

-        movq        xmm2,           [rsi+32]

-        movq        xmm3,           [rsi+48]

-    ; Duplicate and expand dc across

-        punpcklwd   xmm4,           xmm4

-        punpckldq   xmm4,           xmm4

-    ; Rounding to dequant and downshift

-        paddw       xmm4,           [GLOBAL(fours)]

-        psraw       xmm4,           3

-    ; Predict buffer needs to be expanded from bytes to words

-        punpcklbw   xmm0,           xmm5

-        punpcklbw   xmm1,           xmm5

-        punpcklbw   xmm2,           xmm5

-        punpcklbw   xmm3,           xmm5

-    ; Add to predict buffer

-        paddw       xmm0,           xmm4

-        paddw       xmm1,           xmm4

-        paddw       xmm2,           xmm4

-        paddw       xmm3,           xmm4

-    ; pack up before storing

-        packuswb    xmm0,           xmm5

-        packuswb    xmm1,           xmm5

-        packuswb    xmm2,           xmm5

-        packuswb    xmm3,           xmm5

-    ; Load destination stride before writing out,

-    ;   doesn't need to persist

-        movsxd      rdx,            dword ptr arg(4) ; dst_stride

-    ; store blocks back out

-        movq        [rdi],          xmm0

-        movq        [rdi + rdx],    xmm1

-        lea         rdi,            [rdi + 2*rdx]

-        movq        [rdi],          xmm2

-        movq        [rdi + rdx],    xmm3

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-global sym(vp9_idct_dequant_dc_full_2x_sse2)

-sym(vp9_idct_dequant_dc_full_2x_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ; special case when 2 blocks have 0 or 1 coeffs

-    ; dc is set as first coeff, so no need to load qcoeff

-        mov         rax,            arg(0) ; qcoeff

-        mov         rsi,            arg(2) ; pre

-        mov         rdi,            arg(3) ; dst

-    ; Zero out xmm7, for use unpacking

-        pxor        xmm7,           xmm7

-        mov         rdx,            arg(1)  ; dequant

-    ; note the transpose of xmm1 and xmm2, necessary for shuffle

-    ;   to spit out sensicle data

-        movdqa      xmm0,           [rax]

-        movdqa      xmm2,           [rax+16]

-        movdqa      xmm1,           [rax+32]

-        movdqa      xmm3,           [rax+48]

-    ; Clear out coeffs

-        movdqa      [rax],          xmm7

-        movdqa      [rax+16],       xmm7

-        movdqa      [rax+32],       xmm7

-        movdqa      [rax+48],       xmm7

-    ; dequantize qcoeff buffer

-        pmullw      xmm0,           [rdx]

-        pmullw      xmm2,           [rdx+16]

-        pmullw      xmm1,           [rdx]

-        pmullw      xmm3,           [rdx+16]

-    ; DC component

-        mov         rdx,            arg(5)

-    ; repack so block 0 row x and block 1 row x are together

-        movdqa      xmm4,           xmm0

-        punpckldq   xmm0,           xmm1

-        punpckhdq   xmm4,           xmm1

-        pshufd      xmm0,           xmm0,       11011000b

-        pshufd      xmm1,           xmm4,       11011000b

-        movdqa      xmm4,           xmm2

-        punpckldq   xmm2,           xmm3

-        punpckhdq   xmm4,           xmm3

-        pshufd      xmm2,           xmm2,       11011000b

-        pshufd      xmm3,           xmm4,       11011000b

-    ; insert DC component

-        pinsrw      xmm0,           [rdx],      0

-        pinsrw      xmm0,           [rdx+2],    4

-    ; first pass

-        psubw       xmm0,           xmm2        ; b1 = 0-2

-        paddw       xmm2,           xmm2        ;

-        movdqa      xmm5,           xmm1

-        paddw       xmm2,           xmm0        ; a1 = 0+2

-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)

-        movdqa      xmm7,           xmm3

-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)

-        psubw       xmm7,           xmm5        ; c1

-        movdqa      xmm5,           xmm1

-        movdqa      xmm4,           xmm3

-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm5,           xmm1

-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm3,           xmm4

-        paddw       xmm3,           xmm5        ; d1

-        movdqa      xmm6,           xmm2        ; a1

-        movdqa      xmm4,           xmm0        ; b1

-        paddw       xmm2,           xmm3        ;0

-        paddw       xmm4,           xmm7        ;1

-        psubw       xmm0,           xmm7        ;2

-        psubw       xmm6,           xmm3        ;3

-    ; transpose for the second pass

-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

-        pshufd      xmm0,           xmm2,       11011000b

-        pshufd      xmm2,           xmm1,       11011000b

-        pshufd      xmm1,           xmm5,       11011000b

-        pshufd      xmm3,           xmm7,       11011000b

-    ; second pass

-        psubw       xmm0,           xmm2            ; b1 = 0-2

-        paddw       xmm2,           xmm2

-        movdqa      xmm5,           xmm1

-        paddw       xmm2,           xmm0            ; a1 = 0+2

-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)

-        movdqa      xmm7,           xmm3

-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)

-        psubw       xmm7,           xmm5            ; c1

-        movdqa      xmm5,           xmm1

-        movdqa      xmm4,           xmm3

-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm5,           xmm1

-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm3,           xmm4

-        paddw       xmm3,           xmm5            ; d1

-        paddw       xmm0,           [GLOBAL(fours)]

-        paddw       xmm2,           [GLOBAL(fours)]

-        movdqa      xmm6,           xmm2            ; a1

-        movdqa      xmm4,           xmm0            ; b1

-        paddw       xmm2,           xmm3            ;0

-        paddw       xmm4,           xmm7            ;1

-        psubw       xmm0,           xmm7            ;2

-        psubw       xmm6,           xmm3            ;3

-        psraw       xmm2,           3

-        psraw       xmm0,           3

-        psraw       xmm4,           3

-        psraw       xmm6,           3

-    ; transpose to save

-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

-        pshufd      xmm0,           xmm2,       11011000b

-        pshufd      xmm2,           xmm1,       11011000b

-        pshufd      xmm1,           xmm5,       11011000b

-        pshufd      xmm3,           xmm7,       11011000b

-        pxor        xmm7,           xmm7

-    ; Load up predict blocks

-        movq        xmm4,           [rsi]

-        movq        xmm5,           [rsi+16]

-        punpcklbw   xmm4,           xmm7

-        punpcklbw   xmm5,           xmm7

-        paddw       xmm0,           xmm4

-        paddw       xmm1,           xmm5

-        movq        xmm4,           [rsi+32]

-        movq        xmm5,           [rsi+48]

-        punpcklbw   xmm4,           xmm7

-        punpcklbw   xmm5,           xmm7

-        paddw       xmm2,           xmm4

-        paddw       xmm3,           xmm5

-.finish:

-    ; pack up before storing

-        packuswb    xmm0,           xmm7

-        packuswb    xmm1,           xmm7

-        packuswb    xmm2,           xmm7

-        packuswb    xmm3,           xmm7

-    ; Load destination stride before writing out,

-    ;   doesn't need to persist

-        movsxd      rdx,            dword ptr arg(4) ; dst_stride

-    ; store blocks back out

-        movq        [rdi],          xmm0

-        movq        [rdi + rdx],    xmm1

-        lea         rdi,            [rdi + 2*rdx]

-        movq        [rdi],          xmm2

-        movq        [rdi + rdx],    xmm3

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-fours:

-    times 8 dw 0x0004

-align 16

-x_s1sqr2:

-    times 8 dw 0x8A8C

-align 16

-x_c1sqr2less1:

-    times 8 dw 0x4E7B

--- a/vp8/common/x86/iwalsh_mmx.asm

+++ /dev/null

@@ -1,173 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;void vp9_short_inv_walsh4x4_1_mmx(short *input, short *output)

-global sym(vp9_short_inv_walsh4x4_1_mmx)

-sym(vp9_short_inv_walsh4x4_1_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 2

-    push        rsi

-    push        rdi

-    ; end prolog

-    mov     rsi, arg(0)

-    mov     rax, 3

-    mov     rdi, arg(1)

-    add     rax, [rsi]          ;input[0] + 3

-    movd    mm0, eax

-    punpcklwd mm0, mm0          ;x x val val

-    punpckldq mm0, mm0          ;val val val val

-    psraw   mm0, 3            ;(input[0] + 3) >> 3

-    movq  [rdi + 0], mm0

-    movq  [rdi + 8], mm0

-    movq  [rdi + 16], mm0

-    movq  [rdi + 24], mm0

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_short_inv_walsh4x4_mmx(short *input, short *output)

-global sym(vp9_short_inv_walsh4x4_mmx)

-sym(vp9_short_inv_walsh4x4_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 2

-    push        rsi

-    push        rdi

-    ; end prolog

-    mov     rax, 3

-    mov     rsi, arg(0)

-    mov     rdi, arg(1)

-    shl     rax, 16

-    movq    mm0, [rsi + 0]        ;ip[0]

-    movq    mm1, [rsi + 8]        ;ip[4]

-    or      rax, 3            ;00030003h

-    movq    mm2, [rsi + 16]       ;ip[8]

-    movq    mm3, [rsi + 24]       ;ip[12]

-    movq    mm7, rax

-    movq    mm4, mm0

-    punpcklwd mm7, mm7          ;0003000300030003h

-    movq    mm5, mm1

-    paddw   mm4, mm3          ;ip[0] + ip[12] aka al

-    paddw   mm5, mm2          ;ip[4] + ip[8] aka bl

-    movq    mm6, mm4          ;temp al

-    paddw   mm4, mm5          ;al + bl

-    psubw   mm6, mm5          ;al - bl

-    psubw   mm0, mm3          ;ip[0] - ip[12] aka d1

-    psubw   mm1, mm2          ;ip[4] - ip[8] aka c1

-    movq    mm5, mm0          ;temp dl

-    paddw   mm0, mm1          ;dl + cl

-    psubw   mm5, mm1          ;dl - cl

-    ; 03 02 01 00

-    ; 13 12 11 10

-    ; 23 22 21 20

-    ; 33 32 31 30

-    movq    mm3, mm4          ; 03 02 01 00

-    punpcklwd mm4, mm0          ; 11 01 10 00

-    punpckhwd mm3, mm0          ; 13 03 12 02

-    movq    mm1, mm6          ; 23 22 21 20

-    punpcklwd mm6, mm5          ; 31 21 30 20

-    punpckhwd mm1, mm5          ; 33 23 32 22

-    movq    mm0, mm4          ; 11 01 10 00

-    movq    mm2, mm3          ; 13 03 12 02

-    punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]

-    punpckhdq mm4, mm6          ; 31 21 11 01 aka ip[4]

-    punpckldq mm2, mm1          ; 32 22 12 02 aka ip[8]

-    punpckhdq mm3, mm1          ; 33 23 13 03 aka ip[12]

-;~~~~~~~~~~~~~~~~~~~~~

-    movq    mm1, mm0

-    movq    mm5, mm4

-    paddw   mm1, mm3          ;ip[0] + ip[12] aka al

-    paddw   mm5, mm2          ;ip[4] + ip[8] aka bl

-    movq    mm6, mm1          ;temp al

-    paddw   mm1, mm5          ;al + bl

-    psubw   mm6, mm5          ;al - bl

-    psubw   mm0, mm3          ;ip[0] - ip[12] aka d1

-    psubw   mm4, mm2          ;ip[4] - ip[8] aka c1

-    movq    mm5, mm0          ;temp dl

-    paddw   mm0, mm4          ;dl + cl

-    psubw   mm5, mm4          ;dl - cl

-;~~~~~~~~~~~~~~~~~~~~~

-    movq    mm3, mm1          ; 03 02 01 00

-    punpcklwd mm1, mm0          ; 11 01 10 00

-    punpckhwd mm3, mm0          ; 13 03 12 02

-    movq    mm4, mm6          ; 23 22 21 20

-    punpcklwd mm6, mm5          ; 31 21 30 20

-    punpckhwd mm4, mm5          ; 33 23 32 22

-    movq    mm0, mm1          ; 11 01 10 00

-    movq    mm2, mm3          ; 13 03 12 02

-    punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]

-    punpckhdq mm1, mm6          ; 31 21 11 01 aka ip[4]

-    punpckldq mm2, mm4          ; 32 22 12 02 aka ip[8]

-    punpckhdq mm3, mm4          ; 33 23 13 03 aka ip[12]

-    paddw   mm0, mm7

-    paddw   mm1, mm7

-    paddw   mm2, mm7

-    paddw   mm3, mm7

-    psraw   mm0, 3

-    psraw   mm1, 3

-    psraw   mm2, 3

-    psraw   mm3, 3

-    movq  [rdi + 0], mm0

-    movq  [rdi + 8], mm1

-    movq  [rdi + 16], mm2

-    movq  [rdi + 24], mm3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

--- a/vp8/common/x86/iwalsh_sse2.asm

+++ /dev/null

@@ -1,119 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;void vp9_short_inv_walsh4x4_sse2(short *input, short *output)

-global sym(vp9_short_inv_walsh4x4_sse2)

-sym(vp9_short_inv_walsh4x4_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 2

-    SAVE_XMM 6

-    push        rsi

-    push        rdi

-    ; end prolog

-    mov     rsi, arg(0)

-    mov     rdi, arg(1)

-    mov     rax, 3

-    movdqa    xmm0, [rsi + 0]       ;ip[4] ip[0]

-    movdqa    xmm1, [rsi + 16]      ;ip[12] ip[8]

-    shl     rax, 16

-    or      rax, 3            ;00030003h

-    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]

-    movdqa    xmm3, xmm0          ;ip[4] ip[0]

-    paddw   xmm0, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1

-    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1

-    movdqa    xmm4, xmm0

-    punpcklqdq  xmm0, xmm3          ;d1 a1

-    punpckhqdq  xmm4, xmm3          ;c1 b1

-    movd    xmm6, eax

-    movdqa    xmm1, xmm4          ;c1 b1

-    paddw   xmm4, xmm0          ;dl+cl a1+b1 aka op[4] op[0]

-    psubw   xmm0, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]

-;;;temp output

-;;  movdqu  [rdi + 0], xmm4

-;;  movdqu  [rdi + 16], xmm3

-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-    ; 13 12 11 10 03 02 01 00

-    ;

-    ; 33 32 31 30 23 22 21 20

-    ;

-    movdqa    xmm3, xmm4          ; 13 12 11 10 03 02 01 00

-    punpcklwd xmm4, xmm0          ; 23 03 22 02 21 01 20 00

-    punpckhwd xmm3, xmm0          ; 33 13 32 12 31 11 30 10

-    movdqa    xmm1, xmm4          ; 23 03 22 02 21 01 20 00

-    punpcklwd xmm4, xmm3          ; 31 21 11 01 30 20 10 00

-    punpckhwd xmm1, xmm3          ; 33 23 13 03 32 22 12 02

-    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]

-    movdqa    xmm3, xmm4          ;ip[4] ip[0]

-    pshufd    xmm6, xmm6, 0       ;03 03 03 03 03 03 03 03

-    paddw   xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1

-    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1

-    movdqa    xmm5, xmm4

-    punpcklqdq  xmm4, xmm3          ;d1 a1

-    punpckhqdq  xmm5, xmm3          ;c1 b1

-    movdqa    xmm1, xmm5          ;c1 b1

-    paddw   xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0]

-    psubw   xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]

-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-    ; 13 12 11 10 03 02 01 00

-    ;

-    ; 33 32 31 30 23 22 21 20

-    ;

-    movdqa    xmm0, xmm5          ; 13 12 11 10 03 02 01 00

-    punpcklwd xmm5, xmm4          ; 23 03 22 02 21 01 20 00

-    punpckhwd xmm0, xmm4          ; 33 13 32 12 31 11 30 10

-    movdqa    xmm1, xmm5          ; 23 03 22 02 21 01 20 00

-    punpcklwd xmm5, xmm0          ; 31 21 11 01 30 20 10 00

-    punpckhwd xmm1, xmm0          ; 33 23 13 03 32 22 12 02

-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-    paddw   xmm5, xmm6

-    paddw   xmm1, xmm6

-    psraw   xmm5, 3

-    psraw   xmm1, 3

-    movdqa  [rdi + 0], xmm5

-    movdqa  [rdi + 16], xmm1

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-x_s1sqr2:

-    times 4 dw 0x8A8C

-align 16

-x_c1sqr2less1:

-    times 4 dw 0x4E7B

-align 16

-fours:

-    times 4 dw 0x0004

--- a/vp8/common/x86/loopfilter_mmx.asm

+++ /dev/null

@@ -1,969 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;void vp9_loop_filter_horizontal_edge_mmx

-;(

-;    unsigned char *src_ptr,

-;    int src_pixel_step,

-;    const char *blimit,

-;    const char *limit,

-;    const char *thresh,

-;    int  count

-;)

-global sym(vp9_loop_filter_horizontal_edge_mmx)

-sym(vp9_loop_filter_horizontal_edge_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub         rsp, 32                         ; reserve 32 bytes

-    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[8];

-    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[8];

-        mov         rsi, arg(0) ;src_ptr

-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?

-        movsxd      rcx, dword ptr arg(5) ;count

-.next8_h:

-        mov         rdx, arg(3) ;limit

-        movq        mm7, [rdx]

-        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing

-        add         rdi, rax

-        ; calculate breakout conditions

-        movq        mm2, [rdi+2*rax]      ; q3

-        movq        mm1, [rsi+2*rax]      ; q2

-        movq        mm6, mm1              ; q2

-        psubusb     mm1, mm2              ; q2-=q3

-        psubusb     mm2, mm6              ; q3-=q2

-        por         mm1, mm2              ; abs(q3-q2)

-        psubusb     mm1, mm7              ;

-        movq        mm4, [rsi+rax]        ; q1

-        movq        mm3, mm4              ; q1

-        psubusb     mm4, mm6              ; q1-=q2

-        psubusb     mm6, mm3              ; q2-=q1

-        por         mm4, mm6              ; abs(q2-q1)

-        psubusb     mm4, mm7

-        por        mm1, mm4

-        movq        mm4, [rsi]            ; q0

-        movq        mm0, mm4              ; q0

-        psubusb     mm4, mm3              ; q0-=q1

-        psubusb     mm3, mm0              ; q1-=q0

-        por         mm4, mm3              ; abs(q0-q1)

-        movq        t0, mm4               ; save to t0

-        psubusb     mm4, mm7

-        por        mm1, mm4

-        neg         rax                   ; negate pitch to deal with above border

-        movq        mm2, [rsi+4*rax]      ; p3

-        movq        mm4, [rdi+4*rax]      ; p2

-        movq        mm5, mm4              ; p2

-        psubusb     mm4, mm2              ; p2-=p3

-        psubusb     mm2, mm5              ; p3-=p2

-        por         mm4, mm2              ; abs(p3 - p2)

-        psubusb     mm4, mm7

-        por        mm1, mm4

-        movq        mm4, [rsi+2*rax]      ; p1

-        movq        mm3, mm4              ; p1

-        psubusb     mm4, mm5              ; p1-=p2

-        psubusb     mm5, mm3              ; p2-=p1

-        por         mm4, mm5              ; abs(p2 - p1)

-        psubusb     mm4, mm7

-        por        mm1, mm4

-        movq        mm2, mm3              ; p1

-        movq        mm4, [rsi+rax]        ; p0

-        movq        mm5, mm4              ; p0

-        psubusb     mm4, mm3              ; p0-=p1

-        psubusb     mm3, mm5              ; p1-=p0

-        por         mm4, mm3              ; abs(p1 - p0)

-        movq        t1, mm4               ; save to t1

-        psubusb     mm4, mm7

-        por        mm1, mm4

-        movq        mm3, [rdi]            ; q1

-        movq        mm4, mm3              ; q1

-        psubusb     mm3, mm2              ; q1-=p1

-        psubusb     mm2, mm4              ; p1-=q1

-        por         mm2, mm3              ; abs(p1-q1)

-        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero

-        psrlw       mm2, 1                ; abs(p1-q1)/2

-        movq        mm6, mm5              ; p0

-        movq        mm3, [rsi]            ; q0

-        psubusb     mm5, mm3              ; p0-=q0

-        psubusb     mm3, mm6              ; q0-=p0

-        por         mm5, mm3              ; abs(p0 - q0)

-        paddusb     mm5, mm5              ; abs(p0-q0)*2

-        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2

-        mov         rdx, arg(2) ;blimit           ; get blimit

-        movq        mm7, [rdx]            ; blimit

-        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit

-        por         mm1,    mm5

-        pxor        mm5,    mm5

-        pcmpeqb     mm1,    mm5           ; mask mm1

-        ; calculate high edge variance

-        mov         rdx, arg(4) ;thresh           ; get thresh

-        movq        mm7, [rdx]            ;

-        movq        mm4, t0               ; get abs (q1 - q0)

-        psubusb     mm4, mm7

-        movq        mm3, t1               ; get abs (p1 - p0)

-        psubusb     mm3, mm7

-        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh

-        pcmpeqb     mm4,        mm5

-        pcmpeqb     mm5,        mm5

-        pxor        mm4,        mm5

-        ; start work on filters

-        movq        mm2, [rsi+2*rax]      ; p1

-        movq        mm7, [rdi]            ; q1

-        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values

-        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values

-        psubsb      mm2, mm7              ; p1 - q1

-        pand        mm2, mm4              ; high var mask (hvm)(p1 - q1)

-        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values

-        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values

-        movq        mm3, mm0              ; q0

-        psubsb      mm0, mm6              ; q0 - p0

-        paddsb      mm2, mm0              ; 1 * (q0 - p0) + hvm(p1 - q1)

-        paddsb      mm2, mm0              ; 2 * (q0 - p0) + hvm(p1 - q1)

-        paddsb      mm2, mm0              ; 3 * (q0 - p0) + hvm(p1 - q1)

-        pand        mm1, mm2                  ; mask filter values we don't care about

-        movq        mm2, mm1

-        paddsb      mm1, [GLOBAL(t4)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 4

-        paddsb      mm2, [GLOBAL(t3)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 3

-        pxor        mm0, mm0             ;

-        pxor        mm5, mm5

-        punpcklbw   mm0, mm2            ;

-        punpckhbw   mm5, mm2            ;

-        psraw       mm0, 11             ;

-        psraw       mm5, 11

-        packsswb    mm0, mm5

-        movq        mm2, mm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;

-        pxor        mm0, mm0              ; 0

-        movq        mm5, mm1              ; abcdefgh

-        punpcklbw   mm0, mm1              ; e0f0g0h0

-        psraw       mm0, 11               ; sign extended shift right by 3

-        pxor        mm1, mm1              ; 0

-        punpckhbw   mm1, mm5              ; a0b0c0d0

-        psraw       mm1, 11               ; sign extended shift right by 3

-        movq        mm5, mm0              ; save results

-        packsswb    mm0, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3

-        paddsw      mm5, [GLOBAL(ones)]

-        paddsw      mm1, [GLOBAL(ones)]

-        psraw       mm5, 1                ; partial shifted one more time for 2nd tap

-        psraw       mm1, 1                ; partial shifted one more time for 2nd tap

-        packsswb    mm5, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4

-        pandn       mm4, mm5              ; high edge variance additive

-        paddsb      mm6, mm2              ; p0+= p0 add

-        pxor        mm6, [GLOBAL(t80)]    ; unoffset

-        movq        [rsi+rax], mm6        ; write back

-        movq        mm6, [rsi+2*rax]      ; p1

-        pxor        mm6, [GLOBAL(t80)]    ; reoffset

-        paddsb      mm6, mm4              ; p1+= p1 add

-        pxor        mm6, [GLOBAL(t80)]    ; unoffset

-        movq        [rsi+2*rax], mm6      ; write back

-        psubsb      mm3, mm0              ; q0-= q0 add

-        pxor        mm3, [GLOBAL(t80)]    ; unoffset

-        movq        [rsi], mm3            ; write back

-        psubsb      mm7, mm4              ; q1-= q1 add

-        pxor        mm7, [GLOBAL(t80)]    ; unoffset

-        movq        [rdi], mm7            ; write back

-        add         rsi,8

-        neg         rax

-        dec         rcx

-        jnz         .next8_h

-    add rsp, 32

-    pop rsp

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_loop_filter_vertical_edge_mmx

-;(

-;    unsigned char *src_ptr,

-;    int  src_pixel_step,

-;    const char *blimit,

-;    const char *limit,

-;    const char *thresh,

-;    int count

-;)

-global sym(vp9_loop_filter_vertical_edge_mmx)

-sym(vp9_loop_filter_vertical_edge_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub          rsp, 64      ; reserve 64 bytes

-    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];

-    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];

-    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[32];

-        mov         rsi,        arg(0) ;src_ptr

-        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?

-        lea         rsi,        [rsi + rax*4 - 4]

-        movsxd      rcx,        dword ptr arg(5) ;count

-.next8_v:

-        mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing

-        add         rdi,        rax

-        ;transpose

-        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60

-        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70

-        punpckhbw   mm7,        [rdi+2*rax]                 ; 77 67 76 66 75 65 74 64

-        punpcklbw   mm6,        [rdi+2*rax]                 ; 73 63 72 62 71 61 70 60

-        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40

-        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40

-        punpckhbw   mm5,        [rsi+rax]                   ; 57 47 56 46 55 45 54 44

-        punpcklbw   mm4,        [rsi+rax]                   ; 53 43 52 42 51 41 50 40

-        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44

-        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46

-        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44

-        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40

-        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42

-        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40

-        neg         rax

-        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20

-        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20

-        punpckhbw   mm6,        [rsi+rax]                   ; 37 27 36 36 35 25 34 24

-        punpcklbw   mm1,        [rsi+rax]                   ; 33 23 32 22 31 21 30 20

-        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00

-        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04

-        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04

-        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06

-        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04

-        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06

-        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3

-        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2

-        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06

-        psubusb     mm5,        mm7                         ; q2-q3

-        psubusb     mm7,        mm6                         ; q3-q2

-        por         mm7,        mm5;                        ; mm7=abs (q3-q2)

-        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04

-        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1

-        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0

-        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1

-        psubusb     mm3,        mm6                         ; q1-q2

-        psubusb     mm6,        mm5                         ; q2-q1

-        por         mm6,        mm3                         ; mm6=abs(q2-q1)

-        lea         rdx,        srct

-        movq        [rdx+24],   mm5                         ; save q1

-        movq        [rdx+16],   mm0                         ; save q0

-        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00

-        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00

-        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00

-        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00

-        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02

-        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00

-        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3

-        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2

-        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2

-        psubusb     mm2,        mm0                         ; p2-p3

-        psubusb     mm0,        mm1                         ; p3-p2

-        por         mm0,        mm2                         ; mm0=abs(p3-p2)

-        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02

-        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1

-        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0

-        movq        [rdx+8],    mm3                         ; save p0

-        movq        [rdx],      mm2                         ; save p1

-        movq        mm5,        mm2                         ; mm5 = p1

-        psubusb     mm2,        mm1                         ; p1-p2

-        psubusb     mm1,        mm5                         ; p2-p1

-        por         mm1,        mm2                         ; mm1=abs(p2-p1)

-        mov         rdx,        arg(3) ;limit

-        movq        mm4,        [rdx]                       ; mm4 = limit

-        psubusb     mm7,        mm4

-        psubusb     mm0,        mm4

-        psubusb     mm1,        mm4

-        psubusb     mm6,        mm4

-        por         mm7,        mm6

-        por         mm0,        mm1

-        por         mm0,        mm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit

-        movq        mm1,        mm5                         ; p1

-        movq        mm7,        mm3                         ; mm3=mm7=p0

-        psubusb     mm7,        mm5                         ; p0 - p1

-        psubusb     mm5,        mm3                         ; p1 - p0

-        por         mm5,        mm7                         ; abs(p1-p0)

-        movq        t0,         mm5                         ; save abs(p1-p0)

-        lea         rdx,        srct

-        psubusb     mm5,        mm4

-        por         mm0,        mm5                         ; mm0=mask

-        movq        mm5,        [rdx+16]                    ; mm5=q0

-        movq        mm7,        [rdx+24]                    ; mm7=q1

-        movq        mm6,        mm5                         ; mm6=q0

-        movq        mm2,        mm7                         ; q1

-        psubusb     mm5,        mm7                         ; q0-q1

-        psubusb     mm7,        mm6                         ; q1-q0

-        por         mm7,        mm5                         ; abs(q1-q0)

-        movq        t1,         mm7                         ; save abs(q1-q0)

-        psubusb     mm7,        mm4

-        por         mm0,        mm7                         ; mask

-        movq        mm5,        mm2                         ; q1

-        psubusb     mm5,        mm1                         ; q1-=p1

-        psubusb     mm1,        mm2                         ; p1-=q1

-        por         mm5,        mm1                         ; abs(p1-q1)

-        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero

-        psrlw       mm5,        1                           ; abs(p1-q1)/2

-        mov         rdx,        arg(2) ;blimit                      ;

-        movq        mm4,        [rdx]                       ;blimit

-        movq        mm1,        mm3                         ; mm1=mm3=p0

-        movq        mm7,        mm6                         ; mm7=mm6=q0

-        psubusb     mm1,        mm7                         ; p0-q0

-        psubusb     mm7,        mm3                         ; q0-p0

-        por         mm1,        mm7                         ; abs(q0-p0)

-        paddusb     mm1,        mm1                         ; abs(q0-p0)*2

-        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2

-        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit

-        por         mm1,        mm0;                        ; mask

-        pxor        mm0,        mm0

-        pcmpeqb     mm1,        mm0

-        ; calculate high edge variance

-        mov         rdx,        arg(4) ;thresh            ; get thresh

-        movq        mm7,        [rdx]

-        ;

-        movq        mm4,        t0              ; get abs (q1 - q0)

-        psubusb     mm4,        mm7

-        movq        mm3,        t1              ; get abs (p1 - p0)

-        psubusb     mm3,        mm7

-        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh

-        pcmpeqb     mm4,        mm0

-        pcmpeqb     mm0,        mm0

-        pxor        mm4,        mm0

-        ; start work on filters

-        lea         rdx,        srct

-        movq        mm2,        [rdx]           ; p1

-        movq        mm7,        [rdx+24]        ; q1

-        movq        mm6,        [rdx+8]         ; p0

-        movq        mm0,        [rdx+16]        ; q0

-        pxor        mm2,        [GLOBAL(t80)]   ; p1 offset to convert to signed values

-        pxor        mm7,        [GLOBAL(t80)]   ; q1 offset to convert to signed values

-        psubsb      mm2,        mm7             ; p1 - q1

-        pand        mm2,        mm4             ; high var mask (hvm)(p1 - q1)

-        pxor        mm6,        [GLOBAL(t80)]   ; offset to convert to signed values

-        pxor        mm0,        [GLOBAL(t80)]   ; offset to convert to signed values

-        movq        mm3,        mm0             ; q0

-        psubsb      mm0,        mm6             ; q0 - p0

-        paddsb      mm2,        mm0             ; 1 * (q0 - p0) + hvm(p1 - q1)

-        paddsb      mm2,        mm0             ; 2 * (q0 - p0) + hvm(p1 - q1)

-        paddsb      mm2,        mm0             ; 3 * (q0 - p0) + hvm(p1 - q1)

-        pand       mm1,        mm2              ; mask filter values we don't care about

-        movq        mm2,        mm1

-        paddsb      mm1,        [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4

-        paddsb      mm2,        [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3

-        pxor        mm0,        mm0          ;

-        pxor        mm5,        mm5

-        punpcklbw   mm0,        mm2         ;

-        punpckhbw   mm5,        mm2         ;

-        psraw       mm0,        11              ;

-        psraw       mm5,        11

-        packsswb    mm0,        mm5

-        movq        mm2,        mm0         ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;

-        pxor        mm0,        mm0           ; 0

-        movq        mm5,        mm1           ; abcdefgh

-        punpcklbw   mm0,        mm1           ; e0f0g0h0

-        psraw       mm0,        11                ; sign extended shift right by 3

-        pxor        mm1,        mm1           ; 0

-        punpckhbw   mm1,        mm5           ; a0b0c0d0

-        psraw       mm1,        11                ; sign extended shift right by 3

-        movq        mm5,        mm0              ; save results

-        packsswb    mm0,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3

-        paddsw      mm5,        [GLOBAL(ones)]

-        paddsw      mm1,        [GLOBAL(ones)]

-        psraw       mm5,        1                 ; partial shifted one more time for 2nd tap

-        psraw       mm1,        1                 ; partial shifted one more time for 2nd tap

-        packsswb    mm5,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4

-        pandn       mm4,        mm5             ; high edge variance additive

-        paddsb      mm6,        mm2             ; p0+= p0 add

-        pxor        mm6,        [GLOBAL(t80)]   ; unoffset

-        ; mm6=p0                               ;

-        movq        mm1,        [rdx]           ; p1

-        pxor        mm1,        [GLOBAL(t80)]   ; reoffset

-        paddsb      mm1,        mm4                 ; p1+= p1 add

-        pxor        mm1,        [GLOBAL(t80)]       ; unoffset

-        ; mm6 = p0 mm1 = p1

-        psubsb      mm3,        mm0                 ; q0-= q0 add

-        pxor        mm3,        [GLOBAL(t80)]       ; unoffset

-        ; mm3 = q0

-        psubsb      mm7,        mm4                 ; q1-= q1 add

-        pxor        mm7,        [GLOBAL(t80)]       ; unoffset

-        ; mm7 = q1

-        ; tranpose and write back

-        ; mm1 =    72 62 52 42 32 22 12 02

-        ; mm6 =    73 63 53 43 33 23 13 03

-        ; mm3 =    74 64 54 44 34 24 14 04

-        ; mm7 =    75 65 55 45 35 25 15 05

-        movq        mm2,        mm1             ; 72 62 52 42 32 22 12 02

-        punpcklbw   mm2,        mm6             ; 33 32 23 22 13 12 03 02

-        movq        mm4,        mm3             ; 74 64 54 44 34 24 14 04

-        punpckhbw   mm1,        mm6             ; 73 72 63 62 53 52 43 42

-        punpcklbw   mm4,        mm7             ; 35 34 25 24 15 14 05 04

-        punpckhbw   mm3,        mm7             ; 75 74 65 64 55 54 45 44

-        movq        mm6,        mm2             ; 33 32 23 22 13 12 03 02

-        punpcklwd   mm2,        mm4             ; 15 14 13 12 05 04 03 02

-        punpckhwd   mm6,        mm4             ; 35 34 33 32 25 24 23 22

-        movq        mm5,        mm1             ; 73 72 63 62 53 52 43 42

-        punpcklwd   mm1,        mm3             ; 55 54 53 52 45 44 43 42

-        punpckhwd   mm5,        mm3             ; 75 74 73 72 65 64 63 62

-        ; mm2 = 15 14 13 12 05 04 03 02

-        ; mm6 = 35 34 33 32 25 24 23 22

-        ; mm5 = 55 54 53 52 45 44 43 42

-        ; mm1 = 75 74 73 72 65 64 63 62

-        movd        [rsi+rax*4+2], mm2

-        psrlq       mm2,        32

-        movd        [rdi+rax*4+2], mm2

-        movd        [rsi+rax*2+2], mm6

-        psrlq       mm6,        32

-        movd        [rsi+rax+2],mm6

-        movd        [rsi+2],    mm1

-        psrlq       mm1,        32

-        movd        [rdi+2],    mm1

-        neg         rax

-        movd        [rdi+rax+2],mm5

-        psrlq       mm5,        32

-        movd        [rdi+rax*2+2], mm5

-        lea         rsi,        [rsi+rax*8]

-        dec         rcx

-        jnz         .next8_v

-    add rsp, 64

-    pop rsp

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_loop_filter_simple_horizontal_edge_mmx

-;(

-;    unsigned char *src_ptr,

-;    int  src_pixel_step,

-;    const char *blimit

-;)

-global sym(vp9_loop_filter_simple_horizontal_edge_mmx)

-sym(vp9_loop_filter_simple_horizontal_edge_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 3

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rsi, arg(0) ;src_ptr

-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?

-        mov         rcx, 2                ; count

-.nexts8_h:

-        mov         rdx, arg(2) ;blimit           ; get blimit

-        movq        mm3, [rdx]            ;

-        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing

-        add         rdi, rax

-        neg         rax

-        ; calculate mask

-        movq        mm1, [rsi+2*rax]      ; p1

-        movq        mm0, [rdi]            ; q1

-        movq        mm2, mm1

-        movq        mm7, mm0

-        movq        mm4, mm0

-        psubusb     mm0, mm1              ; q1-=p1

-        psubusb     mm1, mm4              ; p1-=q1

-        por         mm1, mm0              ; abs(p1-q1)

-        pand        mm1, [GLOBAL(tfe)]    ; set lsb of each byte to zero

-        psrlw       mm1, 1                ; abs(p1-q1)/2

-        movq        mm5, [rsi+rax]        ; p0

-        movq        mm4, [rsi]            ; q0

-        movq        mm0, mm4              ; q0

-        movq        mm6, mm5              ; p0

-        psubusb     mm5, mm4              ; p0-=q0

-        psubusb     mm4, mm6              ; q0-=p0

-        por         mm5, mm4              ; abs(p0 - q0)

-        paddusb     mm5, mm5              ; abs(p0-q0)*2

-        paddusb     mm5, mm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2

-        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit

-        pxor        mm3, mm3

-        pcmpeqb     mm5, mm3

-        ; start work on filters

-        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values

-        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values

-        psubsb      mm2, mm7              ; p1 - q1

-        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values

-        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values

-        movq        mm3, mm0              ; q0

-        psubsb      mm0, mm6              ; q0 - p0

-        paddsb      mm2, mm0              ; p1 - q1 + 1 * (q0 - p0)

-        paddsb      mm2, mm0              ; p1 - q1 + 2 * (q0 - p0)

-        paddsb      mm2, mm0              ; p1 - q1 + 3 * (q0 - p0)

-        pand        mm5, mm2              ; mask filter values we don't care about

-        ; do + 4 side

-        paddsb      mm5, [GLOBAL(t4)]     ; 3* (q0 - p0) + (p1 - q1) + 4

-        movq        mm0, mm5              ; get a copy of filters

-        psllw       mm0, 8                ; shift left 8

-        psraw       mm0, 3                ; arithmetic shift right 11

-        psrlw       mm0, 8

-        movq        mm1, mm5              ; get a copy of filters

-        psraw       mm1, 11               ; arithmetic shift right 11

-        psllw       mm1, 8                ; shift left 8 to put it back

-        por         mm0, mm1              ; put the two together to get result

-        psubsb      mm3, mm0              ; q0-= q0 add

-        pxor        mm3, [GLOBAL(t80)]    ; unoffset

-        movq        [rsi], mm3            ; write back

-        ; now do +3 side

-        psubsb      mm5, [GLOBAL(t1s)]     ; +3 instead of +4

-        movq        mm0, mm5              ; get a copy of filters

-        psllw       mm0, 8                ; shift left 8

-        psraw       mm0, 3                ; arithmetic shift right 11

-        psrlw       mm0, 8

-        psraw       mm5, 11               ; arithmetic shift right 11

-        psllw       mm5, 8                ; shift left 8 to put it back

-        por         mm0, mm5              ; put the two together to get result

-        paddsb      mm6, mm0              ; p0+= p0 add

-        pxor        mm6, [GLOBAL(t80)]    ; unoffset

-        movq        [rsi+rax], mm6        ; write back

-        add         rsi,8

-        neg         rax

-        dec         rcx

-        jnz         .nexts8_h

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_loop_filter_simple_vertical_edge_mmx

-;(

-;    unsigned char *src_ptr,

-;    int  src_pixel_step,

-;    const char *blimit

-;)

-global sym(vp9_loop_filter_simple_vertical_edge_mmx)

-sym(vp9_loop_filter_simple_vertical_edge_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 3

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub          rsp, 32      ; reserve 32 bytes

-    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];

-    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];

-        mov         rsi, arg(0) ;src_ptr

-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?

-        lea         rsi, [rsi + rax*4- 2];  ;

-        mov         rcx, 2                                      ; count

-.nexts8_v:

-        lea         rdi,        [rsi + rax];

-        movd        mm0,        [rdi + rax * 2]                 ; xx xx xx xx 73 72 71 70

-        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 63 62 61 60

-        punpcklbw   mm6,        mm0                             ; 73 63 72 62 71 61 70 60

-        movd        mm0,        [rsi + rax]                     ; xx xx xx xx 53 52 51 50

-        movd        mm4,        [rsi]                           ; xx xx xx xx 43 42 41 40

-        punpcklbw   mm4,        mm0                             ; 53 43 52 42 51 41 50 40

-        movq        mm5,        mm4                             ; 53 43 52 42 51 41 50 40

-        punpcklwd   mm4,        mm6                             ; 71 61 51 41 70 60 50 40

-        punpckhwd   mm5,        mm6                             ; 73 63 53 43 72 62 52 42

-        neg         rax

-        movd        mm7,        [rsi + rax]                     ; xx xx xx xx 33 32 31 30

-        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 23 22 21 20

-        punpcklbw   mm6,        mm7                             ; 33 23 32 22 31 21 30 20

-        movd        mm1,        [rdi + rax * 4]                 ; xx xx xx xx 13 12 11 10

-        movd        mm0,        [rsi + rax * 4]                 ; xx xx xx xx 03 02 01 00

-        punpcklbw   mm0,        mm1                             ; 13 03 12 02 11 01 10 00

-        movq        mm2,        mm0                             ; 13 03 12 02 11 01 10 00

-        punpcklwd   mm0,        mm6                             ; 31 21 11 01 30 20 10 00

-        punpckhwd   mm2,        mm6                             ; 33 23 13 03 32 22 12 02

-        movq        mm1,        mm0                             ; 13 03 12 02 11 01 10 00

-        punpckldq   mm0,        mm4                             ; 70 60 50 40 30 20 10 00       = p1

-        movq        mm3,        mm2                             ; 33 23 13 03 32 22 12 02

-        punpckhdq   mm1,        mm4                             ; 71 61 51 41 31 21 11 01       = p0

-        punpckldq   mm2,        mm5                             ; 72 62 52 42 32 22 12 02       = q0

-        punpckhdq   mm3,        mm5                             ; 73 63 53 43 33 23 13 03       = q1

-        ; calculate mask

-        movq        mm6,        mm0                             ; p1

-        movq        mm7,        mm3                             ; q1

-        psubusb     mm7,        mm6                             ; q1-=p1

-        psubusb     mm6,        mm3                             ; p1-=q1

-        por         mm6,        mm7                             ; abs(p1-q1)

-        pand        mm6,        [GLOBAL(tfe)]                   ; set lsb of each byte to zero

-        psrlw       mm6,        1                               ; abs(p1-q1)/2

-        movq        mm5,        mm1                             ; p0

-        movq        mm4,        mm2                             ; q0

-        psubusb     mm5,        mm2                             ; p0-=q0

-        psubusb     mm4,        mm1                             ; q0-=p0

-        por         mm5,        mm4                             ; abs(p0 - q0)

-        paddusb     mm5,        mm5                             ; abs(p0-q0)*2

-        paddusb     mm5,        mm6                             ; abs (p0 - q0) *2 + abs(p1-q1)/2

-        mov         rdx,        arg(2) ;blimit                          ; get blimit

-        movq        mm7,        [rdx]

-        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit

-        pxor        mm7,        mm7

-        pcmpeqb     mm5,        mm7                             ; mm5 = mask

-        ; start work on filters

-        movq        t0,         mm0

-        movq        t1,         mm3

-        pxor        mm0,        [GLOBAL(t80)]                   ; p1 offset to convert to signed values

-        pxor        mm3,        [GLOBAL(t80)]                   ; q1 offset to convert to signed values

-        psubsb      mm0,        mm3                             ; p1 - q1

-        movq        mm6,        mm1                             ; p0

-        movq        mm7,        mm2                             ; q0

-        pxor        mm6,        [GLOBAL(t80)]                   ; offset to convert to signed values

-        pxor        mm7,        [GLOBAL(t80)]                   ; offset to convert to signed values

-        movq        mm3,        mm7                             ; offseted ; q0

-        psubsb      mm7,        mm6                             ; q0 - p0

-        paddsb      mm0,        mm7                             ; p1 - q1 + 1 * (q0 - p0)

-        paddsb      mm0,        mm7                             ; p1 - q1 + 2 * (q0 - p0)

-        paddsb      mm0,        mm7                             ; p1 - q1 + 3 * (q0 - p0)

-        pand        mm5,        mm0                             ; mask filter values we don't care about

-        paddsb      mm5,        [GLOBAL(t4)]                    ;  3* (q0 - p0) + (p1 - q1) + 4

-        movq        mm0,        mm5                             ; get a copy of filters

-        psllw       mm0,        8                               ; shift left 8

-        psraw       mm0,        3                               ; arithmetic shift right 11

-        psrlw       mm0,        8

-        movq        mm7,        mm5                             ; get a copy of filters

-        psraw       mm7,        11                              ; arithmetic shift right 11

-        psllw       mm7,        8                               ; shift left 8 to put it back

-        por         mm0,        mm7                             ; put the two together to get result

-        psubsb      mm3,        mm0                             ; q0-= q0sz add

-        pxor        mm3,        [GLOBAL(t80)]                   ; unoffset

-        ; now do +3 side

-        psubsb      mm5, [GLOBAL(t1s)]                          ; +3 instead of +4

-        movq        mm0, mm5                                    ; get a copy of filters

-        psllw       mm0, 8                                      ; shift left 8

-        psraw       mm0, 3                                      ; arithmetic shift right 11

-        psrlw       mm0, 8

-        psraw       mm5, 11                                     ; arithmetic shift right 11

-        psllw       mm5, 8                                      ; shift left 8 to put it back

-        por         mm0, mm5                                    ; put the two together to get result

-        paddsb      mm6, mm0                                    ; p0+= p0 add

-        pxor        mm6, [GLOBAL(t80)]                          ; unoffset

-        movq        mm0,        t0

-        movq        mm4,        t1

-        ; mm0 = 70 60 50 40 30 20 10 00

-        ; mm6 = 71 61 51 41 31 21 11 01

-        ; mm3 = 72 62 52 42 32 22 12 02

-        ; mm4 = 73 63 53 43 33 23 13 03

-        ; transpose back to write out

-        movq        mm1,        mm0                         ;

-        punpcklbw   mm0,        mm6                         ; 31 30 21 20 11 10 01 00

-        punpckhbw   mm1,        mm6                         ; 71 70 61 60 51 50 41 40

-        movq        mm2,        mm3                         ;

-        punpcklbw   mm2,        mm4                         ; 33 32 23 22 13 12 03 02

-        movq        mm5,        mm1                         ; 71 70 61 60 51 50 41 40

-        punpckhbw   mm3,        mm4                         ; 73 72 63 62 53 52 43 42

-        movq        mm6,        mm0                         ; 31 30 21 20 11 10 01 00

-        punpcklwd   mm0,        mm2                         ; 13 12 11 10 03 02 01 00

-        punpckhwd   mm6,        mm2                         ; 33 32 31 30 23 22 21 20

-        movd        [rsi+rax*4], mm0                        ; write 03 02 01 00

-        punpcklwd   mm1,        mm3                         ; 53 52 51 50 43 42 41 40

-        psrlq       mm0,        32                          ; xx xx xx xx 13 12 11 10

-        punpckhwd   mm5,        mm3                         ; 73 72 71 70 63 62 61 60

-        movd        [rdi+rax*4], mm0                        ; write 13 12 11 10

-        movd        [rsi+rax*2], mm6                        ; write 23 22 21 20

-        psrlq       mm6,        32                          ; 33 32 31 30

-        movd        [rsi],      mm1                         ; write 43 42 41 40

-        movd        [rsi + rax], mm6                        ; write 33 32 31 30

-        neg         rax

-        movd        [rsi + rax*2], mm5                      ; write 63 62 61 60

-        psrlq       mm1,        32                          ; 53 52 51 50

-        movd        [rdi],      mm1                         ; write out 53 52 51 50

-        psrlq       mm5,        32                          ; 73 72 71 70

-        movd        [rdi + rax*2], mm5                      ; write 73 72 71 70

-        lea         rsi,        [rsi+rax*8]                 ; next 8

-        dec         rcx

-        jnz         .nexts8_v

-    add rsp, 32

-    pop rsp

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,

-;                  int y_stride,

-;                  loop_filter_info *lfi)

-;{

-;

-;

-;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);

-;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);

-;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);

-;}

-SECTION_RODATA

-align 16

-tfe:

-    times 8 db 0xfe

-align 16

-t80:

-    times 8 db 0x80

-align 16

-t1s:

-    times 8 db 0x01

-align 16

-t3:

-    times 8 db 0x03

-align 16

-t4:

-    times 8 db 0x04

-align 16

-ones:

-    times 4 dw 0x0001

-align 16

-s27:

-    times 4 dw 0x1b00

-align 16

-s18:

-    times 4 dw 0x1200

-align 16

-s9:

-    times 4 dw 0x0900

-align 16

-s63:

-    times 4 dw 0x003f

--- a/vp8/common/x86/loopfilter_sse2.asm

+++ /dev/null

@@ -1,1238 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-; Use of pmaxub instead of psubusb to compute filter mask was seen

-; in ffvp8

-%macro LFH_FILTER_AND_HEV_MASK 1

-%if %1

-        movdqa      xmm2,                   [rdi+2*rax]       ; q3

-        movdqa      xmm1,                   [rsi+2*rax]       ; q2

-        movdqa      xmm4,                   [rsi+rax]         ; q1

-        movdqa      xmm5,                   [rsi]             ; q0

-        neg         rax                     ; negate pitch to deal with above border

-%else

-        movlps      xmm2,                   [rsi + rcx*2]     ; q3

-        movlps      xmm1,                   [rsi + rcx]       ; q2

-        movlps      xmm4,                   [rsi]             ; q1

-        movlps      xmm5,                   [rsi + rax]       ; q0

-        movhps      xmm2,                   [rdi + rcx*2]

-        movhps      xmm1,                   [rdi + rcx]

-        movhps      xmm4,                   [rdi]

-        movhps      xmm5,                   [rdi + rax]

-        lea         rsi,                    [rsi + rax*4]

-        lea         rdi,                    [rdi + rax*4]

-        movdqa      XMMWORD PTR [rsp],      xmm1              ; store q2

-        movdqa      XMMWORD PTR [rsp + 16], xmm4              ; store q1

-%endif

-        movdqa      xmm6,                   xmm1              ; q2

-        movdqa      xmm3,                   xmm4              ; q1

-        psubusb     xmm1,                   xmm2              ; q2-=q3

-        psubusb     xmm2,                   xmm6              ; q3-=q2

-        psubusb     xmm4,                   xmm6              ; q1-=q2

-        psubusb     xmm6,                   xmm3              ; q2-=q1

-        por         xmm4,                   xmm6              ; abs(q2-q1)

-        por         xmm1,                   xmm2              ; abs(q3-q2)

-        movdqa      xmm0,                   xmm5              ; q0

-        pmaxub      xmm1,                   xmm4

-        psubusb     xmm5,                   xmm3              ; q0-=q1

-        psubusb     xmm3,                   xmm0              ; q1-=q0

-        por         xmm5,                   xmm3              ; abs(q0-q1)

-        movdqa      t0,                     xmm5              ; save to t0

-        pmaxub      xmm1,                   xmm5

-%if %1

-        movdqa      xmm2,                   [rsi+4*rax]       ; p3

-        movdqa      xmm4,                   [rdi+4*rax]       ; p2

-        movdqa      xmm6,                   [rsi+2*rax]       ; p1

-%else

-        movlps      xmm2,                   [rsi + rax]       ; p3

-        movlps      xmm4,                   [rsi]             ; p2

-        movlps      xmm6,                   [rsi + rcx]       ; p1

-        movhps      xmm2,                   [rdi + rax]

-        movhps      xmm4,                   [rdi]

-        movhps      xmm6,                   [rdi + rcx]

-        movdqa      XMMWORD PTR [rsp + 32], xmm4              ; store p2

-        movdqa      XMMWORD PTR [rsp + 48], xmm6              ; store p1

-%endif

-        movdqa      xmm5,                   xmm4              ; p2

-        movdqa      xmm3,                   xmm6              ; p1

-        psubusb     xmm4,                   xmm2              ; p2-=p3

-        psubusb     xmm2,                   xmm5              ; p3-=p2

-        psubusb     xmm3,                   xmm5              ; p1-=p2

-        pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)

-        psubusb     xmm5,                   xmm6              ; p2-=p1

-        pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)

-        pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)

-        movdqa      xmm2,                   xmm6              ; p1

-        pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)

-%if %1

-        movdqa      xmm4,                   [rsi+rax]         ; p0

-        movdqa      xmm3,                   [rdi]             ; q1

-%else

-        movlps      xmm4,                   [rsi + rcx*2]     ; p0

-        movhps      xmm4,                   [rdi + rcx*2]

-        movdqa      xmm3,                   q1                ; q1

-%endif

-        movdqa      xmm5,                   xmm4              ; p0

-        psubusb     xmm4,                   xmm6              ; p0-=p1

-        psubusb     xmm6,                   xmm5              ; p1-=p0

-        por         xmm6,                   xmm4              ; abs(p1 - p0)

-        mov         rdx,                    arg(2)            ; get blimit

-        movdqa        t1,                   xmm6              ; save to t1

-        movdqa      xmm4,                   xmm3              ; q1

-        pmaxub      xmm1,                   xmm6

-        psubusb     xmm3,                   xmm2              ; q1-=p1

-        psubusb     xmm2,                   xmm4              ; p1-=q1

-        psubusb     xmm1,                   xmm7

-        por         xmm2,                   xmm3              ; abs(p1-q1)

-        movdqa      xmm7,                   XMMWORD PTR [rdx] ; blimit

-        movdqa      xmm3,                   xmm0              ; q0

-        pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero

-        mov         rdx,                    arg(4)            ; hev get thresh

-        movdqa      xmm6,                   xmm5              ; p0

-        psrlw       xmm2,                   1                 ; abs(p1-q1)/2

-        psubusb     xmm5,                   xmm3              ; p0-=q0

-        psubusb     xmm3,                   xmm6              ; q0-=p0

-        por         xmm5,                   xmm3              ; abs(p0 - q0)

-        paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2

-        movdqa      xmm4,                   t0                ; hev get abs (q1 - q0)

-        movdqa      xmm3,                   t1                ; get abs (p1 - p0)

-        paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2

-        movdqa      xmm2,                   XMMWORD PTR [rdx] ; hev

-        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit

-        psubusb     xmm4,                   xmm2              ; hev

-        psubusb     xmm3,                   xmm2              ; hev

-        por         xmm1,                   xmm5

-        pxor        xmm7,                   xmm7

-        paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh

-        pcmpeqb     xmm4,                   xmm5              ; hev

-        pcmpeqb     xmm3,                   xmm3              ; hev

-        pcmpeqb     xmm1,                   xmm7              ; mask xmm1

-        pxor        xmm4,                   xmm3              ; hev

-%endmacro

-%macro B_FILTER 1

-%if %1 == 0

-        movdqa      xmm2,                   p1                ; p1

-        movdqa      xmm7,                   q1                ; q1

-%elif %1 == 1

-        movdqa      xmm2,                   [rsi+2*rax]       ; p1

-        movdqa      xmm7,                   [rdi]             ; q1

-%elif %1 == 2

-        lea         rdx,                    srct

-        movdqa      xmm2,                   [rdx]             ; p1

-        movdqa      xmm7,                   [rdx+48]          ; q1

-        movdqa      xmm6,                   [rdx+16]          ; p0

-        movdqa      xmm0,                   [rdx+32]          ; q0

-%endif

-        pxor        xmm2,                   [GLOBAL(t80)]     ; p1 offset to convert to signed values

-        pxor        xmm7,                   [GLOBAL(t80)]     ; q1 offset to convert to signed values

-        psubsb      xmm2,                   xmm7              ; p1 - q1

-        pxor        xmm6,                   [GLOBAL(t80)]     ; offset to convert to signed values

-        pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)

-        pxor        xmm0,                   [GLOBAL(t80)]     ; offset to convert to signed values

-        movdqa      xmm3,                   xmm0              ; q0

-        psubsb      xmm0,                   xmm6              ; q0 - p0

-        paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)

-        paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)

-        paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)

-        pand        xmm1,                   xmm2              ; mask filter values we don't care about

-        movdqa      xmm2,                   xmm1

-        paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4

-        paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3

-        punpckhbw   xmm5,                   xmm2              ; axbxcxdx

-        punpcklbw   xmm2,                   xmm2              ; exfxgxhx

-        punpcklbw   xmm0,                   xmm1              ; exfxgxhx

-        psraw       xmm5,                   11                ; sign extended shift right by 3

-        punpckhbw   xmm1,                   xmm1              ; axbxcxdx

-        psraw       xmm2,                   11                ; sign extended shift right by 3

-        packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;

-        psraw       xmm0,                   11                ; sign extended shift right by 3

-        psraw       xmm1,                   11                ; sign extended shift right by 3

-        movdqa      xmm5,                   xmm0              ; save results

-        packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3

-        paddsw      xmm5,                   [GLOBAL(ones)]

-        paddsw      xmm1,                   [GLOBAL(ones)]

-        psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap

-        psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap

-        paddsb      xmm6,                   xmm2              ; p0+= p0 add

-        packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4

-%if %1 == 0

-        movdqa      xmm1,                   p1                ; p1

-%elif %1 == 1

-        movdqa      xmm1,                   [rsi+2*rax]       ; p1

-%elif %1 == 2

-        movdqa      xmm1,                   [rdx]             ; p1

-%endif

-        pandn       xmm4,                   xmm5              ; high edge variance additive

-        pxor        xmm6,                   [GLOBAL(t80)]     ; unoffset

-        pxor        xmm1,                   [GLOBAL(t80)]     ; reoffset

-        psubsb      xmm3,                   xmm0              ; q0-= q0 add

-        paddsb      xmm1,                   xmm4              ; p1+= p1 add

-        pxor        xmm3,                   [GLOBAL(t80)]     ; unoffset

-        pxor        xmm1,                   [GLOBAL(t80)]     ; unoffset

-        psubsb      xmm7,                   xmm4              ; q1-= q1 add

-        pxor        xmm7,                   [GLOBAL(t80)]     ; unoffset

-%if %1 == 0

-        lea         rsi,                    [rsi + rcx*2]

-        lea         rdi,                    [rdi + rcx*2]

-        movq        MMWORD PTR [rsi],       xmm6              ; p0

-        movhps      MMWORD PTR [rdi],       xmm6

-        movq        MMWORD PTR [rsi + rax], xmm1              ; p1

-        movhps      MMWORD PTR [rdi + rax], xmm1

-        movq        MMWORD PTR [rsi + rcx], xmm3              ; q0

-        movhps      MMWORD PTR [rdi + rcx], xmm3

-        movq        MMWORD PTR [rsi + rcx*2],xmm7             ; q1

-        movhps      MMWORD PTR [rdi + rcx*2],xmm7

-%elif %1 == 1

-        movdqa      [rsi+rax],              xmm6              ; write back

-        movdqa      [rsi+2*rax],            xmm1              ; write back

-        movdqa      [rsi],                  xmm3              ; write back

-        movdqa      [rdi],                  xmm7              ; write back

-%endif

-%endmacro

-;void vp9_loop_filter_horizontal_edge_sse2

-;(

-;    unsigned char *src_ptr,

-;    int            src_pixel_step,

-;    const char    *blimit,

-;    const char    *limit,

-;    const char    *thresh,

-;    int            count

-;)

-global sym(vp9_loop_filter_horizontal_edge_sse2)

-sym(vp9_loop_filter_horizontal_edge_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub         rsp, 32     ; reserve 32 bytes

-    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];

-    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];

-        mov         rsi,                    arg(0)           ;src_ptr

-        movsxd      rax,                    dword ptr arg(1) ;src_pixel_step

-        mov         rdx,                    arg(3)           ;limit

-        movdqa      xmm7,                   XMMWORD PTR [rdx]

-        lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing

-        ; calculate breakout conditions and high edge variance

-        LFH_FILTER_AND_HEV_MASK 1

-        ; filter and write back the result

-        B_FILTER 1

-    add rsp, 32

-    pop rsp

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_loop_filter_horizontal_edge_uv_sse2

-;(

-;    unsigned char *src_ptr,

-;    int            src_pixel_step,

-;    const char    *blimit,

-;    const char    *limit,

-;    const char    *thresh,

-;    int            count

-;)

-global sym(vp9_loop_filter_horizontal_edge_uv_sse2)

-sym(vp9_loop_filter_horizontal_edge_uv_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub         rsp, 96       ; reserve 96 bytes

-    %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];

-    %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];

-    %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];

-    %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];

-    %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];

-    %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];

-        mov         rsi,                    arg(0)             ; u

-        mov         rdi,                    arg(5)             ; v

-        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step

-        mov         rcx,                    rax

-        neg         rax                     ; negate pitch to deal with above border

-        mov         rdx,                    arg(3)             ;limit

-        movdqa      xmm7,                   XMMWORD PTR [rdx]

-        lea         rsi,                    [rsi + rcx]

-        lea         rdi,                    [rdi + rcx]

-        ; calculate breakout conditions and high edge variance

-        LFH_FILTER_AND_HEV_MASK 0

-        ; filter and write back the result

-        B_FILTER 0

-    add rsp, 96

-    pop rsp

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-%macro TRANSPOSE_16X8 2

-        movq        xmm4,               QWORD PTR [rsi]        ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00

-        movq        xmm1,               QWORD PTR [rdi]        ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10

-        movq        xmm0,               QWORD PTR [rsi+2*rax]  ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20

-        movq        xmm7,               QWORD PTR [rdi+2*rax]  ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30

-        movq        xmm5,               QWORD PTR [rsi+4*rax]  ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40

-        movq        xmm2,               QWORD PTR [rdi+4*rax]  ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50

-        punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00

-        movq        xmm1,               QWORD PTR [rdi+2*rcx]  ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70

-        movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00

-        punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20

-        movq        xmm7,               QWORD PTR [rsi+2*rcx]  ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60

-        punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40

-%if %1

-        lea         rsi,                [rsi+rax*8]

-%else

-        mov         rsi,                arg(5)          ; v_ptr

-%endif

-        movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40

-        punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60

-        punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40

-        punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44

-%if %1

-        lea         rdi,                [rdi+rax*8]

-%else

-        lea         rsi,                [rsi - 4]

-%endif

-        punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00

-%if %1

-        lea         rdx,                srct

-%else

-        lea         rdi,                [rsi + rax]     ; rdi points to row +1 for indirect addressing

-%endif

-        movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00

-        punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04

-        movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04

-        punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02

-        punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06

-        punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04

-        punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00

-        movdqa      t0,                 xmm2            ; save to free XMM2

-        movq        xmm2,               QWORD PTR [rsi]       ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80

-        movq        xmm6,               QWORD PTR [rdi]       ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90

-        movq        xmm0,               QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0

-        movq        xmm5,               QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0

-        movq        xmm1,               QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0

-        punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80

-        movq        xmm6,               QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0

-        punpcklbw   xmm0,               xmm5                  ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0

-        movq        xmm5,               QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0

-        punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0

-        movq        xmm6,               QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0

-        punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0

-        movdqa      xmm6,               xmm1            ;

-        punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4

-        punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0

-        movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80

-        punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80

-        punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84

-        movdqa      xmm0,               xmm5

-        punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80

-        punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82

-        movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84

-        punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84

-        punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86

-        movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06

-        punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06

-        punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07

-%if %2

-        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02

-        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02

-        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03

-        movdqa      [rdx],              xmm2            ; save 2

-        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04

-        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04

-        movdqa      [rdx+16],           xmm3            ; save 3

-        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05

-        movdqa      [rdx+32],           xmm4            ; save 4

-        movdqa      [rdx+48],           xmm5            ; save 5

-        movdqa      xmm1,               t0              ; get

-        movdqa      xmm2,               xmm1            ;

-        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01

-        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00

-%else

-        movdqa      [rdx+112],          xmm7            ; save 7

-        movdqa      [rdx+96],           xmm6            ; save 6

-        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02

-        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03

-        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02

-        movdqa      [rdx+32],           xmm2            ; save 2

-        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04

-        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04

-        movdqa      [rdx+48],           xmm3            ; save 3

-        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05

-        movdqa      [rdx+64],           xmm4            ; save 4

-        movdqa      [rdx+80],           xmm5            ; save 5

-        movdqa      xmm1,               t0              ; get

-        movdqa      xmm2,               xmm1

-        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01

-        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00

-        movdqa      [rdx+16],           xmm1

-        movdqa      [rdx],              xmm2

-%endif

-%endmacro

-%macro LFV_FILTER_MASK_HEV_MASK 1

-        movdqa      xmm0,               xmm6            ; q2

-        psubusb     xmm0,               xmm7            ; q2-q3

-        psubusb     xmm7,               xmm6            ; q3-q2

-        movdqa      xmm4,               xmm5            ; q1

-        por         xmm7,               xmm0            ; abs (q3-q2)

-        psubusb     xmm4,               xmm6            ; q1-q2

-        movdqa      xmm0,               xmm1

-        psubusb     xmm6,               xmm5            ; q2-q1

-        por         xmm6,               xmm4            ; abs (q2-q1)

-        psubusb     xmm0,               xmm2            ; p2 - p3;

-        psubusb     xmm2,               xmm1            ; p3 - p2;

-        por         xmm0,               xmm2            ; abs(p2-p3)

-%if %1

-        movdqa      xmm2,               [rdx]           ; p1

-%else

-        movdqa      xmm2,               [rdx+32]        ; p1

-%endif

-        movdqa      xmm5,               xmm2            ; p1

-        pmaxub      xmm0,               xmm7

-        psubusb     xmm5,               xmm1            ; p1-p2

-        psubusb     xmm1,               xmm2            ; p2-p1

-        movdqa      xmm7,               xmm3            ; p0

-        psubusb     xmm7,               xmm2            ; p0-p1

-        por         xmm1,               xmm5            ; abs(p2-p1)

-        pmaxub      xmm0,               xmm6

-        pmaxub      xmm0,               xmm1

-        movdqa      xmm1,               xmm2            ; p1

-        psubusb     xmm2,               xmm3            ; p1-p0

-        lea         rdx,                srct

-        por         xmm2,               xmm7            ; abs(p1-p0)

-        movdqa      t0,                 xmm2            ; save abs(p1-p0)

-        pmaxub      xmm0,               xmm2

-%if %1

-        movdqa      xmm5,               [rdx+32]        ; q0

-        movdqa      xmm7,               [rdx+48]        ; q1

-%else

-        movdqa      xmm5,               [rdx+64]        ; q0

-        movdqa      xmm7,               [rdx+80]        ; q1

-%endif

-        mov         rdx,                arg(3)          ; limit

-        movdqa      xmm6,               xmm5            ; q0

-        movdqa      xmm2,               xmm7            ; q1

-        psubusb     xmm5,               xmm7            ; q0-q1

-        psubusb     xmm7,               xmm6            ; q1-q0

-        por         xmm7,               xmm5            ; abs(q1-q0)

-        movdqa      t1,                 xmm7            ; save abs(q1-q0)

-        movdqa      xmm4,               XMMWORD PTR [rdx]; limit

-        pmaxub      xmm0,               xmm7

-        mov         rdx,                arg(2)          ; blimit

-        psubusb     xmm0,               xmm4

-        movdqa      xmm5,               xmm2            ; q1

-        psubusb     xmm5,               xmm1            ; q1-=p1

-        psubusb     xmm1,               xmm2            ; p1-=q1

-        por         xmm5,               xmm1            ; abs(p1-q1)

-        movdqa      xmm1,               xmm3            ; p0

-        pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero

-        psubusb     xmm1,               xmm6            ; p0-q0

-        psrlw       xmm5,               1               ; abs(p1-q1)/2

-        psubusb     xmm6,               xmm3            ; q0-p0

-        movdqa      xmm4,               XMMWORD PTR [rdx]; blimit

-        mov         rdx,                arg(4)          ; get thresh

-        por         xmm1,               xmm6            ; abs(q0-p0)

-        movdqa      xmm6,               t0              ; get abs (q1 - q0)

-        paddusb     xmm1,               xmm1            ; abs(q0-p0)*2

-        movdqa      xmm3,               t1              ; get abs (p1 - p0)

-        movdqa      xmm7,               XMMWORD PTR [rdx]

-        paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2

-        psubusb     xmm6,               xmm7            ; abs(q1 - q0) > thresh

-        psubusb     xmm3,               xmm7            ; abs(p1 - p0)> thresh

-        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit

-        por         xmm6,               xmm3            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh

-        por         xmm1,               xmm0            ; mask

-        pcmpeqb     xmm6,               xmm0

-        pxor        xmm0,               xmm0

-        pcmpeqb     xmm4,               xmm4

-        pcmpeqb     xmm1,               xmm0

-        pxor        xmm4,               xmm6

-%endmacro

-%macro BV_TRANSPOSE 0

-        ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02

-        ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03

-        ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04

-        ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05

-        movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02

-        punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02

-        movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04

-        punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82

-        punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04

-        punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84

-        movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02

-        punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02

-        punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42

-        movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82

-        punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82

-        punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2

-        ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02

-        ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42

-        ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82

-        ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2

-%endmacro

-%macro BV_WRITEBACK 2

-        movd        [rsi+2],            %1

-        psrldq      %1,                 4

-        movd        [rdi+2],            %1

-        psrldq      %1,                 4

-        movd        [rsi+2*rax+2],      %1

-        psrldq      %1,                 4

-        movd        [rdi+2*rax+2],      %1

-        movd        [rsi+4*rax+2],      %2

-        psrldq      %2,                 4

-        movd        [rdi+4*rax+2],      %2

-        psrldq      %2,                 4

-        movd        [rsi+2*rcx+2],      %2

-        psrldq      %2,                 4

-        movd        [rdi+2*rcx+2],      %2

-%endmacro

-;void vp9_loop_filter_vertical_edge_sse2

-;(

-;    unsigned char *src_ptr,

-;    int            src_pixel_step,

-;    const char    *blimit,

-;    const char    *limit,

-;    const char    *thresh,

-;    int            count

-;)

-global sym(vp9_loop_filter_vertical_edge_sse2)

-sym(vp9_loop_filter_vertical_edge_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub             rsp, 96      ; reserve 96 bytes

-    %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];

-    %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];

-    %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];

-        mov         rsi,        arg(0)                  ; src_ptr

-        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step

-        lea         rsi,        [rsi - 4]

-        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing

-        lea         rcx,        [rax*2+rax]

-        ;transpose 16x8 to 8x16, and store the 8-line result on stack.

-        TRANSPOSE_16X8 1, 1

-        ; calculate filter mask and high edge variance

-        LFV_FILTER_MASK_HEV_MASK 1

-        ; start work on filters

-        B_FILTER 2

-        ; tranpose and write back - only work on q1, q0, p0, p1

-        BV_TRANSPOSE

-        ; store 16-line result

-        lea         rdx,        [rax]

-        neg         rdx

-        BV_WRITEBACK xmm1, xmm5

-        lea         rsi,        [rsi+rdx*8]

-        lea         rdi,        [rdi+rdx*8]

-        BV_WRITEBACK xmm2, xmm6

-    add rsp, 96

-    pop rsp

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_loop_filter_vertical_edge_uv_sse2

-;(

-;    unsigned char *u,

-;    int            src_pixel_step,

-;    const char    *blimit,

-;    const char    *limit,

-;    const char    *thresh,

-;    unsigned char *v

-;)

-global sym(vp9_loop_filter_vertical_edge_uv_sse2)

-sym(vp9_loop_filter_vertical_edge_uv_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub             rsp, 96      ; reserve 96 bytes

-    %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];

-    %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];

-    %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];

-        mov         rsi,        arg(0)                  ; u_ptr

-        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step

-        lea         rsi,        [rsi - 4]

-        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing

-        lea         rcx,        [rax+2*rax]

-        lea         rdx,        srct

-        ;transpose 16x8 to 8x16, and store the 8-line result on stack.

-        TRANSPOSE_16X8 0, 1

-        ; calculate filter mask and high edge variance

-        LFV_FILTER_MASK_HEV_MASK 1

-        ; start work on filters

-        B_FILTER 2

-        ; tranpose and write back - only work on q1, q0, p0, p1

-        BV_TRANSPOSE

-        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing

-        ; store 16-line result

-        BV_WRITEBACK xmm1, xmm5

-        mov         rsi,        arg(0)                  ; u_ptr

-        lea         rsi,        [rsi - 4]

-        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing

-        BV_WRITEBACK xmm2, xmm6

-    add rsp, 96

-    pop rsp

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_loop_filter_simple_horizontal_edge_sse2

-;(

-;    unsigned char *src_ptr,

-;    int  src_pixel_step,

-;    const char *blimit,

-;)

-global sym(vp9_loop_filter_simple_horizontal_edge_sse2)

-sym(vp9_loop_filter_simple_horizontal_edge_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 3

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rsi, arg(0)             ;src_ptr

-        movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?

-        mov         rdx, arg(2)             ;blimit

-        movdqa      xmm3, XMMWORD PTR [rdx]

-        mov         rdi, rsi                ; rdi points to row +1 for indirect addressing

-        add         rdi, rax

-        neg         rax

-        ; calculate mask

-        movdqa      xmm1, [rsi+2*rax]       ; p1

-        movdqa      xmm0, [rdi]             ; q1

-        movdqa      xmm2, xmm1

-        movdqa      xmm7, xmm0

-        movdqa      xmm4, xmm0

-        psubusb     xmm0, xmm1              ; q1-=p1

-        psubusb     xmm1, xmm4              ; p1-=q1

-        por         xmm1, xmm0              ; abs(p1-q1)

-        pand        xmm1, [GLOBAL(tfe)]     ; set lsb of each byte to zero

-        psrlw       xmm1, 1                 ; abs(p1-q1)/2

-        movdqa      xmm5, [rsi+rax]         ; p0

-        movdqa      xmm4, [rsi]             ; q0

-        movdqa      xmm0, xmm4              ; q0

-        movdqa      xmm6, xmm5              ; p0

-        psubusb     xmm5, xmm4              ; p0-=q0

-        psubusb     xmm4, xmm6              ; q0-=p0

-        por         xmm5, xmm4              ; abs(p0 - q0)

-        paddusb     xmm5, xmm5              ; abs(p0-q0)*2

-        paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2

-        psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit

-        pxor        xmm3, xmm3

-        pcmpeqb     xmm5, xmm3

-        ; start work on filters

-        pxor        xmm2, [GLOBAL(t80)]     ; p1 offset to convert to signed values

-        pxor        xmm7, [GLOBAL(t80)]     ; q1 offset to convert to signed values

-        psubsb      xmm2, xmm7              ; p1 - q1

-        pxor        xmm6, [GLOBAL(t80)]     ; offset to convert to signed values

-        pxor        xmm0, [GLOBAL(t80)]     ; offset to convert to signed values

-        movdqa      xmm3, xmm0              ; q0

-        psubsb      xmm0, xmm6              ; q0 - p0

-        paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)

-        paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)

-        paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)

-        pand        xmm5, xmm2              ; mask filter values we don't care about

-        ; do + 4 side

-        paddsb      xmm5, [GLOBAL(t4)]      ; 3* (q0 - p0) + (p1 - q1) + 4

-        movdqa      xmm0, xmm5              ; get a copy of filters

-        psllw       xmm0, 8                 ; shift left 8

-        psraw       xmm0, 3                 ; arithmetic shift right 11

-        psrlw       xmm0, 8

-        movdqa      xmm1, xmm5              ; get a copy of filters

-        psraw       xmm1, 11                ; arithmetic shift right 11

-        psllw       xmm1, 8                 ; shift left 8 to put it back

-        por         xmm0, xmm1              ; put the two together to get result

-        psubsb      xmm3, xmm0              ; q0-= q0 add

-        pxor        xmm3, [GLOBAL(t80)]     ; unoffset

-        movdqa      [rsi], xmm3             ; write back

-        ; now do +3 side

-        psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4

-        movdqa      xmm0, xmm5              ; get a copy of filters

-        psllw       xmm0, 8                 ; shift left 8

-        psraw       xmm0, 3                 ; arithmetic shift right 11

-        psrlw       xmm0, 8

-        psraw       xmm5, 11                ; arithmetic shift right 11

-        psllw       xmm5, 8                 ; shift left 8 to put it back

-        por         xmm0, xmm5              ; put the two together to get result

-        paddsb      xmm6, xmm0              ; p0+= p0 add

-        pxor        xmm6, [GLOBAL(t80)]     ; unoffset

-        movdqa      [rsi+rax], xmm6         ; write back

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_loop_filter_simple_vertical_edge_sse2

-;(

-;    unsigned char *src_ptr,

-;    int  src_pixel_step,

-;    const char *blimit,

-;)

-global sym(vp9_loop_filter_simple_vertical_edge_sse2)

-sym(vp9_loop_filter_simple_vertical_edge_sse2):

-    push        rbp         ; save old base pointer value.

-    mov         rbp, rsp    ; set new base pointer value.

-    SHADOW_ARGS_TO_STACK 3

-    SAVE_XMM 7

-    GET_GOT     rbx         ; save callee-saved reg

-    push        rsi

-    push        rdi

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub         rsp, 32                         ; reserve 32 bytes

-    %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];

-    %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];

-        mov         rsi, arg(0) ;src_ptr

-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?

-        lea         rsi,        [rsi - 2 ]

-        lea         rdi,        [rsi + rax]

-        lea         rdx,        [rsi + rax*4]

-        lea         rcx,        [rdx + rax]

-        movd        xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00

-        movd        xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40

-        movd        xmm2,       [rdi]                   ; 13 12 11 10

-        movd        xmm3,       [rcx]                   ; 53 52 51 50

-        punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00

-        punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10

-        movd        xmm4,       [rsi + rax*2]           ; 23 22 21 20

-        movd        xmm5,       [rdx + rax*2]           ; 63 62 61 60

-        movd        xmm6,       [rdi + rax*2]           ; 33 32 31 30

-        movd        xmm7,       [rcx + rax*2]           ; 73 72 71 70

-        punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20

-        punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30

-        punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00

-        punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20

-        movdqa      xmm1,       xmm0

-        punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00

-        punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40

-        movdqa      xmm2,       xmm0

-        punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00

-        punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02

-        movdqa      t0,         xmm0                    ; save to t0

-        movdqa      t1,         xmm2                    ; save to t1

-        lea         rsi,        [rsi + rax*8]

-        lea         rdi,        [rsi + rax]

-        lea         rdx,        [rsi + rax*4]

-        lea         rcx,        [rdx + rax]

-        movd        xmm4,       [rsi]                   ; 83 82 81 80

-        movd        xmm1,       [rdx]                   ; c3 c2 c1 c0

-        movd        xmm6,       [rdi]                   ; 93 92 91 90

-        movd        xmm3,       [rcx]                   ; d3 d2 d1 d0

-        punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80

-        punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90

-        movd        xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0

-        movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0

-        movd        xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0

-        movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0

-        punpckldq   xmm0,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0

-        punpckldq   xmm2,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0

-        punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80

-        punpcklbw   xmm0,       xmm2                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0

-        movdqa      xmm1,       xmm4

-        punpcklwd   xmm4,       xmm0                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80

-        punpckhwd   xmm1,       xmm0                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0

-        movdqa      xmm6,       xmm4

-        punpckldq   xmm4,       xmm1                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80

-        punpckhdq   xmm6,       xmm1                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82

-        movdqa      xmm0,       t0                      ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00

-        movdqa      xmm2,       t1                      ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02

-        movdqa      xmm1,       xmm0

-        movdqa      xmm3,       xmm2

-        punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00

-        punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01

-        punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02

-        punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03

-        ; calculate mask

-        movdqa      xmm6,       xmm0                            ; p1

-        movdqa      xmm7,       xmm3                            ; q1

-        psubusb     xmm7,       xmm0                            ; q1-=p1

-        psubusb     xmm6,       xmm3                            ; p1-=q1

-        por         xmm6,       xmm7                            ; abs(p1-q1)

-        pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero

-        psrlw       xmm6,       1                               ; abs(p1-q1)/2

-        movdqa      xmm5,       xmm1                            ; p0

-        movdqa      xmm4,       xmm2                            ; q0

-        psubusb     xmm5,       xmm2                            ; p0-=q0

-        psubusb     xmm4,       xmm1                            ; q0-=p0

-        por         xmm5,       xmm4                            ; abs(p0 - q0)

-        paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2

-        paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2

-        mov         rdx,        arg(2)                          ;blimit

-        movdqa      xmm7, XMMWORD PTR [rdx]

-        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit

-        pxor        xmm7,        xmm7

-        pcmpeqb     xmm5,        xmm7                           ; mm5 = mask

-        ; start work on filters

-        movdqa        t0,        xmm0

-        movdqa        t1,        xmm3

-        pxor        xmm0,        [GLOBAL(t80)]                  ; p1 offset to convert to signed values

-        pxor        xmm3,        [GLOBAL(t80)]                  ; q1 offset to convert to signed values

-        psubsb      xmm0,        xmm3                           ; p1 - q1

-        movdqa      xmm6,        xmm1                           ; p0

-        movdqa      xmm7,        xmm2                           ; q0

-        pxor        xmm6,        [GLOBAL(t80)]                  ; offset to convert to signed values

-        pxor        xmm7,        [GLOBAL(t80)]                  ; offset to convert to signed values

-        movdqa      xmm3,        xmm7                           ; offseted ; q0

-        psubsb      xmm7,        xmm6                           ; q0 - p0

-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 1 * (q0 - p0)

-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 2 * (q0 - p0)

-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 3 * (q0 - p0)

-        pand        xmm5,        xmm0                           ; mask filter values we don't care about

-        paddsb      xmm5,        [GLOBAL(t4)]                   ;  3* (q0 - p0) + (p1 - q1) + 4

-        movdqa      xmm0,        xmm5                           ; get a copy of filters

-        psllw       xmm0,        8                              ; shift left 8

-        psraw       xmm0,        3                              ; arithmetic shift right 11

-        psrlw       xmm0,        8

-        movdqa      xmm7,        xmm5                           ; get a copy of filters

-        psraw       xmm7,        11                             ; arithmetic shift right 11

-        psllw       xmm7,        8                              ; shift left 8 to put it back

-        por         xmm0,        xmm7                           ; put the two together to get result

-        psubsb      xmm3,        xmm0                           ; q0-= q0sz add

-        pxor        xmm3,        [GLOBAL(t80)]                  ; unoffset   q0

-        ; now do +3 side

-        psubsb      xmm5,        [GLOBAL(t1s)]                  ; +3 instead of +4

-        movdqa      xmm0,        xmm5                           ; get a copy of filters

-        psllw       xmm0,        8                              ; shift left 8

-        psraw       xmm0,        3                              ; arithmetic shift right 11

-        psrlw       xmm0,        8

-        psraw       xmm5,        11                             ; arithmetic shift right 11

-        psllw       xmm5,        8                              ; shift left 8 to put it back

-        por         xmm0,        xmm5                           ; put the two together to get result

-        paddsb      xmm6,        xmm0                           ; p0+= p0 add

-        pxor        xmm6,        [GLOBAL(t80)]                  ; unoffset   p0

-        movdqa      xmm0,        t0                             ; p1

-        movdqa      xmm4,        t1                             ; q1

-        ; transpose back to write out

-        ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00

-        ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01

-        ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02

-        ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03

-        movdqa      xmm1,       xmm0

-        punpcklbw   xmm0,       xmm6                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00

-        punpckhbw   xmm1,       xmm6                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80

-        movdqa      xmm5,       xmm3

-        punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02

-        punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82

-        movdqa      xmm2,       xmm0

-        punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00

-        punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40

-        movdqa      xmm3,       xmm1

-        punpcklwd   xmm1,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80

-        punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0

-        ; write out order: xmm0 xmm2 xmm1 xmm3

-        lea         rdx,        [rsi + rax*4]

-        movd        [rsi],      xmm1                               ; write the second 8-line result

-        psrldq      xmm1,       4

-        movd        [rdi],      xmm1

-        psrldq      xmm1,       4

-        movd        [rsi + rax*2], xmm1

-        psrldq      xmm1,       4

-        movd        [rdi + rax*2], xmm1

-        movd        [rdx],      xmm3

-        psrldq      xmm3,       4

-        movd        [rcx],      xmm3

-        psrldq      xmm3,       4

-        movd        [rdx + rax*2], xmm3

-        psrldq      xmm3,       4

-        movd        [rcx + rax*2], xmm3

-        neg         rax

-        lea         rsi,        [rsi + rax*8]

-        neg         rax

-        lea         rdi,        [rsi + rax]

-        lea         rdx,        [rsi + rax*4]

-        lea         rcx,        [rdx + rax]

-        movd        [rsi],      xmm0                                ; write the first 8-line result

-        psrldq      xmm0,       4

-        movd        [rdi],      xmm0

-        psrldq      xmm0,       4

-        movd        [rsi + rax*2], xmm0

-        psrldq      xmm0,       4

-        movd        [rdi + rax*2], xmm0

-        movd        [rdx],      xmm2

-        psrldq      xmm2,       4

-        movd        [rcx],      xmm2

-        psrldq      xmm2,       4

-        movd        [rdx + rax*2], xmm2

-        psrldq      xmm2,       4

-        movd        [rcx + rax*2], xmm2

-    add rsp, 32

-    pop rsp

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-tfe:

-    times 16 db 0xfe

-align 16

-t80:

-    times 16 db 0x80

-align 16

-t1s:

-    times 16 db 0x01

-align 16

-t3:

-    times 16 db 0x03

-align 16

-t4:

-    times 16 db 0x04

-align 16

-ones:

-    times 8 dw 0x0001

-align 16

-s9:

-    times 8 dw 0x0900

-align 16

-s63:

-    times 8 dw 0x003f

--- a/vp8/common/x86/loopfilter_x86.c

+++ /dev/null

@@ -1,543 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <emmintrin.h>  // SSE2

-#include "vpx_config.h"

-#include "vp8/common/loopfilter.h"

-prototype_loopfilter(vp9_loop_filter_vertical_edge_mmx);

-prototype_loopfilter(vp9_loop_filter_horizontal_edge_mmx);

-prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2);

-prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2);

-extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2;

-extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2;

-#if HAVE_MMX

-/* Horizontal MB filtering */

-void vp9_loop_filter_mbh_mmx(unsigned char *y_ptr,

-                             unsigned char *u_ptr, unsigned char *v_ptr,

-                             int y_stride, int uv_stride,

-                             struct loop_filter_info *lfi) {

-}

-/* Vertical MB Filtering */

-void vp9_loop_filter_mbv_mmx(unsigned char *y_ptr,

-                             unsigned char *u_ptr, unsigned char *v_ptr,

-                             int y_stride, int uv_stride,

-                             struct loop_filter_info *lfi) {

-}

-/* Horizontal B Filtering */

-void vp9_loop_filter_bh_mmx(unsigned char *y_ptr,

-                            unsigned char *u_ptr, unsigned char *v_ptr,

-                            int y_stride, int uv_stride,

-                            struct loop_filter_info *lfi) {

-}

-void vp9_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride,

-                             const unsigned char *blimit) {

-  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride,

-                                             y_stride, blimit);

-  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride,

-                                             y_stride, blimit);

-  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride,

-                                             y_stride, blimit);

-}

-/* Vertical B Filtering */

-void vp9_loop_filter_bv_mmx(unsigned char *y_ptr,

-                            unsigned char *u_ptr, unsigned char *v_ptr,

-                            int y_stride, int uv_stride,

-                            struct loop_filter_info *lfi) {

-  vp9_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride,

-                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  vp9_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride,

-                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  vp9_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride,

-                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  if (u_ptr)

-    vp9_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride,

-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);

-  if (v_ptr)

-    vp9_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride,

-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);

-}

-void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride,

-                             const unsigned char *blimit) {

-  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);

-  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);

-  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);

-}

-#endif

-#if HAVE_SSE2

-void vp9_mbloop_filter_horizontal_edge_c_sse2(unsigned char *s,

-                                              int p,

-                                              const unsigned char *_blimit,

-                                              const unsigned char *_limit,

-                                              const unsigned char *_thresh,

-                                              int count) {

-  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);

-  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);

-  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);

-  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);

-  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);

-  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);

-  __m128i mask, hev, flat;

-  __m128i thresh, limit, blimit;

-  const __m128i zero = _mm_set1_epi16(0);

-  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;

-  thresh = _mm_shuffle_epi32(_mm_cvtsi32_si128(_thresh[0] * 0x01010101), 0);

-  limit = _mm_shuffle_epi32(_mm_cvtsi32_si128(_limit[0] * 0x01010101), 0);

-  blimit = _mm_shuffle_epi32(_mm_cvtsi32_si128(_blimit[0] * 0x01010101), 0);

-  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));

-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));

-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));

-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));

-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));

-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));

-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));

-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));

-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));

-  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));

-  {

-    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),

-                                          _mm_subs_epu8(p0, p1));

-    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),

-                                          _mm_subs_epu8(q0, q1));

-    const __m128i one = _mm_set1_epi8(1);

-    const __m128i fe = _mm_set1_epi8(0xfe);

-    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);

-    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),

-                                    _mm_subs_epu8(q0, p0));

-    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),

-                                    _mm_subs_epu8(q1, p1));

-    __m128i work;

-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);

-    hev = _mm_subs_epu8(flat, thresh);

-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);

-    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);

-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);

-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);

-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);

-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;

-    mask = _mm_max_epu8(flat, mask);

-    // mask |= (abs(p1 - p0) > limit) * -1;

-    // mask |= (abs(q1 - q0) > limit) * -1;

-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),

-                                     _mm_subs_epu8(p1, p2)),

-                         _mm_or_si128(_mm_subs_epu8(p3, p2),

-                                      _mm_subs_epu8(p2, p3)));

-    mask = _mm_max_epu8(work, mask);

-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),

-                                     _mm_subs_epu8(q1, q2)),

-                         _mm_or_si128(_mm_subs_epu8(q3, q2),

-                                      _mm_subs_epu8(q2, q3)));

-    mask = _mm_max_epu8(work, mask);

-    mask = _mm_subs_epu8(mask, limit);

-    mask = _mm_cmpeq_epi8(mask, zero);

-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),

-                                     _mm_subs_epu8(p0, p2)),

-                         _mm_or_si128(_mm_subs_epu8(q2, q0),

-                                      _mm_subs_epu8(q0, q2)));

-    flat = _mm_max_epu8(work, flat);

-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),

-                                     _mm_subs_epu8(p0, p3)),

-                         _mm_or_si128(_mm_subs_epu8(q3, q0),

-                                      _mm_subs_epu8(q0, q3)));

-    flat = _mm_max_epu8(work, flat);

-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),

-                                     _mm_subs_epu8(p0, p4)),

-                         _mm_or_si128(_mm_subs_epu8(q4, q0),

-                                      _mm_subs_epu8(q0, q4)));

-    flat = _mm_max_epu8(work, flat);

-    flat = _mm_subs_epu8(flat, one);

-    flat = _mm_cmpeq_epi8(flat, zero);

-    flat = _mm_and_si128(flat, mask);

-  }

-  {

-    const __m128i four = _mm_set1_epi16(4);

-    unsigned char *src = s;

-    int i = 0;

-    do {

-      __m128i workp_a, workp_b, workp_shft;

-      p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);

-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);

-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);

-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);

-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);

-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);

-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);

-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);

-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);

-      q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);

-      workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1));

-      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);

-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

-      _mm_storel_epi64((__m128i *)&flat_op2[i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

-      _mm_storel_epi64((__m128i *)&flat_op1[i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2);

-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

-      _mm_storel_epi64((__m128i *)&flat_op0[i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);

-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

-      _mm_storel_epi64((__m128i *)&flat_oq0[i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4);

-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

-      _mm_storel_epi64((__m128i *)&flat_oq1[i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4);

-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

-      _mm_storel_epi64((__m128i *)&flat_oq2[i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      src += 8;

-    } while (++i < count);

-  }

-  // lp filter

-  {

-    const __m128i t4 = _mm_set1_epi8(4);

-    const __m128i t3 = _mm_set1_epi8(3);

-    const __m128i t80 = _mm_set1_epi8(0x80);

-    const __m128i te0 = _mm_set1_epi8(0xe0);

-    const __m128i t1f = _mm_set1_epi8(0x1f);

-    const __m128i t1 = _mm_set1_epi8(0x1);

-    const __m128i t7f = _mm_set1_epi8(0x7f);

-    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),

-                                      t80);

-    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),

-                                      t80);

-    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),

-                                      t80);

-    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),

-                                      t80);

-    __m128i filt;

-    __m128i work_a;

-    __m128i filter1, filter2;

-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);

-    work_a = _mm_subs_epi8(qs0, ps0);

-    filt = _mm_adds_epi8(filt, work_a);

-    filt = _mm_adds_epi8(filt, work_a);

-    filt = _mm_adds_epi8(filt, work_a);

-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */

-    filt = _mm_and_si128(filt, mask);

-    filter1 = _mm_adds_epi8(filt, t4);

-    filter2 = _mm_adds_epi8(filt, t3);

-    /* Filter1 >> 3 */

-    work_a = _mm_cmpgt_epi8(zero, filter1);

-    filter1 = _mm_srli_epi16(filter1, 3);

-    work_a = _mm_and_si128(work_a, te0);

-    filter1 = _mm_and_si128(filter1, t1f);

-    filter1 = _mm_or_si128(filter1, work_a);

-    /* Filter2 >> 3 */

-    work_a = _mm_cmpgt_epi8(zero, filter2);

-    filter2 = _mm_srli_epi16(filter2, 3);

-    work_a = _mm_and_si128(work_a, te0);

-    filter2 = _mm_and_si128(filter2, t1f);

-    filter2 = _mm_or_si128(filter2, work_a);

-    /* filt >> 1 */

-    filt = _mm_adds_epi8(filter1, t1);

-    work_a = _mm_cmpgt_epi8(zero, filt);

-    filt = _mm_srli_epi16(filt, 1);

-    work_a = _mm_and_si128(work_a, t80);

-    filt = _mm_and_si128(filt, t7f);

-    filt = _mm_or_si128(filt, work_a);

-    filt = _mm_andnot_si128(hev, filt);

-    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);

-    q0 = _mm_load_si128((__m128i *)flat_oq0);

-    work_a = _mm_andnot_si128(flat, work_a);

-    q0 = _mm_and_si128(flat, q0);

-    q0 = _mm_or_si128(work_a, q0);

-    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);

-    q1 = _mm_load_si128((__m128i *)flat_oq1);

-    work_a = _mm_andnot_si128(flat, work_a);

-    q1 = _mm_and_si128(flat, q1);

-    q1 = _mm_or_si128(work_a, q1);

-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));

-    q2 = _mm_load_si128((__m128i *)flat_oq2);

-    work_a = _mm_andnot_si128(flat, work_a);

-    q2 = _mm_and_si128(flat, q2);

-    q2 = _mm_or_si128(work_a, q2);

-    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);

-    p0 = _mm_load_si128((__m128i *)flat_op0);

-    work_a = _mm_andnot_si128(flat, work_a);

-    p0 = _mm_and_si128(flat, p0);

-    p0 = _mm_or_si128(work_a, p0);

-    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);

-    p1 = _mm_load_si128((__m128i *)flat_op1);

-    work_a = _mm_andnot_si128(flat, work_a);

-    p1 = _mm_and_si128(flat, p1);

-    p1 = _mm_or_si128(work_a, p1);

-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));

-    p2 = _mm_load_si128((__m128i *)flat_op2);

-    work_a = _mm_andnot_si128(flat, work_a);

-    p2 = _mm_and_si128(flat, p2);

-    p2 = _mm_or_si128(work_a, p2);

-    if (count == 1) {

-      _mm_storel_epi64((__m128i *)(s - 3 * p), p2);

-      _mm_storel_epi64((__m128i *)(s - 2 * p), p1);

-      _mm_storel_epi64((__m128i *)(s - 1 * p), p0);

-      _mm_storel_epi64((__m128i *)(s + 0 * p), q0);

-      _mm_storel_epi64((__m128i *)(s + 1 * p), q1);

-      _mm_storel_epi64((__m128i *)(s + 2 * p), q2);

-    } else {

-      _mm_storeu_si128((__m128i *)(s - 3 * p), p2);

-      _mm_storeu_si128((__m128i *)(s - 2 * p), p1);

-      _mm_storeu_si128((__m128i *)(s - 1 * p), p0);

-      _mm_storeu_si128((__m128i *)(s + 0 * p), q0);

-      _mm_storeu_si128((__m128i *)(s + 1 * p), q1);

-      _mm_storeu_si128((__m128i *)(s + 2 * p), q2);

-    }

-  }

-}

-static __inline void transpose(unsigned char *src[], int in_p,

-                               unsigned char *dst[], int out_p,

-                               int num_8x8_to_transpose) {

-  int idx8x8 = 0;

-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;

-  do {

-    unsigned char *in = src[idx8x8];

-    unsigned char *out = dst[idx8x8];

-    x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07

-    x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17

-    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27

-    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37

-    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47

-    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57

-    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67

-    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77

-    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17

-    x0 = _mm_unpacklo_epi8(x0, x1);

-    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37

-    x1 = _mm_unpacklo_epi8(x2, x3);

-    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57

-    x2 = _mm_unpacklo_epi8(x4, x5);

-    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77

-    x3 = _mm_unpacklo_epi8(x6, x7);

-    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33

-    x4 = _mm_unpacklo_epi16(x0, x1);

-    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73

-    x5 = _mm_unpacklo_epi16(x2, x3);

-    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71

-    x6 = _mm_unpacklo_epi32(x4, x5);

-    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73

-    x7 = _mm_unpackhi_epi32(x4, x5);

-    _mm_storel_pd((double *)(out + 0*out_p),

-                  _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70

-    _mm_storeh_pd((double *)(out + 1*out_p),

-                  _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71

-    _mm_storel_pd((double *)(out + 2*out_p),

-                  _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72

-    _mm_storeh_pd((double *)(out + 3*out_p),

-                  _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73

-    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37

-    x4 = _mm_unpackhi_epi16(x0, x1);

-    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77

-    x5 = _mm_unpackhi_epi16(x2, x3);

-    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75

-    x6 = _mm_unpacklo_epi32(x4, x5);

-    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77

-    x7 = _mm_unpackhi_epi32(x4, x5);

-    _mm_storel_pd((double *)(out + 4*out_p),

-                  _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74

-    _mm_storeh_pd((double *)(out + 5*out_p),

-                  _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75

-    _mm_storel_pd((double *)(out + 6*out_p),

-                  _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76

-    _mm_storeh_pd((double *)(out + 7*out_p),

-                  _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77

-  } while (++idx8x8 < num_8x8_to_transpose);

-}

-void vp9_mbloop_filter_vertical_edge_c_sse2(unsigned char *s,

-                                            int p,

-                                            const unsigned char *blimit,

-                                            const unsigned char *limit,

-                                            const unsigned char *thresh,

-                                            int count) {

-  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 16]);

-  unsigned char *src[4];

-  unsigned char *dst[4];

-  src[0] = s - 5;

-  src[1] = s - 5 + 8;

-  src[2] = s - 5 + p*8;

-  src[3] = s - 5 + p*8 + 8;

-  dst[0] = t_dst;

-  dst[1] = t_dst + 16*8;

-  dst[2] = t_dst + 8;

-  dst[3] = t_dst + 16*8 + 8;

-  // 16x16->16x16 or 16x8->8x16

-  transpose(src, p, dst, 16, (1 << count));

-  vp9_mbloop_filter_horizontal_edge_c_sse2(t_dst + 5*16, 16, blimit, limit,

-                                           thresh, count);

-  dst[0] = s - 5;

-  dst[1] = s - 5 + p*8;

-  src[0] = t_dst;

-  src[1] = t_dst + 8;

-  // 16x8->8x16 or 8x8->8x8

-  transpose(src, 16, dst, p, (1 << (count - 1)));

-}

-/* Horizontal MB filtering */

-void vp9_loop_filter_mbh_sse2(unsigned char *y_ptr,

-                              unsigned char *u_ptr, unsigned char *v_ptr,

-                              int y_stride, int uv_stride,

-                              struct loop_filter_info *lfi) {

-  vp9_mbloop_filter_horizontal_edge_c_sse2(y_ptr, y_stride, lfi->mblim,

-                                           lfi->lim, lfi->hev_thr, 2);

-  /* TODO: write sse2 version with u,v interleaved */

-  if (u_ptr)

-    vp9_mbloop_filter_horizontal_edge_c_sse2(u_ptr, uv_stride, lfi->mblim,

-                                             lfi->lim, lfi->hev_thr, 1);

-  if (v_ptr)

-    vp9_mbloop_filter_horizontal_edge_c_sse2(v_ptr, uv_stride, lfi->mblim,

-                                             lfi->lim, lfi->hev_thr, 1);

-}

-void vp9_loop_filter_bh8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,

-                             unsigned char *v_ptr, int y_stride, int uv_stride,

-                             struct loop_filter_info *lfi) {

-  vp9_mbloop_filter_horizontal_edge_c_sse2(

-    y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

-}

-/* Vertical MB Filtering */

-void vp9_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr,

-                              unsigned char *v_ptr, int y_stride, int uv_stride,

-                              struct loop_filter_info *lfi) {

-  vp9_mbloop_filter_vertical_edge_c_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim,

-                                         lfi->hev_thr, 2);

-  /* TODO: write sse2 version with u,v interleaved */

-  if (u_ptr)

-    vp9_mbloop_filter_vertical_edge_c_sse2(u_ptr, uv_stride, lfi->mblim,

-                                           lfi->lim, lfi->hev_thr, 1);

-  if (v_ptr)

-    vp9_mbloop_filter_vertical_edge_c_sse2(v_ptr, uv_stride, lfi->mblim,

-                                           lfi->lim, lfi->hev_thr, 1);

-}

-void vp9_loop_filter_bv8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,

-                             unsigned char *v_ptr, int y_stride, int uv_stride,

-                             struct loop_filter_info *lfi) {

-  vp9_mbloop_filter_vertical_edge_c_sse2(

-    y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

-}

-/* Horizontal B Filtering */

-void vp9_loop_filter_bh_sse2(unsigned char *y_ptr,

-                             unsigned char *u_ptr, unsigned char *v_ptr,

-                             int y_stride, int uv_stride,

-                             struct loop_filter_info *lfi) {

-  vp9_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride,

-                                       lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  vp9_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride,

-                                       lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  vp9_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride,

-                                       lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  if (u_ptr)

-    vp9_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride,

-                                            lfi->blim, lfi->lim, lfi->hev_thr,

-                                            v_ptr + 4 * uv_stride);

-}

-void vp9_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride,

-                              const unsigned char *blimit) {

-  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride,

-                                              y_stride, blimit);

-  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride,

-                                              y_stride, blimit);

-  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride,

-                                              y_stride, blimit);

-}

-/* Vertical B Filtering */

-void vp9_loop_filter_bv_sse2(unsigned char *y_ptr,

-                             unsigned char *u_ptr, unsigned char *v_ptr,

-                             int y_stride, int uv_stride,

-                             struct loop_filter_info *lfi) {

-  vp9_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride,

-                                     lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  vp9_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride,

-                                     lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  vp9_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride,

-                                     lfi->blim, lfi->lim, lfi->hev_thr, 2);

-  if (u_ptr)

-    vp9_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride,

-                                          lfi->blim, lfi->lim, lfi->hev_thr,

-                                          v_ptr + 4);

-}

-void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride,

-                              const unsigned char *blimit) {

-  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);

-  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);

-  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);

-}

-#endif

--- a/vp8/common/x86/loopfilter_x86.h

+++ /dev/null

@@ -1,43 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef LOOPFILTER_X86_H

-#define LOOPFILTER_X86_H

-/* Note:

- *

- * This platform is commonly built for runtime CPU detection. If you modify

- * any of the function mappings present in this file, be sure to also update

- * them in the function pointer initialization code

- */

-#if HAVE_MMX

-extern prototype_loopfilter_block(vp9_loop_filter_mbv_mmx);

-extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx);

-extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx);

-extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx);

-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_mmx);

-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_mmx);

-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_mmx);

-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_mmx);

-#endif

-#if HAVE_SSE2

-extern prototype_loopfilter_block(vp9_loop_filter_mbv_sse2);

-extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2);

-extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2);

-extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2);

-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_sse2);

-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_sse2);

-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_sse2);

-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_sse2);

-#endif

-#endif  // LOOPFILTER_X86_H

--- a/vp8/common/x86/mask_sse3.asm

+++ /dev/null

@@ -1,484 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;void int vp8_makemask_sse3(

-;    unsigned char *y,

-;    unsigned char *u,

-;    unsigned char *v,

-;    unsigned char *ym,

-;    unsigned char *uvm,

-;    int yp,

-;    int uvp,

-;    int ys,

-;    int us,

-;    int vs,

-;    int yt,

-;    int ut,

-;    int vt)

-global sym(vp8_makemask_sse3)

-sym(vp8_makemask_sse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 14

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;y

-        mov             rdi,        arg(1) ;u

-        mov             rcx,        arg(2) ;v

-        mov             rax,        arg(3) ;ym

-        movsxd          rbx,        dword arg(4) ;yp

-        movsxd          rdx,        dword arg(5) ;uvp

-        pxor            xmm0,xmm0

-        ;make 16 copies of the center y value

-        movd            xmm1, arg(6)

-        pshufb          xmm1, xmm0

-        ; make 16 copies of the center u value

-        movd            xmm2, arg(7)

-        pshufb          xmm2, xmm0

-        ; make 16 copies of the center v value

-        movd            xmm3, arg(8)

-        pshufb          xmm3, xmm0

-        unpcklpd        xmm2, xmm3

-        ;make 16 copies of the y tolerance

-        movd            xmm3, arg(9)

-        pshufb          xmm3, xmm0

-        ;make 16 copies of the u tolerance

-        movd            xmm4, arg(10)

-        pshufb          xmm4, xmm0

-        ;make 16 copies of the v tolerance

-        movd            xmm5, arg(11)

-        pshufb          xmm5, xmm0

-        unpckhpd        xmm4, xmm5

-        mov             r8,8

-NextPairOfRows:

-        ;grab the y source values

-        movdqu          xmm0, [rsi]

-        ;compute abs difference between source and y target

-        movdqa          xmm6, xmm1

-        movdqa          xmm7, xmm0

-        psubusb         xmm0, xmm1

-        psubusb         xmm6, xmm7

-        por             xmm0, xmm6

-        ;compute abs difference between

-        movdqa          xmm6, xmm3

-        pcmpgtb         xmm6, xmm0

-        ;grab the y source values

-        add             rsi, rbx

-        movdqu          xmm0, [rsi]

-        ;compute abs difference between source and y target

-        movdqa          xmm11, xmm1

-        movdqa          xmm7, xmm0

-        psubusb         xmm0, xmm1

-        psubusb         xmm11, xmm7

-        por             xmm0, xmm11

-        ;compute abs difference between

-        movdqa          xmm11, xmm3

-        pcmpgtb         xmm11, xmm0

-        ;grab the u and v source values

-        movdqu          xmm7, [rdi]

-        movdqu          xmm8, [rcx]

-        unpcklpd        xmm7, xmm8

-        ;compute abs difference between source and uv targets

-        movdqa          xmm9, xmm2

-        movdqa          xmm10, xmm7

-        psubusb         xmm7, xmm2

-        psubusb         xmm9, xmm10

-        por             xmm7, xmm9

-        ;check whether the number is < tolerance

-        movdqa          xmm0, xmm4

-        pcmpgtb         xmm0, xmm7

-        ;double  u and v masks

-        movdqa          xmm8, xmm0

-        punpckhbw       xmm0, xmm0

-        punpcklbw       xmm8, xmm8

-        ;mask row 0 and output

-        pand            xmm6, xmm8

-        pand            xmm6, xmm0

-        movdqa          [rax],xmm6

-        ;mask row 1 and output

-        pand            xmm11, xmm8

-        pand            xmm11, xmm0

-        movdqa          [rax+16],xmm11

-        ; to the next row or set of rows

-        add             rsi, rbx

-        add             rdi, rdx

-        add             rcx, rdx

-        add             rax,32

-        dec r8

-        jnz NextPairOfRows

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;GROW_HORIZ (register for result, source register or mem local)

-; takes source and shifts left and ors with source

-; then shifts right and ors with source

-%macro GROW_HORIZ 2

-    movdqa          %1, %2

-    movdqa          xmm14, %1

-    movdqa          xmm15, %1

-    pslldq          xmm14, 1

-    psrldq          xmm15, 1

-    por             %1,xmm14

-    por             %1,xmm15

-%endmacro

-;GROW_VERT (result, center row, above row, below row)

-%macro GROW_VERT 4

-    movdqa          %1,%2

-    por             %1,%3

-    por             %1,%4

-%endmacro

-;GROW_NEXTLINE (new line to grow, new source, line to write)

-%macro GROW_NEXTLINE 3

-    GROW_HORIZ %1, %2

-    GROW_VERT xmm3, xmm0, xmm1, xmm2

-    movdqa %3,xmm3

-%endmacro

-;void int vp8_growmaskmb_sse3(

-;    unsigned char *om,

-;    unsigned char *nm,

-global sym(vp8_growmaskmb_sse3)

-sym(vp8_growmaskmb_sse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 2

-    push        rsi

-    push        rdi

-    ; end prolog

-    mov             rsi,        arg(0) ;src

-    mov             rdi,        arg(1) ;rst

-    GROW_HORIZ xmm0, [rsi]

-    GROW_HORIZ xmm1, [rsi+16]

-    GROW_HORIZ xmm2, [rsi+32]

-    GROW_VERT xmm3, xmm0, xmm1, xmm2

-    por xmm0,xmm1

-    movdqa [rdi], xmm0

-    movdqa [rdi+16],xmm3

-    GROW_NEXTLINE xmm0,[rsi+48],[rdi+32]

-    GROW_NEXTLINE xmm1,[rsi+64],[rdi+48]

-    GROW_NEXTLINE xmm2,[rsi+80],[rdi+64]

-    GROW_NEXTLINE xmm0,[rsi+96],[rdi+80]

-    GROW_NEXTLINE xmm1,[rsi+112],[rdi+96]

-    GROW_NEXTLINE xmm2,[rsi+128],[rdi+112]

-    GROW_NEXTLINE xmm0,[rsi+144],[rdi+128]

-    GROW_NEXTLINE xmm1,[rsi+160],[rdi+144]

-    GROW_NEXTLINE xmm2,[rsi+176],[rdi+160]

-    GROW_NEXTLINE xmm0,[rsi+192],[rdi+176]

-    GROW_NEXTLINE xmm1,[rsi+208],[rdi+192]

-    GROW_NEXTLINE xmm2,[rsi+224],[rdi+208]

-    GROW_NEXTLINE xmm0,[rsi+240],[rdi+224]

-    por xmm0,xmm2

-    movdqa [rdi+240], xmm0

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int vp8_sad16x16_masked_wmt(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride,

-;    unsigned char *mask)

-global sym(vp8_sad16x16_masked_wmt)

-sym(vp8_sad16x16_masked_wmt):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    push        rsi

-    push        rdi

-    ; end prolog

-    mov             rsi,        arg(0) ;src_ptr

-    mov             rdi,        arg(2) ;ref_ptr

-    mov             rbx,        arg(4) ;mask

-    movsxd          rax,        dword ptr arg(1) ;src_stride

-    movsxd          rdx,        dword ptr arg(3) ;ref_stride

-    mov             rcx,        16

-    pxor            xmm3,       xmm3

-NextSadRow:

-    movdqu          xmm0,       [rsi]

-    movdqu          xmm1,       [rdi]

-    movdqu          xmm2,       [rbx]

-    pand            xmm0,       xmm2

-    pand            xmm1,       xmm2

-    psadbw          xmm0,       xmm1

-    paddw           xmm3,       xmm0

-    add             rsi, rax

-    add             rdi, rdx

-    add             rbx,  16

-    dec rcx

-    jnz NextSadRow

-    movdqa          xmm4 ,     xmm3

-    psrldq          xmm4,       8

-    paddw           xmm3,      xmm4

-    movq            rax,       xmm3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int vp8_sad16x16_unmasked_wmt(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride,

-;    unsigned char *mask)

-global sym(vp8_sad16x16_unmasked_wmt)

-sym(vp8_sad16x16_unmasked_wmt):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    push        rsi

-    push        rdi

-    ; end prolog

-    mov             rsi,        arg(0) ;src_ptr

-    mov             rdi,        arg(2) ;ref_ptr

-    mov             rbx,        arg(4) ;mask

-    movsxd          rax,        dword ptr arg(1) ;src_stride

-    movsxd          rdx,        dword ptr arg(3) ;ref_stride

-    mov             rcx,        16

-    pxor            xmm3,       xmm3

-next_vp8_sad16x16_unmasked_wmt:

-    movdqu          xmm0,       [rsi]

-    movdqu          xmm1,       [rdi]

-    movdqu          xmm2,       [rbx]

-    por             xmm0,       xmm2

-    por             xmm1,       xmm2

-    psadbw          xmm0,       xmm1

-    paddw           xmm3,       xmm0

-    add             rsi, rax

-    add             rdi, rdx

-    add             rbx,  16

-    dec rcx

-    jnz next_vp8_sad16x16_unmasked_wmt

-    movdqa          xmm4 ,     xmm3

-    psrldq          xmm4,       8

-    paddw           xmm3,      xmm4

-    movq            rax,        xmm3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int vp8_masked_predictor_wmt(

-;    unsigned char *masked,

-;    unsigned char *unmasked,

-;    int  src_stride,

-;    unsigned char *dst_ptr,

-;    int  dst_stride,

-;    unsigned char *mask)

-global sym(vp8_masked_predictor_wmt)

-sym(vp8_masked_predictor_wmt):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    push        rsi

-    push        rdi

-    ; end prolog

-    mov             rsi,        arg(0) ;src_ptr

-    mov             rdi,        arg(1) ;ref_ptr

-    mov             rbx,        arg(5) ;mask

-    movsxd          rax,        dword ptr arg(2) ;src_stride

-    mov             r11,        arg(3) ; destination

-    movsxd          rdx,        dword ptr arg(4) ;dst_stride

-    mov             rcx,        16

-    pxor            xmm3,       xmm3

-next_vp8_masked_predictor_wmt:

-    movdqu          xmm0,       [rsi]

-    movdqu          xmm1,       [rdi]

-    movdqu          xmm2,       [rbx]

-    pand            xmm0,       xmm2

-    pandn           xmm2,       xmm1

-    por             xmm0,       xmm2

-    movdqu          [r11],      xmm0

-    add             r11, rdx

-    add             rsi, rax

-    add             rdi, rdx

-    add             rbx,  16

-    dec rcx

-    jnz next_vp8_masked_predictor_wmt

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int vp8_masked_predictor_uv_wmt(

-;    unsigned char *masked,

-;    unsigned char *unmasked,

-;    int  src_stride,

-;    unsigned char *dst_ptr,

-;    int  dst_stride,

-;    unsigned char *mask)

-global sym(vp8_masked_predictor_uv_wmt)

-sym(vp8_masked_predictor_uv_wmt):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    push        rsi

-    push        rdi

-    ; end prolog

-    mov             rsi,        arg(0) ;src_ptr

-    mov             rdi,        arg(1) ;ref_ptr

-    mov             rbx,        arg(5) ;mask

-    movsxd          rax,        dword ptr arg(2) ;src_stride

-    mov             r11,        arg(3) ; destination

-    movsxd          rdx,        dword ptr arg(4) ;dst_stride

-    mov             rcx,        8

-    pxor            xmm3,       xmm3

-next_vp8_masked_predictor_uv_wmt:

-    movq            xmm0,       [rsi]

-    movq            xmm1,       [rdi]

-    movq            xmm2,       [rbx]

-    pand            xmm0,       xmm2

-    pandn           xmm2,       xmm1

-    por             xmm0,       xmm2

-    movq            [r11],      xmm0

-    add             r11, rdx

-    add             rsi, rax

-    add             rdi, rax

-    add             rbx,  8

-    dec rcx

-    jnz next_vp8_masked_predictor_uv_wmt

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int vp8_uv_from_y_mask(

-;    unsigned char *ymask,

-;    unsigned char *uvmask)

-global sym(vp8_uv_from_y_mask)

-sym(vp8_uv_from_y_mask):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    push        rsi

-    push        rdi

-    ; end prolog

-    mov             rsi,        arg(0) ;src_ptr

-    mov             rdi,        arg(1) ;dst_ptr

-    mov             rcx,        8

-    pxor            xmm3,       xmm3

-next_p8_uv_from_y_mask:

-    movdqu          xmm0,       [rsi]

-    pshufb          xmm0, [shuf1b] ;[GLOBAL(shuf1b)]

-    movq            [rdi],xmm0

-    add             rdi, 8

-    add             rsi,32

-    dec rcx

-    jnz next_p8_uv_from_y_mask

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-shuf1b:

-    db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0

--- a/vp8/common/x86/postproc_mmx.asm

+++ /dev/null

@@ -1,534 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%define VP9_FILTER_WEIGHT 128

-%define VP9_FILTER_SHIFT  7

-;void vp9_post_proc_down_and_across_mmx

-;(

-;    unsigned char *src_ptr,

-;    unsigned char *dst_ptr,

-;    int src_pixels_per_line,

-;    int dst_pixels_per_line,

-;    int rows,

-;    int cols,

-;    int flimit

-;)

-global sym(vp9_post_proc_down_and_across_mmx)

-sym(vp9_post_proc_down_and_across_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-%if ABI_IS_32BIT=1 && CONFIG_PIC=1

-    ; move the global rd onto the stack, since we don't have enough registers

-    ; to do PIC addressing

-    movq        mm0, [GLOBAL(rd)]

-    sub         rsp, 8

-    movq        [rsp], mm0

-%define RD [rsp]

-%else

-%define RD [GLOBAL(rd)]

-%endif

-        push        rbx

-        lea         rbx, [GLOBAL(Blur)]

-        movd        mm2, dword ptr arg(6) ;flimit

-        punpcklwd   mm2, mm2

-        punpckldq   mm2, mm2

-        mov         rsi,        arg(0) ;src_ptr

-        mov         rdi,        arg(1) ;dst_ptr

-        movsxd      rcx, DWORD PTR arg(4) ;rows

-        movsxd      rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?

-        pxor        mm0, mm0              ; mm0 = 00000000

-.nextrow:

-        xor         rdx,        rdx       ; clear out rdx for use as loop counter

-.nextcol:

-        pxor        mm7, mm7              ; mm7 = 00000000

-        movq        mm6, [rbx + 32 ]      ; mm6 = kernel 2 taps

-        movq        mm3, [rsi]            ; mm4 = r0 p0..p7

-        punpcklbw   mm3, mm0              ; mm3 = p0..p3

-        movq        mm1, mm3              ; mm1 = p0..p3

-        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers

-        movq        mm6, [rbx + 48]       ; mm6 = kernel 3 taps

-        movq        mm5, [rsi + rax]      ; mm4 = r1 p0..p7

-        punpcklbw   mm5, mm0              ; mm5 = r1 p0..p3

-        pmullw      mm6, mm5              ; mm6 *= p0..p3 * kernel 3 modifiers

-        paddusw     mm3, mm6              ; mm3 += mm6

-        ; thresholding

-        movq        mm7, mm1              ; mm7 = r0 p0..p3

-        psubusw     mm7, mm5              ; mm7 = r0 p0..p3 - r1 p0..p3

-        psubusw     mm5, mm1              ; mm5 = r1 p0..p3 - r0 p0..p3

-        paddusw     mm7, mm5              ; mm7 = abs(r0 p0..p3 - r1 p0..p3)

-        pcmpgtw     mm7, mm2

-        movq        mm6, [rbx + 64 ]      ; mm6 = kernel 4 modifiers

-        movq        mm5, [rsi + 2*rax]    ; mm4 = r2 p0..p7

-        punpcklbw   mm5, mm0              ; mm5 = r2 p0..p3

-        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers

-        paddusw     mm3, mm6              ; mm3 += mm5

-        ; thresholding

-        movq        mm6, mm1              ; mm6 = r0 p0..p3

-        psubusw     mm6, mm5              ; mm6 = r0 p0..p3 - r2 p0..p3

-        psubusw     mm5, mm1              ; mm5 = r2 p0..p3 - r2 p0..p3

-        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r2 p0..p3)

-        pcmpgtw     mm6, mm2

-        por         mm7, mm6              ; accumulate thresholds

-        neg         rax

-        movq        mm6, [rbx ]           ; kernel 0 taps

-        movq        mm5, [rsi+2*rax]      ; mm4 = r-2 p0..p7

-        punpcklbw   mm5, mm0              ; mm5 = r-2 p0..p3

-        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers

-        paddusw     mm3, mm6              ; mm3 += mm5

-        ; thresholding

-        movq        mm6, mm1              ; mm6 = r0 p0..p3

-        psubusw     mm6, mm5              ; mm6 = p0..p3 - r-2 p0..p3

-        psubusw     mm5, mm1              ; mm5 = r-2 p0..p3 - p0..p3

-        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)

-        pcmpgtw     mm6, mm2

-        por         mm7, mm6              ; accumulate thresholds

-        movq        mm6, [rbx + 16]       ; kernel 1 taps

-        movq        mm4, [rsi+rax]        ; mm4 = r-1 p0..p7

-        punpcklbw   mm4, mm0              ; mm4 = r-1 p0..p3

-        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.

-        paddusw     mm3, mm6              ; mm3 += mm5

-        ; thresholding

-        movq        mm6, mm1              ; mm6 = r0 p0..p3

-        psubusw     mm6, mm4              ; mm6 = p0..p3 - r-2 p0..p3

-        psubusw     mm4, mm1              ; mm5 = r-1 p0..p3 - p0..p3

-        paddusw     mm6, mm4              ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)

-        pcmpgtw     mm6, mm2

-        por         mm7, mm6              ; accumulate thresholds

-        paddusw     mm3, RD               ; mm3 += round value

-        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128

-        pand        mm1, mm7              ; mm1 select vals > thresh from source

-        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result

-        paddusw     mm1, mm7              ; combination

-        packuswb    mm1, mm0              ; pack to bytes

-        movd        [rdi], mm1            ;

-        neg         rax                   ; pitch is positive

-        add         rsi, 4

-        add         rdi, 4

-        add         rdx, 4

-        cmp         edx, dword ptr arg(5) ;cols

-        jl          .nextcol

-        ; done with the all cols, start the across filtering in place

-        sub         rsi, rdx

-        sub         rdi, rdx

-        push        rax

-        xor         rdx,    rdx

-        mov         rax,    [rdi-4];

-.acrossnextcol:

-        pxor        mm7, mm7              ; mm7 = 00000000

-        movq        mm6, [rbx + 32 ]      ;

-        movq        mm4, [rdi+rdx]        ; mm4 = p0..p7

-        movq        mm3, mm4              ; mm3 = p0..p7

-        punpcklbw   mm3, mm0              ; mm3 = p0..p3

-        movq        mm1, mm3              ; mm1 = p0..p3

-        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers

-        movq        mm6, [rbx + 48]

-        psrlq       mm4, 8                ; mm4 = p1..p7

-        movq        mm5, mm4              ; mm5 = p1..p7

-        punpcklbw   mm5, mm0              ; mm5 = p1..p4

-        pmullw      mm6, mm5              ; mm6 *= p1..p4 * kernel 3 modifiers

-        paddusw     mm3, mm6              ; mm3 += mm6

-        ; thresholding

-        movq        mm7, mm1              ; mm7 = p0..p3

-        psubusw     mm7, mm5              ; mm7 = p0..p3 - p1..p4

-        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3

-        paddusw     mm7, mm5              ; mm7 = abs(p0..p3 - p1..p4)

-        pcmpgtw     mm7, mm2

-        movq        mm6, [rbx + 64 ]

-        psrlq       mm4, 8                ; mm4 = p2..p7

-        movq        mm5, mm4              ; mm5 = p2..p7

-        punpcklbw   mm5, mm0              ; mm5 = p2..p5

-        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers

-        paddusw     mm3, mm6              ; mm3 += mm5

-        ; thresholding

-        movq        mm6, mm1              ; mm6 = p0..p3

-        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4

-        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3

-        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)

-        pcmpgtw     mm6, mm2

-        por         mm7, mm6              ; accumulate thresholds

-        movq        mm6, [rbx ]

-        movq        mm4, [rdi+rdx-2]      ; mm4 = p-2..p5

-        movq        mm5, mm4              ; mm5 = p-2..p5

-        punpcklbw   mm5, mm0              ; mm5 = p-2..p1

-        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers

-        paddusw     mm3, mm6              ; mm3 += mm5

-        ; thresholding

-        movq        mm6, mm1              ; mm6 = p0..p3

-        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4

-        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3

-        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)

-        pcmpgtw     mm6, mm2

-        por         mm7, mm6              ; accumulate thresholds

-        movq        mm6, [rbx + 16]

-        psrlq       mm4, 8                ; mm4 = p-1..p5

-        punpcklbw   mm4, mm0              ; mm4 = p-1..p2

-        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.

-        paddusw     mm3, mm6              ; mm3 += mm5

-        ; thresholding

-        movq        mm6, mm1              ; mm6 = p0..p3

-        psubusw     mm6, mm4              ; mm6 = p0..p3 - p1..p4

-        psubusw     mm4, mm1              ; mm5 = p1..p4 - p0..p3

-        paddusw     mm6, mm4              ; mm6 = abs(p0..p3 - p1..p4)

-        pcmpgtw     mm6, mm2

-        por         mm7, mm6              ; accumulate thresholds

-        paddusw     mm3, RD               ; mm3 += round value

-        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128

-        pand        mm1, mm7              ; mm1 select vals > thresh from source

-        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result

-        paddusw     mm1, mm7              ; combination

-        packuswb    mm1, mm0              ; pack to bytes

-        mov         DWORD PTR [rdi+rdx-4],  eax   ; store previous four bytes

-        movd        eax,    mm1

-        add         rdx, 4

-        cmp         edx, dword ptr arg(5) ;cols

-        jl          .acrossnextcol;

-        mov         DWORD PTR [rdi+rdx-4],  eax

-        pop         rax

-        ; done with this rwo

-        add         rsi,rax               ; next line

-        movsxd      rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?

-        add         rdi,rax               ; next destination

-        movsxd      rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?

-        dec         rcx                   ; decrement count

-        jnz         .nextrow               ; next row

-        pop         rbx

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-%undef RD

-;void vp9_mbpost_proc_down_mmx(unsigned char *dst,

-;                             int pitch, int rows, int cols,int flimit)

-extern sym(vp9_rv)

-global sym(vp9_mbpost_proc_down_mmx)

-sym(vp9_mbpost_proc_down_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub         rsp, 136

-    ; unsigned char d[16][8] at [rsp]

-    ; create flimit2 at [rsp+128]

-    mov         eax, dword ptr arg(4) ;flimit

-    mov         [rsp+128], eax

-    mov         [rsp+128+4], eax

-%define flimit2 [rsp+128]

-%if ABI_IS_32BIT=0

-    lea         r8,       [GLOBAL(sym(vp9_rv))]

-%endif

-    ;rows +=8;

-    add         dword ptr arg(2), 8

-    ;for(c=0; c<cols; c+=4)

-.loop_col:

-            mov         rsi,        arg(0)  ;s

-            pxor        mm0,        mm0     ;

-            movsxd      rax,        dword ptr arg(1) ;pitch       ;

-            neg         rax                                     ; rax = -pitch

-            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]

-            neg         rax

-            pxor        mm5,        mm5

-            pxor        mm6,        mm6     ;

-            pxor        mm7,        mm7     ;

-            mov         rdi,        rsi

-            mov         rcx,        15          ;

-.loop_initvar:

-            movd        mm1,        DWORD PTR [rdi];

-            punpcklbw   mm1,        mm0     ;

-            paddw       mm5,        mm1     ;

-            pmullw      mm1,        mm1     ;

-            movq        mm2,        mm1     ;

-            punpcklwd   mm1,        mm0     ;

-            punpckhwd   mm2,        mm0     ;

-            paddd       mm6,        mm1     ;

-            paddd       mm7,        mm2     ;

-            lea         rdi,        [rdi+rax]   ;

-            dec         rcx

-            jne         .loop_initvar

-            ;save the var and sum

-            xor         rdx,        rdx

-.loop_row:

-            movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]

-            movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]

-            punpcklbw   mm1,        mm0

-            punpcklbw   mm2,        mm0

-            paddw       mm5,        mm2

-            psubw       mm5,        mm1

-            pmullw      mm2,        mm2

-            movq        mm4,        mm2

-            punpcklwd   mm2,        mm0

-            punpckhwd   mm4,        mm0

-            paddd       mm6,        mm2

-            paddd       mm7,        mm4

-            pmullw      mm1,        mm1

-            movq        mm2,        mm1

-            punpcklwd   mm1,        mm0

-            psubd       mm6,        mm1

-            punpckhwd   mm2,        mm0

-            psubd       mm7,        mm2

-            movq        mm3,        mm6

-            pslld       mm3,        4

-            psubd       mm3,        mm6

-            movq        mm1,        mm5

-            movq        mm4,        mm5

-            pmullw      mm1,        mm1

-            pmulhw      mm4,        mm4

-            movq        mm2,        mm1

-            punpcklwd   mm1,        mm4

-            punpckhwd   mm2,        mm4

-            movq        mm4,        mm7

-            pslld       mm4,        4

-            psubd       mm4,        mm7

-            psubd       mm3,        mm1

-            psubd       mm4,        mm2

-            psubd       mm3,        flimit2

-            psubd       mm4,        flimit2

-            psrad       mm3,        31

-            psrad       mm4,        31

-            packssdw    mm3,        mm4

-            packsswb    mm3,        mm0

-            movd        mm1,        DWORD PTR [rsi+rax*8]

-            movq        mm2,        mm1

-            punpcklbw   mm1,        mm0

-            paddw       mm1,        mm5

-            mov         rcx,        rdx

-            and         rcx,        127

-%if ABI_IS_32BIT=1 && CONFIG_PIC=1

-            push        rax

-            lea         rax,        [GLOBAL(sym(vp9_rv))]

-            movq        mm4,        [rax + rcx*2] ;vp9_rv[rcx*2]

-            pop         rax

-%elif ABI_IS_32BIT=0

-            movq        mm4,        [r8 + rcx*2] ;vp9_rv[rcx*2]

-%else

-            movq        mm4,        [sym(vp9_rv) + rcx*2]

-%endif

-            paddw       mm1,        mm4

-            ;paddw     xmm1,       eight8s

-            psraw       mm1,        4

-            packuswb    mm1,        mm0

-            pand        mm1,        mm3

-            pandn       mm3,        mm2

-            por         mm1,        mm3

-            and         rcx,        15

-            movd        DWORD PTR   [rsp+rcx*4], mm1 ;d[rcx*4]

-            mov         rcx,        rdx

-            sub         rcx,        8

-            and         rcx,        15

-            movd        mm1,        DWORD PTR [rsp+rcx*4] ;d[rcx*4]

-            movd        [rsi],      mm1

-            lea         rsi,        [rsi+rax]

-            lea         rdi,        [rdi+rax]

-            add         rdx,        1

-            cmp         edx,        dword arg(2) ;rows

-            jl          .loop_row

-        add         dword arg(0), 4 ; s += 4

-        sub         dword arg(3), 4 ; cols -= 4

-        cmp         dword arg(3), 0

-        jg          .loop_col

-    add         rsp, 136

-    pop         rsp

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-%undef flimit2

-;void vp9_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,

-;                            unsigned char blackclamp[16],

-;                            unsigned char whiteclamp[16],

-;                            unsigned char bothclamp[16],

-;                            unsigned int Width, unsigned int Height, int Pitch)

-extern sym(rand)

-global sym(vp9_plane_add_noise_mmx)

-sym(vp9_plane_add_noise_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 8

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-.addnoise_loop:

-    call sym(rand) WRT_PLT

-    mov     rcx, arg(1) ;noise

-    and     rax, 0xff

-    add     rcx, rax

-    ; we rely on the fact that the clamping vectors are stored contiguously

-    ; in black/white/both order. Note that we have to reload this here because

-    ; rdx could be trashed by rand()

-    mov     rdx, arg(2) ; blackclamp

-            mov     rdi, rcx

-            movsxd  rcx, dword arg(5) ;[Width]

-            mov     rsi, arg(0) ;Pos

-            xor         rax,rax

-.addnoise_nextset:

-            movq        mm1,[rsi+rax]         ; get the source

-            psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise

-            paddusb     mm1, [rdx+32] ;bothclamp

-            psubusb     mm1, [rdx+16] ;whiteclamp

-            movq        mm2,[rdi+rax]         ; get the noise for this line

-            paddb       mm1,mm2              ; add it in

-            movq        [rsi+rax],mm1         ; store the result

-            add         rax,8                 ; move to the next line

-            cmp         rax, rcx

-            jl          .addnoise_nextset

-    movsxd  rax, dword arg(7) ; Pitch

-    add     arg(0), rax ; Start += Pitch

-    sub     dword arg(6), 1   ; Height -= 1

-    jg      .addnoise_loop

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-Blur:

-    times 16 dw 16

-    times  8 dw 64

-    times 16 dw 16

-    times  8 dw  0

-rd:

-    times 4 dw 0x40

--- a/vp8/common/x86/postproc_sse2.asm

+++ /dev/null

@@ -1,695 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;void vp9_post_proc_down_and_across_xmm

-;(

-;    unsigned char *src_ptr,

-;    unsigned char *dst_ptr,

-;    int src_pixels_per_line,

-;    int dst_pixels_per_line,

-;    int rows,

-;    int cols,

-;    int flimit

-;)

-global sym(vp9_post_proc_down_and_across_xmm)

-sym(vp9_post_proc_down_and_across_xmm):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-%if ABI_IS_32BIT=1 && CONFIG_PIC=1

-    ALIGN_STACK 16, rax

-    ; move the global rd onto the stack, since we don't have enough registers

-    ; to do PIC addressing

-    movdqa      xmm0, [GLOBAL(rd42)]

-    sub         rsp, 16

-    movdqa      [rsp], xmm0

-%define RD42 [rsp]

-%else

-%define RD42 [GLOBAL(rd42)]

-%endif

-        movd        xmm2,       dword ptr arg(6) ;flimit

-        punpcklwd   xmm2,       xmm2

-        punpckldq   xmm2,       xmm2

-        punpcklqdq  xmm2,       xmm2

-        mov         rsi,        arg(0) ;src_ptr

-        mov         rdi,        arg(1) ;dst_ptr

-        movsxd      rcx,        DWORD PTR arg(4) ;rows

-        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?

-        pxor        xmm0,       xmm0              ; mm0 = 00000000

-.nextrow:

-        xor         rdx,        rdx       ; clear out rdx for use as loop counter

-.nextcol:

-        movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7

-        punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3

-        movdqa      xmm1,       xmm3                    ; mm1 = p0..p3

-        psllw       xmm3,       2                       ;

-        movq        xmm5,       QWORD PTR [rsi + rax]   ; mm4 = r1 p0..p7

-        punpcklbw   xmm5,       xmm0                    ; mm5 = r1 p0..p3

-        paddusw     xmm3,       xmm5                    ; mm3 += mm6

-        ; thresholding

-        movdqa      xmm7,       xmm1                    ; mm7 = r0 p0..p3

-        psubusw     xmm7,       xmm5                    ; mm7 = r0 p0..p3 - r1 p0..p3

-        psubusw     xmm5,       xmm1                    ; mm5 = r1 p0..p3 - r0 p0..p3

-        paddusw     xmm7,       xmm5                    ; mm7 = abs(r0 p0..p3 - r1 p0..p3)

-        pcmpgtw     xmm7,       xmm2

-        movq        xmm5,       QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7

-        punpcklbw   xmm5,       xmm0                    ; mm5 = r2 p0..p3

-        paddusw     xmm3,       xmm5                    ; mm3 += mm5

-        ; thresholding

-        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3

-        psubusw     xmm6,       xmm5                    ; mm6 = r0 p0..p3 - r2 p0..p3

-        psubusw     xmm5,       xmm1                    ; mm5 = r2 p0..p3 - r2 p0..p3

-        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r2 p0..p3)

-        pcmpgtw     xmm6,       xmm2

-        por         xmm7,       xmm6                    ; accumulate thresholds

-        neg         rax

-        movq        xmm5,       QWORD PTR [rsi+2*rax]   ; mm4 = r-2 p0..p7

-        punpcklbw   xmm5,       xmm0                    ; mm5 = r-2 p0..p3

-        paddusw     xmm3,       xmm5                    ; mm3 += mm5

-        ; thresholding

-        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3

-        psubusw     xmm6,       xmm5                    ; mm6 = p0..p3 - r-2 p0..p3

-        psubusw     xmm5,       xmm1                    ; mm5 = r-2 p0..p3 - p0..p3

-        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)

-        pcmpgtw     xmm6,       xmm2

-        por         xmm7,       xmm6                    ; accumulate thresholds

-        movq        xmm4,       QWORD PTR [rsi+rax]     ; mm4 = r-1 p0..p7

-        punpcklbw   xmm4,       xmm0                    ; mm4 = r-1 p0..p3

-        paddusw     xmm3,       xmm4                    ; mm3 += mm5

-        ; thresholding

-        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3

-        psubusw     xmm6,       xmm4                    ; mm6 = p0..p3 - r-2 p0..p3

-        psubusw     xmm4,       xmm1                    ; mm5 = r-1 p0..p3 - p0..p3

-        paddusw     xmm6,       xmm4                    ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)

-        pcmpgtw     xmm6,       xmm2

-        por         xmm7,       xmm6                    ; accumulate thresholds

-        paddusw     xmm3,       RD42                    ; mm3 += round value

-        psraw       xmm3,       3                       ; mm3 /= 8

-        pand        xmm1,       xmm7                    ; mm1 select vals > thresh from source

-        pandn       xmm7,       xmm3                    ; mm7 select vals < thresh from blurred result

-        paddusw     xmm1,       xmm7                    ; combination

-        packuswb    xmm1,       xmm0                    ; pack to bytes

-        movq        QWORD PTR [rdi], xmm1             ;

-        neg         rax                   ; pitch is positive

-        add         rsi,        8

-        add         rdi,        8

-        add         rdx,        8

-        cmp         edx,        dword arg(5) ;cols

-        jl          .nextcol

-        ; done with the all cols, start the across filtering in place

-        sub         rsi,        rdx

-        sub         rdi,        rdx

-        xor         rdx,        rdx

-        movq        mm0,        QWORD PTR [rdi-8];

-.acrossnextcol:

-        movq        xmm7,       QWORD PTR [rdi +rdx -2]

-        movd        xmm4,       DWORD PTR [rdi +rdx +6]

-        pslldq      xmm4,       8

-        por         xmm4,       xmm7

-        movdqa      xmm3,       xmm4

-        psrldq      xmm3,       2

-        punpcklbw   xmm3,       xmm0              ; mm3 = p0..p3

-        movdqa      xmm1,       xmm3              ; mm1 = p0..p3

-        psllw       xmm3,       2

-        movdqa      xmm5,       xmm4

-        psrldq      xmm5,       3

-        punpcklbw   xmm5,       xmm0              ; mm5 = p1..p4

-        paddusw     xmm3,       xmm5              ; mm3 += mm6

-        ; thresholding

-        movdqa      xmm7,       xmm1              ; mm7 = p0..p3

-        psubusw     xmm7,       xmm5              ; mm7 = p0..p3 - p1..p4

-        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3

-        paddusw     xmm7,       xmm5              ; mm7 = abs(p0..p3 - p1..p4)

-        pcmpgtw     xmm7,       xmm2

-        movdqa      xmm5,       xmm4

-        psrldq      xmm5,       4

-        punpcklbw   xmm5,       xmm0              ; mm5 = p2..p5

-        paddusw     xmm3,       xmm5              ; mm3 += mm5

-        ; thresholding

-        movdqa      xmm6,       xmm1              ; mm6 = p0..p3

-        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4

-        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3

-        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)

-        pcmpgtw     xmm6,       xmm2

-        por         xmm7,       xmm6              ; accumulate thresholds

-        movdqa      xmm5,       xmm4              ; mm5 = p-2..p5

-        punpcklbw   xmm5,       xmm0              ; mm5 = p-2..p1

-        paddusw     xmm3,       xmm5              ; mm3 += mm5

-        ; thresholding

-        movdqa      xmm6,       xmm1              ; mm6 = p0..p3

-        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4

-        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3

-        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)

-        pcmpgtw     xmm6,       xmm2

-        por         xmm7,       xmm6              ; accumulate thresholds

-        psrldq      xmm4,       1                   ; mm4 = p-1..p5

-        punpcklbw   xmm4,       xmm0              ; mm4 = p-1..p2

-        paddusw     xmm3,       xmm4              ; mm3 += mm5

-        ; thresholding

-        movdqa      xmm6,       xmm1              ; mm6 = p0..p3

-        psubusw     xmm6,       xmm4              ; mm6 = p0..p3 - p1..p4

-        psubusw     xmm4,       xmm1              ; mm5 = p1..p4 - p0..p3

-        paddusw     xmm6,       xmm4              ; mm6 = abs(p0..p3 - p1..p4)

-        pcmpgtw     xmm6,       xmm2

-        por         xmm7,       xmm6              ; accumulate thresholds

-        paddusw     xmm3,       RD42              ; mm3 += round value

-        psraw       xmm3,       3                 ; mm3 /= 8

-        pand        xmm1,       xmm7              ; mm1 select vals > thresh from source

-        pandn       xmm7,       xmm3              ; mm7 select vals < thresh from blurred result

-        paddusw     xmm1,       xmm7              ; combination

-        packuswb    xmm1,       xmm0              ; pack to bytes

-        movq        QWORD PTR [rdi+rdx-8],  mm0   ; store previous four bytes

-        movdq2q     mm0,        xmm1

-        add         rdx,        8

-        cmp         edx,        dword arg(5) ;cols

-        jl          .acrossnextcol;

-        ; last 8 pixels

-        movq        QWORD PTR [rdi+rdx-8],  mm0

-        ; done with this rwo

-        add         rsi,rax               ; next line

-        mov         eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?

-        add         rdi,rax               ; next destination

-        mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?

-        dec         rcx                   ; decrement count

-        jnz         .nextrow              ; next row

-%if ABI_IS_32BIT=1 && CONFIG_PIC=1

-    add rsp,16

-    pop rsp

-%endif

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-%undef RD42

-;void vp9_mbpost_proc_down_xmm(unsigned char *dst,

-;                            int pitch, int rows, int cols,int flimit)

-extern sym(vp9_rv)

-global sym(vp9_mbpost_proc_down_xmm)

-sym(vp9_mbpost_proc_down_xmm):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub         rsp, 128+16

-    ; unsigned char d[16][8] at [rsp]

-    ; create flimit2 at [rsp+128]

-    mov         eax, dword ptr arg(4) ;flimit

-    mov         [rsp+128], eax

-    mov         [rsp+128+4], eax

-    mov         [rsp+128+8], eax

-    mov         [rsp+128+12], eax

-%define flimit4 [rsp+128]

-%if ABI_IS_32BIT=0

-    lea         r8,       [GLOBAL(sym(vp9_rv))]

-%endif

-    ;rows +=8;

-    add         dword arg(2), 8

-    ;for(c=0; c<cols; c+=8)

-.loop_col:

-            mov         rsi,        arg(0) ; s

-            pxor        xmm0,       xmm0        ;

-            movsxd      rax,        dword ptr arg(1) ;pitch       ;

-            neg         rax                                     ; rax = -pitch

-            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]

-            neg         rax

-            pxor        xmm5,       xmm5

-            pxor        xmm6,       xmm6        ;

-            pxor        xmm7,       xmm7        ;

-            mov         rdi,        rsi

-            mov         rcx,        15          ;

-.loop_initvar:

-            movq        xmm1,       QWORD PTR [rdi];

-            punpcklbw   xmm1,       xmm0        ;

-            paddw       xmm5,       xmm1        ;

-            pmullw      xmm1,       xmm1        ;

-            movdqa      xmm2,       xmm1        ;

-            punpcklwd   xmm1,       xmm0        ;

-            punpckhwd   xmm2,       xmm0        ;

-            paddd       xmm6,       xmm1        ;

-            paddd       xmm7,       xmm2        ;

-            lea         rdi,        [rdi+rax]   ;

-            dec         rcx

-            jne         .loop_initvar

-            ;save the var and sum

-            xor         rdx,        rdx

-.loop_row:

-            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]

-            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]

-            punpcklbw   xmm1,       xmm0

-            punpcklbw   xmm2,       xmm0

-            paddw       xmm5,       xmm2

-            psubw       xmm5,       xmm1

-            pmullw      xmm2,       xmm2

-            movdqa      xmm4,       xmm2

-            punpcklwd   xmm2,       xmm0

-            punpckhwd   xmm4,       xmm0

-            paddd       xmm6,       xmm2

-            paddd       xmm7,       xmm4

-            pmullw      xmm1,       xmm1

-            movdqa      xmm2,       xmm1

-            punpcklwd   xmm1,       xmm0

-            psubd       xmm6,       xmm1

-            punpckhwd   xmm2,       xmm0

-            psubd       xmm7,       xmm2

-            movdqa      xmm3,       xmm6

-            pslld       xmm3,       4

-            psubd       xmm3,       xmm6

-            movdqa      xmm1,       xmm5

-            movdqa      xmm4,       xmm5

-            pmullw      xmm1,       xmm1

-            pmulhw      xmm4,       xmm4

-            movdqa      xmm2,       xmm1

-            punpcklwd   xmm1,       xmm4

-            punpckhwd   xmm2,       xmm4

-            movdqa      xmm4,       xmm7

-            pslld       xmm4,       4

-            psubd       xmm4,       xmm7

-            psubd       xmm3,       xmm1

-            psubd       xmm4,       xmm2

-            psubd       xmm3,       flimit4

-            psubd       xmm4,       flimit4

-            psrad       xmm3,       31

-            psrad       xmm4,       31

-            packssdw    xmm3,       xmm4

-            packsswb    xmm3,       xmm0

-            movq        xmm1,       QWORD PTR [rsi+rax*8]

-            movq        xmm2,       xmm1

-            punpcklbw   xmm1,       xmm0

-            paddw       xmm1,       xmm5

-            mov         rcx,        rdx

-            and         rcx,        127

-%if ABI_IS_32BIT=1 && CONFIG_PIC=1

-            push        rax

-            lea         rax,        [GLOBAL(sym(vp9_rv))]

-            movdqu      xmm4,       [rax + rcx*2] ;vp9_rv[rcx*2]

-            pop         rax

-%elif ABI_IS_32BIT=0

-            movdqu      xmm4,       [r8 + rcx*2] ;vp9_rv[rcx*2]

-%else

-            movdqu      xmm4,       [sym(vp9_rv) + rcx*2]

-%endif

-            paddw       xmm1,       xmm4

-            ;paddw     xmm1,       eight8s

-            psraw       xmm1,       4

-            packuswb    xmm1,       xmm0

-            pand        xmm1,       xmm3

-            pandn       xmm3,       xmm2

-            por         xmm1,       xmm3

-            and         rcx,        15

-            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]

-            mov         rcx,        rdx

-            sub         rcx,        8

-            and         rcx,        15

-            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]

-            movq        [rsi],      mm0

-            lea         rsi,        [rsi+rax]

-            lea         rdi,        [rdi+rax]

-            add         rdx,        1

-            cmp         edx,        dword arg(2) ;rows

-            jl          .loop_row

-        add         dword arg(0), 8 ; s += 8

-        sub         dword arg(3), 8 ; cols -= 8

-        cmp         dword arg(3), 0

-        jg          .loop_col

-    add         rsp, 128+16

-    pop         rsp

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-%undef flimit4

-;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src,

-;                                int pitch, int rows, int cols,int flimit)

-global sym(vp9_mbpost_proc_across_ip_xmm)

-sym(vp9_mbpost_proc_across_ip_xmm):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub         rsp, 16

-    ; create flimit4 at [rsp]

-    mov         eax, dword ptr arg(4) ;flimit

-    mov         [rsp], eax

-    mov         [rsp+4], eax

-    mov         [rsp+8], eax

-    mov         [rsp+12], eax

-%define flimit4 [rsp]

-    ;for(r=0;r<rows;r++)

-.ip_row_loop:

-        xor         rdx,    rdx ;sumsq=0;

-        xor         rcx,    rcx ;sum=0;

-        mov         rsi,    arg(0); s

-        mov         rdi,    -8

-.ip_var_loop:

-        ;for(i=-8;i<=6;i++)

-        ;{

-        ;    sumsq += s[i]*s[i];

-        ;    sum   += s[i];

-        ;}

-        movzx       eax, byte [rsi+rdi]

-        add         ecx, eax

-        mul         al

-        add         edx, eax

-        add         rdi, 1

-        cmp         rdi, 6

-        jle         .ip_var_loop

-            ;mov         rax,    sumsq

-            ;movd        xmm7,   rax

-            movd        xmm7,   edx

-            ;mov         rax,    sum

-            ;movd        xmm6,   rax

-            movd        xmm6,   ecx

-            mov         rsi,    arg(0) ;s

-            xor         rcx,    rcx

-            movsxd      rdx,    dword arg(3) ;cols

-            add         rdx,    8

-            pxor        mm0,    mm0

-            pxor        mm1,    mm1

-            pxor        xmm0,   xmm0

-.nextcol4:

-            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5

-            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10

-            punpcklbw   xmm1,   xmm0                    ; expanding

-            punpcklbw   xmm2,   xmm0                    ; expanding

-            punpcklwd   xmm1,   xmm0                    ; expanding to dwords

-            punpcklwd   xmm2,   xmm0                    ; expanding to dwords

-            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5

-            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2

-            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5

-            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5

-            paddd       xmm6,   xmm2

-            paddd       xmm7,   xmm1

-            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones

-            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones

-            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000

-            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000

-            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared

-            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared

-            paddd       xmm6,   xmm4

-            paddd       xmm7,   xmm3

-            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared

-            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared

-            paddd       xmm7,   xmm3

-            paddd       xmm6,   xmm4

-            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared

-            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared

-            paddd       xmm7,   xmm3

-            paddd       xmm6,   xmm4

-            movdqa      xmm3,   xmm6

-            pmaddwd     xmm3,   xmm3

-            movdqa      xmm5,   xmm7

-            pslld       xmm5,   4

-            psubd       xmm5,   xmm7

-            psubd       xmm5,   xmm3

-            psubd       xmm5,   flimit4

-            psrad       xmm5,   31

-            packssdw    xmm5,   xmm0

-            packsswb    xmm5,   xmm0

-            movd        xmm1,   DWORD PTR [rsi+rcx]

-            movq        xmm2,   xmm1

-            punpcklbw   xmm1,   xmm0

-            punpcklwd   xmm1,   xmm0

-            paddd       xmm1,   xmm6

-            paddd       xmm1,   [GLOBAL(four8s)]

-            psrad       xmm1,   4

-            packssdw    xmm1,   xmm0

-            packuswb    xmm1,   xmm0

-            pand        xmm1,   xmm5

-            pandn       xmm5,   xmm2

-            por         xmm5,   xmm1

-            movd        [rsi+rcx-8],  mm0

-            movq        mm0,    mm1

-            movdq2q     mm1,    xmm5

-            psrldq      xmm7,   12

-            psrldq      xmm6,   12

-            add         rcx,    4

-            cmp         rcx,    rdx

-            jl          .nextcol4

-        ;s+=pitch;

-        movsxd rax, dword arg(1)

-        add    arg(0), rax

-        sub dword arg(2), 1 ;rows-=1

-        cmp dword arg(2), 0

-        jg .ip_row_loop

-    add         rsp, 16

-    pop         rsp

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-%undef flimit4

-;void vp9_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,

-;                            unsigned char blackclamp[16],

-;                            unsigned char whiteclamp[16],

-;                            unsigned char bothclamp[16],

-;                            unsigned int Width, unsigned int Height, int Pitch)

-extern sym(rand)

-global sym(vp9_plane_add_noise_wmt)

-sym(vp9_plane_add_noise_wmt):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 8

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-.addnoise_loop:

-    call sym(rand) WRT_PLT

-    mov     rcx, arg(1) ;noise

-    and     rax, 0xff

-    add     rcx, rax

-    ; we rely on the fact that the clamping vectors are stored contiguously

-    ; in black/white/both order. Note that we have to reload this here because

-    ; rdx could be trashed by rand()

-    mov     rdx, arg(2) ; blackclamp

-            mov     rdi, rcx

-            movsxd  rcx, dword arg(5) ;[Width]

-            mov     rsi, arg(0) ;Pos

-            xor         rax,rax

-.addnoise_nextset:

-            movdqu      xmm1,[rsi+rax]         ; get the source

-            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise

-            paddusb     xmm1, [rdx+32] ;bothclamp

-            psubusb     xmm1, [rdx+16] ;whiteclamp

-            movdqu      xmm2,[rdi+rax]         ; get the noise for this line

-            paddb       xmm1,xmm2              ; add it in

-            movdqu      [rsi+rax],xmm1         ; store the result

-            add         rax,16                 ; move to the next line

-            cmp         rax, rcx

-            jl          .addnoise_nextset

-    movsxd  rax, dword arg(7) ; Pitch

-    add     arg(0), rax ; Start += Pitch

-    sub     dword arg(6), 1   ; Height -= 1

-    jg      .addnoise_loop

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-rd42:

-    times 8 dw 0x04

-four8s:

-    times 4 dd 8

--- a/vp8/common/x86/postproc_x86.h

+++ /dev/null

@@ -1,64 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef POSTPROC_X86_H

-#define POSTPROC_X86_H

-/* Note:

- *

- * This platform is commonly built for runtime CPU detection. If you modify

- * any of the function mappings present in this file, be sure to also update

- * them in the function pointer initialization code

- */

-#if HAVE_MMX

-extern prototype_postproc_inplace(vp9_mbpost_proc_down_mmx);

-extern prototype_postproc(vp9_post_proc_down_and_across_mmx);

-extern prototype_postproc_addnoise(vp9_plane_add_noise_mmx);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_postproc_down

-#define vp9_postproc_down vp9_mbpost_proc_down_mmx

-#undef  vp9_postproc_downacross

-#define vp9_postproc_downacross vp9_post_proc_down_and_across_mmx

-#undef  vp9_postproc_addnoise

-#define vp9_postproc_addnoise vp9_plane_add_noise_mmx

-#endif

-#endif

-#if HAVE_SSE2

-extern prototype_postproc_inplace(vp9_mbpost_proc_down_xmm);

-extern prototype_postproc_inplace(vp9_mbpost_proc_across_ip_xmm);

-extern prototype_postproc(vp9_post_proc_down_and_across_xmm);

-extern prototype_postproc_addnoise(vp9_plane_add_noise_wmt);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_postproc_down

-#define vp9_postproc_down vp9_mbpost_proc_down_xmm

-#undef  vp9_postproc_across

-#define vp9_postproc_across vp9_mbpost_proc_across_ip_xmm

-#undef  vp9_postproc_downacross

-#define vp9_postproc_downacross vp9_post_proc_down_and_across_xmm

-#undef  vp9_postproc_addnoise

-#define vp9_postproc_addnoise vp9_plane_add_noise_wmt

-#endif

-#endif

-#endif

--- a/vp8/common/x86/recon_mmx.asm

+++ /dev/null

@@ -1,321 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;void vp9_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride)

-global sym(vp9_recon_b_mmx)

-sym(vp9_recon_b_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov       rsi, arg(0) ;s

-        mov       rdi, arg(2) ;d

-        mov       rdx, arg(1) ;q

-        movsxd    rax, dword ptr arg(3) ;stride

-        pxor      mm0, mm0

-        movd      mm1, [rsi]

-        punpcklbw mm1, mm0

-        paddsw    mm1, [rdx]

-        packuswb  mm1,  mm0              ; pack and unpack to saturate

-        movd      [rdi], mm1

-        movd      mm2, [rsi+16]

-        punpcklbw mm2, mm0

-        paddsw    mm2, [rdx+32]

-        packuswb  mm2, mm0              ; pack and unpack to saturate

-        movd      [rdi+rax], mm2

-        movd      mm3, [rsi+32]

-        punpcklbw mm3, mm0

-        paddsw    mm3, [rdx+64]

-        packuswb  mm3,  mm0              ; pack and unpack to saturate

-        movd      [rdi+2*rax], mm3

-        add       rdi, rax

-        movd      mm4, [rsi+48]

-        punpcklbw mm4, mm0

-        paddsw    mm4, [rdx+96]

-        packuswb  mm4, mm0              ; pack and unpack to saturate

-        movd      [rdi+2*rax], mm4

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void copy_mem8x8_mmx(

-;    unsigned char *src,

-;    int src_stride,

-;    unsigned char *dst,

-;    int dst_stride

-;    )

-global sym(vp9_copy_mem8x8_mmx)

-sym(vp9_copy_mem8x8_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rsi,        arg(0) ;src;

-        movq        mm0,        [rsi]

-        movsxd      rax,        dword ptr arg(1) ;src_stride;

-        mov         rdi,        arg(2) ;dst;

-        movq        mm1,        [rsi+rax]

-        movq        mm2,        [rsi+rax*2]

-        movsxd      rcx,        dword ptr arg(3) ;dst_stride

-        lea         rsi,        [rsi+rax*2]

-        movq        [rdi],      mm0

-        add         rsi,        rax

-        movq        [rdi+rcx],      mm1

-        movq        [rdi+rcx*2],    mm2

-        lea         rdi,        [rdi+rcx*2]

-        movq        mm3,        [rsi]

-        add         rdi,        rcx

-        movq        mm4,        [rsi+rax]

-        movq        mm5,        [rsi+rax*2]

-        movq        [rdi],      mm3

-        lea         rsi,        [rsi+rax*2]

-        movq        [rdi+rcx],  mm4

-        movq        [rdi+rcx*2],    mm5

-        lea         rdi,        [rdi+rcx*2]

-        movq        mm0,        [rsi+rax]

-        movq        mm1,        [rsi+rax*2]

-        movq        [rdi+rcx],  mm0

-        movq        [rdi+rcx*2],mm1

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void copy_mem8x4_mmx(

-;    unsigned char *src,

-;    int src_stride,

-;    unsigned char *dst,

-;    int dst_stride

-;    )

-global sym(vp9_copy_mem8x4_mmx)

-sym(vp9_copy_mem8x4_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rsi,        arg(0) ;src;

-        movq        mm0,        [rsi]

-        movsxd      rax,        dword ptr arg(1) ;src_stride;

-        mov         rdi,        arg(2) ;dst;

-        movq        mm1,        [rsi+rax]

-        movq        mm2,        [rsi+rax*2]

-        movsxd      rcx,        dword ptr arg(3) ;dst_stride

-        lea         rsi,        [rsi+rax*2]

-        movq        [rdi],      mm0

-        movq        [rdi+rcx],      mm1

-        movq        [rdi+rcx*2],    mm2

-        lea         rdi,        [rdi+rcx*2]

-        movq        mm3,        [rsi+rax]

-        movq        [rdi+rcx],      mm3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void copy_mem16x16_mmx(

-;    unsigned char *src,

-;    int src_stride,

-;    unsigned char *dst,

-;    int dst_stride

-;    )

-global sym(vp9_copy_mem16x16_mmx)

-sym(vp9_copy_mem16x16_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rsi,        arg(0) ;src;

-        movsxd      rax,        dword ptr arg(1) ;src_stride;

-        mov         rdi,        arg(2) ;dst;

-        movsxd      rcx,        dword ptr arg(3) ;dst_stride

-        movq        mm0,            [rsi]

-        movq        mm3,            [rsi+8];

-        movq        mm1,            [rsi+rax]

-        movq        mm4,            [rsi+rax+8]

-        movq        mm2,            [rsi+rax*2]

-        movq        mm5,            [rsi+rax*2+8]

-        lea         rsi,            [rsi+rax*2]

-        add         rsi,            rax

-        movq        [rdi],          mm0

-        movq        [rdi+8],        mm3

-        movq        [rdi+rcx],      mm1

-        movq        [rdi+rcx+8],    mm4

-        movq        [rdi+rcx*2],    mm2

-        movq        [rdi+rcx*2+8],  mm5

-        lea         rdi,            [rdi+rcx*2]

-        add         rdi,            rcx

-        movq        mm0,            [rsi]

-        movq        mm3,            [rsi+8];

-        movq        mm1,            [rsi+rax]

-        movq        mm4,            [rsi+rax+8]

-        movq        mm2,            [rsi+rax*2]

-        movq        mm5,            [rsi+rax*2+8]

-        lea         rsi,            [rsi+rax*2]

-        add         rsi,            rax

-        movq        [rdi],          mm0

-        movq        [rdi+8],        mm3

-        movq        [rdi+rcx],      mm1

-        movq        [rdi+rcx+8],    mm4

-        movq        [rdi+rcx*2],    mm2

-        movq        [rdi+rcx*2+8],  mm5

-        lea         rdi,            [rdi+rcx*2]

-        add         rdi,            rcx

-        movq        mm0,            [rsi]

-        movq        mm3,            [rsi+8];

-        movq        mm1,            [rsi+rax]

-        movq        mm4,            [rsi+rax+8]

-        movq        mm2,            [rsi+rax*2]

-        movq        mm5,            [rsi+rax*2+8]

-        lea         rsi,            [rsi+rax*2]

-        add         rsi,            rax

-        movq        [rdi],          mm0

-        movq        [rdi+8],        mm3

-        movq        [rdi+rcx],      mm1

-        movq        [rdi+rcx+8],    mm4

-        movq        [rdi+rcx*2],    mm2

-        movq        [rdi+rcx*2+8],  mm5

-        lea         rdi,            [rdi+rcx*2]

-        add         rdi,            rcx

-        movq        mm0,            [rsi]

-        movq        mm3,            [rsi+8];

-        movq        mm1,            [rsi+rax]

-        movq        mm4,            [rsi+rax+8]

-        movq        mm2,            [rsi+rax*2]

-        movq        mm5,            [rsi+rax*2+8]

-        lea         rsi,            [rsi+rax*2]

-        add         rsi,            rax

-        movq        [rdi],          mm0

-        movq        [rdi+8],        mm3

-        movq        [rdi+rcx],      mm1

-        movq        [rdi+rcx+8],    mm4

-        movq        [rdi+rcx*2],    mm2

-        movq        [rdi+rcx*2+8],  mm5

-        lea         rdi,            [rdi+rcx*2]

-        add         rdi,            rcx

-        movq        mm0,            [rsi]

-        movq        mm3,            [rsi+8];

-        movq        mm1,            [rsi+rax]

-        movq        mm4,            [rsi+rax+8]

-        movq        mm2,            [rsi+rax*2]

-        movq        mm5,            [rsi+rax*2+8]

-        lea         rsi,            [rsi+rax*2]

-        add         rsi,            rax

-        movq        [rdi],          mm0

-        movq        [rdi+8],        mm3

-        movq        [rdi+rcx],      mm1

-        movq        [rdi+rcx+8],    mm4

-        movq        [rdi+rcx*2],    mm2

-        movq        [rdi+rcx*2+8],  mm5

-        lea         rdi,            [rdi+rcx*2]

-        add         rdi,            rcx

-        movq        mm0,            [rsi]

-        movq        mm3,            [rsi+8];

-        movq        [rdi],          mm0

-        movq        [rdi+8],        mm3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

--- a/vp8/common/x86/recon_sse2.asm

+++ /dev/null

@@ -1,688 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;void vp9_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)

-global sym(vp9_recon2b_sse2)

-sym(vp9_recon2b_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rsi,        arg(0) ;s

-        mov         rdi,        arg(2) ;d

-        mov         rdx,        arg(1) ;q

-        movsxd      rax,        dword ptr arg(3) ;stride

-        pxor        xmm0,       xmm0

-        movq        xmm1,       MMWORD PTR [rsi]

-        punpcklbw   xmm1,       xmm0

-        paddsw      xmm1,       XMMWORD PTR [rdx]

-        packuswb    xmm1,       xmm0              ; pack and unpack to saturate

-        movq        MMWORD PTR [rdi],   xmm1

-        movq        xmm2,       MMWORD PTR [rsi+8]

-        punpcklbw   xmm2,       xmm0

-        paddsw      xmm2,       XMMWORD PTR [rdx+16]

-        packuswb    xmm2,       xmm0              ; pack and unpack to saturate

-        movq        MMWORD PTR [rdi+rax],   xmm2

-        movq        xmm3,       MMWORD PTR [rsi+16]

-        punpcklbw   xmm3,       xmm0

-        paddsw      xmm3,       XMMWORD PTR [rdx+32]

-        packuswb    xmm3,       xmm0              ; pack and unpack to saturate

-        movq        MMWORD PTR [rdi+rax*2], xmm3

-        add         rdi, rax

-        movq        xmm4,       MMWORD PTR [rsi+24]

-        punpcklbw   xmm4,       xmm0

-        paddsw      xmm4,       XMMWORD PTR [rdx+48]

-        packuswb    xmm4,       xmm0              ; pack and unpack to saturate

-        movq        MMWORD PTR [rdi+rax*2], xmm4

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)

-global sym(vp9_recon4b_sse2)

-sym(vp9_recon4b_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    SAVE_XMM 7

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rsi,        arg(0) ;s

-        mov         rdi,        arg(2) ;d

-        mov         rdx,        arg(1) ;q

-        movsxd      rax,        dword ptr arg(3) ;stride

-        pxor        xmm0,       xmm0

-        movdqa      xmm1,       XMMWORD PTR [rsi]

-        movdqa      xmm5,       xmm1

-        punpcklbw   xmm1,       xmm0

-        punpckhbw   xmm5,       xmm0

-        paddsw      xmm1,       XMMWORD PTR [rdx]

-        paddsw      xmm5,       XMMWORD PTR [rdx+16]

-        packuswb    xmm1,       xmm5              ; pack and unpack to saturate

-        movdqa      XMMWORD PTR [rdi],  xmm1

-        movdqa      xmm2,       XMMWORD PTR [rsi+16]

-        movdqa      xmm6,       xmm2

-        punpcklbw   xmm2,       xmm0

-        punpckhbw   xmm6,       xmm0

-        paddsw      xmm2,       XMMWORD PTR [rdx+32]

-        paddsw      xmm6,       XMMWORD PTR [rdx+48]

-        packuswb    xmm2,       xmm6              ; pack and unpack to saturate

-        movdqa      XMMWORD PTR [rdi+rax],  xmm2

-        movdqa      xmm3,       XMMWORD PTR [rsi+32]

-        movdqa      xmm7,       xmm3

-        punpcklbw   xmm3,       xmm0

-        punpckhbw   xmm7,       xmm0

-        paddsw      xmm3,       XMMWORD PTR [rdx+64]

-        paddsw      xmm7,       XMMWORD PTR [rdx+80]

-        packuswb    xmm3,       xmm7              ; pack and unpack to saturate

-        movdqa      XMMWORD PTR [rdi+rax*2],    xmm3

-        add       rdi, rax

-        movdqa      xmm4,       XMMWORD PTR [rsi+48]

-        movdqa      xmm5,       xmm4

-        punpcklbw   xmm4,       xmm0

-        punpckhbw   xmm5,       xmm0

-        paddsw      xmm4,       XMMWORD PTR [rdx+96]

-        paddsw      xmm5,       XMMWORD PTR [rdx+112]

-        packuswb    xmm4,       xmm5              ; pack and unpack to saturate

-        movdqa      XMMWORD PTR [rdi+rax*2],    xmm4

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void copy_mem16x16_sse2(

-;    unsigned char *src,

-;    int src_stride,

-;    unsigned char *dst,

-;    int dst_stride

-;    )

-global sym(vp9_copy_mem16x16_sse2)

-sym(vp9_copy_mem16x16_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rsi,        arg(0) ;src;

-        movdqu      xmm0,       [rsi]

-        movsxd      rax,        dword ptr arg(1) ;src_stride;

-        mov         rdi,        arg(2) ;dst;

-        movdqu      xmm1,       [rsi+rax]

-        movdqu      xmm2,       [rsi+rax*2]

-        movsxd      rcx,        dword ptr arg(3) ;dst_stride

-        lea         rsi,        [rsi+rax*2]

-        movdqa      [rdi],      xmm0

-        add         rsi,        rax

-        movdqa      [rdi+rcx],  xmm1

-        movdqa      [rdi+rcx*2],xmm2

-        lea         rdi,        [rdi+rcx*2]

-        movdqu      xmm3,       [rsi]

-        add         rdi,        rcx

-        movdqu      xmm4,       [rsi+rax]

-        movdqu      xmm5,       [rsi+rax*2]

-        lea         rsi,        [rsi+rax*2]

-        movdqa      [rdi],  xmm3

-        add         rsi,        rax

-        movdqa      [rdi+rcx],  xmm4

-        movdqa      [rdi+rcx*2],xmm5

-        lea         rdi,        [rdi+rcx*2]

-        movdqu      xmm0,       [rsi]

-        add         rdi,        rcx

-        movdqu      xmm1,       [rsi+rax]

-        movdqu      xmm2,       [rsi+rax*2]

-        lea         rsi,        [rsi+rax*2]

-        movdqa      [rdi],      xmm0

-        add         rsi,        rax

-        movdqa      [rdi+rcx],  xmm1

-        movdqa      [rdi+rcx*2],    xmm2

-        movdqu      xmm3,       [rsi]

-        movdqu      xmm4,       [rsi+rax]

-        lea         rdi,        [rdi+rcx*2]

-        add         rdi,        rcx

-        movdqu      xmm5,       [rsi+rax*2]

-        lea         rsi,        [rsi+rax*2]

-        movdqa      [rdi],  xmm3

-        add         rsi,        rax

-        movdqa      [rdi+rcx],  xmm4

-        movdqa      [rdi+rcx*2],xmm5

-        movdqu      xmm0,       [rsi]

-        lea         rdi,        [rdi+rcx*2]

-        movdqu      xmm1,       [rsi+rax]

-        add         rdi,        rcx

-        movdqu      xmm2,       [rsi+rax*2]

-        lea         rsi,        [rsi+rax*2]

-        movdqa      [rdi],      xmm0

-        movdqa      [rdi+rcx],  xmm1

-        movdqa      [rdi+rcx*2],xmm2

-        movdqu      xmm3,       [rsi+rax]

-        lea         rdi,        [rdi+rcx*2]

-        movdqa      [rdi+rcx],  xmm3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_intra_pred_uv_dc_mmx2(

-;    unsigned char *dst,

-;    int dst_stride

-;    unsigned char *src,

-;    int src_stride,

-;    )

-global sym(vp9_intra_pred_uv_dc_mmx2)

-sym(vp9_intra_pred_uv_dc_mmx2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    push        rsi

-    push        rdi

-    ; end prolog

-    ; from top

-    mov         rsi,        arg(2) ;src;

-    movsxd      rax,        dword ptr arg(3) ;src_stride;

-    sub         rsi,        rax

-    pxor        mm0,        mm0

-    movq        mm1,        [rsi]

-    psadbw      mm1,        mm0

-    ; from left

-    dec         rsi

-    lea         rdi,        [rax*3]

-    movzx       ecx,        byte [rsi+rax]

-    movzx       edx,        byte [rsi+rax*2]

-    add         ecx,        edx

-    movzx       edx,        byte [rsi+rdi]

-    add         ecx,        edx

-    lea         rsi,        [rsi+rax*4]

-    movzx       edx,        byte [rsi]

-    add         ecx,        edx

-    movzx       edx,        byte [rsi+rax]

-    add         ecx,        edx

-    movzx       edx,        byte [rsi+rax*2]

-    add         ecx,        edx

-    movzx       edx,        byte [rsi+rdi]

-    add         ecx,        edx

-    movzx       edx,        byte [rsi+rax*4]

-    add         ecx,        edx

-    ; add up

-    pextrw      edx,        mm1, 0x0

-    lea         edx,        [edx+ecx+8]

-    sar         edx,        4

-    movd        mm1,        edx

-    pshufw      mm1,        mm1, 0x0

-    packuswb    mm1,        mm1

-    ; write out

-    mov         rdi,        arg(0) ;dst;

-    movsxd      rcx,        dword ptr arg(1) ;dst_stride

-    lea         rax,        [rcx*3]

-    movq [rdi      ],       mm1

-    movq [rdi+rcx  ],       mm1

-    movq [rdi+rcx*2],       mm1

-    movq [rdi+rax  ],       mm1

-    lea         rdi,        [rdi+rcx*4]

-    movq [rdi      ],       mm1

-    movq [rdi+rcx  ],       mm1

-    movq [rdi+rcx*2],       mm1

-    movq [rdi+rax  ],       mm1

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_intra_pred_uv_dctop_mmx2(

-;    unsigned char *dst,

-;    int dst_stride

-;    unsigned char *src,

-;    int src_stride,

-;    )

-global sym(vp9_intra_pred_uv_dctop_mmx2)

-sym(vp9_intra_pred_uv_dctop_mmx2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ; from top

-    mov         rsi,        arg(2) ;src;

-    movsxd      rax,        dword ptr arg(3) ;src_stride;

-    sub         rsi,        rax

-    pxor        mm0,        mm0

-    movq        mm1,        [rsi]

-    psadbw      mm1,        mm0

-    ; add up

-    paddw       mm1,        [GLOBAL(dc_4)]

-    psraw       mm1,        3

-    pshufw      mm1,        mm1, 0x0

-    packuswb    mm1,        mm1

-    ; write out

-    mov         rdi,        arg(0) ;dst;

-    movsxd      rcx,        dword ptr arg(1) ;dst_stride

-    lea         rax,        [rcx*3]

-    movq [rdi      ],       mm1

-    movq [rdi+rcx  ],       mm1

-    movq [rdi+rcx*2],       mm1

-    movq [rdi+rax  ],       mm1

-    lea         rdi,        [rdi+rcx*4]

-    movq [rdi      ],       mm1

-    movq [rdi+rcx  ],       mm1

-    movq [rdi+rcx*2],       mm1

-    movq [rdi+rax  ],       mm1

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_intra_pred_uv_dcleft_mmx2(

-;    unsigned char *dst,

-;    int dst_stride

-;    unsigned char *src,

-;    int src_stride,

-;    )

-global sym(vp9_intra_pred_uv_dcleft_mmx2)

-sym(vp9_intra_pred_uv_dcleft_mmx2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    push        rsi

-    push        rdi

-    ; end prolog

-    ; from left

-    mov         rsi,        arg(2) ;src;

-    movsxd      rax,        dword ptr arg(3) ;src_stride;

-    dec         rsi

-    lea         rdi,        [rax*3]

-    movzx       ecx,        byte [rsi]

-    movzx       edx,        byte [rsi+rax]

-    add         ecx,        edx

-    movzx       edx,        byte [rsi+rax*2]

-    add         ecx,        edx

-    movzx       edx,        byte [rsi+rdi]

-    add         ecx,        edx

-    lea         rsi,        [rsi+rax*4]

-    movzx       edx,        byte [rsi]

-    add         ecx,        edx

-    movzx       edx,        byte [rsi+rax]

-    add         ecx,        edx

-    movzx       edx,        byte [rsi+rax*2]

-    add         ecx,        edx

-    movzx       edx,        byte [rsi+rdi]

-    lea         edx,        [ecx+edx+4]

-    ; add up

-    shr         edx,        3

-    movd        mm1,        edx

-    pshufw      mm1,        mm1, 0x0

-    packuswb    mm1,        mm1

-    ; write out

-    mov         rdi,        arg(0) ;dst;

-    movsxd      rcx,        dword ptr arg(1) ;dst_stride

-    lea         rax,        [rcx*3]

-    movq [rdi      ],       mm1

-    movq [rdi+rcx  ],       mm1

-    movq [rdi+rcx*2],       mm1

-    movq [rdi+rax  ],       mm1

-    lea         rdi,        [rdi+rcx*4]

-    movq [rdi      ],       mm1

-    movq [rdi+rcx  ],       mm1

-    movq [rdi+rcx*2],       mm1

-    movq [rdi+rax  ],       mm1

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_intra_pred_uv_dc128_mmx(

-;    unsigned char *dst,

-;    int dst_stride

-;    unsigned char *src,

-;    int src_stride,

-;    )

-global sym(vp9_intra_pred_uv_dc128_mmx)

-sym(vp9_intra_pred_uv_dc128_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    GET_GOT     rbx

-    ; end prolog

-    ; write out

-    movq        mm1,        [GLOBAL(dc_128)]

-    mov         rax,        arg(0) ;dst;

-    movsxd      rdx,        dword ptr arg(1) ;dst_stride

-    lea         rcx,        [rdx*3]

-    movq [rax      ],       mm1

-    movq [rax+rdx  ],       mm1

-    movq [rax+rdx*2],       mm1

-    movq [rax+rcx  ],       mm1

-    lea         rax,        [rax+rdx*4]

-    movq [rax      ],       mm1

-    movq [rax+rdx  ],       mm1

-    movq [rax+rdx*2],       mm1

-    movq [rax+rcx  ],       mm1

-    ; begin epilog

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_intra_pred_uv_tm_sse2(

-;    unsigned char *dst,

-;    int dst_stride

-;    unsigned char *src,

-;    int src_stride,

-;    )

-%macro vp9_intra_pred_uv_tm 1

-global sym(vp9_intra_pred_uv_tm_%1)

-sym(vp9_intra_pred_uv_tm_%1):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ; read top row

-    mov         edx,        4

-    mov         rsi,        arg(2) ;src;

-    movsxd      rax,        dword ptr arg(3) ;src_stride;

-    sub         rsi,        rax

-    pxor        xmm0,       xmm0

-%ifidn %1, ssse3

-    movdqa      xmm2,       [GLOBAL(dc_1024)]

-%endif

-    movq        xmm1,       [rsi]

-    punpcklbw   xmm1,       xmm0

-    ; set up left ptrs ans subtract topleft

-    movd        xmm3,       [rsi-1]

-    lea         rsi,        [rsi+rax-1]

-%ifidn %1, sse2

-    punpcklbw   xmm3,       xmm0

-    pshuflw     xmm3,       xmm3, 0x0

-    punpcklqdq  xmm3,       xmm3

-%else

-    pshufb      xmm3,       xmm2

-%endif

-    psubw       xmm1,       xmm3

-    ; set up dest ptrs

-    mov         rdi,        arg(0) ;dst;

-    movsxd      rcx,        dword ptr arg(1) ;dst_stride

-.vp9_intra_pred_uv_tm_%1_loop:

-    movd        xmm3,       [rsi]

-    movd        xmm5,       [rsi+rax]

-%ifidn %1, sse2

-    punpcklbw   xmm3,       xmm0

-    punpcklbw   xmm5,       xmm0

-    pshuflw     xmm3,       xmm3, 0x0

-    pshuflw     xmm5,       xmm5, 0x0

-    punpcklqdq  xmm3,       xmm3

-    punpcklqdq  xmm5,       xmm5

-%else

-    pshufb      xmm3,       xmm2

-    pshufb      xmm5,       xmm2

-%endif

-    paddw       xmm3,       xmm1

-    paddw       xmm5,       xmm1

-    packuswb    xmm3,       xmm5

-    movq  [rdi    ],        xmm3

-    movhps[rdi+rcx],        xmm3

-    lea         rsi,        [rsi+rax*2]

-    lea         rdi,        [rdi+rcx*2]

-    dec         edx

-    jnz .vp9_intra_pred_uv_tm_%1_loop

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-%endmacro

-vp9_intra_pred_uv_tm sse2

-vp9_intra_pred_uv_tm ssse3

-;void vp9_intra_pred_uv_ve_mmx(

-;    unsigned char *dst,

-;    int dst_stride

-;    unsigned char *src,

-;    int src_stride,

-;    )

-global sym(vp9_intra_pred_uv_ve_mmx)

-sym(vp9_intra_pred_uv_ve_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    ; end prolog

-    ; read from top

-    mov         rax,        arg(2) ;src;

-    movsxd      rdx,        dword ptr arg(3) ;src_stride;

-    sub         rax,        rdx

-    movq        mm1,        [rax]

-    ; write out

-    mov         rax,        arg(0) ;dst;

-    movsxd      rdx,        dword ptr arg(1) ;dst_stride

-    lea         rcx,        [rdx*3]

-    movq [rax      ],       mm1

-    movq [rax+rdx  ],       mm1

-    movq [rax+rdx*2],       mm1

-    movq [rax+rcx  ],       mm1

-    lea         rax,        [rax+rdx*4]

-    movq [rax      ],       mm1

-    movq [rax+rdx  ],       mm1

-    movq [rax+rdx*2],       mm1

-    movq [rax+rcx  ],       mm1

-    ; begin epilog

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_intra_pred_uv_ho_mmx2(

-;    unsigned char *dst,

-;    int dst_stride

-;    unsigned char *src,

-;    int src_stride,

-;    )

-%macro vp9_intra_pred_uv_ho 1

-global sym(vp9_intra_pred_uv_ho_%1)

-sym(vp9_intra_pred_uv_ho_%1):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    push        rsi

-    push        rdi

-%ifidn %1, ssse3

-%ifndef GET_GOT_SAVE_ARG

-    push        rbx

-%endif

-    GET_GOT     rbx

-%endif

-    ; end prolog

-    ; read from left and write out

-%ifidn %1, mmx2

-    mov         edx,        4

-%endif

-    mov         rsi,        arg(2) ;src;

-    movsxd      rax,        dword ptr arg(3) ;src_stride;

-    mov         rdi,        arg(0) ;dst;

-    movsxd      rcx,        dword ptr arg(1) ;dst_stride

-%ifidn %1, ssse3

-    lea         rdx,        [rcx*3]

-    movdqa      xmm2,       [GLOBAL(dc_00001111)]

-    lea         rbx,        [rax*3]

-%endif

-    dec         rsi

-%ifidn %1, mmx2

-.vp9_intra_pred_uv_ho_%1_loop:

-    movd        mm0,        [rsi]

-    movd        mm1,        [rsi+rax]

-    punpcklbw   mm0,        mm0

-    punpcklbw   mm1,        mm1

-    pshufw      mm0,        mm0, 0x0

-    pshufw      mm1,        mm1, 0x0

-    movq  [rdi    ],        mm0

-    movq  [rdi+rcx],        mm1

-    lea         rsi,        [rsi+rax*2]

-    lea         rdi,        [rdi+rcx*2]

-    dec         edx

-    jnz .vp9_intra_pred_uv_ho_%1_loop

-%else

-    movd        xmm0,       [rsi]

-    movd        xmm3,       [rsi+rax]

-    movd        xmm1,       [rsi+rax*2]

-    movd        xmm4,       [rsi+rbx]

-    punpcklbw   xmm0,       xmm3

-    punpcklbw   xmm1,       xmm4

-    pshufb      xmm0,       xmm2

-    pshufb      xmm1,       xmm2

-    movq   [rdi    ],       xmm0

-    movhps [rdi+rcx],       xmm0

-    movq [rdi+rcx*2],       xmm1

-    movhps [rdi+rdx],       xmm1

-    lea         rsi,        [rsi+rax*4]

-    lea         rdi,        [rdi+rcx*4]

-    movd        xmm0,       [rsi]

-    movd        xmm3,       [rsi+rax]

-    movd        xmm1,       [rsi+rax*2]

-    movd        xmm4,       [rsi+rbx]

-    punpcklbw   xmm0,       xmm3

-    punpcklbw   xmm1,       xmm4

-    pshufb      xmm0,       xmm2

-    pshufb      xmm1,       xmm2

-    movq   [rdi    ],       xmm0

-    movhps [rdi+rcx],       xmm0

-    movq [rdi+rcx*2],       xmm1

-    movhps [rdi+rdx],       xmm1

-%endif

-    ; begin epilog

-%ifidn %1, ssse3

-    RESTORE_GOT

-%ifndef GET_GOT_SAVE_ARG

-    pop         rbx

-%endif

-%endif

-    pop         rdi

-    pop         rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-%endmacro

-vp9_intra_pred_uv_ho mmx2

-vp9_intra_pred_uv_ho ssse3

-SECTION_RODATA

-dc_128:

-    times 8 db 128

-dc_4:

-    times 4 dw 4

-align 16

-dc_1024:

-    times 8 dw 0x400

-align 16

-dc_00001111:

-    times 8 db 0

-    times 8 db 1

--- a/vp8/common/x86/recon_wrapper_sse2.c

+++ /dev/null

@@ -1,101 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "vpx_mem/vpx_mem.h"

-#include "vp8/common/blockd.h"

-#define build_intra_predictors_mbuv_prototype(sym) \

-  void sym(unsigned char *dst, int dst_stride, \

-           const unsigned char *src, int src_stride)

-typedef build_intra_predictors_mbuv_prototype((*build_intra_pred_mbuv_fn_t));

-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dc_mmx2);

-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dctop_mmx2);

-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dcleft_mmx2);

-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dc128_mmx);

-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ho_mmx2);

-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ho_ssse3);

-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ve_mmx);

-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_tm_sse2);

-extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_tm_ssse3);

-static void build_intra_predictors_mbuv_x86(MACROBLOCKD *xd,

-                                            unsigned char *dst_u,

-                                            unsigned char *dst_v,

-                                            int dst_stride,

-                                            build_intra_pred_mbuv_fn_t tm_fn,

-                                            build_intra_pred_mbuv_fn_t ho_fn) {

-  int mode = xd->mode_info_context->mbmi.uv_mode;

-  build_intra_pred_mbuv_fn_t fn;

-  int src_stride = xd->dst.uv_stride;

-  switch (mode) {

-    case  V_PRED:

-      fn = vp9_intra_pred_uv_ve_mmx;

-      break;

-    case  H_PRED:

-      fn = ho_fn;

-      break;

-    case TM_PRED:

-      fn = tm_fn;

-      break;

-    case DC_PRED:

-      if (xd->up_available) {

-        if (xd->left_available) {

-          fn = vp9_intra_pred_uv_dc_mmx2;

-          break;

-        } else {

-          fn = vp9_intra_pred_uv_dctop_mmx2;

-          break;

-        }

-      } else if (xd->left_available) {

-        fn = vp9_intra_pred_uv_dcleft_mmx2;

-        break;

-      } else {

-        fn = vp9_intra_pred_uv_dc128_mmx;

-        break;

-      }

-      break;

-    default:

-      return;

-  }

-  fn(dst_u, dst_stride, xd->dst.u_buffer, src_stride);

-  fn(dst_v, dst_stride, xd->dst.v_buffer, src_stride);

-}

-void vp9_build_intra_predictors_mbuv_sse2(MACROBLOCKD *xd) {

-  build_intra_predictors_mbuv_x86(xd, &xd->predictor[256],

-                                  &xd->predictor[320], 8,

-                                  vp9_intra_pred_uv_tm_sse2,

-                                  vp9_intra_pred_uv_ho_mmx2);

-}

-void vp9_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *xd) {

-  build_intra_predictors_mbuv_x86(xd, &xd->predictor[256],

-                                  &xd->predictor[320], 8,

-                                  vp9_intra_pred_uv_tm_ssse3,

-                                  vp9_intra_pred_uv_ho_ssse3);

-}

-void vp9_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *xd) {

-  build_intra_predictors_mbuv_x86(xd, xd->dst.u_buffer,

-                                  xd->dst.v_buffer, xd->dst.uv_stride,

-                                  vp9_intra_pred_uv_tm_sse2,

-                                  vp9_intra_pred_uv_ho_mmx2);

-}

-void vp9_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *xd) {

-  build_intra_predictors_mbuv_x86(xd, xd->dst.u_buffer,

-                                  xd->dst.v_buffer, xd->dst.uv_stride,

-                                  vp9_intra_pred_uv_tm_ssse3,

-                                  vp9_intra_pred_uv_ho_ssse3);

-}

--- a/vp8/common/x86/sadmxn_x86.c

+++ /dev/null

@@ -1,92 +1,0 @@

-/*

- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <emmintrin.h>  // SSE2

-#include "./vpx_config.h"

-#include "./vpx_rtcd.h"

-#if CONFIG_NEWBESTREFMV

-#if HAVE_SSE2

-unsigned int vp9_sad16x3_sse2(

-  const unsigned char *src_ptr,

-  int  src_stride,

-  const unsigned char *ref_ptr,

-  int  ref_stride,

-  int max_sad) {

-  __m128i s0, s1, s2;

-  __m128i r0, r1, r2;

-  __m128i sad;

-  (void)max_sad;

-  s0 = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride));

-  s1 = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride));

-  s2 = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride));

-  r0 = _mm_loadu_si128((const __m128i *)(ref_ptr + 0 * src_stride));

-  r1 = _mm_loadu_si128((const __m128i *)(ref_ptr + 1 * src_stride));

-  r2 = _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * src_stride));

-  sad = _mm_sad_epu8(s0, r0);

-  sad = _mm_add_epi16(sad,  _mm_sad_epu8(s1, r1));

-  sad = _mm_add_epi16(sad,  _mm_sad_epu8(s2, r2));

-  sad = _mm_add_epi16(sad,  _mm_srli_si128(sad, 8));

-  return _mm_cvtsi128_si32(sad);

-}

-unsigned int vp9_sad3x16_sse2(

-  const unsigned char *src_ptr,

-  int  src_stride,

-  const unsigned char *ref_ptr,

-  int  ref_stride,

-  int max_sad) {

-  int r;

-  __m128i s0, s1, s2, s3;

-  __m128i r0, r1, r2, r3;

-  __m128i sad = _mm_set1_epi16(0);

-  for (r = 0; r < 16; r += 4) {

-    s0 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 0 * src_stride));

-    s1 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 1 * src_stride));

-    s2 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 2 * src_stride));

-    s3 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 3 * src_stride));

-    r0 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 0 * src_stride));

-    r1 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 1 * src_stride));

-    r2 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 2 * src_stride));

-    r3 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 3 * src_stride));

-    s0 = _mm_unpacklo_epi8(s0, s1);

-    r0 = _mm_unpacklo_epi8(r0, r1);

-    s2 = _mm_unpacklo_epi8(s2, s3);

-    r2 = _mm_unpacklo_epi8(r2, r3);

-    s0 = _mm_unpacklo_epi64(s0, s2);

-    r0 = _mm_unpacklo_epi64(r0, r2);

-    // throw out byte 3

-    s0 = _mm_slli_epi64(s0, 16);

-    r0 = _mm_slli_epi64(r0, 16);

-    sad = _mm_add_epi16(sad, _mm_sad_epu8(s0, r0));

-    src_ptr += src_stride*4;

-    ref_ptr += ref_stride*4;

-  }

-  sad = _mm_add_epi16(sad,  _mm_srli_si128(sad, 8));

-  return _mm_cvtsi128_si32(sad);

-}

-#endif

-#endif  // CONFIG_NEWBESTREFMV

--- a/vp8/common/x86/subpixel_8t_ssse3.asm

+++ /dev/null

@@ -1,550 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;/************************************************************************************

-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The

-; input pixel array has output_height rows. This routine assumes that output_height is an

-; even number. This function handles 8 pixels in horizontal direction, calculating ONE

-; rows each iteration to take advantage of the 128 bits operations.

-;

-; This is an implementation of some of the SSE optimizations first seen in ffvp8

-;

-;*************************************************************************************/

-;void vp9_filter_block1d8_v8_ssse3

-;(

-;    unsigned char *src_ptr,

-;    unsigned int   src_pitch,

-;    unsigned char *output_ptr,

-;    unsigned int   out_pitch,

-;    unsigned int   output_height,

-;    short *filter

-;)

-global sym(vp9_filter_block1d8_v8_ssse3)

-sym(vp9_filter_block1d8_v8_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    push        rsi

-    push        rdi

-    push        rbx

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub         rsp, 16*5

-    %define k0k1 [rsp + 16*0]

-    %define k2k3 [rsp + 16*1]

-    %define k4k5 [rsp + 16*2]

-    %define k6k7 [rsp + 16*3]

-    %define krd [rsp + 16*4]

-    mov         rdx, arg(5)                 ;filter ptr

-    mov         rsi, arg(0)                 ;src_ptr

-    mov         rdi, arg(2)                 ;output_ptr

-    mov         rcx, 0x0400040

-    movdqa      xmm4, [rdx]                 ;load filters

-    movd        xmm5, rcx

-    packsswb    xmm4, xmm4

-    pshuflw     xmm0, xmm4, 0b              ;k0_k1

-    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3

-    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5

-    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7

-    punpcklqdq  xmm0, xmm0

-    punpcklqdq  xmm1, xmm1

-    punpcklqdq  xmm2, xmm2

-    punpcklqdq  xmm3, xmm3

-    movdqa      k0k1, xmm0

-    movdqa      k2k3, xmm1

-    pshufd      xmm5, xmm5, 0

-    movdqa      k4k5, xmm2

-    movdqa      k6k7, xmm3

-    movdqa      krd, xmm5

-    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line

-%if ABI_IS_32BIT=0

-    movsxd      r8, DWORD PTR arg(3)        ;out_pitch

-%endif

-    mov         rax, rsi

-    movsxd      rcx, DWORD PTR arg(4)       ;output_height

-    add         rax, rdx

-    lea         rbx, [rdx + rdx*4]

-    add         rbx, rdx                    ;pitch * 6

-.vp9_filter_block1d8_v8_ssse3_loop:

-    movq        xmm0, [rsi]                 ;A

-    movq        xmm1, [rsi + rdx]           ;B

-    movq        xmm2, [rsi + rdx * 2]       ;C

-    movq        xmm3, [rax + rdx * 2]       ;D

-    movq        xmm4, [rsi + rdx * 4]       ;E

-    movq        xmm5, [rax + rdx * 4]       ;F

-    punpcklbw   xmm0, xmm1                  ;A B

-    punpcklbw   xmm2, xmm3                  ;C D

-    punpcklbw   xmm4, xmm5                  ;E F

-    movq        xmm6, [rsi + rbx]           ;G

-    movq        xmm7, [rax + rbx]           ;H

-    pmaddubsw   xmm0, k0k1

-    pmaddubsw   xmm2, k2k3

-    punpcklbw   xmm6, xmm7                  ;G H

-    pmaddubsw   xmm4, k4k5

-    pmaddubsw   xmm6, k6k7

-    paddsw      xmm0, xmm2

-    paddsw      xmm0, krd

-    paddsw      xmm4, xmm6

-    paddsw      xmm0, xmm4

-    psraw       xmm0, 7

-    packuswb    xmm0, xmm0

-    add         rsi,  rdx

-    add         rax,  rdx

-    movq        [rdi], xmm0

-%if ABI_IS_32BIT

-    add         rdi, DWORD PTR arg(3)       ;out_pitch

-%else

-    add         rdi, r8

-%endif

-    dec         rcx

-    jnz         .vp9_filter_block1d8_v8_ssse3_loop

-    add rsp, 16*5

-    pop rsp

-    pop rbx

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d16_v8_ssse3

-;(

-;    unsigned char *src_ptr,

-;    unsigned int   src_pitch,

-;    unsigned char *output_ptr,

-;    unsigned int   out_pitch,

-;    unsigned int   output_height,

-;    short *filter

-;)

-global sym(vp9_filter_block1d16_v8_ssse3)

-sym(vp9_filter_block1d16_v8_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    push        rsi

-    push        rdi

-    push        rbx

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub         rsp, 16*5

-    %define k0k1 [rsp + 16*0]

-    %define k2k3 [rsp + 16*1]

-    %define k4k5 [rsp + 16*2]

-    %define k6k7 [rsp + 16*3]

-    %define krd [rsp + 16*4]

-    mov         rdx, arg(5)                 ;filter ptr

-    mov         rsi, arg(0)                 ;src_ptr

-    mov         rdi, arg(2)                 ;output_ptr

-    mov         rcx, 0x0400040

-    movdqa      xmm4, [rdx]                 ;load filters

-    movd        xmm5, rcx

-    packsswb    xmm4, xmm4

-    pshuflw     xmm0, xmm4, 0b              ;k0_k1

-    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3

-    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5

-    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7

-    punpcklqdq  xmm0, xmm0

-    punpcklqdq  xmm1, xmm1

-    punpcklqdq  xmm2, xmm2

-    punpcklqdq  xmm3, xmm3

-    movdqa      k0k1, xmm0

-    movdqa      k2k3, xmm1

-    pshufd      xmm5, xmm5, 0

-    movdqa      k4k5, xmm2

-    movdqa      k6k7, xmm3

-    movdqa      krd, xmm5

-    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line

-%if ABI_IS_32BIT=0

-    movsxd      r8, DWORD PTR arg(3)        ;out_pitch

-%endif

-    mov         rax, rsi

-    movsxd      rcx, DWORD PTR arg(4)       ;output_height

-    add         rax, rdx

-    lea         rbx, [rdx + rdx*4]

-    add         rbx, rdx                    ;pitch * 6

-.vp9_filter_block1d16_v8_ssse3_loop:

-    movq        xmm0, [rsi]                 ;A

-    movq        xmm1, [rsi + rdx]           ;B

-    movq        xmm2, [rsi + rdx * 2]       ;C

-    movq        xmm3, [rax + rdx * 2]       ;D

-    movq        xmm4, [rsi + rdx * 4]       ;E

-    movq        xmm5, [rax + rdx * 4]       ;F

-    punpcklbw   xmm0, xmm1                  ;A B

-    punpcklbw   xmm2, xmm3                  ;C D

-    punpcklbw   xmm4, xmm5                  ;E F

-    movq        xmm6, [rsi + rbx]           ;G

-    movq        xmm7, [rax + rbx]           ;H

-    pmaddubsw   xmm0, k0k1

-    pmaddubsw   xmm2, k2k3

-    punpcklbw   xmm6, xmm7                  ;G H

-    pmaddubsw   xmm4, k4k5

-    pmaddubsw   xmm6, k6k7

-    paddsw      xmm0, xmm2

-    paddsw      xmm0, krd

-    paddsw      xmm4, xmm6

-    paddsw      xmm0, xmm4

-    psraw       xmm0, 7

-    packuswb    xmm0, xmm0

-    movq        [rdi], xmm0

-    movq        xmm0, [rsi + 8]             ;A

-    movq        xmm1, [rsi + rdx + 8]       ;B

-    movq        xmm2, [rsi + rdx * 2 + 8]   ;C

-    movq        xmm3, [rax + rdx * 2 + 8]   ;D

-    movq        xmm4, [rsi + rdx * 4 + 8]   ;E

-    movq        xmm5, [rax + rdx * 4 + 8]   ;F

-    punpcklbw   xmm0, xmm1                  ;A B

-    punpcklbw   xmm2, xmm3                  ;C D

-    punpcklbw   xmm4, xmm5                  ;E F

-    movq        xmm6, [rsi + rbx + 8]       ;G

-    movq        xmm7, [rax + rbx + 8]       ;H

-    punpcklbw   xmm6, xmm7                  ;G H

-    pmaddubsw   xmm0, k0k1

-    pmaddubsw   xmm2, k2k3

-    pmaddubsw   xmm4, k4k5

-    pmaddubsw   xmm6, k6k7

-    paddsw      xmm0, xmm2

-    paddsw      xmm4, xmm6

-    paddsw      xmm0, krd

-    paddsw      xmm0, xmm4

-    psraw       xmm0, 7

-    packuswb    xmm0, xmm0

-    add         rsi,  rdx

-    add         rax,  rdx

-    movq        [rdi+8], xmm0

-%if ABI_IS_32BIT

-    add         rdi, DWORD PTR arg(3)       ;out_pitch

-%else

-    add         rdi, r8

-%endif

-    dec         rcx

-    jnz         .vp9_filter_block1d16_v8_ssse3_loop

-    add rsp, 16*5

-    pop rsp

-    pop rbx

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d8_h8_ssse3

-;(

-;    unsigned char  *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char  *output_ptr,

-;    unsigned int    output_pitch,

-;    unsigned int    output_height,

-;    short *filter

-;)

-global sym(vp9_filter_block1d8_h8_ssse3)

-sym(vp9_filter_block1d8_h8_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub         rsp, 16*5

-    %define k0k1 [rsp + 16*0]

-    %define k2k3 [rsp + 16*1]

-    %define k4k5 [rsp + 16*2]

-    %define k6k7 [rsp + 16*3]

-    %define krd [rsp + 16*4]

-    mov         rdx, arg(5)                 ;filter ptr

-    mov         rsi, arg(0)                 ;src_ptr

-    mov         rdi, arg(2)                 ;output_ptr

-    mov         rcx, 0x0400040

-    movdqa      xmm4, [rdx]                 ;load filters

-    movd        xmm5, rcx

-    packsswb    xmm4, xmm4

-    pshuflw     xmm0, xmm4, 0b              ;k0_k1

-    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3

-    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5

-    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7

-    punpcklqdq  xmm0, xmm0

-    punpcklqdq  xmm1, xmm1

-    punpcklqdq  xmm2, xmm2

-    punpcklqdq  xmm3, xmm3

-    movdqa      k0k1, xmm0

-    movdqa      k2k3, xmm1

-    pshufd      xmm5, xmm5, 0

-    movdqa      k4k5, xmm2

-    movdqa      k6k7, xmm3

-;    movdqa      krd, xmm5

-    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line

-    movsxd      rdx, dword ptr arg(3)       ;output_pitch

-    movsxd      rcx, dword ptr arg(4)       ;output_height

-.filter_block1d8_h8_rowloop_ssse3:

-    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4

-;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11

-    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12

-;note: if we create a k0_k7 filter, we can save a pshufb

-;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11

-    punpcklqdq  xmm0,   xmm3

-    movdqa      xmm1,   xmm0

-    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]

-    pmaddubsw   xmm0,   k0k1

-    movdqa      xmm2,   xmm1

-    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]

-    pmaddubsw   xmm1,   k2k3

-    movdqa      xmm4,   xmm2

-    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]

-    pmaddubsw   xmm2,   k4k5

-    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]

-    pmaddubsw   xmm4,   k6k7

-    paddsw      xmm0,   xmm1

-    paddsw      xmm0,   xmm2

-    paddsw      xmm0,   xmm5

-    paddsw      xmm0,   xmm4

-    psraw       xmm0,   7

-    packuswb    xmm0,   xmm0

-    lea         rsi,    [rsi + rax]

-    movq        [rdi],  xmm0

-    lea         rdi,    [rdi + rdx]

-    dec         rcx

-    jnz         .filter_block1d8_h8_rowloop_ssse3

-    add rsp, 16*5

-    pop rsp

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d16_h8_ssse3

-;(

-;    unsigned char  *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char  *output_ptr,

-;    unsigned int    output_pitch,

-;    unsigned int    output_height,

-;    short *filter

-;)

-global sym(vp9_filter_block1d16_h8_ssse3)

-sym(vp9_filter_block1d16_h8_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub         rsp, 16*5

-    %define k0k1 [rsp + 16*0]

-    %define k2k3 [rsp + 16*1]

-    %define k4k5 [rsp + 16*2]

-    %define k6k7 [rsp + 16*3]

-    %define krd [rsp + 16*4]

-    mov         rdx, arg(5)                 ;filter ptr

-    mov         rsi, arg(0)                 ;src_ptr

-    mov         rdi, arg(2)                 ;output_ptr

-    mov         rcx, 0x0400040

-    movdqa      xmm4, [rdx]                 ;load filters

-    movd        xmm5, rcx

-    packsswb    xmm4, xmm4

-    pshuflw     xmm0, xmm4, 0b              ;k0_k1

-    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3

-    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5

-    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7

-    punpcklqdq  xmm0, xmm0

-    punpcklqdq  xmm1, xmm1

-    punpcklqdq  xmm2, xmm2

-    punpcklqdq  xmm3, xmm3

-    movdqa      k0k1, xmm0

-    movdqa      k2k3, xmm1

-    pshufd      xmm5, xmm5, 0

-    movdqa      k4k5, xmm2

-    movdqa      k6k7, xmm3

-    movdqa      krd, xmm5

-    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line

-    movsxd      rdx, dword ptr arg(3)       ;output_pitch

-    movsxd      rcx, dword ptr arg(4)       ;output_height

-.filter_block1d16_h8_rowloop_ssse3:

-    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4

-;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11

-    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12

-;note: if we create a k0_k7 filter, we can save a pshufb

-;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11

-    punpcklqdq  xmm0,   xmm3

-    movdqa      xmm1,   xmm0

-    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]

-    pmaddubsw   xmm0,   k0k1

-    movdqa      xmm2,   xmm1

-    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]

-    pmaddubsw   xmm1,   k2k3

-    movdqa      xmm4,   xmm2

-    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]

-    pmaddubsw   xmm2,   k4k5

-    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]

-    pmaddubsw   xmm4,   k6k7

-    paddsw      xmm0,   xmm1

-    paddsw      xmm0,   xmm4

-    paddsw      xmm0,   xmm2

-    paddsw      xmm0,   krd

-    psraw       xmm0,   7

-    packuswb    xmm0,   xmm0

-    movq        xmm3,   [rsi +  5]

-;    movq        xmm7,   [rsi + 12]

-    movq        xmm7,   [rsi + 13]

-;note: same as above

-;    punpcklbw   xmm3,   xmm7

-    punpcklqdq  xmm3,   xmm7

-    movdqa      xmm1,   xmm3

-    pshufb      xmm3,   [GLOBAL(shuf_t0t1)]

-    pmaddubsw   xmm3,   k0k1

-    movdqa      xmm2,   xmm1

-    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]

-    pmaddubsw   xmm1,   k2k3

-    movdqa      xmm4,   xmm2

-    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]

-    pmaddubsw   xmm2,   k4k5

-    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]

-    pmaddubsw   xmm4,   k6k7

-    paddsw      xmm3,   xmm1

-    paddsw      xmm3,   xmm2

-    paddsw      xmm3,   krd

-    paddsw      xmm3,   xmm4

-    psraw       xmm3,   7

-    packuswb    xmm3,   xmm3

-    punpcklqdq  xmm0,   xmm3

-    lea         rsi,    [rsi + rax]

-    movdqa      [rdi],  xmm0

-    lea         rdi,    [rdi + rdx]

-    dec         rcx

-    jnz         .filter_block1d16_h8_rowloop_ssse3

-    add rsp, 16*5

-    pop rsp

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-shuf_t0t1:

-    db  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8

-align 16

-shuf_t2t3:

-    db  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10

-align 16

-shuf_t4t5:

-    db  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12

-align 16

-shuf_t6t7:

-    db  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14

--- a/vp8/common/x86/subpixel_mmx.asm

+++ /dev/null

@@ -1,727 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%define BLOCK_HEIGHT_WIDTH 4

-%define vp9_filter_weight 128

-%define VP9_FILTER_SHIFT  7

-;void vp9_filter_block1d_h6_mmx

-;(

-;    unsigned char   *src_ptr,

-;    unsigned short  *output_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned int    pixel_step,

-;    unsigned int    output_height,

-;    unsigned int    output_width,

-;    short           * vp9_filter

-;)

-global sym(vp9_filter_block1d_h6_mmx)

-sym(vp9_filter_block1d_h6_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rdx,    arg(6) ;vp9_filter

-        movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!

-        movq        mm2,    [rdx + 32]         ;

-        movq        mm6,    [rdx + 48]        ;

-        movq        mm7,    [rdx + 64]        ;

-        mov         rdi,    arg(1) ;output_ptr

-        mov         rsi,    arg(0) ;src_ptr

-        movsxd      rcx,    dword ptr arg(4) ;output_height

-        movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?

-        pxor        mm0,    mm0              ; mm0 = 00000000

-.nextrow:

-        movq        mm3,    [rsi-2]          ; mm3 = p-2..p5

-        movq        mm4,    mm3              ; mm4 = p-2..p5

-        psrlq       mm3,    8                ; mm3 = p-1..p5

-        punpcklbw   mm3,    mm0              ; mm3 = p-1..p2

-        pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.

-        movq        mm5,    mm4              ; mm5 = p-2..p5

-        punpckhbw   mm4,    mm0              ; mm5 = p2..p5

-        pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers

-        paddsw      mm3,    mm4              ; mm3 += mm5

-        movq        mm4,    mm5              ; mm4 = p-2..p5;

-        psrlq       mm5,    16               ; mm5 = p0..p5;

-        punpcklbw   mm5,    mm0              ; mm5 = p0..p3

-        pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers

-        paddsw      mm3,    mm5              ; mm3 += mm5

-        movq        mm5,    mm4              ; mm5 = p-2..p5

-        psrlq       mm4,    24               ; mm4 = p1..p5

-        punpcklbw   mm4,    mm0              ; mm4 = p1..p4

-        pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers

-        paddsw      mm3,    mm4              ; mm3 += mm5

-        ; do outer positive taps

-        movd        mm4,    [rsi+3]

-        punpcklbw   mm4,    mm0              ; mm5 = p3..p6

-        pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers

-        paddsw      mm3,    mm4              ; mm3 += mm5

-        punpcklbw   mm5,    mm0              ; mm5 = p-2..p1

-        pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers

-        paddsw      mm3,    mm5              ; mm3 += mm5

-        paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value

-        psraw       mm3,    VP9_FILTER_SHIFT     ; mm3 /= 128

-        packuswb    mm3,    mm0              ; pack and unpack to saturate

-        punpcklbw   mm3,    mm0              ;

-        movq        [rdi],  mm3              ; store the results in the destination

-%if ABI_IS_32BIT

-        add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line

-        add         rdi,    rax;

-%else

-        movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line

-        add         rdi,    rax;

-        add         rsi,    r8               ; next line

-%endif

-        dec         rcx                      ; decrement count

-        jnz         .nextrow                 ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1dc_v6_mmx

-;(

-;   short *src_ptr,

-;   unsigned char *output_ptr,

-;    int output_pitch,

-;   unsigned int pixels_per_line,

-;   unsigned int pixel_step,

-;   unsigned int output_height,

-;   unsigned int output_width,

-;   short * vp9_filter

-;)

-global sym(vp9_filter_block1dc_v6_mmx)

-sym(vp9_filter_block1dc_v6_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 8

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        movq      mm5, [GLOBAL(rd)]

-        push        rbx

-        mov         rbx, arg(7) ;vp9_filter

-        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!

-        movq      mm2, [rbx + 32]         ;

-        movq      mm6, [rbx + 48]        ;

-        movq      mm7, [rbx + 64]        ;

-        movsxd      rdx, dword ptr arg(3) ;pixels_per_line

-        mov         rdi, arg(1) ;output_ptr

-        mov         rsi, arg(0) ;src_ptr

-        sub         rsi, rdx

-        sub         rsi, rdx

-        movsxd      rcx, DWORD PTR arg(5) ;output_height

-        movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?

-        pxor        mm0, mm0              ; mm0 = 00000000

-.nextrow_cv:

-        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1

-        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.

-        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2

-        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.

-        paddsw      mm3, mm4              ; mm3 += mm4

-        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0

-        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.

-        paddsw      mm3, mm4              ; mm3 += mm4

-        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2

-        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.

-        paddsw      mm3, mm4              ; mm3 += mm4

-        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch

-        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1

-        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.

-        paddsw      mm3, mm4              ; mm3 += mm4

-        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3

-        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.

-        paddsw      mm3, mm4              ; mm3 += mm4

-        paddsw      mm3, mm5               ; mm3 += round value

-        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128

-        packuswb    mm3, mm0              ; pack and saturate

-        movd        [rdi],mm3             ; store the results in the destination

-        ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the

-        ; recon block should be in cache this shouldn't cost much.  Its obviously

-        ; avoidable!!!.

-        lea         rdi,  [rdi+rax] ;

-        dec         rcx                   ; decrement count

-        jnz         .nextrow_cv           ; next row

-        pop         rbx

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void bilinear_predict8x8_mmx

-;(

-;    unsigned char  *src_ptr,

-;    int   src_pixels_per_line,

-;    int  xoffset,

-;    int  yoffset,

-;   unsigned char *dst_ptr,

-;    int dst_pitch

-;)

-global sym(vp9_bilinear_predict8x8_mmx)

-sym(vp9_bilinear_predict8x8_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ;const short *HFilter = bilinear_filters_mmx[xoffset];

-    ;const short *VFilter = bilinear_filters_mmx[yoffset];

-        movsxd      rax,        dword ptr arg(2) ;xoffset

-        mov         rdi,        arg(4) ;dst_ptr           ;

-        shl         rax,        5 ; offset * 32

-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]

-        add         rax,        rcx ; HFilter

-        mov         rsi,        arg(0) ;src_ptr              ;

-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch

-        movq        mm1,        [rax]               ;

-        movq        mm2,        [rax+16]            ;

-        movsxd      rax,        dword ptr arg(3) ;yoffset

-        pxor        mm0,        mm0                 ;

-        shl         rax,        5 ; offset*32

-        add         rax,        rcx ; VFilter

-        lea         rcx,        [rdi+rdx*8]          ;

-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;

-        ; get the first horizontal line done       ;

-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        movq        mm4,        mm3                 ; make a copy of current line

-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06

-        punpckhbw   mm4,        mm0                 ;

-        pmullw      mm3,        mm1                 ;

-        pmullw      mm4,        mm1                 ;

-        movq        mm5,        [rsi+1]             ;

-        movq        mm6,        mm5                 ;

-        punpcklbw   mm5,        mm0                 ;

-        punpckhbw   mm6,        mm0                 ;

-        pmullw      mm5,        mm2                 ;

-        pmullw      mm6,        mm2                 ;

-        paddw       mm3,        mm5                 ;

-        paddw       mm4,        mm6                 ;

-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128

-        paddw       mm4,        [GLOBAL(rd)]                 ;

-        psraw       mm4,        VP9_FILTER_SHIFT        ;

-        movq        mm7,        mm3                 ;

-        packuswb    mm7,        mm4                 ;

-        add         rsi,        rdx                 ; next line

-.next_row_8x8:

-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        movq        mm4,        mm3                 ; make a copy of current line

-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06

-        punpckhbw   mm4,        mm0                 ;

-        pmullw      mm3,        mm1                 ;

-        pmullw      mm4,        mm1                 ;

-        movq        mm5,        [rsi+1]             ;

-        movq        mm6,        mm5                 ;

-        punpcklbw   mm5,        mm0                 ;

-        punpckhbw   mm6,        mm0                 ;

-        pmullw      mm5,        mm2                 ;

-        pmullw      mm6,        mm2                 ;

-        paddw       mm3,        mm5                 ;

-        paddw       mm4,        mm6                 ;

-        movq        mm5,        mm7                 ;

-        movq        mm6,        mm7                 ;

-        punpcklbw   mm5,        mm0                 ;

-        punpckhbw   mm6,        mm0

-        pmullw      mm5,        [rax]               ;

-        pmullw      mm6,        [rax]               ;

-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128

-        paddw       mm4,        [GLOBAL(rd)]                 ;

-        psraw       mm4,        VP9_FILTER_SHIFT        ;

-        movq        mm7,        mm3                 ;

-        packuswb    mm7,        mm4                 ;

-        pmullw      mm3,        [rax+16]            ;

-        pmullw      mm4,        [rax+16]            ;

-        paddw       mm3,        mm5                 ;

-        paddw       mm4,        mm6                 ;

-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128

-        paddw       mm4,        [GLOBAL(rd)]                 ;

-        psraw       mm4,        VP9_FILTER_SHIFT        ;

-        packuswb    mm3,        mm4

-        movq        [rdi],      mm3                 ; store the results in the destination

-%if ABI_IS_32BIT

-        add         rsi,        rdx                 ; next line

-        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;

-%else

-        movsxd      r8,         dword ptr arg(5) ;dst_pitch

-        add         rsi,        rdx                 ; next line

-        add         rdi,        r8                  ;dst_pitch

-%endif

-        cmp         rdi,        rcx                 ;

-        jne         .next_row_8x8

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void bilinear_predict8x4_mmx

-;(

-;    unsigned char  *src_ptr,

-;    int   src_pixels_per_line,

-;    int  xoffset,

-;    int  yoffset,

-;    unsigned char *dst_ptr,

-;    int dst_pitch

-;)

-global sym(vp9_bilinear_predict8x4_mmx)

-sym(vp9_bilinear_predict8x4_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ;const short *HFilter = bilinear_filters_mmx[xoffset];

-    ;const short *VFilter = bilinear_filters_mmx[yoffset];

-        movsxd      rax,        dword ptr arg(2) ;xoffset

-        mov         rdi,        arg(4) ;dst_ptr           ;

-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]

-        shl         rax,        5

-        mov         rsi,        arg(0) ;src_ptr              ;

-        add         rax,        rcx

-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch

-        movq        mm1,        [rax]               ;

-        movq        mm2,        [rax+16]            ;

-        movsxd      rax,        dword ptr arg(3) ;yoffset

-        pxor        mm0,        mm0                 ;

-        shl         rax,        5

-        add         rax,        rcx

-        lea         rcx,        [rdi+rdx*4]          ;

-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;

-        ; get the first horizontal line done       ;

-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        movq        mm4,        mm3                 ; make a copy of current line

-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06

-        punpckhbw   mm4,        mm0                 ;

-        pmullw      mm3,        mm1                 ;

-        pmullw      mm4,        mm1                 ;

-        movq        mm5,        [rsi+1]             ;

-        movq        mm6,        mm5                 ;

-        punpcklbw   mm5,        mm0                 ;

-        punpckhbw   mm6,        mm0                 ;

-        pmullw      mm5,        mm2                 ;

-        pmullw      mm6,        mm2                 ;

-        paddw       mm3,        mm5                 ;

-        paddw       mm4,        mm6                 ;

-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128

-        paddw       mm4,        [GLOBAL(rd)]                 ;

-        psraw       mm4,        VP9_FILTER_SHIFT        ;

-        movq        mm7,        mm3                 ;

-        packuswb    mm7,        mm4                 ;

-        add         rsi,        rdx                 ; next line

-.next_row_8x4:

-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        movq        mm4,        mm3                 ; make a copy of current line

-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06

-        punpckhbw   mm4,        mm0                 ;

-        pmullw      mm3,        mm1                 ;

-        pmullw      mm4,        mm1                 ;

-        movq        mm5,        [rsi+1]             ;

-        movq        mm6,        mm5                 ;

-        punpcklbw   mm5,        mm0                 ;

-        punpckhbw   mm6,        mm0                 ;

-        pmullw      mm5,        mm2                 ;

-        pmullw      mm6,        mm2                 ;

-        paddw       mm3,        mm5                 ;

-        paddw       mm4,        mm6                 ;

-        movq        mm5,        mm7                 ;

-        movq        mm6,        mm7                 ;

-        punpcklbw   mm5,        mm0                 ;

-        punpckhbw   mm6,        mm0

-        pmullw      mm5,        [rax]               ;

-        pmullw      mm6,        [rax]               ;

-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128

-        paddw       mm4,        [GLOBAL(rd)]                 ;

-        psraw       mm4,        VP9_FILTER_SHIFT        ;

-        movq        mm7,        mm3                 ;

-        packuswb    mm7,        mm4                 ;

-        pmullw      mm3,        [rax+16]            ;

-        pmullw      mm4,        [rax+16]            ;

-        paddw       mm3,        mm5                 ;

-        paddw       mm4,        mm6                 ;

-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128

-        paddw       mm4,        [GLOBAL(rd)]                 ;

-        psraw       mm4,        VP9_FILTER_SHIFT        ;

-        packuswb    mm3,        mm4

-        movq        [rdi],      mm3                 ; store the results in the destination

-%if ABI_IS_32BIT

-        add         rsi,        rdx                 ; next line

-        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;

-%else

-        movsxd      r8,         dword ptr arg(5) ;dst_pitch

-        add         rsi,        rdx                 ; next line

-        add         rdi,        r8

-%endif

-        cmp         rdi,        rcx                 ;

-        jne         .next_row_8x4

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void bilinear_predict4x4_mmx

-;(

-;    unsigned char  *src_ptr,

-;    int   src_pixels_per_line,

-;    int  xoffset,

-;    int  yoffset,

-;    unsigned char *dst_ptr,

-;    int dst_pitch

-;)

-global sym(vp9_bilinear_predict4x4_mmx)

-sym(vp9_bilinear_predict4x4_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ;const short *HFilter = bilinear_filters_mmx[xoffset];

-    ;const short *VFilter = bilinear_filters_mmx[yoffset];

-        movsxd      rax,        dword ptr arg(2) ;xoffset

-        mov         rdi,        arg(4) ;dst_ptr           ;

-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]

-        shl         rax,        5

-        add         rax,        rcx ; HFilter

-        mov         rsi,        arg(0) ;src_ptr              ;

-        movsxd      rdx,        dword ptr arg(5) ;ldst_pitch

-        movq        mm1,        [rax]               ;

-        movq        mm2,        [rax+16]            ;

-        movsxd      rax,        dword ptr arg(3) ;yoffset

-        pxor        mm0,        mm0                 ;

-        shl         rax,        5

-        add         rax,        rcx

-        lea         rcx,        [rdi+rdx*4]          ;

-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;

-        ; get the first horizontal line done       ;

-        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06

-        pmullw      mm3,        mm1                 ;

-        movd        mm5,        [rsi+1]             ;

-        punpcklbw   mm5,        mm0                 ;

-        pmullw      mm5,        mm2                 ;

-        paddw       mm3,        mm5                 ;

-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128

-        movq        mm7,        mm3                 ;

-        packuswb    mm7,        mm0                 ;

-        add         rsi,        rdx                 ; next line

-.next_row_4x4:

-        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06

-        pmullw      mm3,        mm1                 ;

-        movd        mm5,        [rsi+1]             ;

-        punpcklbw   mm5,        mm0                 ;

-        pmullw      mm5,        mm2                 ;

-        paddw       mm3,        mm5                 ;

-        movq        mm5,        mm7                 ;

-        punpcklbw   mm5,        mm0                 ;

-        pmullw      mm5,        [rax]               ;

-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128

-        movq        mm7,        mm3                 ;

-        packuswb    mm7,        mm0                 ;

-        pmullw      mm3,        [rax+16]            ;

-        paddw       mm3,        mm5                 ;

-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128

-        packuswb    mm3,        mm0

-        movd        [rdi],      mm3                 ; store the results in the destination

-%if ABI_IS_32BIT

-        add         rsi,        rdx                 ; next line

-        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;

-%else

-        movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;

-        add         rsi,        rdx                 ; next line

-        add         rdi,        r8

-%endif

-        cmp         rdi,        rcx                 ;

-        jne         .next_row_4x4

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-rd:

-    times 4 dw 0x40

-align 16

-global HIDDEN_DATA(sym(vp9_six_tap_mmx))

-sym(vp9_six_tap_mmx):

-    times 8 dw 0

-    times 8 dw 0

-    times 8 dw 128

-    times 8 dw 0

-    times 8 dw 0

-    times 8 dw 0

-    times 8 dw 0

-    times 8 dw -6

-    times 8 dw 123

-    times 8 dw 12

-    times 8 dw -1

-    times 8 dw 0

-    times 8 dw 2

-    times 8 dw -11

-    times 8 dw 108

-    times 8 dw 36

-    times 8 dw -8

-    times 8 dw 1

-    times 8 dw 0

-    times 8 dw -9

-    times 8 dw 93

-    times 8 dw 50

-    times 8 dw -6

-    times 8 dw 0

-    times 8 dw 3

-    times 8 dw -16

-    times 8 dw 77

-    times 8 dw 77

-    times 8 dw -16

-    times 8 dw 3

-    times 8 dw 0

-    times 8 dw -6

-    times 8 dw 50

-    times 8 dw 93

-    times 8 dw -9

-    times 8 dw 0

-    times 8 dw 1

-    times 8 dw -8

-    times 8 dw 36

-    times 8 dw 108

-    times 8 dw -11

-    times 8 dw 2

-    times 8 dw 0

-    times 8 dw -1

-    times 8 dw 12

-    times 8 dw 123

-    times 8 dw -6

-    times 8 dw 0

-align 16

-global HIDDEN_DATA(sym(vp9_bilinear_filters_8x_mmx))

-sym(vp9_bilinear_filters_8x_mmx):

-    times 8 dw 128

-    times 8 dw 0

-    times 8 dw 112

-    times 8 dw 16

-    times 8 dw 96

-    times 8 dw 32

-    times 8 dw 80

-    times 8 dw 48

-    times 8 dw 64

-    times 8 dw 64

-    times 8 dw 48

-    times 8 dw 80

-    times 8 dw 32

-    times 8 dw 96

-    times 8 dw 16

-    times 8 dw 112

--- a/vp8/common/x86/subpixel_sse2.asm

+++ /dev/null

@@ -1,1372 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%define BLOCK_HEIGHT_WIDTH 4

-%define VP9_FILTER_WEIGHT 128

-%define VP9_FILTER_SHIFT  7

-;/************************************************************************************

-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The

-; input pixel array has output_height rows. This routine assumes that output_height is an

-; even number. This function handles 8 pixels in horizontal direction, calculating ONE

-; rows each iteration to take advantage of the 128 bits operations.

-;*************************************************************************************/

-;void vp9_filter_block1d8_h6_sse2

-;(

-;    unsigned char  *src_ptr,

-;    unsigned short *output_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned int    pixel_step,

-;    unsigned int    output_height,

-;    unsigned int    output_width,

-;    short           *vp9_filter

-;)

-global sym(vp9_filter_block1d8_h6_sse2)

-sym(vp9_filter_block1d8_h6_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rdx,        arg(6) ;vp9_filter

-        mov         rsi,        arg(0) ;src_ptr

-        mov         rdi,        arg(1) ;output_ptr

-        movsxd      rcx,        dword ptr arg(4) ;output_height

-        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(5) ;output_width

-%endif

-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

-.filter_block1d8_h6_rowloop:

-        movq        xmm3,       MMWORD PTR [rsi - 2]

-        movq        xmm1,       MMWORD PTR [rsi + 6]

-        prefetcht2  [rsi+rax-2]

-        pslldq      xmm1,       8

-        por         xmm1,       xmm3

-        movdqa      xmm4,       xmm1

-        movdqa      xmm5,       xmm1

-        movdqa      xmm6,       xmm1

-        movdqa      xmm7,       xmm1

-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

-        paddsw      xmm4,       xmm7

-        paddsw      xmm4,       xmm5

-        paddsw      xmm4,       xmm3

-        paddsw      xmm4,       xmm6

-        paddsw      xmm4,       xmm1

-        paddsw      xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       7

-        packuswb    xmm4,       xmm0

-        punpcklbw   xmm4,       xmm0

-        movdqa      XMMWORD Ptr [rdi],         xmm4

-        lea         rsi,        [rsi + rax]

-%if ABI_IS_32BIT

-        add         rdi,        DWORD Ptr arg(5) ;[output_width]

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx

-        jnz         .filter_block1d8_h6_rowloop                ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d16_h6_sse2

-;(

-;    unsigned char  *src_ptr,

-;    unsigned short *output_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned int    pixel_step,

-;    unsigned int    output_height,

-;    unsigned int    output_width,

-;    short           *vp9_filter

-;)

-;/************************************************************************************

-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The

-; input pixel array has output_height rows. This routine assumes that output_height is an

-; even number. This function handles 8 pixels in horizontal direction, calculating ONE

-; rows each iteration to take advantage of the 128 bits operations.

-;*************************************************************************************/

-global sym(vp9_filter_block1d16_h6_sse2)

-sym(vp9_filter_block1d16_h6_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rdx,        arg(6) ;vp9_filter

-        mov         rsi,        arg(0) ;src_ptr

-        mov         rdi,        arg(1) ;output_ptr

-        movsxd      rcx,        dword ptr arg(4) ;output_height

-        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(5) ;output_width

-%endif

-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

-.filter_block1d16_h6_sse2_rowloop:

-        movq        xmm3,       MMWORD PTR [rsi - 2]

-        movq        xmm1,       MMWORD PTR [rsi + 6]

-        movq        xmm2,       MMWORD PTR [rsi +14]

-        pslldq      xmm2,       8

-        por         xmm2,       xmm1

-        prefetcht2  [rsi+rax-2]

-        pslldq      xmm1,       8

-        por         xmm1,       xmm3

-        movdqa      xmm4,       xmm1

-        movdqa      xmm5,       xmm1

-        movdqa      xmm6,       xmm1

-        movdqa      xmm7,       xmm1

-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

-        paddsw      xmm4,       xmm7

-        paddsw      xmm4,       xmm5

-        paddsw      xmm4,       xmm3

-        paddsw      xmm4,       xmm6

-        paddsw      xmm4,       xmm1

-        paddsw      xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       7

-        packuswb    xmm4,       xmm0

-        punpcklbw   xmm4,       xmm0

-        movdqa      XMMWORD Ptr [rdi],         xmm4

-        movdqa      xmm3,       xmm2

-        movdqa      xmm4,       xmm2

-        movdqa      xmm5,       xmm2

-        movdqa      xmm6,       xmm2

-        movdqa      xmm7,       xmm2

-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

-        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

-        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

-        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

-        paddsw      xmm4,       xmm7

-        paddsw      xmm4,       xmm5

-        paddsw      xmm4,       xmm3

-        paddsw      xmm4,       xmm6

-        paddsw      xmm4,       xmm2

-        paddsw      xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       7

-        packuswb    xmm4,       xmm0

-        punpcklbw   xmm4,       xmm0

-        movdqa      XMMWORD Ptr [rdi+16],      xmm4

-        lea         rsi,        [rsi + rax]

-%if ABI_IS_32BIT

-        add         rdi,        DWORD Ptr arg(5) ;[output_width]

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx

-        jnz         .filter_block1d16_h6_sse2_rowloop                ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d8_v6_sse2

-;(

-;    short *src_ptr,

-;    unsigned char *output_ptr,

-;    int dst_ptich,

-;    unsigned int pixels_per_line,

-;    unsigned int pixel_step,

-;    unsigned int output_height,

-;    unsigned int output_width,

-;    short * vp9_filter

-;)

-;/************************************************************************************

-; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The

-; input pixel array has output_height rows.

-;*************************************************************************************/

-global sym(vp9_filter_block1d8_v6_sse2)

-sym(vp9_filter_block1d8_v6_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 8

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rax,        arg(7) ;vp9_filter

-        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line

-        mov         rdi,        arg(1) ;output_ptr

-        mov         rsi,        arg(0) ;src_ptr

-        sub         rsi,        rdx

-        sub         rsi,        rdx

-        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]

-        pxor        xmm0,       xmm0                        ; clear xmm0

-        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(2) ; dst_ptich

-%endif

-.vp9_filter_block1d8_v6_sse2_loop:

-        movdqa      xmm1,       XMMWORD PTR [rsi]

-        pmullw      xmm1,       [rax]

-        movdqa      xmm2,       XMMWORD PTR [rsi + rdx]

-        pmullw      xmm2,       [rax + 16]

-        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]

-        pmullw      xmm3,       [rax + 32]

-        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]

-        pmullw      xmm5,       [rax + 64]

-        add         rsi,        rdx

-        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]

-        pmullw      xmm4,       [rax + 48]

-        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]

-        pmullw      xmm6,       [rax + 80]

-        paddsw      xmm2,       xmm5

-        paddsw      xmm2,       xmm3

-        paddsw      xmm2,       xmm1

-        paddsw      xmm2,       xmm4

-        paddsw      xmm2,       xmm6

-        paddsw      xmm2,       xmm7

-        psraw       xmm2,       7

-        packuswb    xmm2,       xmm0              ; pack and saturate

-        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination

-%if ABI_IS_32BIT

-        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx         ; decrement count

-        jnz         .vp9_filter_block1d8_v6_sse2_loop               ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d16_v6_sse2

-;(

-;    unsigned short *src_ptr,

-;    unsigned char *output_ptr,

-;    int dst_ptich,

-;    unsigned int pixels_per_line,

-;    unsigned int pixel_step,

-;    unsigned int output_height,

-;    unsigned int output_width,

-;    const short    *vp9_filter

-;)

-;/************************************************************************************

-; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The

-; input pixel array has output_height rows.

-;*************************************************************************************/

-global sym(vp9_filter_block1d16_v6_sse2)

-sym(vp9_filter_block1d16_v6_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 8

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rax,        arg(7) ;vp9_filter

-        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line

-        mov         rdi,        arg(1) ;output_ptr

-        mov         rsi,        arg(0) ;src_ptr

-        sub         rsi,        rdx

-        sub         rsi,        rdx

-        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(2) ; dst_ptich

-%endif

-.vp9_filter_block1d16_v6_sse2_loop:

-; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.

-        movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2

-        movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]

-        pmullw      xmm1,       [rax + 16]

-        pmullw      xmm2,       [rax + 16]

-        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5

-        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]

-        pmullw      xmm3,       [rax + 64]

-        pmullw      xmm4,       [rax + 64]

-        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3

-        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]

-        pmullw      xmm5,       [rax + 32]

-        pmullw      xmm6,       [rax + 32]

-        movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1

-        movdqa      xmm0,       XMMWORD PTR [rsi + 16]

-        pmullw      xmm7,       [rax]

-        pmullw      xmm0,       [rax]

-        paddsw      xmm1,       xmm3

-        paddsw      xmm2,       xmm4

-        paddsw      xmm1,       xmm5

-        paddsw      xmm2,       xmm6

-        paddsw      xmm1,       xmm7

-        paddsw      xmm2,       xmm0

-        add         rsi,        rdx

-        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4

-        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]

-        pmullw      xmm3,       [rax + 48]

-        pmullw      xmm4,       [rax + 48]

-        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6

-        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]

-        pmullw      xmm5,       [rax + 80]

-        pmullw      xmm6,       [rax + 80]

-        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]

-        pxor        xmm0,       xmm0                        ; clear xmm0

-        paddsw      xmm1,       xmm3

-        paddsw      xmm2,       xmm4

-        paddsw      xmm1,       xmm5

-        paddsw      xmm2,       xmm6

-        paddsw      xmm1,       xmm7

-        paddsw      xmm2,       xmm7

-        psraw       xmm1,       7

-        psraw       xmm2,       7

-        packuswb    xmm1,       xmm2              ; pack and saturate

-        movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination

-%if ABI_IS_32BIT

-        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx         ; decrement count

-        jnz         .vp9_filter_block1d16_v6_sse2_loop              ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d8_h6_only_sse2

-;(

-;    unsigned char  *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char  *output_ptr,

-;    int dst_ptich,

-;    unsigned int    output_height,

-;    const short    *vp9_filter

-;)

-; First-pass filter only when yoffset==0

-global sym(vp9_filter_block1d8_h6_only_sse2)

-sym(vp9_filter_block1d8_h6_only_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rdx,        arg(5) ;vp9_filter

-        mov         rsi,        arg(0) ;src_ptr

-        mov         rdi,        arg(2) ;output_ptr

-        movsxd      rcx,        dword ptr arg(4) ;output_height

-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(3) ;dst_ptich

-%endif

-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

-.filter_block1d8_h6_only_rowloop:

-        movq        xmm3,       MMWORD PTR [rsi - 2]

-        movq        xmm1,       MMWORD PTR [rsi + 6]

-        prefetcht2  [rsi+rax-2]

-        pslldq      xmm1,       8

-        por         xmm1,       xmm3

-        movdqa      xmm4,       xmm1

-        movdqa      xmm5,       xmm1

-        movdqa      xmm6,       xmm1

-        movdqa      xmm7,       xmm1

-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

-        paddsw      xmm4,       xmm7

-        paddsw      xmm4,       xmm5

-        paddsw      xmm4,       xmm3

-        paddsw      xmm4,       xmm6

-        paddsw      xmm4,       xmm1

-        paddsw      xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       7

-        packuswb    xmm4,       xmm0

-        movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination

-        lea         rsi,        [rsi + rax]

-%if ABI_IS_32BIT

-        add         rdi,        DWORD Ptr arg(3) ;dst_ptich

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx

-        jnz         .filter_block1d8_h6_only_rowloop               ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d16_h6_only_sse2

-;(

-;    unsigned char  *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char  *output_ptr,

-;    int dst_ptich,

-;    unsigned int    output_height,

-;    const short    *vp9_filter

-;)

-; First-pass filter only when yoffset==0

-global sym(vp9_filter_block1d16_h6_only_sse2)

-sym(vp9_filter_block1d16_h6_only_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rdx,        arg(5) ;vp9_filter

-        mov         rsi,        arg(0) ;src_ptr

-        mov         rdi,        arg(2) ;output_ptr

-        movsxd      rcx,        dword ptr arg(4) ;output_height

-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(3) ;dst_ptich

-%endif

-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

-.filter_block1d16_h6_only_sse2_rowloop:

-        movq        xmm3,       MMWORD PTR [rsi - 2]

-        movq        xmm1,       MMWORD PTR [rsi + 6]

-        movq        xmm2,       MMWORD PTR [rsi +14]

-        pslldq      xmm2,       8

-        por         xmm2,       xmm1

-        prefetcht2  [rsi+rax-2]

-        pslldq      xmm1,       8

-        por         xmm1,       xmm3

-        movdqa      xmm4,       xmm1

-        movdqa      xmm5,       xmm1

-        movdqa      xmm6,       xmm1

-        movdqa      xmm7,       xmm1

-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

-        paddsw      xmm4,       xmm7

-        paddsw      xmm4,       xmm5

-        paddsw      xmm4,       xmm3

-        paddsw      xmm4,       xmm6

-        paddsw      xmm4,       xmm1

-        paddsw      xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       7

-        packuswb    xmm4,       xmm0                        ; lower 8 bytes

-        movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination

-        movdqa      xmm3,       xmm2

-        movdqa      xmm4,       xmm2

-        movdqa      xmm5,       xmm2

-        movdqa      xmm6,       xmm2

-        movdqa      xmm7,       xmm2

-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

-        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

-        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

-        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

-        paddsw      xmm4,       xmm7

-        paddsw      xmm4,       xmm5

-        paddsw      xmm4,       xmm3

-        paddsw      xmm4,       xmm6

-        paddsw      xmm4,       xmm2

-        paddsw      xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       7

-        packuswb    xmm4,       xmm0                        ; higher 8 bytes

-        movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination

-        lea         rsi,        [rsi + rax]

-%if ABI_IS_32BIT

-        add         rdi,        DWORD Ptr arg(3) ;dst_ptich

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx

-        jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d8_v6_only_sse2

-;(

-;    unsigned char *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char *output_ptr,

-;    int dst_ptich,

-;    unsigned int output_height,

-;    const short    *vp9_filter

-;)

-; Second-pass filter only when xoffset==0

-global sym(vp9_filter_block1d8_v6_only_sse2)

-sym(vp9_filter_block1d8_v6_only_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rsi,        arg(0) ;src_ptr

-        mov         rdi,        arg(2) ;output_ptr

-        movsxd      rcx,        dword ptr arg(4) ;output_height

-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line

-        mov         rax,        arg(5) ;vp9_filter

-        pxor        xmm0,       xmm0                        ; clear xmm0

-        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(3) ; dst_ptich

-%endif

-.vp9_filter_block1d8_v6_only_sse2_loop:

-        movq        xmm1,       MMWORD PTR [rsi]

-        movq        xmm2,       MMWORD PTR [rsi + rdx]

-        movq        xmm3,       MMWORD PTR [rsi + rdx * 2]

-        movq        xmm5,       MMWORD PTR [rsi + rdx * 4]

-        add         rsi,        rdx

-        movq        xmm4,       MMWORD PTR [rsi + rdx * 2]

-        movq        xmm6,       MMWORD PTR [rsi + rdx * 4]

-        punpcklbw   xmm1,       xmm0

-        pmullw      xmm1,       [rax]

-        punpcklbw   xmm2,       xmm0

-        pmullw      xmm2,       [rax + 16]

-        punpcklbw   xmm3,       xmm0

-        pmullw      xmm3,       [rax + 32]

-        punpcklbw   xmm5,       xmm0

-        pmullw      xmm5,       [rax + 64]

-        punpcklbw   xmm4,       xmm0

-        pmullw      xmm4,       [rax + 48]

-        punpcklbw   xmm6,       xmm0

-        pmullw      xmm6,       [rax + 80]

-        paddsw      xmm2,       xmm5

-        paddsw      xmm2,       xmm3

-        paddsw      xmm2,       xmm1

-        paddsw      xmm2,       xmm4

-        paddsw      xmm2,       xmm6

-        paddsw      xmm2,       xmm7

-        psraw       xmm2,       7

-        packuswb    xmm2,       xmm0              ; pack and saturate

-        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination

-%if ABI_IS_32BIT

-        add         rdi,        DWORD PTR arg(3) ;[dst_ptich]

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx         ; decrement count

-        jnz         .vp9_filter_block1d8_v6_only_sse2_loop              ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_unpack_block1d16_h6_sse2

-;(

-;    unsigned char  *src_ptr,

-;    unsigned short *output_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned int    output_height,

-;    unsigned int    output_width

-;)

-global sym(vp9_unpack_block1d16_h6_sse2)

-sym(vp9_unpack_block1d16_h6_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rsi,        arg(0) ;src_ptr

-        mov         rdi,        arg(1) ;output_ptr

-        movsxd      rcx,        dword ptr arg(3) ;output_height

-        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source

-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source

-%endif

-.unpack_block1d16_h6_sse2_rowloop:

-        movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2

-        movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1

-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

-        punpcklbw   xmm1,       xmm0

-        movdqa      XMMWORD Ptr [rdi],         xmm1

-        movdqa      XMMWORD Ptr [rdi + 16],    xmm3

-        lea         rsi,        [rsi + rax]

-%if ABI_IS_32BIT

-        add         rdi,        DWORD Ptr arg(4) ;[output_width]

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx

-        jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_bilinear_predict16x16_sse2

-;(

-;    unsigned char  *src_ptr,

-;    int   src_pixels_per_line,

-;    int  xoffset,

-;    int  yoffset,

-;    unsigned char *dst_ptr,

-;    int dst_pitch

-;)

-extern sym(vp9_bilinear_filters_mmx)

-global sym(vp9_bilinear_predict16x16_sse2)

-sym(vp9_bilinear_predict16x16_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ;const short *HFilter = bilinear_filters_mmx[xoffset]

-    ;const short *VFilter = bilinear_filters_mmx[yoffset]

-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_mmx))]

-        movsxd      rax,        dword ptr arg(2) ;xoffset

-        cmp         rax,        0      ;skip first_pass filter if xoffset=0

-        je          .b16x16_sp_only

-        shl         rax,        5

-        add         rax,        rcx    ;HFilter

-        mov         rdi,        arg(4) ;dst_ptr

-        mov         rsi,        arg(0) ;src_ptr

-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch

-        movdqa      xmm1,       [rax]

-        movdqa      xmm2,       [rax+16]

-        movsxd      rax,        dword ptr arg(3) ;yoffset

-        cmp         rax,        0      ;skip second_pass filter if yoffset=0

-        je          .b16x16_fp_only

-        shl         rax,        5

-        add         rax,        rcx    ;VFilter

-        lea         rcx,        [rdi+rdx*8]

-        lea         rcx,        [rcx+rdx*8]

-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line

-        pxor        xmm0,       xmm0

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(5) ;dst_pitch

-%endif

-        ; get the first horizontal line done

-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        movdqa      xmm4,       xmm3                 ; make a copy of current line

-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06

-        punpckhbw   xmm4,       xmm0

-        pmullw      xmm3,       xmm1

-        pmullw      xmm4,       xmm1

-        movdqu      xmm5,       [rsi+1]

-        movdqa      xmm6,       xmm5

-        punpcklbw   xmm5,       xmm0

-        punpckhbw   xmm6,       xmm0

-        pmullw      xmm5,       xmm2

-        pmullw      xmm6,       xmm2

-        paddw       xmm3,       xmm5

-        paddw       xmm4,       xmm6

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        paddw       xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       VP9_FILTER_SHIFT

-        movdqa      xmm7,       xmm3

-        packuswb    xmm7,       xmm4

-        add         rsi,        rdx                 ; next line

-.next_row:

-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        movdqa      xmm4,       xmm3                 ; make a copy of current line

-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06

-        punpckhbw   xmm4,       xmm0

-        pmullw      xmm3,       xmm1

-        pmullw      xmm4,       xmm1

-        movdqu      xmm5,       [rsi+1]

-        movdqa      xmm6,       xmm5

-        punpcklbw   xmm5,       xmm0

-        punpckhbw   xmm6,       xmm0

-        pmullw      xmm5,       xmm2

-        pmullw      xmm6,       xmm2

-        paddw       xmm3,       xmm5

-        paddw       xmm4,       xmm6

-        movdqa      xmm5,       xmm7

-        movdqa      xmm6,       xmm7

-        punpcklbw   xmm5,       xmm0

-        punpckhbw   xmm6,       xmm0

-        pmullw      xmm5,       [rax]

-        pmullw      xmm6,       [rax]

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        paddw       xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       VP9_FILTER_SHIFT

-        movdqa      xmm7,       xmm3

-        packuswb    xmm7,       xmm4

-        pmullw      xmm3,       [rax+16]

-        pmullw      xmm4,       [rax+16]

-        paddw       xmm3,       xmm5

-        paddw       xmm4,       xmm6

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        paddw       xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       VP9_FILTER_SHIFT

-        packuswb    xmm3,       xmm4

-        movdqa      [rdi],      xmm3                 ; store the results in the destination

-        add         rsi,        rdx                 ; next line

-%if ABI_IS_32BIT

-        add         rdi,        DWORD PTR arg(5) ;dst_pitch

-%else

-        add         rdi,        r8

-%endif

-        cmp         rdi,        rcx

-        jne         .next_row

-        jmp         .done

-.b16x16_sp_only:

-        movsxd      rax,        dword ptr arg(3) ;yoffset

-        shl         rax,        5

-        add         rax,        rcx    ;VFilter

-        mov         rdi,        arg(4) ;dst_ptr

-        mov         rsi,        arg(0) ;src_ptr

-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch

-        movdqa      xmm1,       [rax]

-        movdqa      xmm2,       [rax+16]

-        lea         rcx,        [rdi+rdx*8]

-        lea         rcx,        [rcx+rdx*8]

-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line

-        pxor        xmm0,       xmm0

-        ; get the first horizontal line done

-        movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        add         rsi,        rax                 ; next line

-.next_row_spo:

-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        movdqa      xmm5,       xmm7

-        movdqa      xmm6,       xmm7

-        movdqa      xmm4,       xmm3                 ; make a copy of current line

-        movdqa      xmm7,       xmm3

-        punpcklbw   xmm5,       xmm0

-        punpckhbw   xmm6,       xmm0

-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06

-        punpckhbw   xmm4,       xmm0

-        pmullw      xmm5,       xmm1

-        pmullw      xmm6,       xmm1

-        pmullw      xmm3,       xmm2

-        pmullw      xmm4,       xmm2

-        paddw       xmm3,       xmm5

-        paddw       xmm4,       xmm6

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        paddw       xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       VP9_FILTER_SHIFT

-        packuswb    xmm3,       xmm4

-        movdqa      [rdi],      xmm3                 ; store the results in the destination

-        add         rsi,        rax                 ; next line

-        add         rdi,        rdx                 ;dst_pitch

-        cmp         rdi,        rcx

-        jne         .next_row_spo

-        jmp         .done

-.b16x16_fp_only:

-        lea         rcx,        [rdi+rdx*8]

-        lea         rcx,        [rcx+rdx*8]

-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line

-        pxor        xmm0,       xmm0

-.next_row_fpo:

-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        movdqa      xmm4,       xmm3                 ; make a copy of current line

-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06

-        punpckhbw   xmm4,       xmm0

-        pmullw      xmm3,       xmm1

-        pmullw      xmm4,       xmm1

-        movdqu      xmm5,       [rsi+1]

-        movdqa      xmm6,       xmm5

-        punpcklbw   xmm5,       xmm0

-        punpckhbw   xmm6,       xmm0

-        pmullw      xmm5,       xmm2

-        pmullw      xmm6,       xmm2

-        paddw       xmm3,       xmm5

-        paddw       xmm4,       xmm6

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        paddw       xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       VP9_FILTER_SHIFT

-        packuswb    xmm3,       xmm4

-        movdqa      [rdi],      xmm3                 ; store the results in the destination

-        add         rsi,        rax                 ; next line

-        add         rdi,        rdx                 ; dst_pitch

-        cmp         rdi,        rcx

-        jne         .next_row_fpo

-.done:

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_bilinear_predict8x8_sse2

-;(

-;    unsigned char  *src_ptr,

-;    int   src_pixels_per_line,

-;    int  xoffset,

-;    int  yoffset,

-;    unsigned char *dst_ptr,

-;    int dst_pitch

-;)

-extern sym(vp9_bilinear_filters_mmx)

-global sym(vp9_bilinear_predict8x8_sse2)

-sym(vp9_bilinear_predict8x8_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub         rsp, 144                         ; reserve 144 bytes

-    ;const short *HFilter = bilinear_filters_mmx[xoffset]

-    ;const short *VFilter = bilinear_filters_mmx[yoffset]

-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_mmx))]

-        mov         rsi,        arg(0) ;src_ptr

-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line

-    ;Read 9-line unaligned data in and put them on stack. This gives a big

-    ;performance boost.

-        movdqu      xmm0,       [rsi]

-        lea         rax,        [rdx + rdx*2]

-        movdqu      xmm1,       [rsi+rdx]

-        movdqu      xmm2,       [rsi+rdx*2]

-        add         rsi,        rax

-        movdqu      xmm3,       [rsi]

-        movdqu      xmm4,       [rsi+rdx]

-        movdqu      xmm5,       [rsi+rdx*2]

-        add         rsi,        rax

-        movdqu      xmm6,       [rsi]

-        movdqu      xmm7,       [rsi+rdx]

-        movdqa      XMMWORD PTR [rsp],            xmm0

-        movdqu      xmm0,       [rsi+rdx*2]

-        movdqa      XMMWORD PTR [rsp+16],         xmm1

-        movdqa      XMMWORD PTR [rsp+32],         xmm2

-        movdqa      XMMWORD PTR [rsp+48],         xmm3

-        movdqa      XMMWORD PTR [rsp+64],         xmm4

-        movdqa      XMMWORD PTR [rsp+80],         xmm5

-        movdqa      XMMWORD PTR [rsp+96],         xmm6

-        movdqa      XMMWORD PTR [rsp+112],        xmm7

-        movdqa      XMMWORD PTR [rsp+128],        xmm0

-        movsxd      rax,        dword ptr arg(2) ;xoffset

-        shl         rax,        5

-        add         rax,        rcx    ;HFilter

-        mov         rdi,        arg(4) ;dst_ptr

-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch

-        movdqa      xmm1,       [rax]

-        movdqa      xmm2,       [rax+16]

-        movsxd      rax,        dword ptr arg(3) ;yoffset

-        shl         rax,        5

-        add         rax,        rcx    ;VFilter

-        lea         rcx,        [rdi+rdx*8]

-        movdqa      xmm5,       [rax]

-        movdqa      xmm6,       [rax+16]

-        pxor        xmm0,       xmm0

-        ; get the first horizontal line done

-        movdqa      xmm3,       XMMWORD PTR [rsp]

-        movdqa      xmm4,       xmm3                 ; make a copy of current line

-        psrldq      xmm4,       1

-        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07

-        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08

-        pmullw      xmm3,       xmm1

-        pmullw      xmm4,       xmm2

-        paddw       xmm3,       xmm4

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        movdqa      xmm7,       xmm3

-        add         rsp,        16                 ; next line

-.next_row8x8:

-        movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

-        movdqa      xmm4,       xmm3                 ; make a copy of current line

-        psrldq      xmm4,       1

-        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07

-        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08

-        pmullw      xmm3,       xmm1

-        pmullw      xmm4,       xmm2

-        paddw       xmm3,       xmm4

-        pmullw      xmm7,       xmm5

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        movdqa      xmm4,       xmm3

-        pmullw      xmm3,       xmm6

-        paddw       xmm3,       xmm7

-        movdqa      xmm7,       xmm4

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        packuswb    xmm3,       xmm0

-        movq        [rdi],      xmm3                 ; store the results in the destination

-        add         rsp,        16                 ; next line

-        add         rdi,        rdx

-        cmp         rdi,        rcx

-        jne         .next_row8x8

-    ;add rsp, 144

-    pop rsp

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-rd:

-    times 8 dw 0x40

--- a/vp8/common/x86/subpixel_ssse3.asm

+++ /dev/null

@@ -1,1515 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%define BLOCK_HEIGHT_WIDTH 4

-%define VP9_FILTER_WEIGHT 128

-%define VP9_FILTER_SHIFT  7

-;/************************************************************************************

-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The

-; input pixel array has output_height rows. This routine assumes that output_height is an

-; even number. This function handles 8 pixels in horizontal direction, calculating ONE

-; rows each iteration to take advantage of the 128 bits operations.

-;

-; This is an implementation of some of the SSE optimizations first seen in ffvp8

-;

-;*************************************************************************************/

-;void vp9_filter_block1d8_h6_ssse3

-;(

-;    unsigned char  *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char *output_ptr,

-;    unsigned int    output_pitch,

-;    unsigned int    output_height,

-;    unsigned int    vp9_filter_index

-;)

-global sym(vp9_filter_block1d8_h6_ssse3)

-sym(vp9_filter_block1d8_h6_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    movsxd      rdx, DWORD PTR arg(5)   ;table index

-    xor         rsi, rsi

-    shl         rdx, 4

-    movdqa      xmm7, [GLOBAL(rd)]

-    lea         rax, [GLOBAL(k0_k5)]

-    add         rax, rdx

-    mov         rdi, arg(2)             ;output_ptr

-    cmp         esi, DWORD PTR [rax]

-    je          vp9_filter_block1d8_h4_ssse3

-    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5

-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

-    mov         rsi, arg(0)             ;src_ptr

-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line

-    movsxd      rcx, dword ptr arg(4)   ;output_height

-    movsxd      rdx, dword ptr arg(3)   ;output_pitch

-    sub         rdi, rdx

-;xmm3 free

-.filter_block1d8_h6_rowloop_ssse3:

-    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5

-    movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10

-    punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10

-    movdqa      xmm1,   xmm0

-    pmaddubsw   xmm0,   xmm4

-    movdqa      xmm2,   xmm1

-    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]

-    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]

-    pmaddubsw   xmm1,   xmm5

-    lea         rdi,    [rdi + rdx]

-    pmaddubsw   xmm2,   xmm6

-    lea         rsi,    [rsi + rax]

-    dec         rcx

-    paddsw      xmm0,   xmm1

-    paddsw      xmm2,   xmm7

-    paddsw      xmm0,   xmm2

-    psraw       xmm0,   7

-    packuswb    xmm0,   xmm0

-    movq        MMWORD Ptr [rdi], xmm0

-    jnz         .filter_block1d8_h6_rowloop_ssse3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-vp9_filter_block1d8_h4_ssse3:

-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

-    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]

-    movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]

-    mov         rsi, arg(0)             ;src_ptr

-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line

-    movsxd      rcx, dword ptr arg(4)   ;output_height

-    movsxd      rdx, dword ptr arg(3)   ;output_pitch

-    sub         rdi, rdx

-.filter_block1d8_h4_rowloop_ssse3:

-    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5

-    movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10

-    punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10

-    movdqa      xmm2,   xmm0

-    pshufb      xmm0,   xmm3

-    pshufb      xmm2,   xmm4

-    pmaddubsw   xmm0,   xmm5

-    lea         rdi,    [rdi + rdx]

-    pmaddubsw   xmm2,   xmm6

-    lea         rsi,    [rsi + rax]

-    dec         rcx

-    paddsw      xmm0,   xmm7

-    paddsw      xmm0,   xmm2

-    psraw       xmm0,   7

-    packuswb    xmm0,   xmm0

-    movq        MMWORD Ptr [rdi], xmm0

-    jnz         .filter_block1d8_h4_rowloop_ssse3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d16_h6_ssse3

-;(

-;    unsigned char  *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char  *output_ptr,

-;    unsigned int    output_pitch,

-;    unsigned int    output_height,

-;    unsigned int    vp9_filter_index

-;)

-global sym(vp9_filter_block1d16_h6_ssse3)

-sym(vp9_filter_block1d16_h6_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    movsxd      rdx, DWORD PTR arg(5)           ;table index

-    xor         rsi, rsi

-    shl         rdx, 4      ;

-    lea         rax, [GLOBAL(k0_k5)]

-    add         rax, rdx

-    mov         rdi, arg(2)                     ;output_ptr

-    mov         rsi, arg(0)                     ;src_ptr

-    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5

-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

-    movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line

-    movsxd      rcx, dword ptr arg(4)           ;output_height

-    movsxd      rdx, dword ptr arg(3)           ;output_pitch

-.filter_block1d16_h6_rowloop_ssse3:

-    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5

-    movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10

-    punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10

-    movdqa      xmm1,   xmm0

-    pmaddubsw   xmm0,   xmm4

-    movdqa      xmm2,   xmm1

-    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]

-    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]

-    movq        xmm3,   MMWORD PTR [rsi +  6]

-    pmaddubsw   xmm1,   xmm5

-    movq        xmm7,   MMWORD PTR [rsi + 11]

-    pmaddubsw   xmm2,   xmm6

-    punpcklbw   xmm3,   xmm7

-    paddsw      xmm0,   xmm1

-    movdqa      xmm1,   xmm3

-    pmaddubsw   xmm3,   xmm4

-    paddsw      xmm0,   xmm2

-    movdqa      xmm2,   xmm1

-    paddsw      xmm0,   [GLOBAL(rd)]

-    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]

-    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]

-    psraw       xmm0,   7

-    pmaddubsw   xmm1,   xmm5

-    pmaddubsw   xmm2,   xmm6

-    packuswb    xmm0,   xmm0

-    lea         rsi,    [rsi + rax]

-    paddsw      xmm3,   xmm1

-    paddsw      xmm3,   xmm2

-    paddsw      xmm3,   [GLOBAL(rd)]

-    psraw       xmm3,   7

-    packuswb    xmm3,   xmm3

-    punpcklqdq  xmm0,   xmm3

-    movdqa      XMMWORD Ptr [rdi], xmm0

-    lea         rdi,    [rdi + rdx]

-    dec         rcx

-    jnz         .filter_block1d16_h6_rowloop_ssse3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d4_h6_ssse3

-;(

-;    unsigned char  *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char  *output_ptr,

-;    unsigned int    output_pitch,

-;    unsigned int    output_height,

-;    unsigned int    vp9_filter_index

-;)

-global sym(vp9_filter_block1d4_h6_ssse3)

-sym(vp9_filter_block1d4_h6_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    movsxd      rdx, DWORD PTR arg(5)   ;table index

-    xor         rsi, rsi

-    shl         rdx, 4      ;

-    lea         rax, [GLOBAL(k0_k5)]

-    add         rax, rdx

-    movdqa      xmm7, [GLOBAL(rd)]

-    cmp         esi, DWORD PTR [rax]

-    je          .vp9_filter_block1d4_h4_ssse3

-    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5

-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

-    mov         rsi, arg(0)             ;src_ptr

-    mov         rdi, arg(2)             ;output_ptr

-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line

-    movsxd      rcx, dword ptr arg(4)   ;output_height

-    movsxd      rdx, dword ptr arg(3)   ;output_pitch

-;xmm3 free

-.filter_block1d4_h6_rowloop_ssse3:

-    movdqu      xmm0,   XMMWORD PTR [rsi - 2]

-    movdqa      xmm1, xmm0

-    pshufb      xmm0, [GLOBAL(shuf1b)]

-    movdqa      xmm2, xmm1

-    pshufb      xmm1, [GLOBAL(shuf2b)]

-    pmaddubsw   xmm0, xmm4

-    pshufb      xmm2, [GLOBAL(shuf3b)]

-    pmaddubsw   xmm1, xmm5

-;--

-    pmaddubsw   xmm2, xmm6

-    lea         rsi,    [rsi + rax]

-;--

-    paddsw      xmm0, xmm1

-    paddsw      xmm0, xmm7

-    pxor        xmm1, xmm1

-    paddsw      xmm0, xmm2

-    psraw       xmm0, 7

-    packuswb    xmm0, xmm0

-    movd        DWORD PTR [rdi], xmm0

-    add         rdi, rdx

-    dec         rcx

-    jnz         .filter_block1d4_h6_rowloop_ssse3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-.vp9_filter_block1d4_h4_ssse3:

-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

-    movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]

-    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]

-    mov         rsi, arg(0)             ;src_ptr

-    mov         rdi, arg(2)             ;output_ptr

-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line

-    movsxd      rcx, dword ptr arg(4)   ;output_height

-    movsxd      rdx, dword ptr arg(3)   ;output_pitch

-.filter_block1d4_h4_rowloop_ssse3:

-    movdqu      xmm1,   XMMWORD PTR [rsi - 2]

-    movdqa      xmm2, xmm1

-    pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]

-    pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]

-    pmaddubsw   xmm1, xmm5

-;--

-    pmaddubsw   xmm2, xmm6

-    lea         rsi,    [rsi + rax]

-;--

-    paddsw      xmm1, xmm7

-    paddsw      xmm1, xmm2

-    psraw       xmm1, 7

-    packuswb    xmm1, xmm1

-    movd        DWORD PTR [rdi], xmm1

-    add         rdi, rdx

-    dec         rcx

-    jnz         .filter_block1d4_h4_rowloop_ssse3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d16_v6_ssse3

-;(

-;    unsigned char *src_ptr,

-;    unsigned int   src_pitch,

-;    unsigned char *output_ptr,

-;    unsigned int   out_pitch,

-;    unsigned int   output_height,

-;    unsigned int   vp9_filter_index

-;)

-global sym(vp9_filter_block1d16_v6_ssse3)

-sym(vp9_filter_block1d16_v6_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    movsxd      rdx, DWORD PTR arg(5)   ;table index

-    xor         rsi, rsi

-    shl         rdx, 4      ;

-    lea         rax, [GLOBAL(k0_k5)]

-    add         rax, rdx

-    cmp         esi, DWORD PTR [rax]

-    je          .vp9_filter_block1d16_v4_ssse3

-    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5

-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3

-    mov         rsi, arg(0)             ;src_ptr

-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line

-    mov         rdi, arg(2)             ;output_ptr

-%if ABI_IS_32BIT=0

-    movsxd      r8, DWORD PTR arg(3)    ;out_pitch

-%endif

-    mov         rax, rsi

-    movsxd      rcx, DWORD PTR arg(4)   ;output_height

-    add         rax, rdx

-.vp9_filter_block1d16_v6_ssse3_loop:

-    movq        xmm1, MMWORD PTR [rsi]                  ;A

-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B

-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C

-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D

-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E

-    punpcklbw   xmm2, xmm4                  ;B D

-    punpcklbw   xmm3, xmm0                  ;C E

-    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F

-    pmaddubsw   xmm3, xmm6

-    punpcklbw   xmm1, xmm0                  ;A F

-    pmaddubsw   xmm2, xmm7

-    pmaddubsw   xmm1, xmm5

-    paddsw      xmm2, xmm3

-    paddsw      xmm2, xmm1

-    paddsw      xmm2, [GLOBAL(rd)]

-    psraw       xmm2, 7

-    packuswb    xmm2, xmm2

-    movq        MMWORD PTR [rdi], xmm2          ;store the results

-    movq        xmm1, MMWORD PTR [rsi + 8]                  ;A

-    movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B

-    movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C

-    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D

-    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E

-    punpcklbw   xmm2, xmm4                  ;B D

-    punpcklbw   xmm3, xmm0                  ;C E

-    movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F

-    pmaddubsw   xmm3, xmm6

-    punpcklbw   xmm1, xmm0                  ;A F

-    pmaddubsw   xmm2, xmm7

-    pmaddubsw   xmm1, xmm5

-    add         rsi,  rdx

-    add         rax,  rdx

-;--

-;--

-    paddsw      xmm2, xmm3

-    paddsw      xmm2, xmm1

-    paddsw      xmm2, [GLOBAL(rd)]

-    psraw       xmm2, 7

-    packuswb    xmm2, xmm2

-    movq        MMWORD PTR [rdi+8], xmm2

-%if ABI_IS_32BIT

-    add         rdi,        DWORD PTR arg(3) ;out_pitch

-%else

-    add         rdi,        r8

-%endif

-    dec         rcx

-    jnz         .vp9_filter_block1d16_v6_ssse3_loop

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-.vp9_filter_block1d16_v4_ssse3:

-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3

-    mov         rsi, arg(0)             ;src_ptr

-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line

-    mov         rdi, arg(2)             ;output_ptr

-%if ABI_IS_32BIT=0

-    movsxd      r8, DWORD PTR arg(3)    ;out_pitch

-%endif

-    mov         rax, rsi

-    movsxd      rcx, DWORD PTR arg(4)   ;output_height

-    add         rax, rdx

-.vp9_filter_block1d16_v4_ssse3_loop:

-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B

-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C

-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D

-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E

-    punpcklbw   xmm2, xmm4                  ;B D

-    punpcklbw   xmm3, xmm0                  ;C E

-    pmaddubsw   xmm3, xmm6

-    pmaddubsw   xmm2, xmm7

-    movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B

-    movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C

-    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D

-    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E

-    paddsw      xmm2, [GLOBAL(rd)]

-    paddsw      xmm2, xmm3

-    psraw       xmm2, 7

-    packuswb    xmm2, xmm2

-    punpcklbw   xmm5, xmm4                  ;B D

-    punpcklbw   xmm1, xmm0                  ;C E

-    pmaddubsw   xmm1, xmm6

-    pmaddubsw   xmm5, xmm7

-    movdqa      xmm4, [GLOBAL(rd)]

-    add         rsi,  rdx

-    add         rax,  rdx

-;--

-;--

-    paddsw      xmm5, xmm1

-    paddsw      xmm5, xmm4

-    psraw       xmm5, 7

-    packuswb    xmm5, xmm5

-    punpcklqdq  xmm2, xmm5

-    movdqa       XMMWORD PTR [rdi], xmm2

-%if ABI_IS_32BIT

-    add         rdi,        DWORD PTR arg(3) ;out_pitch

-%else

-    add         rdi,        r8

-%endif

-    dec         rcx

-    jnz         .vp9_filter_block1d16_v4_ssse3_loop

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d8_v6_ssse3

-;(

-;    unsigned char *src_ptr,

-;    unsigned int   src_pitch,

-;    unsigned char *output_ptr,

-;    unsigned int   out_pitch,

-;    unsigned int   output_height,

-;    unsigned int   vp9_filter_index

-;)

-global sym(vp9_filter_block1d8_v6_ssse3)

-sym(vp9_filter_block1d8_v6_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    movsxd      rdx, DWORD PTR arg(5)   ;table index

-    xor         rsi, rsi

-    shl         rdx, 4      ;

-    lea         rax, [GLOBAL(k0_k5)]

-    add         rax, rdx

-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line

-    mov         rdi, arg(2)             ;output_ptr

-%if ABI_IS_32BIT=0

-    movsxd      r8, DWORD PTR arg(3)    ; out_pitch

-%endif

-    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]

-    cmp         esi, DWORD PTR [rax]

-    je          .vp9_filter_block1d8_v4_ssse3

-    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5

-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3

-    mov         rsi, arg(0)             ;src_ptr

-    mov         rax, rsi

-    add         rax, rdx

-.vp9_filter_block1d8_v6_ssse3_loop:

-    movq        xmm1, MMWORD PTR [rsi]                  ;A

-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B

-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C

-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D

-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E

-    punpcklbw   xmm2, xmm4                  ;B D

-    punpcklbw   xmm3, xmm0                  ;C E

-    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F

-    movdqa      xmm4, [GLOBAL(rd)]

-    pmaddubsw   xmm3, xmm6

-    punpcklbw   xmm1, xmm0                  ;A F

-    pmaddubsw   xmm2, xmm7

-    pmaddubsw   xmm1, xmm5

-    add         rsi,  rdx

-    add         rax,  rdx

-;--

-;--

-    paddsw      xmm2, xmm3

-    paddsw      xmm2, xmm1

-    paddsw      xmm2, xmm4

-    psraw       xmm2, 7

-    packuswb    xmm2, xmm2

-    movq        MMWORD PTR [rdi], xmm2

-%if ABI_IS_32BIT

-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]

-%else

-    add         rdi,        r8

-%endif

-    dec         rcx

-    jnz         .vp9_filter_block1d8_v6_ssse3_loop

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-.vp9_filter_block1d8_v4_ssse3:

-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3

-    movdqa      xmm5, [GLOBAL(rd)]

-    mov         rsi, arg(0)             ;src_ptr

-    mov         rax, rsi

-    add         rax, rdx

-.vp9_filter_block1d8_v4_ssse3_loop:

-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B

-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C

-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D

-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E

-    punpcklbw   xmm2, xmm4                  ;B D

-    punpcklbw   xmm3, xmm0                  ;C E

-    pmaddubsw   xmm3, xmm6

-    pmaddubsw   xmm2, xmm7

-    add         rsi,  rdx

-    add         rax,  rdx

-;--

-;--

-    paddsw      xmm2, xmm3

-    paddsw      xmm2, xmm5

-    psraw       xmm2, 7

-    packuswb    xmm2, xmm2

-    movq        MMWORD PTR [rdi], xmm2

-%if ABI_IS_32BIT

-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]

-%else

-    add         rdi,        r8

-%endif

-    dec         rcx

-    jnz         .vp9_filter_block1d8_v4_ssse3_loop

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d4_v6_ssse3

-;(

-;    unsigned char *src_ptr,

-;    unsigned int   src_pitch,

-;    unsigned char *output_ptr,

-;    unsigned int   out_pitch,

-;    unsigned int   output_height,

-;    unsigned int   vp9_filter_index

-;)

-global sym(vp9_filter_block1d4_v6_ssse3)

-sym(vp9_filter_block1d4_v6_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    movsxd      rdx, DWORD PTR arg(5)   ;table index

-    xor         rsi, rsi

-    shl         rdx, 4      ;

-    lea         rax, [GLOBAL(k0_k5)]

-    add         rax, rdx

-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line

-    mov         rdi, arg(2)             ;output_ptr

-%if ABI_IS_32BIT=0

-    movsxd      r8, DWORD PTR arg(3)    ; out_pitch

-%endif

-    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]

-    cmp         esi, DWORD PTR [rax]

-    je          .vp9_filter_block1d4_v4_ssse3

-    movq        mm5, MMWORD PTR [rax]         ;k0_k5

-    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4

-    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3

-    mov         rsi, arg(0)             ;src_ptr

-    mov         rax, rsi

-    add         rax, rdx

-.vp9_filter_block1d4_v6_ssse3_loop:

-    movd        mm1, DWORD PTR [rsi]                  ;A

-    movd        mm2, DWORD PTR [rsi + rdx]            ;B

-    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C

-    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D

-    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E

-    punpcklbw   mm2, mm4                  ;B D

-    punpcklbw   mm3, mm0                  ;C E

-    movd        mm0, DWORD PTR [rax + rdx * 4]        ;F

-    movq        mm4, [GLOBAL(rd)]

-    pmaddubsw   mm3, mm6

-    punpcklbw   mm1, mm0                  ;A F

-    pmaddubsw   mm2, mm7

-    pmaddubsw   mm1, mm5

-    add         rsi,  rdx

-    add         rax,  rdx

-;--

-;--

-    paddsw      mm2, mm3

-    paddsw      mm2, mm1

-    paddsw      mm2, mm4

-    psraw       mm2, 7

-    packuswb    mm2, mm2

-    movd        DWORD PTR [rdi], mm2

-%if ABI_IS_32BIT

-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]

-%else

-    add         rdi,        r8

-%endif

-    dec         rcx

-    jnz         .vp9_filter_block1d4_v6_ssse3_loop

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-.vp9_filter_block1d4_v4_ssse3:

-    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4

-    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3

-    movq        mm5, MMWORD PTR [GLOBAL(rd)]

-    mov         rsi, arg(0)             ;src_ptr

-    mov         rax, rsi

-    add         rax, rdx

-.vp9_filter_block1d4_v4_ssse3_loop:

-    movd        mm2, DWORD PTR [rsi + rdx]            ;B

-    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C

-    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D

-    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E

-    punpcklbw   mm2, mm4                  ;B D

-    punpcklbw   mm3, mm0                  ;C E

-    pmaddubsw   mm3, mm6

-    pmaddubsw   mm2, mm7

-    add         rsi,  rdx

-    add         rax,  rdx

-;--

-;--

-    paddsw      mm2, mm3

-    paddsw      mm2, mm5

-    psraw       mm2, 7

-    packuswb    mm2, mm2

-    movd        DWORD PTR [rdi], mm2

-%if ABI_IS_32BIT

-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]

-%else

-    add         rdi,        r8

-%endif

-    dec         rcx

-    jnz         .vp9_filter_block1d4_v4_ssse3_loop

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_bilinear_predict16x16_ssse3

-;(

-;    unsigned char  *src_ptr,

-;    int   src_pixels_per_line,

-;    int  xoffset,

-;    int  yoffset,

-;    unsigned char *dst_ptr,

-;    int dst_pitch

-;)

-global sym(vp9_bilinear_predict16x16_ssse3)

-sym(vp9_bilinear_predict16x16_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        lea         rcx,        [GLOBAL(bilinear_filters_ssse3)]

-        movsxd      rax,        dword ptr arg(2)    ; xoffset

-        cmp         rax,        0                   ; skip first_pass filter if xoffset=0

-        je          .b16x16_sp_only

-        shl         rax,        4

-        lea         rax,        [rax + rcx]         ; HFilter

-        mov         rdi,        arg(4)              ; dst_ptr

-        mov         rsi,        arg(0)              ; src_ptr

-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch

-        movdqa      xmm1,       [rax]

-        movsxd      rax,        dword ptr arg(3)    ; yoffset

-        cmp         rax,        0                   ; skip second_pass filter if yoffset=0

-        je          .b16x16_fp_only

-        shl         rax,        4

-        lea         rax,        [rax + rcx]         ; VFilter

-        lea         rcx,        [rdi+rdx*8]

-        lea         rcx,        [rcx+rdx*8]

-        movsxd      rdx,        dword ptr arg(1)    ; src_pixels_per_line

-        movdqa      xmm2,       [rax]

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(5)    ; dst_pitch

-%endif

-        movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07

-        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08

-        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08

-        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15

-        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16

-        lea         rsi,        [rsi + rdx]         ; next line

-        pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14

-        punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16

-        pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT    ; xmm3 /= 128

-        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value

-        psraw       xmm4,       VP9_FILTER_SHIFT    ; xmm4 /= 128

-        movdqa      xmm7,       xmm3

-        packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

-.next_row:

-        movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07

-        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08

-        punpcklbw   xmm6,       xmm5

-        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15

-        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16

-        lea         rsi,        [rsi + rdx]         ; next line

-        pmaddubsw   xmm6,       xmm1

-        punpcklbw   xmm4,       xmm5

-        pmaddubsw   xmm4,       xmm1

-        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value

-        psraw       xmm6,       VP9_FILTER_SHIFT    ; xmm6 /= 128

-        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value

-        psraw       xmm4,       VP9_FILTER_SHIFT    ; xmm4 /= 128

-        packuswb    xmm6,       xmm4

-        movdqa      xmm5,       xmm7

-        punpcklbw   xmm5,       xmm6

-        pmaddubsw   xmm5,       xmm2

-        punpckhbw   xmm7,       xmm6

-        pmaddubsw   xmm7,       xmm2

-        paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value

-        psraw       xmm5,       VP9_FILTER_SHIFT    ; xmm5 /= 128

-        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value

-        psraw       xmm7,       VP9_FILTER_SHIFT    ; xmm7 /= 128

-        packuswb    xmm5,       xmm7

-        movdqa      xmm7,       xmm6

-        movdqa      [rdi],      xmm5                ; store the results in the destination

-%if ABI_IS_32BIT

-        add         rdi,        DWORD PTR arg(5)    ; dst_pitch

-%else

-        add         rdi,        r8

-%endif

-        cmp         rdi,        rcx

-        jne         .next_row

-        jmp         .done

-.b16x16_sp_only:

-        movsxd      rax,        dword ptr arg(3)    ; yoffset

-        shl         rax,        4

-        lea         rax,        [rax + rcx]         ; VFilter

-        mov         rdi,        arg(4)              ; dst_ptr

-        mov         rsi,        arg(0)              ; src_ptr

-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch

-        movdqa      xmm1,       [rax]               ; VFilter

-        lea         rcx,        [rdi+rdx*8]

-        lea         rcx,        [rcx+rdx*8]

-        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line

-        ; get the first horizontal line done

-        movq        xmm4,       [rsi]               ; load row 0

-        movq        xmm2,       [rsi + 8]           ; load row 0

-        lea         rsi,        [rsi + rax]         ; next line

-.next_row_sp:

-        movq        xmm3,       [rsi]               ; load row + 1

-        movq        xmm5,       [rsi + 8]           ; load row + 1

-        punpcklbw   xmm4,       xmm3

-        punpcklbw   xmm2,       xmm5

-        pmaddubsw   xmm4,       xmm1

-        movq        xmm7,       [rsi + rax]         ; load row + 2

-        pmaddubsw   xmm2,       xmm1

-        movq        xmm6,       [rsi + rax + 8]     ; load row + 2

-        punpcklbw   xmm3,       xmm7

-        punpcklbw   xmm5,       xmm6

-        pmaddubsw   xmm3,       xmm1

-        paddw       xmm4,       [GLOBAL(rd)]

-        pmaddubsw   xmm5,       xmm1

-        paddw       xmm2,       [GLOBAL(rd)]

-        psraw       xmm4,       VP9_FILTER_SHIFT

-        psraw       xmm2,       VP9_FILTER_SHIFT

-        packuswb    xmm4,       xmm2

-        paddw       xmm3,       [GLOBAL(rd)]

-        movdqa      [rdi],      xmm4                ; store row 0

-        paddw       xmm5,       [GLOBAL(rd)]

-        psraw       xmm3,       VP9_FILTER_SHIFT

-        psraw       xmm5,       VP9_FILTER_SHIFT

-        packuswb    xmm3,       xmm5

-        movdqa      xmm4,       xmm7

-        movdqa      [rdi + rdx],xmm3                ; store row 1

-        lea         rsi,        [rsi + 2*rax]

-        movdqa      xmm2,       xmm6

-        lea         rdi,        [rdi + 2*rdx]

-        cmp         rdi,        rcx

-        jne         .next_row_sp

-        jmp         .done

-.b16x16_fp_only:

-        lea         rcx,        [rdi+rdx*8]

-        lea         rcx,        [rcx+rdx*8]

-        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line

-.next_row_fp:

-        movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07

-        movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08

-        punpcklbw   xmm2,       xmm4

-        movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15

-        pmaddubsw   xmm2,       xmm1

-        movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16

-        lea         rsi,        [rsi + rax]         ; next line

-        punpcklbw   xmm3,       xmm4

-        pmaddubsw   xmm3,       xmm1

-        movq        xmm5,       [rsi]

-        paddw       xmm2,       [GLOBAL(rd)]

-        movq        xmm7,       [rsi+1]

-        movq        xmm6,       [rsi+8]

-        psraw       xmm2,       VP9_FILTER_SHIFT

-        punpcklbw   xmm5,       xmm7

-        movq        xmm7,       [rsi+9]

-        paddw       xmm3,       [GLOBAL(rd)]

-        pmaddubsw   xmm5,       xmm1

-        psraw       xmm3,       VP9_FILTER_SHIFT

-        punpcklbw   xmm6,       xmm7

-        packuswb    xmm2,       xmm3

-        pmaddubsw   xmm6,       xmm1

-        movdqa      [rdi],      xmm2                ; store the results in the destination

-        paddw       xmm5,       [GLOBAL(rd)]

-        lea         rdi,        [rdi + rdx]         ; dst_pitch

-        psraw       xmm5,       VP9_FILTER_SHIFT

-        paddw       xmm6,       [GLOBAL(rd)]

-        psraw       xmm6,       VP9_FILTER_SHIFT

-        packuswb    xmm5,       xmm6

-        lea         rsi,        [rsi + rax]         ; next line

-        movdqa      [rdi],      xmm5                ; store the results in the destination

-        lea         rdi,        [rdi + rdx]         ; dst_pitch

-        cmp         rdi,        rcx

-        jne         .next_row_fp

-.done:

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_bilinear_predict8x8_ssse3

-;(

-;    unsigned char  *src_ptr,

-;    int   src_pixels_per_line,

-;    int  xoffset,

-;    int  yoffset,

-;    unsigned char *dst_ptr,

-;    int dst_pitch

-;)

-global sym(vp9_bilinear_predict8x8_ssse3)

-sym(vp9_bilinear_predict8x8_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub         rsp, 144                         ; reserve 144 bytes

-        lea         rcx,        [GLOBAL(bilinear_filters_ssse3)]

-        mov         rsi,        arg(0) ;src_ptr

-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line

-    ;Read 9-line unaligned data in and put them on stack. This gives a big

-    ;performance boost.

-        movdqu      xmm0,       [rsi]

-        lea         rax,        [rdx + rdx*2]

-        movdqu      xmm1,       [rsi+rdx]

-        movdqu      xmm2,       [rsi+rdx*2]

-        add         rsi,        rax

-        movdqu      xmm3,       [rsi]

-        movdqu      xmm4,       [rsi+rdx]

-        movdqu      xmm5,       [rsi+rdx*2]

-        add         rsi,        rax

-        movdqu      xmm6,       [rsi]

-        movdqu      xmm7,       [rsi+rdx]

-        movdqa      XMMWORD PTR [rsp],            xmm0

-        movdqu      xmm0,       [rsi+rdx*2]

-        movdqa      XMMWORD PTR [rsp+16],         xmm1

-        movdqa      XMMWORD PTR [rsp+32],         xmm2

-        movdqa      XMMWORD PTR [rsp+48],         xmm3

-        movdqa      XMMWORD PTR [rsp+64],         xmm4

-        movdqa      XMMWORD PTR [rsp+80],         xmm5

-        movdqa      XMMWORD PTR [rsp+96],         xmm6

-        movdqa      XMMWORD PTR [rsp+112],        xmm7

-        movdqa      XMMWORD PTR [rsp+128],        xmm0

-        movsxd      rax,        dword ptr arg(2)    ; xoffset

-        cmp         rax,        0                   ; skip first_pass filter if xoffset=0

-        je          .b8x8_sp_only

-        shl         rax,        4

-        add         rax,        rcx                 ; HFilter

-        mov         rdi,        arg(4)              ; dst_ptr

-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch

-        movdqa      xmm0,       [rax]

-        movsxd      rax,        dword ptr arg(3)    ; yoffset

-        cmp         rax,        0                   ; skip second_pass filter if yoffset=0

-        je          .b8x8_fp_only

-        shl         rax,        4

-        lea         rax,        [rax + rcx]         ; VFilter

-        lea         rcx,        [rdi+rdx*8]

-        movdqa      xmm1,       [rax]

-        ; get the first horizontal line done

-        movdqa      xmm3,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

-        movdqa      xmm5,       xmm3                ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx

-        psrldq      xmm5,       1

-        lea         rsp,        [rsp + 16]          ; next line

-        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08

-        pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT    ; xmm3 /= 128

-        movdqa      xmm7,       xmm3

-        packuswb    xmm7,       xmm7                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

-.next_row:

-        movdqa      xmm6,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

-        lea         rsp,        [rsp + 16]          ; next line

-        movdqa      xmm5,       xmm6

-        psrldq      xmm5,       1

-        punpcklbw   xmm6,       xmm5

-        pmaddubsw   xmm6,       xmm0

-        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value

-        psraw       xmm6,       VP9_FILTER_SHIFT    ; xmm6 /= 128

-        packuswb    xmm6,       xmm6

-        punpcklbw   xmm7,       xmm6

-        pmaddubsw   xmm7,       xmm1

-        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value

-        psraw       xmm7,       VP9_FILTER_SHIFT    ; xmm7 /= 128

-        packuswb    xmm7,       xmm7

-        movq        [rdi],      xmm7                ; store the results in the destination

-        lea         rdi,        [rdi + rdx]

-        movdqa      xmm7,       xmm6

-        cmp         rdi,        rcx

-        jne         .next_row

-        jmp         .done8x8

-.b8x8_sp_only:

-        movsxd      rax,        dword ptr arg(3)    ; yoffset

-        shl         rax,        4

-        lea         rax,        [rax + rcx]         ; VFilter

-        mov         rdi,        arg(4) ;dst_ptr

-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch

-        movdqa      xmm0,       [rax]               ; VFilter

-        movq        xmm1,       XMMWORD PTR [rsp]

-        movq        xmm2,       XMMWORD PTR [rsp+16]

-        movq        xmm3,       XMMWORD PTR [rsp+32]

-        punpcklbw   xmm1,       xmm2

-        movq        xmm4,       XMMWORD PTR [rsp+48]

-        punpcklbw   xmm2,       xmm3

-        movq        xmm5,       XMMWORD PTR [rsp+64]

-        punpcklbw   xmm3,       xmm4

-        movq        xmm6,       XMMWORD PTR [rsp+80]

-        punpcklbw   xmm4,       xmm5

-        movq        xmm7,       XMMWORD PTR [rsp+96]

-        punpcklbw   xmm5,       xmm6

-        pmaddubsw   xmm1,       xmm0

-        pmaddubsw   xmm2,       xmm0

-        pmaddubsw   xmm3,       xmm0

-        pmaddubsw   xmm4,       xmm0

-        pmaddubsw   xmm5,       xmm0

-        punpcklbw   xmm6,       xmm7

-        pmaddubsw   xmm6,       xmm0

-        paddw       xmm1,       [GLOBAL(rd)]

-        paddw       xmm2,       [GLOBAL(rd)]

-        psraw       xmm1,       VP9_FILTER_SHIFT

-        paddw       xmm3,       [GLOBAL(rd)]

-        psraw       xmm2,       VP9_FILTER_SHIFT

-        paddw       xmm4,       [GLOBAL(rd)]

-        psraw       xmm3,       VP9_FILTER_SHIFT

-        paddw       xmm5,       [GLOBAL(rd)]

-        psraw       xmm4,       VP9_FILTER_SHIFT

-        paddw       xmm6,       [GLOBAL(rd)]

-        psraw       xmm5,       VP9_FILTER_SHIFT

-        psraw       xmm6,       VP9_FILTER_SHIFT

-        packuswb    xmm1,       xmm1

-        packuswb    xmm2,       xmm2

-        movq        [rdi],      xmm1

-        packuswb    xmm3,       xmm3

-        movq        [rdi+rdx],  xmm2

-        packuswb    xmm4,       xmm4

-        movq        xmm1,       XMMWORD PTR [rsp+112]

-        lea         rdi,        [rdi + 2*rdx]

-        movq        xmm2,       XMMWORD PTR [rsp+128]

-        packuswb    xmm5,       xmm5

-        movq        [rdi],      xmm3

-        packuswb    xmm6,       xmm6

-        movq        [rdi+rdx],  xmm4

-        lea         rdi,        [rdi + 2*rdx]

-        punpcklbw   xmm7,       xmm1

-        movq        [rdi],      xmm5

-        pmaddubsw   xmm7,       xmm0

-        movq        [rdi+rdx],  xmm6

-        punpcklbw   xmm1,       xmm2

-        pmaddubsw   xmm1,       xmm0

-        paddw       xmm7,       [GLOBAL(rd)]

-        psraw       xmm7,       VP9_FILTER_SHIFT

-        paddw       xmm1,       [GLOBAL(rd)]

-        psraw       xmm1,       VP9_FILTER_SHIFT

-        packuswb    xmm7,       xmm7

-        packuswb    xmm1,       xmm1

-        lea         rdi,        [rdi + 2*rdx]

-        movq        [rdi],      xmm7

-        movq        [rdi+rdx],  xmm1

-        lea         rsp,        [rsp + 144]

-        jmp         .done8x8

-.b8x8_fp_only:

-        lea         rcx,        [rdi+rdx*8]

-.next_row_fp:

-        movdqa      xmm1,       XMMWORD PTR [rsp]

-        movdqa      xmm3,       XMMWORD PTR [rsp+16]

-        movdqa      xmm2,       xmm1

-        movdqa      xmm5,       XMMWORD PTR [rsp+32]

-        psrldq      xmm2,       1

-        movdqa      xmm7,       XMMWORD PTR [rsp+48]

-        movdqa      xmm4,       xmm3

-        psrldq      xmm4,       1

-        movdqa      xmm6,       xmm5

-        psrldq      xmm6,       1

-        punpcklbw   xmm1,       xmm2

-        pmaddubsw   xmm1,       xmm0

-        punpcklbw   xmm3,       xmm4

-        pmaddubsw   xmm3,       xmm0

-        punpcklbw   xmm5,       xmm6

-        pmaddubsw   xmm5,       xmm0

-        movdqa      xmm2,       xmm7

-        psrldq      xmm2,       1

-        punpcklbw   xmm7,       xmm2

-        pmaddubsw   xmm7,       xmm0

-        paddw       xmm1,       [GLOBAL(rd)]

-        psraw       xmm1,       VP9_FILTER_SHIFT

-        paddw       xmm3,       [GLOBAL(rd)]

-        psraw       xmm3,       VP9_FILTER_SHIFT

-        paddw       xmm5,       [GLOBAL(rd)]

-        psraw       xmm5,       VP9_FILTER_SHIFT

-        paddw       xmm7,       [GLOBAL(rd)]

-        psraw       xmm7,       VP9_FILTER_SHIFT

-        packuswb    xmm1,       xmm1

-        packuswb    xmm3,       xmm3

-        packuswb    xmm5,       xmm5

-        movq        [rdi],      xmm1

-        packuswb    xmm7,       xmm7

-        movq        [rdi+rdx],  xmm3

-        lea         rdi,        [rdi + 2*rdx]

-        movq        [rdi],      xmm5

-        lea         rsp,        [rsp + 4*16]

-        movq        [rdi+rdx],  xmm7

-        lea         rdi,        [rdi + 2*rdx]

-        cmp         rdi,        rcx

-        jne         .next_row_fp

-        lea         rsp,        [rsp + 16]

-.done8x8:

-    ;add rsp, 144

-    pop         rsp

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-shuf1b:

-    db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12

-shuf2b:

-    db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11

-shuf3b:

-    db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10

-align 16

-shuf2bfrom1:

-    db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13

-align 16

-shuf3bfrom1:

-    db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11

-align 16

-rd:

-    times 8 dw 0x40

-align 16

-k0_k5:

-    times 8 db 0, 0             ;placeholder

-    times 8 db 0, 0

-    times 8 db 2, 1

-    times 8 db 0, 0

-    times 8 db 3, 3

-    times 8 db 0, 0

-    times 8 db 1, 2

-    times 8 db 0, 0

-k1_k3:

-    times 8 db  0,    0         ;placeholder

-    times 8 db  -6,  12

-    times 8 db -11,  36

-    times 8 db  -9,  50

-    times 8 db -16,  77

-    times 8 db  -6,  93

-    times 8 db  -8, 108

-    times 8 db  -1, 123

-k2_k4:

-    times 8 db 128,    0        ;placeholder

-    times 8 db 123,   -1

-    times 8 db 108,   -8

-    times 8 db  93,   -6

-    times 8 db  77,  -16

-    times 8 db  50,   -9

-    times 8 db  36,  -11

-    times 8 db  12,   -6

-align 16

-bilinear_filters_ssse3:

-    times 8 db 128, 0

-    times 8 db 120, 8

-    times 8 db 112, 16

-    times 8 db 104, 24

-    times 8 db 96,  32

-    times 8 db 88,  40

-    times 8 db 80,  48

-    times 8 db 72,  56

-    times 8 db 64,  64

-    times 8 db 56,  72

-    times 8 db 48,  80

-    times 8 db 40,  88

-    times 8 db 32,  96

-    times 8 db 24,  104

-    times 8 db 16,  112

-    times 8 db 8,   120

--- a/vp8/common/x86/subpixel_x86.h

+++ /dev/null

@@ -1,122 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef SUBPIXEL_X86_H

-#define SUBPIXEL_X86_H

-/* Note:

- *

- * This platform is commonly built for runtime CPU detection. If you modify

- * any of the function mappings present in this file, be sure to also update

- * them in the function pointer initialization code

- */

-#if HAVE_MMX

-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_mmx);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx);

-extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx);

-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx);

-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx);

-extern prototype_subpixel_predict(vp9_bilinear_predict8x4_mmx);

-extern prototype_subpixel_predict(vp9_bilinear_predict4x4_mmx);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_subpix_sixtap16x16

-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx

-#undef  vp9_subpix_sixtap8x8

-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_mmx

-#undef  vp9_subpix_sixtap8x4

-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_mmx

-#undef  vp9_subpix_sixtap4x4

-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_mmx

-#undef  vp9_subpix_bilinear16x16

-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx

-#undef  vp9_subpix_bilinear8x8

-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_mmx

-#undef  vp9_subpix_bilinear8x4

-#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_mmx

-#undef  vp9_subpix_bilinear4x4

-#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_mmx

-#endif

-#endif

-#if HAVE_SSE2

-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_sse2);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_sse2);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_sse2);

-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_sse2);

-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_sse2);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_subpix_sixtap16x16

-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_sse2

-#undef  vp9_subpix_sixtap8x8

-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_sse2

-#undef  vp9_subpix_sixtap8x4

-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_sse2

-#undef  vp9_subpix_bilinear16x16

-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_sse2

-#undef  vp9_subpix_bilinear8x8

-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_sse2

-#endif

-#endif

-#if HAVE_SSSE3

-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_ssse3);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_ssse3);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_ssse3);

-extern prototype_subpixel_predict(vp9_sixtap_predict4x4_ssse3);

-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_ssse3);

-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_ssse3);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_subpix_sixtap16x16

-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_ssse3

-#undef  vp9_subpix_sixtap8x8

-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_ssse3

-#undef  vp9_subpix_sixtap8x4

-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_ssse3

-#undef  vp9_subpix_sixtap4x4

-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_ssse3

-#undef  vp9_subpix_bilinear16x16

-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_ssse3

-#undef  vp9_subpix_bilinear8x8

-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_ssse3

-#endif

-#endif

-#endif

--- a/vp8/common/x86/vp8_asm_stubs.c

+++ /dev/null

@@ -1,602 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "vpx_ports/mem.h"

-#include "vp8/common/subpixel.h"

-extern const short vp9_six_tap_mmx[16][6 * 8];

-extern const short vp9_bilinear_filters_8x_mmx[16][2 * 8];

-extern void vp9_filter_block1d_h6_mmx(unsigned char   *src_ptr,

-                                      unsigned short  *output_ptr,

-                                      unsigned int     src_pixels_per_line,

-                                      unsigned int     pixel_step,

-                                      unsigned int     output_height,

-                                      unsigned int     output_width,

-                                      const short     *vp9_filter);

-extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr,

-                                       unsigned char  *output_ptr,

-                                       int             output_pitch,

-                                       unsigned int    pixels_per_line,

-                                       unsigned int    pixel_step,

-                                       unsigned int    output_height,

-                                       unsigned int    output_width,

-                                       const short    *vp9_filter);

-extern void vp9_filter_block1d8_h6_sse2(unsigned char  *src_ptr,

-                                        unsigned short *output_ptr,

-                                        unsigned int    src_pixels_per_line,

-                                        unsigned int    pixel_step,

-                                        unsigned int    output_height,

-                                        unsigned int    output_width,

-                                        const short    *vp9_filter);

-extern void vp9_filter_block1d16_h6_sse2(unsigned char  *src_ptr,

-                                         unsigned short *output_ptr,

-                                         unsigned int    src_pixels_per_line,

-                                         unsigned int    pixel_step,

-                                         unsigned int    output_height,

-                                         unsigned int    output_width,

-                                         const short    *vp9_filter);

-extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr,

-                                        unsigned char *output_ptr,

-                                        int dst_ptich,

-                                        unsigned int pixels_per_line,

-                                        unsigned int pixel_step,

-                                        unsigned int output_height,

-                                        unsigned int output_width,

-                                        const short    *vp9_filter);

-extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr,

-                                         unsigned char *output_ptr,

-                                         int dst_ptich,

-                                         unsigned int pixels_per_line,

-                                         unsigned int pixel_step,

-                                         unsigned int output_height,

-                                         unsigned int output_width,

-                                         const short    *vp9_filter);

-extern void vp9_unpack_block1d16_h6_sse2(unsigned char  *src_ptr,

-                                         unsigned short *output_ptr,

-                                         unsigned int    src_pixels_per_line,

-                                         unsigned int    output_height,

-                                         unsigned int    output_width);

-extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,

-                                             unsigned int   src_pixels_per_line,

-                                             unsigned char *output_ptr,

-                                             int            dst_pitch,

-                                             unsigned int   output_height,

-                                             const short   *vp9_filter);

-extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,

-                                              unsigned int   src_pixels_per_lin,

-                                              unsigned char *output_ptr,

-                                              int            dst_pitch,

-                                              unsigned int   output_height,

-                                              const short   *vp9_filter);

-extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,

-                                             unsigned int   src_pixels_per_line,

-                                             unsigned char *output_ptr,

-                                             int            dst_pitch,

-                                             unsigned int   output_height,

-                                             const short   *vp9_filter);

-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx);

-#if HAVE_MMX

-void vp9_sixtap_predict4x4_mmx(unsigned char  *src_ptr,

-                               int  src_pixels_per_line,

-                               int  xoffset,

-                               int  yoffset,

-                               unsigned char *dst_ptr,

-                               int  dst_pitch) {

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict4x4_mmx\n");

-#endif

-  /* Temp data bufffer used in filtering */

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16);

-  const short *hfilter, *vfilter;

-  hfilter = vp9_six_tap_mmx[xoffset];

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2,

-                            src_pixels_per_line, 1, 9, 8, hfilter);

-  vfilter = vp9_six_tap_mmx[yoffset];

-  vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch,

-                             8, 4, 4, 4, vfilter);

-}

-void vp9_sixtap_predict16x16_mmx(unsigned char  *src_ptr,

-                                 int  src_pixels_per_line,

-                                 int  xoffset,

-                                 int  yoffset,

-                                 unsigned char *dst_ptr,

-                                 int dst_pitch) {

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict16x16_mmx\n");

-#endif

-  /* Temp data bufffer used in filtering */

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);

-  const short *hfilter, *vfilter;

-  hfilter = vp9_six_tap_mmx[xoffset];

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),

-                            fdata2,   src_pixels_per_line, 1, 21, 32,

-                            hfilter);

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,

-                            fdata2 + 4, src_pixels_per_line, 1, 21, 32,

-                            hfilter);

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,

-                            fdata2 + 8, src_pixels_per_line, 1, 21, 32,

-                            hfilter);

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12,

-                            fdata2 + 12, src_pixels_per_line, 1, 21, 32,

-                            hfilter);

-  vfilter = vp9_six_tap_mmx[yoffset];

-  vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr,      dst_pitch,

-                             32, 16, 16, 16, vfilter);

-  vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4,  dst_pitch,

-                             32, 16, 16, 16, vfilter);

-  vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8,  dst_pitch,

-                             32, 16, 16, 16, vfilter);

-  vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch,

-                             32, 16, 16, 16, vfilter);

-}

-void vp9_sixtap_predict8x8_mmx(unsigned char  *src_ptr,

-                               int  src_pixels_per_line,

-                               int  xoffset,

-                               int  yoffset,

-                               unsigned char *dst_ptr,

-                               int  dst_pitch) {

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict8x8_mmx\n");

-#endif

-  /* Temp data bufffer used in filtering */

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);

-  const short *hfilter, *vfilter;

-  hfilter = vp9_six_tap_mmx[xoffset];

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),

-                            fdata2,   src_pixels_per_line, 1, 13, 16,

-                            hfilter);

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,

-                            fdata2 + 4, src_pixels_per_line, 1, 13, 16,

-                            hfilter);

-  vfilter = vp9_six_tap_mmx[yoffset];

-  vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr,     dst_pitch,

-                             16, 8, 8, 8, vfilter);

-  vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,

-                             16, 8, 8, 8, vfilter);

-}

-void vp9_sixtap_predict8x4_mmx(unsigned char  *src_ptr,

-                               int  src_pixels_per_line,

-                               int  xoffset,

-                               int  yoffset,

-                               unsigned char *dst_ptr,

-                               int  dst_pitch) {

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict8x4_mmx\n");

-#endif

-  /* Temp data bufffer used in filtering */

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);

-  const short *hfilter, *vfilter;

-  hfilter = vp9_six_tap_mmx[xoffset];

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),

-                            fdata2,   src_pixels_per_line, 1, 9, 16, hfilter);

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,

-                            fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter);

-  vfilter = vp9_six_tap_mmx[yoffset];

-  vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr,     dst_pitch,

-                             16, 8, 4, 8, vfilter);

-  vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,

-                             16, 8, 4, 8, vfilter);

-}

-void vp9_bilinear_predict16x16_mmx(unsigned char  *src_ptr,

-                                   int  src_pixels_per_line,

-                                   int  xoffset,

-                                   int  yoffset,

-                                   unsigned char *dst_ptr,

-                                   int  dst_pitch) {

-  vp9_bilinear_predict8x8_mmx(src_ptr,

-                              src_pixels_per_line, xoffset, yoffset,

-                              dst_ptr, dst_pitch);

-  vp9_bilinear_predict8x8_mmx(src_ptr + 8,

-                              src_pixels_per_line, xoffset, yoffset,

-                              dst_ptr + 8, dst_pitch);

-  vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line,

-                              src_pixels_per_line, xoffset, yoffset,

-                              dst_ptr + dst_pitch * 8, dst_pitch);

-  vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8,

-                              src_pixels_per_line, xoffset, yoffset,

-                              dst_ptr + dst_pitch * 8 + 8, dst_pitch);

-}

-#endif

-#if HAVE_SSE2

-void vp9_sixtap_predict16x16_sse2(unsigned char  *src_ptr,

-                                  int  src_pixels_per_line,

-                                  int  xoffset,

-                                  int  yoffset,

-                                  unsigned char *dst_ptr,

-                                  int  dst_pitch) {

-  /* Temp data bufffer used in filtering */

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);

-  const short *hfilter, *vfilter;

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict16x16_sse2\n");

-#endif

-  if (xoffset) {

-    if (yoffset) {

-      hfilter = vp9_six_tap_mmx[xoffset];

-      vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,

-                                   src_pixels_per_line, 1, 21, 32, hfilter);

-      vfilter = vp9_six_tap_mmx[yoffset];

-      vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,

-                                   32, 16, 16, dst_pitch, vfilter);

-    } else {

-      /* First-pass only */

-      hfilter = vp9_six_tap_mmx[xoffset];

-      vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line,

-                                        dst_ptr, dst_pitch, 16, hfilter);

-    }

-  } else {

-    /* Second-pass only */

-    vfilter = vp9_six_tap_mmx[yoffset];

-    vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,

-                                 src_pixels_per_line, 21, 32);

-    vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,

-                                 32, 16, 16, dst_pitch, vfilter);

-  }

-}

-void vp9_sixtap_predict8x8_sse2(unsigned char  *src_ptr,

-                                int  src_pixels_per_line,

-                                int  xoffset,

-                                int  yoffset,

-                                unsigned char *dst_ptr,

-                                int  dst_pitch) {

-  /* Temp data bufffer used in filtering */

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);

-  const short *hfilter, *vfilter;

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict8x8_sse2\n");

-#endif

-  if (xoffset) {

-    if (yoffset) {

-      hfilter = vp9_six_tap_mmx[xoffset];

-      vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,

-                                  src_pixels_per_line, 1, 13, 16, hfilter);

-      vfilter = vp9_six_tap_mmx[yoffset];

-      vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,

-                                  16, 8, 8, dst_pitch, vfilter);

-    } else {

-      /* First-pass only */

-      hfilter = vp9_six_tap_mmx[xoffset];

-      vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,

-                                       dst_ptr, dst_pitch, 8, hfilter);

-    }

-  } else {

-    /* Second-pass only */

-    vfilter = vp9_six_tap_mmx[yoffset];

-    vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),

-                                     src_pixels_per_line,

-                                     dst_ptr, dst_pitch, 8, vfilter);

-  }

-}

-void vp9_sixtap_predict8x4_sse2(unsigned char  *src_ptr,

-                                int  src_pixels_per_line,

-                                int  xoffset,

-                                int  yoffset,

-                                unsigned char *dst_ptr,

-                                int  dst_pitch) {

-  /* Temp data bufffer used in filtering */

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);

-  const short *hfilter, *vfilter;

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict8x4_sse2\n");

-#endif

-  if (xoffset) {

-    if (yoffset) {

-      hfilter = vp9_six_tap_mmx[xoffset];

-      vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,

-                                  src_pixels_per_line, 1, 9, 16, hfilter);

-      vfilter = vp9_six_tap_mmx[yoffset];

-      vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,

-                                  16, 8, 4, dst_pitch, vfilter);

-    } else {

-      /* First-pass only */

-      hfilter = vp9_six_tap_mmx[xoffset];

-      vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,

-                                       dst_ptr, dst_pitch, 4, hfilter);

-    }

-  } else {

-    /* Second-pass only */

-    vfilter = vp9_six_tap_mmx[yoffset];

-    vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),

-                                     src_pixels_per_line,

-                                     dst_ptr, dst_pitch, 4, vfilter);

-  }

-}

-#endif

-#if HAVE_SSSE3

-extern void vp9_filter_block1d8_h6_ssse3(unsigned char  *src_ptr,

-                                         unsigned int    src_pixels_per_line,

-                                         unsigned char  *output_ptr,

-                                         unsigned int    output_pitch,

-                                         unsigned int    output_height,

-                                         unsigned int    vp9_filter_index);

-extern void vp9_filter_block1d16_h6_ssse3(unsigned char  *src_ptr,

-                                          unsigned int    src_pixels_per_line,

-                                          unsigned char  *output_ptr,

-                                          unsigned int    output_pitch,

-                                          unsigned int    output_height,

-                                          unsigned int    vp9_filter_index);

-extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr,

-                                          unsigned int   src_pitch,

-                                          unsigned char *output_ptr,

-                                          unsigned int   out_pitch,

-                                          unsigned int   output_height,

-                                          unsigned int   vp9_filter_index);

-extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr,

-                                         unsigned int   src_pitch,

-                                         unsigned char *output_ptr,

-                                         unsigned int   out_pitch,

-                                         unsigned int   output_height,

-                                         unsigned int   vp9_filter_index);

-extern void vp9_filter_block1d4_h6_ssse3(unsigned char  *src_ptr,

-                                         unsigned int    src_pixels_per_line,

-                                         unsigned char  *output_ptr,

-                                         unsigned int    output_pitch,

-                                         unsigned int    output_height,

-                                         unsigned int    vp9_filter_index);

-extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr,

-                                         unsigned int   src_pitch,

-                                         unsigned char *output_ptr,

-                                         unsigned int   out_pitch,

-                                         unsigned int   output_height,

-                                         unsigned int   vp9_filter_index);

-void vp9_sixtap_predict16x16_ssse3(unsigned char  *src_ptr,

-                                   int  src_pixels_per_line,

-                                   int  xoffset,

-                                   int  yoffset,

-                                   unsigned char *dst_ptr,

-                                   int  dst_pitch) {

-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24);

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict16x16_ssse3\n");

-#endif

-  if (xoffset) {

-    if (yoffset) {

-      vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                    src_pixels_per_line,

-                                    fdata2, 16, 21, xoffset);

-      vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch,

-                                    16, yoffset);

-    } else {

-      /* First-pass only */

-      vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,

-                                    dst_ptr, dst_pitch, 16, xoffset);

-    }

-  } else {

-    /* Second-pass only */

-    vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                  src_pixels_per_line,

-                                  dst_ptr, dst_pitch, 16, yoffset);

-  }

-}

-void vp9_sixtap_predict8x8_ssse3(unsigned char  *src_ptr,

-                                 int  src_pixels_per_line,

-                                 int  xoffset,

-                                 int  yoffset,

-                                 unsigned char *dst_ptr,

-                                 int  dst_pitch) {

-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict8x8_ssse3\n");

-#endif

-  if (xoffset) {

-    if (yoffset) {

-      vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                   src_pixels_per_line, fdata2, 8, 13, xoffset);

-      vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset);

-    } else {

-      vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,

-                                   dst_ptr, dst_pitch, 8, xoffset);

-    }

-  } else {

-    /* Second-pass only */

-    vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                 src_pixels_per_line,

-                                 dst_ptr, dst_pitch, 8, yoffset);

-  }

-}

-void vp9_sixtap_predict8x4_ssse3(unsigned char  *src_ptr,

-                                 int  src_pixels_per_line,

-                                 int  xoffset,

-                                 int  yoffset,

-                                 unsigned char *dst_ptr,

-                                 int  dst_pitch) {

-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict8x4_ssse3\n");

-#endif

-  if (xoffset) {

-    if (yoffset) {

-      vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                   src_pixels_per_line, fdata2, 8, 9, xoffset);

-      vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset);

-    } else {

-      /* First-pass only */

-      vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,

-                                   dst_ptr, dst_pitch, 4, xoffset);

-    }

-  } else {

-    /* Second-pass only */

-    vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                 src_pixels_per_line,

-                                 dst_ptr, dst_pitch, 4, yoffset);

-  }

-}

-void vp9_sixtap_predict4x4_ssse3(unsigned char  *src_ptr,

-                                 int   src_pixels_per_line,

-                                 int  xoffset,

-                                 int  yoffset,

-                                 unsigned char *dst_ptr,

-                                 int dst_pitch) {

-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9);

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict4x4_ssse3\n");

-#endif

-  if (xoffset) {

-    if (yoffset) {

-      vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                   src_pixels_per_line, fdata2, 4, 9, xoffset);

-      vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset);

-    } else {

-      vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,

-                                   dst_ptr, dst_pitch, 4, xoffset);

-    }

-  } else {

-    vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                 src_pixels_per_line,

-                                 dst_ptr, dst_pitch, 4, yoffset);

-  }

-}

-void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,

-                                   const unsigned int src_pitch,

-                                   unsigned char *output_ptr,

-                                   unsigned int out_pitch,

-                                   unsigned int output_height,

-                                   const short *filter);

-void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,

-                                   const unsigned int src_pitch,

-                                   unsigned char *output_ptr,

-                                   unsigned int out_pitch,

-                                   unsigned int output_height,

-                                   const short *filter);

-void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr,

-                                      const unsigned int src_stride,

-                                      const short *hfilter_aligned16,

-                                      const short *vfilter_aligned16,

-                                      unsigned char *dst_ptr,

-                                      unsigned int dst_stride) {

-  if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {

-    DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);

-    vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride,

-                                  fdata2, 16, 23, hfilter_aligned16);

-    vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16,

-                                  vfilter_aligned16);

-  } else {

-    if (hfilter_aligned16[3] != 128) {

-      vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride,

-                                    16, hfilter_aligned16);

-    } else {

-      vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride,

-                                    dst_ptr, dst_stride, 16, vfilter_aligned16);

-    }

-  }

-}

-void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,

-                                   const unsigned int src_pitch,

-                                   unsigned char *output_ptr,

-                                   unsigned int out_pitch,

-                                   unsigned int output_height,

-                                   const short *filter);

-void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,

-                                   const unsigned int src_pitch,

-                                   unsigned char *output_ptr,

-                                   unsigned int out_pitch,

-                                   unsigned int output_height,

-                                   const short *filter);

-void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr,

-                                    const unsigned int src_stride,

-                                    const short *hfilter_aligned16,

-                                    const short *vfilter_aligned16,

-                                    unsigned char *dst_ptr,

-                                    unsigned int dst_stride) {

-  if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {

-    DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);

-    vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,

-                                 fdata2, 16, 15, hfilter_aligned16);

-    vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8,

-                                 vfilter_aligned16);

-  } else {

-    if (hfilter_aligned16[3] != 128) {

-      vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8,

-                                   hfilter_aligned16);

-    } else {

-      vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,

-                                   dst_ptr, dst_stride, 8, vfilter_aligned16);

-    }

-  }

-}

-void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr,

-                                    const unsigned int src_stride,

-                                    const short *hfilter_aligned16,

-                                    const short *vfilter_aligned16,

-                                    unsigned char *dst_ptr,

-                                    unsigned int dst_stride) {

-  if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) {

-      DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);

-      vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,

-                                   fdata2, 16, 11, hfilter_aligned16);

-      vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4,

-                                   vfilter_aligned16);

-  } else {

-    if (hfilter_aligned16[3] != 128) {

-      vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4,

-                                   hfilter_aligned16);

-    } else {

-      vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,

-                                   dst_ptr, dst_stride, 4, vfilter_aligned16);

-    }

-  }

-}

-#endif

--- a/vp8/common/x86/x86_systemdependent.c

+++ /dev/null

@@ -1,108 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_config.h"

-#include "vpx_ports/x86.h"

-#include "vp8/common/subpixel.h"

-#include "vp8/common/loopfilter.h"

-#include "vp8/common/idct.h"

-#include "vp8/common/pragmas.h"

-#include "vp8/common/onyxc_int.h"

-void vp9_arch_x86_common_init(VP9_COMMON *ctx) {

-#if CONFIG_RUNTIME_CPU_DETECT

-  VP9_COMMON_RTCD *rtcd = &ctx->rtcd;

-  int flags = x86_simd_caps();

-  /* Note:

-   *

-   * This platform can be built without runtime CPU detection as well. If

-   * you modify any of the function mappings present in this file, be sure

-   * to also update them in static mapings (<arch>/filename_<arch>.h)

-   */

-  /* Override default functions with fastest ones for this CPU. */

-#if HAVE_MMX

-// The commented functions need to be re-written for vpx.

-  if (flags & HAS_MMX) {

-    rtcd->idct.idct1        = vp9_short_idct4x4llm_1_mmx;

-    rtcd->idct.idct16       = vp9_short_idct4x4llm_mmx;

-    rtcd->idct.idct1_scalar_add = vp9_dc_only_idct_add_mmx;

-    // rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_mmx;

-    // rtcd->idct.iwalsh1     = vp9_short_inv_walsh4x4_1_mmx;

-    /* Disabled due to unsupported enhanced interpolation/high_prec mv

-    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_mmx;

-    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_mmx;

-    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_mmx;

-    rtcd->subpix.sixtap4x4     = vp9_sixtap_predict4x4_mmx;

-    */

-    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_mmx;

-    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_mmx;

-    rtcd->subpix.bilinear8x4   = vp9_bilinear_predict8x4_mmx;

-    rtcd->subpix.bilinear4x4   = vp9_bilinear_predict4x4_mmx;

-#if CONFIG_POSTPROC

-    rtcd->postproc.down        = vp9_mbpost_proc_down_mmx;

-    /*rtcd->postproc.across      = vp9_mbpost_proc_across_ip_c;*/

-    rtcd->postproc.downacross  = vp9_post_proc_down_and_across_mmx;

-    rtcd->postproc.addnoise    = vp9_plane_add_noise_mmx;

-#endif

-  }

-#endif

-#if HAVE_SSE2

-  if (flags & HAS_SSE2) {

-    // rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_sse2;

-    /* Disabled due to unsupported enhanced interpolation/high_prec mv

-    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_sse2;

-    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_sse2;

-    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_sse2;

-    */

-    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_sse2;

-    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_sse2;

-#if CONFIG_POSTPROC

-    rtcd->postproc.down        = vp9_mbpost_proc_down_xmm;

-    rtcd->postproc.across      = vp9_mbpost_proc_across_ip_xmm;

-    rtcd->postproc.downacross  = vp9_post_proc_down_and_across_xmm;

-    rtcd->postproc.addnoise    = vp9_plane_add_noise_wmt;

-#endif

-  }

-#endif

-#if HAVE_SSSE3

-  if (flags & HAS_SSSE3) {

-    /* Disabled due to unsupported enhanced interpolation/high_prec mv

-    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_ssse3;

-    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_ssse3;

-    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_ssse3;

-    rtcd->subpix.sixtap4x4     = vp9_sixtap_predict4x4_ssse3;

-    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_ssse3;

-    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_ssse3;

-    */

-    /* these are disable because of unsupported diagonal pred modes

-    rtcd->recon.build_intra_predictors_mbuv =

-      vp9_build_intra_predictors_mbuv_ssse3;

-    rtcd->recon.build_intra_predictors_mbuv_s =

-      vp9_build_intra_predictors_mbuv_s_ssse3;

-      */

-  }

-#endif

-#endif

-}

--- a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm

+++ /dev/null

@@ -1,218 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-    EXPORT |vp8_dequant_dc_idct_add_v6|

-    AREA |.text|, CODE, READONLY

-;void vp8_dequant_dc_idct_v6(short *input, short *dq, unsigned char *pred,

-; unsigned char *dest, int pitch, int stride, int Dc)

-; r0 = input

-; r1 = dq

-; r2 = pred

-; r3 = dest

-; sp + 36 = pitch  ; +4 = 40

-; sp + 40 = stride  ; +4 = 44

-; sp + 44 = Dc  ; +4 = 48

-|vp8_dequant_dc_idct_add_v6| PROC

-    stmdb   sp!, {r4-r11, lr}

-    ldr     r6, [sp, #44]

-    ldr     r4, [r0]                ;input

-    ldr     r5, [r1], #4            ;dq

-    sub     sp, sp, #4

-    str     r3, [sp]

-    smultt  r7, r4, r5

-    ldr     r4, [r0, #4]            ;input

-    ldr     r5, [r1], #4            ;dq

-    strh    r6, [r0], #2

-    strh    r7, [r0], #2

-    smulbb  r6, r4, r5

-    smultt  r7, r4, r5

-    ldr     r4, [r0, #4]            ;input

-    ldr     r5, [r1], #4            ;dq

-    strh    r6, [r0], #2

-    strh    r7, [r0], #2

-    mov     r12, #3

-vp8_dequant_dc_add_loop

-    smulbb  r6, r4, r5

-    smultt  r7, r4, r5

-    ldr     r4, [r0, #4]            ;input

-    ldr     r5, [r1], #4            ;dq

-    strh    r6, [r0], #2

-    strh    r7, [r0], #2

-    smulbb  r6, r4, r5

-    smultt  r7, r4, r5

-    subs    r12, r12, #1

-    ldrne   r4, [r0, #4]

-    ldrne   r5, [r1], #4

-    strh    r6, [r0], #2

-    strh    r7, [r0], #2

-    bne     vp8_dequant_dc_add_loop

-    sub     r0, r0, #32

-    mov     r1, r0

-; short_idct4x4llm_v6_dual

-    ldr     r3, cospi8sqrt2minus1

-    ldr     r4, sinpi8sqrt2

-    ldr     r6, [r0, #8]

-    mov     r5, #2

-vp8_dequant_dc_idct_loop1_v6

-    ldr     r12, [r0, #24]

-    ldr     r14, [r0, #16]

-    smulwt  r9, r3, r6

-    smulwb  r7, r3, r6

-    smulwt  r10, r4, r6

-    smulwb  r8, r4, r6

-    pkhbt   r7, r7, r9, lsl #16

-    smulwt  r11, r3, r12

-    pkhbt   r8, r8, r10, lsl #16

-    uadd16  r6, r6, r7

-    smulwt  r7, r4, r12

-    smulwb  r9, r3, r12

-    smulwb  r10, r4, r12

-    subs    r5, r5, #1

-    pkhbt   r9, r9, r11, lsl #16

-    ldr     r11, [r0], #4

-    pkhbt   r10, r10, r7, lsl #16

-    uadd16  r7, r12, r9

-    usub16  r7, r8, r7

-    uadd16  r6, r6, r10

-    uadd16  r10, r11, r14

-    usub16  r8, r11, r14

-    uadd16  r9, r10, r6

-    usub16  r10, r10, r6

-    uadd16  r6, r8, r7

-    usub16  r7, r8, r7

-    str     r6, [r1, #8]

-    ldrne   r6, [r0, #8]

-    str     r7, [r1, #16]

-    str     r10, [r1, #24]

-    str     r9, [r1], #4

-    bne     vp8_dequant_dc_idct_loop1_v6

-    mov     r5, #2

-    sub     r0, r1, #8

-vp8_dequant_dc_idct_loop2_v6

-    ldr     r6, [r0], #4

-    ldr     r7, [r0], #4

-    ldr     r8, [r0], #4

-    ldr     r9, [r0], #4

-    smulwt  r1, r3, r6

-    smulwt  r12, r4, r6

-    smulwt  lr, r3, r8

-    smulwt  r10, r4, r8

-    pkhbt   r11, r8, r6, lsl #16

-    pkhbt   r1, lr, r1, lsl #16

-    pkhbt   r12, r10, r12, lsl #16

-    pkhtb   r6, r6, r8, asr #16

-    uadd16  r6, r1, r6

-    pkhbt   lr, r9, r7, lsl #16

-    uadd16  r10, r11, lr

-    usub16  lr, r11, lr

-    pkhtb   r8, r7, r9, asr #16

-    subs    r5, r5, #1

-    smulwt  r1, r3, r8

-    smulwb  r7, r3, r8

-    smulwt  r11, r4, r8

-    smulwb  r9, r4, r8

-    pkhbt   r1, r7, r1, lsl #16

-    uadd16  r8, r1, r8

-    pkhbt   r11, r9, r11, lsl #16

-    usub16  r1, r12, r8

-    uadd16  r8, r11, r6

-    ldr     r9, c0x00040004

-    ldr     r12, [sp, #40]

-    uadd16  r6, r10, r8

-    usub16  r7, r10, r8

-    uadd16  r7, r7, r9

-    uadd16  r6, r6, r9

-    uadd16  r10, r14, r1

-    usub16  r1, r14, r1

-    uadd16  r10, r10, r9

-    uadd16  r1, r1, r9

-    ldr     r11, [r2], r12

-    mov     r8, r7, asr #3

-    pkhtb   r9, r8, r10, asr #19

-    mov     r8, r1, asr #3

-    pkhtb   r8, r8, r6, asr #19

-    uxtb16  lr, r11, ror #8

-    qadd16  r9, r9, lr

-    uxtb16  lr, r11

-    qadd16  r8, r8, lr

-    usat16  r9, #8, r9

-    usat16  r8, #8, r8

-    orr     r9, r8, r9, lsl #8

-    ldr     r11, [r2], r12

-    ldr     lr, [sp]

-    ldr     r12, [sp, #44]

-    mov     r7, r7, lsl #16

-    mov     r1, r1, lsl #16

-    mov     r10, r10, lsl #16

-    mov     r6, r6, lsl #16

-    mov     r7, r7, asr #3

-    pkhtb   r7, r7, r10, asr #19

-    mov     r1, r1, asr #3

-    pkhtb   r1, r1, r6, asr #19

-    uxtb16  r8, r11, ror #8

-    qadd16  r7, r7, r8

-    uxtb16  r8, r11

-    qadd16  r1, r1, r8

-    usat16  r7, #8, r7

-    usat16  r1, #8, r1

-    orr     r1, r1, r7, lsl #8

-    str     r9, [lr], r12

-    str     r1, [lr], r12

-    str     lr, [sp]

-    bne     vp8_dequant_dc_idct_loop2_v6

-; vpx_memset

-    sub     r0, r0, #32

-    add     sp, sp, #4

-    mov     r12, #0

-    str     r12, [r0]

-    str     r12, [r0, #4]

-    str     r12, [r0, #8]

-    str     r12, [r0, #12]

-    str     r12, [r0, #16]

-    str     r12, [r0, #20]

-    str     r12, [r0, #24]

-    str     r12, [r0, #28]

-    ldmia   sp!, {r4 - r11, pc}

-    ENDP    ; |vp8_dequant_dc_idct_add_v6|

-; Constant Pool

-cospi8sqrt2minus1 DCD 0x00004E7B

-sinpi8sqrt2       DCD 0x00008A8C

-c0x00040004       DCD 0x00040004

-    END

--- a/vp8/decoder/arm/armv6/dequant_idct_v6.asm

+++ /dev/null

@@ -1,196 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-    EXPORT |vp8_dequant_idct_add_v6|

-    AREA |.text|, CODE, READONLY

-;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred,

-; unsigned char *dest, int pitch, int stride)

-; r0 = input

-; r1 = dq

-; r2 = pred

-; r3 = dest

-; sp + 36 = pitch  ; +4 = 40

-; sp + 40 = stride  ; +4 = 44

-|vp8_dequant_idct_add_v6| PROC

-    stmdb   sp!, {r4-r11, lr}

-    ldr     r4, [r0]                ;input

-    ldr     r5, [r1], #4            ;dq

-    sub     sp, sp, #4

-    str     r3, [sp]

-    mov     r12, #4

-vp8_dequant_add_loop

-    smulbb  r6, r4, r5

-    smultt  r7, r4, r5

-    ldr     r4, [r0, #4]            ;input

-    ldr     r5, [r1], #4            ;dq

-    strh    r6, [r0], #2

-    strh    r7, [r0], #2

-    smulbb  r6, r4, r5

-    smultt  r7, r4, r5

-    subs    r12, r12, #1

-    ldrne   r4, [r0, #4]

-    ldrne   r5, [r1], #4

-    strh    r6, [r0], #2

-    strh    r7, [r0], #2

-    bne     vp8_dequant_add_loop

-    sub     r0, r0, #32

-    mov     r1, r0

-; short_idct4x4llm_v6_dual

-    ldr     r3, cospi8sqrt2minus1

-    ldr     r4, sinpi8sqrt2

-    ldr     r6, [r0, #8]

-    mov     r5, #2

-vp8_dequant_idct_loop1_v6

-    ldr     r12, [r0, #24]

-    ldr     r14, [r0, #16]

-    smulwt  r9, r3, r6

-    smulwb  r7, r3, r6

-    smulwt  r10, r4, r6

-    smulwb  r8, r4, r6

-    pkhbt   r7, r7, r9, lsl #16

-    smulwt  r11, r3, r12

-    pkhbt   r8, r8, r10, lsl #16

-    uadd16  r6, r6, r7

-    smulwt  r7, r4, r12

-    smulwb  r9, r3, r12

-    smulwb  r10, r4, r12

-    subs    r5, r5, #1

-    pkhbt   r9, r9, r11, lsl #16

-    ldr     r11, [r0], #4

-    pkhbt   r10, r10, r7, lsl #16

-    uadd16  r7, r12, r9

-    usub16  r7, r8, r7

-    uadd16  r6, r6, r10

-    uadd16  r10, r11, r14

-    usub16  r8, r11, r14

-    uadd16  r9, r10, r6

-    usub16  r10, r10, r6

-    uadd16  r6, r8, r7

-    usub16  r7, r8, r7

-    str     r6, [r1, #8]

-    ldrne   r6, [r0, #8]

-    str     r7, [r1, #16]

-    str     r10, [r1, #24]

-    str     r9, [r1], #4

-    bne     vp8_dequant_idct_loop1_v6

-    mov     r5, #2

-    sub     r0, r1, #8

-vp8_dequant_idct_loop2_v6

-    ldr     r6, [r0], #4

-    ldr     r7, [r0], #4

-    ldr     r8, [r0], #4

-    ldr     r9, [r0], #4

-    smulwt  r1, r3, r6

-    smulwt  r12, r4, r6

-    smulwt  lr, r3, r8

-    smulwt  r10, r4, r8

-    pkhbt   r11, r8, r6, lsl #16

-    pkhbt   r1, lr, r1, lsl #16

-    pkhbt   r12, r10, r12, lsl #16

-    pkhtb   r6, r6, r8, asr #16

-    uadd16  r6, r1, r6

-    pkhbt   lr, r9, r7, lsl #16

-    uadd16  r10, r11, lr

-    usub16  lr, r11, lr

-    pkhtb   r8, r7, r9, asr #16

-    subs    r5, r5, #1

-    smulwt  r1, r3, r8

-    smulwb  r7, r3, r8

-    smulwt  r11, r4, r8

-    smulwb  r9, r4, r8

-    pkhbt   r1, r7, r1, lsl #16

-    uadd16  r8, r1, r8

-    pkhbt   r11, r9, r11, lsl #16

-    usub16  r1, r12, r8

-    uadd16  r8, r11, r6

-    ldr     r9, c0x00040004

-    ldr     r12, [sp, #40]

-    uadd16  r6, r10, r8

-    usub16  r7, r10, r8

-    uadd16  r7, r7, r9

-    uadd16  r6, r6, r9

-    uadd16  r10, r14, r1

-    usub16  r1, r14, r1

-    uadd16  r10, r10, r9

-    uadd16  r1, r1, r9

-    ldr     r11, [r2], r12

-    mov     r8, r7, asr #3

-    pkhtb   r9, r8, r10, asr #19

-    mov     r8, r1, asr #3

-    pkhtb   r8, r8, r6, asr #19

-    uxtb16  lr, r11, ror #8

-    qadd16  r9, r9, lr

-    uxtb16  lr, r11

-    qadd16  r8, r8, lr

-    usat16  r9, #8, r9

-    usat16  r8, #8, r8

-    orr     r9, r8, r9, lsl #8

-    ldr     r11, [r2], r12

-    ldr     lr, [sp]

-    ldr     r12, [sp, #44]

-    mov     r7, r7, lsl #16

-    mov     r1, r1, lsl #16

-    mov     r10, r10, lsl #16

-    mov     r6, r6, lsl #16

-    mov     r7, r7, asr #3

-    pkhtb   r7, r7, r10, asr #19

-    mov     r1, r1, asr #3

-    pkhtb   r1, r1, r6, asr #19

-    uxtb16  r8, r11, ror #8

-    qadd16  r7, r7, r8

-    uxtb16  r8, r11

-    qadd16  r1, r1, r8

-    usat16  r7, #8, r7

-    usat16  r1, #8, r1

-    orr     r1, r1, r7, lsl #8

-    str     r9, [lr], r12

-    str     r1, [lr], r12

-    str     lr, [sp]

-    bne     vp8_dequant_idct_loop2_v6

-; vpx_memset

-    sub     r0, r0, #32

-    add     sp, sp, #4

-    mov     r12, #0

-    str     r12, [r0]

-    str     r12, [r0, #4]

-    str     r12, [r0, #8]

-    str     r12, [r0, #12]

-    str     r12, [r0, #16]

-    str     r12, [r0, #20]

-    str     r12, [r0, #24]

-    str     r12, [r0, #28]

-    ldmia   sp!, {r4 - r11, pc}

-    ENDP    ; |vp8_dequant_idct_add_v6|

-; Constant Pool

-cospi8sqrt2minus1 DCD 0x00004E7B

-sinpi8sqrt2       DCD 0x00008A8C

-c0x00040004       DCD 0x00040004

-    END

--- a/vp8/decoder/arm/armv6/dequantize_v6.asm

+++ /dev/null

@@ -1,69 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_dequantize_b_loop_v6|

-    AREA    |.text|, CODE, READONLY  ; name this block of code

-;-------------------------------

-;void   vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);

-; r0    short *Q,

-; r1    short *DQC

-; r2    short *DQ

-|vp8_dequantize_b_loop_v6| PROC

-    stmdb   sp!, {r4-r9, lr}

-    ldr     r3, [r0]                ;load Q

-    ldr     r4, [r1]                ;load DQC

-    ldr     r5, [r0, #4]

-    ldr     r6, [r1, #4]

-    mov     r12, #2                 ;loop counter

-dequant_loop

-    smulbb  r7, r3, r4              ;multiply

-    smultt  r8, r3, r4

-    smulbb  r9, r5, r6

-    smultt  lr, r5, r6

-    ldr     r3, [r0, #8]

-    ldr     r4, [r1, #8]

-    ldr     r5, [r0, #12]

-    ldr     r6, [r1, #12]

-    strh    r7, [r2], #2            ;store result

-    smulbb  r7, r3, r4              ;multiply

-    strh    r8, [r2], #2

-    smultt  r8, r3, r4

-    strh    r9, [r2], #2

-    smulbb  r9, r5, r6

-    strh    lr, [r2], #2

-    smultt  lr, r5, r6

-    subs    r12, r12, #1

-    add     r0, r0, #16

-    add     r1, r1, #16

-    ldrne       r3, [r0]

-    strh    r7, [r2], #2            ;store result

-    ldrne       r4, [r1]

-    strh    r8, [r2], #2

-    ldrne       r5, [r0, #4]

-    strh    r9, [r2], #2

-    ldrne       r6, [r1, #4]

-    strh    lr, [r2], #2

-    bne     dequant_loop

-    ldmia   sp!, {r4-r9, pc}

-    ENDP    ;|vp8_dequantize_b_loop_v6|

-    END

--- a/vp8/decoder/arm/armv6/idct_blk_v6.c

+++ /dev/null

@@ -1,136 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "vp8/common/idct.h"

-#include "vp8/decoder/dequantize.h"

-void vp8_dequant_dc_idct_add_y_block_v6

-(short *q, short *dq, unsigned char *pre,

- unsigned char *dst, int stride, char *eobs, short *dc) {

-  int i;

-  for (i = 0; i < 4; i++) {

-    if (eobs[0] > 1)

-      vp8_dequant_dc_idct_add_v6(q, dq, pre, dst, 16, stride, dc[0]);

-    else

-      vp8_dc_only_idct_add_v6(dc[0], pre, dst, 16, stride);

-    if (eobs[1] > 1)

-      vp8_dequant_dc_idct_add_v6(q + 16, dq, pre + 4, dst + 4, 16, stride, dc[1]);

-    else

-      vp8_dc_only_idct_add_v6(dc[1], pre + 4, dst + 4, 16, stride);

-    if (eobs[2] > 1)

-      vp8_dequant_dc_idct_add_v6(q + 32, dq, pre + 8, dst + 8, 16, stride, dc[2]);

-    else

-      vp8_dc_only_idct_add_v6(dc[2], pre + 8, dst + 8, 16, stride);

-    if (eobs[3] > 1)

-      vp8_dequant_dc_idct_add_v6(q + 48, dq, pre + 12, dst + 12, 16, stride, dc[3]);

-    else

-      vp8_dc_only_idct_add_v6(dc[3], pre + 12, dst + 12, 16, stride);

-    q    += 64;

-    dc   += 4;

-    pre  += 64;

-    dst  += 4 * stride;

-    eobs += 4;

-  }

-}

-void vp8_dequant_idct_add_y_block_v6

-(short *q, short *dq, unsigned char *pre,

- unsigned char *dst, int stride, char *eobs) {

-  int i;

-  for (i = 0; i < 4; i++) {

-    if (eobs[0] > 1)

-      vp8_dequant_idct_add_v6(q, dq, pre, dst, 16, stride);

-    else {

-      vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dst, 16, stride);

-      ((int *)q)[0] = 0;

-    }

-    if (eobs[1] > 1)

-      vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dst + 4, 16, stride);

-    else {

-      vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dst + 4, 16, stride);

-      ((int *)(q + 16))[0] = 0;

-    }

-    if (eobs[2] > 1)

-      vp8_dequant_idct_add_v6(q + 32, dq, pre + 8, dst + 8, 16, stride);

-    else {

-      vp8_dc_only_idct_add_v6(q[32]*dq[0], pre + 8, dst + 8, 16, stride);

-      ((int *)(q + 32))[0] = 0;

-    }

-    if (eobs[3] > 1)

-      vp8_dequant_idct_add_v6(q + 48, dq, pre + 12, dst + 12, 16, stride);

-    else {

-      vp8_dc_only_idct_add_v6(q[48]*dq[0], pre + 12, dst + 12, 16, stride);

-      ((int *)(q + 48))[0] = 0;

-    }

-    q    += 64;

-    pre  += 64;

-    dst  += 4 * stride;

-    eobs += 4;

-  }

-}

-void vp8_dequant_idct_add_uv_block_v6

-(short *q, short *dq, unsigned char *pre,

- unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) {

-  int i;

-  for (i = 0; i < 2; i++) {

-    if (eobs[0] > 1)

-      vp8_dequant_idct_add_v6(q, dq, pre, dstu, 8, stride);

-    else {

-      vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dstu, 8, stride);

-      ((int *)q)[0] = 0;

-    }

-    if (eobs[1] > 1)

-      vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dstu + 4, 8, stride);

-    else {

-      vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dstu + 4, 8, stride);

-      ((int *)(q + 16))[0] = 0;

-    }

-    q    += 32;

-    pre  += 32;

-    dstu += 4 * stride;

-    eobs += 2;

-  }

-  for (i = 0; i < 2; i++) {

-    if (eobs[0] > 1)

-      vp8_dequant_idct_add_v6(q, dq, pre, dstv, 8, stride);

-    else {

-      vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dstv, 8, stride);

-      ((int *)q)[0] = 0;

-    }

-    if (eobs[1] > 1)

-      vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dstv + 4, 8, stride);

-    else {

-      vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dstv + 4, 8, stride);

-      ((int *)(q + 16))[0] = 0;

-    }

-    q    += 32;

-    pre  += 32;

-    dstv += 4 * stride;

-    eobs += 2;

-  }

-}

--- a/vp8/decoder/arm/dequantize_arm.c

+++ /dev/null

@@ -1,44 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "vp8/decoder/dequantize.h"

-#include "vp8/common/idct.h"

-#include "vpx_mem/vpx_mem.h"

-#if HAVE_ARMV7

-extern void vp9_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);

-#endif

-#if HAVE_ARMV6

-extern void vp9_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);

-#endif

-#if HAVE_ARMV7

-void vp9_dequantize_b_neon(BLOCKD *d) {

-  short *DQ  = d->dqcoeff;

-  short *Q   = d->qcoeff;

-  short *DQC = d->dequant;

-  vp9_dequantize_b_loop_neon(Q, DQC, DQ);

-}

-#endif

-#if HAVE_ARMV6

-void vp9_dequantize_b_v6(BLOCKD *d) {

-  short *DQ  = d->dqcoeff;

-  short *Q   = d->qcoeff;

-  short *DQC = d->dequant;

-  vp9_dequantize_b_loop_v6(Q, DQC, DQ);

-}

-#endif

--- a/vp8/decoder/arm/neon/dequant_idct_neon.asm

+++ /dev/null

@@ -1,129 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_dequant_idct_add_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;void vp8_dequant_idct_neon(short *input, short *dq, unsigned char *pred,

-;                           unsigned char *dest, int pitch, int stride)

-; r0    short *input,

-; r1    short *dq,

-; r2    unsigned char *pred

-; r3    unsigned char *dest

-; sp    int pitch

-; sp+4  int stride

-|vp8_dequant_idct_add_neon| PROC

-    vld1.16         {q3, q4}, [r0]

-    vld1.16         {q5, q6}, [r1]

-    ldr             r1, [sp]                ; pitch

-    vld1.32         {d14[0]}, [r2], r1

-    vld1.32         {d14[1]}, [r2], r1

-    vld1.32         {d15[0]}, [r2], r1

-    vld1.32         {d15[1]}, [r2]

-    ldr             r1, [sp, #4]            ; stride

-    adr             r12, cospi8sqrt2minus1  ; pointer to the first constant

-    vmul.i16        q1, q3, q5              ;input for short_idct4x4llm_neon

-    vmul.i16        q2, q4, q6

-;|short_idct4x4llm_neon| PROC

-    vld1.16         {d0}, [r12]

-    vswp            d3, d4                  ;q2(vp[4] vp[12])

-    vqdmulh.s16     q3, q2, d0[2]

-    vqdmulh.s16     q4, q2, d0[0]

-    vqadd.s16       d12, d2, d3             ;a1

-    vqsub.s16       d13, d2, d3             ;b1

-    vshr.s16        q3, q3, #1

-    vshr.s16        q4, q4, #1

-    vqadd.s16       q3, q3, q2

-    vqadd.s16       q4, q4, q2

-    vqsub.s16       d10, d6, d9             ;c1

-    vqadd.s16       d11, d7, d8             ;d1

-    vqadd.s16       d2, d12, d11

-    vqadd.s16       d3, d13, d10

-    vqsub.s16       d4, d13, d10

-    vqsub.s16       d5, d12, d11

-    vtrn.32         d2, d4

-    vtrn.32         d3, d5

-    vtrn.16         d2, d3

-    vtrn.16         d4, d5

-; memset(input, 0, 32) -- 32bytes

-    vmov.i16        q14, #0

-    vswp            d3, d4

-    vqdmulh.s16     q3, q2, d0[2]

-    vqdmulh.s16     q4, q2, d0[0]

-    vqadd.s16       d12, d2, d3             ;a1

-    vqsub.s16       d13, d2, d3             ;b1

-    vmov            q15, q14

-    vshr.s16        q3, q3, #1

-    vshr.s16        q4, q4, #1

-    vqadd.s16       q3, q3, q2

-    vqadd.s16       q4, q4, q2

-    vqsub.s16       d10, d6, d9             ;c1

-    vqadd.s16       d11, d7, d8             ;d1

-    vqadd.s16       d2, d12, d11

-    vqadd.s16       d3, d13, d10

-    vqsub.s16       d4, d13, d10

-    vqsub.s16       d5, d12, d11

-    vst1.16         {q14, q15}, [r0]

-    vrshr.s16       d2, d2, #3

-    vrshr.s16       d3, d3, #3

-    vrshr.s16       d4, d4, #3

-    vrshr.s16       d5, d5, #3

-    vtrn.32         d2, d4

-    vtrn.32         d3, d5

-    vtrn.16         d2, d3

-    vtrn.16         d4, d5

-    vaddw.u8        q1, q1, d14

-    vaddw.u8        q2, q2, d15

-    vqmovun.s16     d0, q1

-    vqmovun.s16     d1, q2

-    vst1.32         {d0[0]}, [r3], r1

-    vst1.32         {d0[1]}, [r3], r1

-    vst1.32         {d1[0]}, [r3], r1

-    vst1.32         {d1[1]}, [r3]

-    bx             lr

-    ENDP           ; |vp8_dequant_idct_add_neon|

-; Constant Pool

-cospi8sqrt2minus1 DCD 0x4e7b4e7b

-sinpi8sqrt2       DCD 0x8a8c8a8c

-    END

--- a/vp8/decoder/arm/neon/dequantizeb_neon.asm

+++ /dev/null

@@ -1,34 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_dequantize_b_loop_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    short *Q,

-; r1    short *DQC

-; r2    short *DQ

-|vp8_dequantize_b_loop_neon| PROC

-    vld1.16         {q0, q1}, [r0]

-    vld1.16         {q2, q3}, [r1]

-    vmul.i16        q4, q0, q2

-    vmul.i16        q5, q1, q3

-    vst1.16         {q4, q5}, [r2]

-    bx             lr

-    ENDP

-    END

--- a/vp8/decoder/arm/neon/idct_blk_neon.c

+++ /dev/null

@@ -1,110 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "vp8/common/idct.h"

-#include "vp8/decoder/dequantize.h"

-/* place these declarations here because we don't want to maintain them

- * outside of this scope

- */

-void idct_dequant_dc_full_2x_neon

-(short *input, short *dq, unsigned char *pre, unsigned char *dst,

- int stride, short *dc);

-void idct_dequant_dc_0_2x_neon

-(short *dc, unsigned char *pre, unsigned char *dst, int stride);

-void idct_dequant_full_2x_neon

-(short *q, short *dq, unsigned char *pre, unsigned char *dst,

- int pitch, int stride);

-void idct_dequant_0_2x_neon

-(short *q, short dq, unsigned char *pre, int pitch,

- unsigned char *dst, int stride);

-void vp8_dequant_dc_idct_add_y_block_neon

-(short *q, short *dq, unsigned char *pre,

- unsigned char *dst, int stride, char *eobs, short *dc) {

-  int i;

-  for (i = 0; i < 4; i++) {

-    if (((short *)eobs)[0] & 0xfefe)

-      idct_dequant_dc_full_2x_neon(q, dq, pre, dst, stride, dc);

-    else

-      idct_dequant_dc_0_2x_neon(dc, pre, dst, stride);

-    if (((short *)eobs)[1] & 0xfefe)

-      idct_dequant_dc_full_2x_neon(q + 32, dq, pre + 8, dst + 8, stride, dc + 2);

-    else

-      idct_dequant_dc_0_2x_neon(dc + 2, pre + 8, dst + 8, stride);

-    q    += 64;

-    dc   += 4;

-    pre  += 64;

-    dst  += 4 * stride;

-    eobs += 4;

-  }

-}

-void vp8_dequant_idct_add_y_block_neon

-(short *q, short *dq, unsigned char *pre,

- unsigned char *dst, int stride, char *eobs) {

-  int i;

-  for (i = 0; i < 4; i++) {

-    if (((short *)eobs)[0] & 0xfefe)

-      idct_dequant_full_2x_neon(q, dq, pre, dst, 16, stride);

-    else

-      idct_dequant_0_2x_neon(q, dq[0], pre, 16, dst, stride);

-    if (((short *)eobs)[1] & 0xfefe)

-      idct_dequant_full_2x_neon(q + 32, dq, pre + 8, dst + 8, 16, stride);

-    else

-      idct_dequant_0_2x_neon(q + 32, dq[0], pre + 8, 16, dst + 8, stride);

-    q    += 64;

-    pre  += 64;

-    dst  += 4 * stride;

-    eobs += 4;

-  }

-}

-void vp8_dequant_idct_add_uv_block_neon

-(short *q, short *dq, unsigned char *pre,

- unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) {

-  if (((short *)eobs)[0] & 0xfefe)

-    idct_dequant_full_2x_neon(q, dq, pre, dstu, 8, stride);

-  else

-    idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstu, stride);

-  q    += 32;

-  pre  += 32;

-  dstu += 4 * stride;

-  if (((short *)eobs)[1] & 0xfefe)

-    idct_dequant_full_2x_neon(q, dq, pre, dstu, 8, stride);

-  else

-    idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstu, stride);

-  q += 32;

-  pre += 32;

-  if (((short *)eobs)[2] & 0xfefe)

-    idct_dequant_full_2x_neon(q, dq, pre, dstv, 8, stride);

-  else

-    idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstv, stride);

-  q    += 32;

-  pre  += 32;

-  dstv += 4 * stride;

-  if (((short *)eobs)[3] & 0xfefe)

-    idct_dequant_full_2x_neon(q, dq, pre, dstv, 8, stride);

-  else

-    idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstv, stride);

-}

--- a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm

+++ /dev/null

@@ -1,79 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-    EXPORT  |idct_dequant_0_2x_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *pre,

-;                            int pitch, unsigned char *dst, int stride);

-; r0   *q

-; r1   dq

-; r2   *pre

-; r3   pitch

-; sp   *dst

-; sp+4 stride

-|idct_dequant_0_2x_neon| PROC

-    add             r12, r2, #4

-    vld1.32         {d2[0]}, [r2], r3

-    vld1.32         {d2[1]}, [r2], r3

-    vld1.32         {d4[0]}, [r2], r3

-    vld1.32         {d4[1]}, [r2]

-    vld1.32         {d8[0]}, [r12], r3

-    vld1.32         {d8[1]}, [r12], r3

-    vld1.32         {d10[0]}, [r12], r3

-    vld1.32         {d10[1]}, [r12]

-    ldrh            r12, [r0]               ; lo q

-    ldrh            r2, [r0, #32]           ; hi q

-    mov             r3, #0

-    strh            r3, [r0]

-    strh            r3, [r0, #32]

-    sxth            r12, r12                ; lo

-    mul             r0, r12, r1

-    add             r0, r0, #4

-    asr             r0, r0, #3

-    vdup.16         q0, r0

-    sxth            r2, r2                  ; hi

-    mul             r0, r2, r1

-    add             r0, r0, #4

-    asr             r0, r0, #3

-    vdup.16         q3, r0

-    vaddw.u8        q1, q0, d2              ; lo

-    vaddw.u8        q2, q0, d4

-    vaddw.u8        q4, q3, d8              ; hi

-    vaddw.u8        q5, q3, d10

-    ldr             r2, [sp]                ; dst

-    ldr             r3, [sp, #4]            ; stride

-    vqmovun.s16     d2, q1                  ; lo

-    vqmovun.s16     d4, q2

-    vqmovun.s16     d8, q4                  ; hi

-    vqmovun.s16     d10, q5

-    add             r0, r2, #4

-    vst1.32         {d2[0]}, [r2], r3       ; lo

-    vst1.32         {d2[1]}, [r2], r3

-    vst1.32         {d4[0]}, [r2], r3

-    vst1.32         {d4[1]}, [r2]

-    vst1.32         {d8[0]}, [r0], r3       ; hi

-    vst1.32         {d8[1]}, [r0], r3

-    vst1.32         {d10[0]}, [r0], r3

-    vst1.32         {d10[1]}, [r0]

-    bx             lr

-    ENDP           ; |idct_dequant_0_2x_neon|

-    END

--- a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm

+++ /dev/null

@@ -1,69 +1,0 @@

-;

-;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-    EXPORT  |idct_dequant_dc_0_2x_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;void idct_dequant_dc_0_2x_neon(short *dc, unsigned char *pre,

-;                               unsigned char *dst, int stride);

-; r0  *dc

-; r1  *pre

-; r2  *dst

-; r3  stride

-|idct_dequant_dc_0_2x_neon| PROC

-    ldr             r0, [r0]                ; *dc

-    mov             r12, #16

-    vld1.32         {d2[0]}, [r1], r12      ; lo

-    vld1.32         {d2[1]}, [r1], r12

-    vld1.32         {d4[0]}, [r1], r12

-    vld1.32         {d4[1]}, [r1]

-    sub             r1, r1, #44

-    vld1.32         {d8[0]}, [r1], r12      ; hi

-    vld1.32         {d8[1]}, [r1], r12

-    vld1.32         {d10[0]}, [r1], r12

-    vld1.32         {d10[1]}, [r1]

-    sxth            r1, r0                  ; lo *dc

-    add             r1, r1, #4

-    asr             r1, r1, #3

-    vdup.16         q0, r1

-    sxth            r0, r0, ror #16         ; hi *dc

-    add             r0, r0, #4

-    asr             r0, r0, #3

-    vdup.16         q3, r0

-    vaddw.u8        q1, q0, d2              ; lo

-    vaddw.u8        q2, q0, d4

-    vaddw.u8        q4, q3, d8              ; hi

-    vaddw.u8        q5, q3, d10

-    vqmovun.s16     d2, q1                  ; lo

-    vqmovun.s16     d4, q2

-    vqmovun.s16     d8, q4                  ; hi

-    vqmovun.s16     d10, q5

-    add             r0, r2, #4

-    vst1.32         {d2[0]}, [r2], r3       ; lo

-    vst1.32         {d2[1]}, [r2], r3

-    vst1.32         {d4[0]}, [r2], r3

-    vst1.32         {d4[1]}, [r2]

-    vst1.32         {d8[0]}, [r0], r3       ; hi

-    vst1.32         {d8[1]}, [r0], r3

-    vst1.32         {d10[0]}, [r0], r3

-    vst1.32         {d10[1]}, [r0]

-    bx             lr

-    ENDP           ;|idct_dequant_dc_0_2x_neon|

-    END

--- a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm

+++ /dev/null

@@ -1,205 +1,0 @@

-;

-;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |idct_dequant_dc_full_2x_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;void idct_dequant_dc_full_2x_neon(short *q, short *dq, unsigned char *pre,

-;                                  unsigned char *dst, int stride, short *dc);

-; r0    *q,

-; r1    *dq,

-; r2    *pre

-; r3    *dst

-; sp    stride

-; sp+4  *dc

-|idct_dequant_dc_full_2x_neon| PROC

-    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)

-    vld1.16         {q2, q3}, [r0]          ; l q

-    mov             r1, #16                 ; pitch

-    add             r0, r0, #32

-    vld1.16         {q4, q5}, [r0]          ; r q

-    add             r12, r2, #4

-    ; interleave the predictors

-    vld1.32         {d28[0]}, [r2], r1      ; l pre

-    vld1.32         {d28[1]}, [r12], r1     ; r pre

-    vld1.32         {d29[0]}, [r2], r1

-    vld1.32         {d29[1]}, [r12], r1

-    vld1.32         {d30[0]}, [r2], r1

-    vld1.32         {d30[1]}, [r12], r1

-    vld1.32         {d31[0]}, [r2]

-    ldr             r1, [sp, #4]

-    vld1.32         {d31[1]}, [r12]

-    adr             r2, cospi8sqrt2minus1   ; pointer to the first constant

-    ldrh            r12, [r1], #2           ; lo *dc

-    ldrh            r1, [r1]                ; hi *dc

-    ; dequant: q[i] = q[i] * dq[i]

-    vmul.i16        q2, q2, q0

-    vmul.i16        q3, q3, q1

-    vmul.i16        q4, q4, q0

-    vmul.i16        q5, q5, q1

-    ; move dc up to neon and overwrite first element

-    vmov.16         d4[0], r12

-    vmov.16         d8[0], r1

-    vld1.16         {d0}, [r2]

-    ; q2: l0r0  q3: l8r8

-    ; q4: l4r4  q5: l12r12

-    vswp            d5, d8

-    vswp            d7, d10

-    ; _CONSTANTS_ * 4,12 >> 16

-    ; q6:  4 * sinpi : c1/temp1

-    ; q7: 12 * sinpi : d1/temp2

-    ; q8:  4 * cospi

-    ; q9: 12 * cospi

-    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2

-    vqdmulh.s16     q7, q5, d0[2]

-    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1

-    vqdmulh.s16     q9, q5, d0[0]

-    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8

-    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8

-    ; vqdmulh only accepts signed values. this was a problem because

-    ; our constant had the high bit set, and was treated as a negative value.

-    ; vqdmulh also doubles the value before it shifts by 16. we need to

-    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,

-    ; so we can shift the constant without losing precision. this avoids

-    ; shift again afterward, but also avoids the sign issue. win win!

-    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we

-    ; pre-shift it

-    vshr.s16        q8, q8, #1

-    vshr.s16        q9, q9, #1

-    ; q4:  4 +  4 * cospi : d1/temp1

-    ; q5: 12 + 12 * cospi : c1/temp2

-    vqadd.s16       q4, q4, q8

-    vqadd.s16       q5, q5, q9

-    ; c1 = temp1 - temp2

-    ; d1 = temp1 + temp2

-    vqsub.s16       q2, q6, q5

-    vqadd.s16       q3, q4, q7

-    ; [0]: a1+d1

-    ; [1]: b1+c1

-    ; [2]: b1-c1

-    ; [3]: a1-d1

-    vqadd.s16       q4, q10, q3

-    vqadd.s16       q5, q11, q2

-    vqsub.s16       q6, q11, q2

-    vqsub.s16       q7, q10, q3

-    ; rotate

-    vtrn.32         q4, q6

-    vtrn.32         q5, q7

-    vtrn.16         q4, q5

-    vtrn.16         q6, q7

-    ; idct loop 2

-    ; q4: l 0, 4, 8,12 r 0, 4, 8,12

-    ; q5: l 1, 5, 9,13 r 1, 5, 9,13

-    ; q6: l 2, 6,10,14 r 2, 6,10,14

-    ; q7: l 3, 7,11,15 r 3, 7,11,15

-    ; q8:  1 * sinpi : c1/temp1

-    ; q9:  3 * sinpi : d1/temp2

-    ; q10: 1 * cospi

-    ; q11: 3 * cospi

-    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2

-    vqdmulh.s16     q9, q7, d0[2]

-    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1

-    vqdmulh.s16     q11, q7, d0[0]

-    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2

-    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2

-    ; see note on shifting above

-    vshr.s16        q10, q10, #1

-    vshr.s16        q11, q11, #1

-    ; q10: 1 + 1 * cospi : d1/temp1

-    ; q11: 3 + 3 * cospi : c1/temp2

-    vqadd.s16       q10, q5, q10

-    vqadd.s16       q11, q7, q11

-    ; q8: c1 = temp1 - temp2

-    ; q9: d1 = temp1 + temp2

-    vqsub.s16       q8, q8, q11

-    vqadd.s16       q9, q10, q9

-    ; a1+d1

-    ; b1+c1

-    ; b1-c1

-    ; a1-d1

-    vqadd.s16       q4, q2, q9

-    vqadd.s16       q5, q3, q8

-    vqsub.s16       q6, q3, q8

-    vqsub.s16       q7, q2, q9

-    ; +4 >> 3 (rounding)

-    vrshr.s16       q4, q4, #3              ; lo

-    vrshr.s16       q5, q5, #3

-    vrshr.s16       q6, q6, #3              ; hi

-    vrshr.s16       q7, q7, #3

-    vtrn.32         q4, q6

-    vtrn.32         q5, q7

-    vtrn.16         q4, q5

-    vtrn.16         q6, q7

-    ; adding pre

-    ; input is still packed. pre was read interleaved

-    vaddw.u8        q4, q4, d28

-    vaddw.u8        q5, q5, d29

-    vaddw.u8        q6, q6, d30

-    vaddw.u8        q7, q7, d31

-    vmov.i16        q14, #0

-    vmov            q15, q14

-    vst1.16         {q14, q15}, [r0]        ; write over high input

-    sub             r0, r0, #32

-    vst1.16         {q14, q15}, [r0]        ; write over low input

-    ;saturate and narrow

-    vqmovun.s16     d0, q4                  ; lo

-    vqmovun.s16     d1, q5

-    vqmovun.s16     d2, q6                  ; hi

-    vqmovun.s16     d3, q7

-    ldr             r1, [sp]                ; stride

-    add             r2, r3, #4              ; hi

-    vst1.32         {d0[0]}, [r3], r1       ; lo

-    vst1.32         {d0[1]}, [r2], r1       ; hi

-    vst1.32         {d1[0]}, [r3], r1

-    vst1.32         {d1[1]}, [r2], r1

-    vst1.32         {d2[0]}, [r3], r1

-    vst1.32         {d2[1]}, [r2], r1

-    vst1.32         {d3[0]}, [r3]

-    vst1.32         {d3[1]}, [r2]

-    bx             lr

-    ENDP           ; |idct_dequant_dc_full_2x_neon|

-; Constant Pool

-cospi8sqrt2minus1 DCD 0x4e7b

-; because the lowest bit in 0x8a8c is 0, we can pre-shift this

-sinpi8sqrt2       DCD 0x4546

-    END

--- a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm

+++ /dev/null

@@ -1,197 +1,0 @@

-;

-;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |idct_dequant_full_2x_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *pre,

-;                               unsigned char *dst, int pitch, int stride);

-; r0    *q,

-; r1    *dq,

-; r2    *pre

-; r3    *dst

-; sp    pitch

-; sp+4  stride

-|idct_dequant_full_2x_neon| PROC

-    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)

-    vld1.16         {q2, q3}, [r0]          ; l q

-    ldr             r1, [sp]                ; pitch

-    add             r0, r0, #32

-    vld1.16         {q4, q5}, [r0]          ; r q

-    add             r12, r2, #4

-    ; interleave the predictors

-    vld1.32         {d28[0]}, [r2], r1      ; l pre

-    vld1.32         {d28[1]}, [r12], r1     ; r pre

-    vld1.32         {d29[0]}, [r2], r1

-    vld1.32         {d29[1]}, [r12], r1

-    vld1.32         {d30[0]}, [r2], r1

-    vld1.32         {d30[1]}, [r12], r1

-    vld1.32         {d31[0]}, [r2]

-    vld1.32         {d31[1]}, [r12]

-    adr             r2, cospi8sqrt2minus1   ; pointer to the first constant

-    ; dequant: q[i] = q[i] * dq[i]

-    vmul.i16        q2, q2, q0

-    vmul.i16        q3, q3, q1

-    vmul.i16        q4, q4, q0

-    vmul.i16        q5, q5, q1

-    vld1.16         {d0}, [r2]

-    ; q2: l0r0  q3: l8r8

-    ; q4: l4r4  q5: l12r12

-    vswp            d5, d8

-    vswp            d7, d10

-    ; _CONSTANTS_ * 4,12 >> 16

-    ; q6:  4 * sinpi : c1/temp1

-    ; q7: 12 * sinpi : d1/temp2

-    ; q8:  4 * cospi

-    ; q9: 12 * cospi

-    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2

-    vqdmulh.s16     q7, q5, d0[2]

-    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1

-    vqdmulh.s16     q9, q5, d0[0]

-    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8

-    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8

-    ; vqdmulh only accepts signed values. this was a problem because

-    ; our constant had the high bit set, and was treated as a negative value.

-    ; vqdmulh also doubles the value before it shifts by 16. we need to

-    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,

-    ; so we can shift the constant without losing precision. this avoids

-    ; shift again afterward, but also avoids the sign issue. win win!

-    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we

-    ; pre-shift it

-    vshr.s16        q8, q8, #1

-    vshr.s16        q9, q9, #1

-    ; q4:  4 +  4 * cospi : d1/temp1

-    ; q5: 12 + 12 * cospi : c1/temp2

-    vqadd.s16       q4, q4, q8

-    vqadd.s16       q5, q5, q9

-    ; c1 = temp1 - temp2

-    ; d1 = temp1 + temp2

-    vqsub.s16       q2, q6, q5

-    vqadd.s16       q3, q4, q7

-    ; [0]: a1+d1

-    ; [1]: b1+c1

-    ; [2]: b1-c1

-    ; [3]: a1-d1

-    vqadd.s16       q4, q10, q3

-    vqadd.s16       q5, q11, q2

-    vqsub.s16       q6, q11, q2

-    vqsub.s16       q7, q10, q3

-    ; rotate

-    vtrn.32         q4, q6

-    vtrn.32         q5, q7

-    vtrn.16         q4, q5

-    vtrn.16         q6, q7

-    ; idct loop 2

-    ; q4: l 0, 4, 8,12 r 0, 4, 8,12

-    ; q5: l 1, 5, 9,13 r 1, 5, 9,13

-    ; q6: l 2, 6,10,14 r 2, 6,10,14

-    ; q7: l 3, 7,11,15 r 3, 7,11,15

-    ; q8:  1 * sinpi : c1/temp1

-    ; q9:  3 * sinpi : d1/temp2

-    ; q10: 1 * cospi

-    ; q11: 3 * cospi

-    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2

-    vqdmulh.s16     q9, q7, d0[2]

-    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1

-    vqdmulh.s16     q11, q7, d0[0]

-    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2

-    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2

-    ; see note on shifting above

-    vshr.s16        q10, q10, #1

-    vshr.s16        q11, q11, #1

-    ; q10: 1 + 1 * cospi : d1/temp1

-    ; q11: 3 + 3 * cospi : c1/temp2

-    vqadd.s16       q10, q5, q10

-    vqadd.s16       q11, q7, q11

-    ; q8: c1 = temp1 - temp2

-    ; q9: d1 = temp1 + temp2

-    vqsub.s16       q8, q8, q11

-    vqadd.s16       q9, q10, q9

-    ; a1+d1

-    ; b1+c1

-    ; b1-c1

-    ; a1-d1

-    vqadd.s16       q4, q2, q9

-    vqadd.s16       q5, q3, q8

-    vqsub.s16       q6, q3, q8

-    vqsub.s16       q7, q2, q9

-    ; +4 >> 3 (rounding)

-    vrshr.s16       q4, q4, #3              ; lo

-    vrshr.s16       q5, q5, #3

-    vrshr.s16       q6, q6, #3              ; hi

-    vrshr.s16       q7, q7, #3

-    vtrn.32         q4, q6

-    vtrn.32         q5, q7

-    vtrn.16         q4, q5

-    vtrn.16         q6, q7

-    ; adding pre

-    ; input is still packed. pre was read interleaved

-    vaddw.u8        q4, q4, d28

-    vaddw.u8        q5, q5, d29

-    vaddw.u8        q6, q6, d30

-    vaddw.u8        q7, q7, d31

-    vmov.i16        q14, #0

-    vmov            q15, q14

-    vst1.16         {q14, q15}, [r0]        ; write over high input

-    sub             r0, r0, #32

-    vst1.16         {q14, q15}, [r0]        ; write over low input

-    ;saturate and narrow

-    vqmovun.s16     d0, q4                  ; lo

-    vqmovun.s16     d1, q5

-    vqmovun.s16     d2, q6                  ; hi

-    vqmovun.s16     d3, q7

-    ldr             r1, [sp, #4]            ; stride

-    add             r2, r3, #4              ; hi

-    vst1.32         {d0[0]}, [r3], r1       ; lo

-    vst1.32         {d0[1]}, [r2], r1       ; hi

-    vst1.32         {d1[0]}, [r3], r1

-    vst1.32         {d1[1]}, [r2], r1

-    vst1.32         {d2[0]}, [r3], r1

-    vst1.32         {d2[1]}, [r2], r1

-    vst1.32         {d3[0]}, [r3]

-    vst1.32         {d3[1]}, [r2]

-    bx             lr

-    ENDP           ; |idct_dequant_full_2x_neon|

-; Constant Pool

-cospi8sqrt2minus1 DCD 0x4e7b

-; because the lowest bit in 0x8a8c is 0, we can pre-shift this

-sinpi8sqrt2       DCD 0x4546

-    END

--- a/vp8/decoder/asm_dec_offsets.c

+++ /dev/null

@@ -1,39 +1,0 @@

-/*

- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/asm_offsets.h"

-#include "onyxd_int.h"

-BEGIN

-DEFINE(detok_scan,                              offsetof(DETOK, scan));

-DEFINE(detok_ptr_block2leftabove,               offsetof(DETOK, ptr_block2leftabove));

-DEFINE(detok_coef_tree_ptr,                     offsetof(DETOK, vp9_coef_tree_ptr));

-DEFINE(detok_norm_ptr,                          offsetof(DETOK, norm_ptr));

-DEFINE(detok_ptr_coef_bands_x,                  offsetof(DETOK, ptr_coef_bands_x));

-DEFINE(detok_A,                                 offsetof(DETOK, A));

-DEFINE(detok_L,                                 offsetof(DETOK, L));

-DEFINE(detok_qcoeff_start_ptr,                  offsetof(DETOK, qcoeff_start_ptr));

-DEFINE(detok_coef_probs,                        offsetof(DETOK, coef_probs));

-DEFINE(detok_eob,                               offsetof(DETOK, eob));

-DEFINE(bool_decoder_user_buffer_end,            offsetof(BOOL_DECODER, user_buffer_end));

-DEFINE(bool_decoder_user_buffer,                offsetof(BOOL_DECODER, user_buffer));

-DEFINE(bool_decoder_value,                      offsetof(BOOL_DECODER, value));

-DEFINE(bool_decoder_count,                      offsetof(BOOL_DECODER, count));

-DEFINE(bool_decoder_range,                      offsetof(BOOL_DECODER, range));

-END

-/* add asserts for any offset that is not supported by assembly code */

-/* add asserts for any size that is not supported by assembly code */

--- a/vp8/decoder/dboolhuff.c

+++ /dev/null

@@ -1,100 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "dboolhuff.h"

-#include "vpx_ports/mem.h"

-#include "vpx_mem/vpx_mem.h"

-int vp9_start_decode(BOOL_DECODER *br,

-                     const unsigned char *source,

-                     unsigned int source_sz) {

-  br->user_buffer_end = source + source_sz;

-  br->user_buffer     = source;

-  br->value    = 0;

-  br->count    = -8;

-  br->range    = 255;

-  if (source_sz && !source)

-    return 1;

-  /* Populate the buffer */

-  vp9_bool_decoder_fill(br);

-  return 0;

-}

-void vp9_bool_decoder_fill(BOOL_DECODER *br) {

-  const unsigned char *bufptr;

-  const unsigned char *bufend;

-  VP9_BD_VALUE         value;

-  int                  count;

-  bufend = br->user_buffer_end;

-  bufptr = br->user_buffer;

-  value = br->value;

-  count = br->count;

-  VP9DX_BOOL_DECODER_FILL(count, value, bufptr, bufend);

-  br->user_buffer = bufptr;

-  br->value = value;

-  br->count = count;

-}

-static int get_unsigned_bits(unsigned num_values) {

-  int cat = 0;

-  if ((num_values--) <= 1) return 0;

-  while (num_values > 0) {

-    cat++;

-    num_values >>= 1;

-  }

-  return cat;

-}

-int vp9_inv_recenter_nonneg(int v, int m) {

-  if (v > (m << 1)) return v;

-  else if ((v & 1) == 0) return (v >> 1) + m;

-  else return m - ((v + 1) >> 1);

-}

-int vp9_decode_uniform(BOOL_DECODER *br, int n) {

-  int v;

-  int l = get_unsigned_bits(n);

-  int m = (1 << l) - n;

-  if (!l) return 0;

-  v = decode_value(br, l - 1);

-  if (v < m)

-    return v;

-  else

-    return (v << 1) - m + decode_value(br, 1);

-}

-int vp9_decode_term_subexp(BOOL_DECODER *br, int k, int num_syms) {

-  int i = 0, mk = 0, word;

-  while (1) {

-    int b = (i ? k + i - 1 : k);

-    int a = (1 << b);

-    if (num_syms <= mk + 3 * a) {

-      word = vp9_decode_uniform(br, num_syms - mk) + mk;

-      break;

-    } else {

-      if (decode_value(br, 1)) {

-        i++;

-        mk += a;

-      } else {

-        word = decode_value(br, b) + mk;

-        break;

-      }

-    }

-  }

-  return word;

-}

--- a/vp8/decoder/dboolhuff.h

+++ /dev/null

@@ -1,153 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef DBOOLHUFF_H

-#define DBOOLHUFF_H

-#include <stddef.h>

-#include <limits.h>

-#include "vpx_ports/config.h"

-#include "vpx_ports/mem.h"

-#include "vpx/vpx_integer.h"

-typedef size_t VP9_BD_VALUE;

-# define VP9_BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT)

-/*This is meant to be a large, positive constant that can still be efficiently

-   loaded as an immediate (on platforms like ARM, for example).

-  Even relatively modest values like 100 would work fine.*/

-# define VP9_LOTS_OF_BITS (0x40000000)

-typedef struct {

-  const unsigned char *user_buffer_end;

-  const unsigned char *user_buffer;

-  VP9_BD_VALUE         value;

-  int                  count;

-  unsigned int         range;

-} BOOL_DECODER;

-DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);

-int vp9_start_decode(BOOL_DECODER *br,

-                     const unsigned char *source,

-                     unsigned int source_sz);

-void vp9_bool_decoder_fill(BOOL_DECODER *br);

-int vp9_decode_uniform(BOOL_DECODER *br, int n);

-int vp9_decode_term_subexp(BOOL_DECODER *br, int k, int num_syms);

-int vp9_inv_recenter_nonneg(int v, int m);

-/*The refill loop is used in several places, so define it in a macro to make

-   sure they're all consistent.

-  An inline function would be cleaner, but has a significant penalty, because

-   multiple BOOL_DECODER fields must be modified, and the compiler is not smart

-   enough to eliminate the stores to those fields and the subsequent reloads

-   from them when inlining the function.*/

-#define VP9DX_BOOL_DECODER_FILL(_count,_value,_bufptr,_bufend) \

-  do \

-  { \

-    int shift = VP9_BD_VALUE_SIZE - 8 - ((_count) + 8); \

-    int loop_end, x; \

-    size_t bits_left = ((_bufend)-(_bufptr))*CHAR_BIT; \

-    \

-    x = shift + CHAR_BIT - bits_left; \

-    loop_end = 0; \

-    if(x >= 0) \

-    { \

-      (_count) += VP9_LOTS_OF_BITS; \

-      loop_end = x; \

-      if(!bits_left) break; \

-    } \

-    while(shift >= loop_end) \

-    { \

-      (_count) += CHAR_BIT; \

-      (_value) |= (VP9_BD_VALUE)*(_bufptr)++ << shift; \

-      shift -= CHAR_BIT; \

-    } \

-  } \

-  while(0) \

-static int decode_bool(BOOL_DECODER *br, int probability) {

-  unsigned int bit = 0;

-  VP9_BD_VALUE value;

-  unsigned int split;

-  VP9_BD_VALUE bigsplit;

-  int count;

-  unsigned int range;

-  split = 1 + (((br->range - 1) * probability) >> 8);

-  if (br->count < 0)

-    vp9_bool_decoder_fill(br);

-  value = br->value;

-  count = br->count;

-  bigsplit = (VP9_BD_VALUE)split << (VP9_BD_VALUE_SIZE - 8);

-  range = split;

-  if (value >= bigsplit) {

-    range = br->range - split;

-    value = value - bigsplit;

-    bit = 1;

-  }

-  {

-    register unsigned int shift = vp9_norm[range];

-    range <<= shift;

-    value <<= shift;

-    count -= shift;

-  }

-  br->value = value;

-  br->count = count;

-  br->range = range;

-  return bit;

-}

-static int decode_value(BOOL_DECODER *br, int bits) {

-  int z = 0;

-  int bit;

-  for (bit = bits - 1; bit >= 0; bit--) {

-    z |= (decode_bool(br, 0x80) << bit);

-  }

-  return z;

-}

-static int bool_error(BOOL_DECODER *br) {

-  /* Check if we have reached the end of the buffer.

-   *

-   * Variable 'count' stores the number of bits in the 'value' buffer, minus

-   * 8. The top byte is part of the algorithm, and the remainder is buffered

-   * to be shifted into it. So if count == 8, the top 16 bits of 'value' are

-   * occupied, 8 for the algorithm and 8 in the buffer.

-   *

-   * When reading a byte from the user's buffer, count is filled with 8 and

-   * one byte is filled into the value buffer. When we reach the end of the

-   * data, count is additionally filled with VP9_LOTS_OF_BITS. So when

-   * count == VP9_LOTS_OF_BITS - 1, the user's data has been exhausted.

-   */

-  if ((br->count > VP9_BD_VALUE_SIZE) && (br->count < VP9_LOTS_OF_BITS)) {

-    /* We have tried to decode bits after the end of

-     * stream was encountered.

-     */

-    return 1;

-  }

-  /* No error. */

-  return 0;

-}

-#endif

--- a/vp8/decoder/decodemv.c

+++ /dev/null

@@ -1,1199 +1,0 @@

-/*

-  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "treereader.h"

-#include "vp8/common/entropymv.h"

-#include "vp8/common/entropymode.h"

-#include "onyxd_int.h"

-#include "vp8/common/findnearmv.h"

-#include "vp8/common/seg_common.h"

-#include "vp8/common/pred_common.h"

-#include "vp8/common/entropy.h"

-#include "vp8/decoder/decodemv.h"

-#if CONFIG_DEBUG

-#include <assert.h>

-#endif

-// #define DEBUG_DEC_MV

-#ifdef DEBUG_DEC_MV

-int dec_mvcount = 0;

-#endif

-static int read_bmode(vp9_reader *bc, const vp9_prob *p) {

-  return treed_read(bc, vp9_bmode_tree, p);

-}

-static int read_ymode(vp9_reader *bc, const vp9_prob *p) {

-  return treed_read(bc, vp9_ymode_tree, p);

-}

-#if CONFIG_SUPERBLOCKS

-static int read_kf_sb_ymode(vp9_reader *bc, const vp9_prob *p) {

-  return treed_read(bc, vp9_uv_mode_tree, p);

-}

-#endif

-static int read_kf_mb_ymode(vp9_reader *bc, const vp9_prob *p) {

-  return treed_read(bc, vp9_kf_ymode_tree, p);

-}

-static int read_i8x8_mode(vp9_reader *bc, const vp9_prob *p) {

-  return treed_read(bc, vp9_i8x8_mode_tree, p);

-}

-static int read_uv_mode(vp9_reader *bc, const vp9_prob *p) {

-  return treed_read(bc, vp9_uv_mode_tree, p);

-}

-// This function reads the current macro block's segnent id from the bitstream

-// It should only be called if a segment map update is indicated.

-static void read_mb_segid(vp9_reader *r, MB_MODE_INFO *mi,

-                          MACROBLOCKD *xd) {

-  /* Is segmentation enabled */

-  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {

-    /* If so then read the segment id. */

-    if (vp9_read(r, xd->mb_segment_tree_probs[0]))

-      mi->segment_id =

-        (unsigned char)(2 + vp9_read(r, xd->mb_segment_tree_probs[2]));

-    else

-      mi->segment_id =

-        (unsigned char)(vp9_read(r, xd->mb_segment_tree_probs[1]));

-  }

-}

-#if CONFIG_NEW_MVREF

-int vp9_read_mv_ref_id(vp9_reader *r,

-                       vp9_prob * ref_id_probs) {

-  int ref_index = 0;

-  if (vp9_read(r, ref_id_probs[0])) {

-    ref_index++;

-    if (vp9_read(r, ref_id_probs[1])) {

-      ref_index++;

-      if (vp9_read(r, ref_id_probs[2]))

-        ref_index++;

-    }

-  }

-  return ref_index;

-}

-#endif

-extern const int vp9_i8x8_block[4];

-static void kfread_modes(VP9D_COMP *pbi,

-                         MODE_INFO *m,

-                         int mb_row,

-                         int mb_col,

-                         BOOL_DECODER* const bc) {

-  VP9_COMMON *const cm = &pbi->common;

-  const int mis = pbi->common.mode_info_stride;

-  int map_index = mb_row * pbi->common.mb_cols + mb_col;

-  MB_PREDICTION_MODE y_mode;

-  // Read the Macroblock segmentation map if it is being updated explicitly

-  // this frame (reset to 0 by default).

-  m->mbmi.segment_id = 0;

-  if (pbi->mb.update_mb_segmentation_map) {

-    read_mb_segid(bc, &m->mbmi, &pbi->mb);

-    pbi->common.last_frame_seg_map[map_index] = m->mbmi.segment_id;

-  }

-  m->mbmi.mb_skip_coeff = 0;

-  if (pbi->common.mb_no_coeff_skip &&

-      (!vp9_segfeature_active(&pbi->mb,

-                              m->mbmi.segment_id, SEG_LVL_EOB) ||

-       (vp9_get_segdata(&pbi->mb,

-                        m->mbmi.segment_id, SEG_LVL_EOB) != 0))) {

-    MACROBLOCKD *const xd  = &pbi->mb;

-    m->mbmi.mb_skip_coeff =

-      vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));

-  } else {

-    if (vp9_segfeature_active(&pbi->mb,

-                              m->mbmi.segment_id, SEG_LVL_EOB) &&

-        (vp9_get_segdata(&pbi->mb,

-                         m->mbmi.segment_id, SEG_LVL_EOB) == 0)) {

-      m->mbmi.mb_skip_coeff = 1;

-    } else

-      m->mbmi.mb_skip_coeff = 0;

-  }

-#if CONFIG_SUPERBLOCKS

-  if (m->mbmi.encoded_as_sb) {

-    y_mode = (MB_PREDICTION_MODE) read_kf_sb_ymode(bc,

-      pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]);

-  } else

-#endif

-  y_mode = (MB_PREDICTION_MODE) read_kf_mb_ymode(bc,

-    pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);

-#if CONFIG_COMP_INTRA_PRED

-  m->mbmi.second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

-#endif

-  m->mbmi.ref_frame = INTRA_FRAME;

-  if ((m->mbmi.mode = y_mode) == B_PRED) {

-    int i = 0;

-#if CONFIG_COMP_INTRA_PRED

-    int use_comp_pred = vp9_read(bc, 128);

-#endif

-    do {

-      const B_PREDICTION_MODE A = above_block_mode(m, i, mis);

-      const B_PREDICTION_MODE L = left_block_mode(m, i);

-      m->bmi[i].as_mode.first =

-        (B_PREDICTION_MODE) read_bmode(

-          bc, pbi->common.kf_bmode_prob [A] [L]);

-#if CONFIG_COMP_INTRA_PRED

-      if (use_comp_pred) {

-        m->bmi[i].as_mode.second =

-          (B_PREDICTION_MODE) read_bmode(

-            bc, pbi->common.kf_bmode_prob [A] [L]);

-      } else {

-        m->bmi[i].as_mode.second = (B_PREDICTION_MODE)(B_DC_PRED - 1);

-      }

-#endif

-    } while (++i < 16);

-  }

-  if ((m->mbmi.mode = y_mode) == I8X8_PRED) {

-    int i;

-    int mode8x8;

-    for (i = 0; i < 4; i++) {

-      int ib = vp9_i8x8_block[i];

-      mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);

-      m->bmi[ib + 0].as_mode.first = mode8x8;

-      m->bmi[ib + 1].as_mode.first = mode8x8;

-      m->bmi[ib + 4].as_mode.first = mode8x8;

-      m->bmi[ib + 5].as_mode.first = mode8x8;

-#if CONFIG_COMP_INTRA_PRED

-      m->bmi[ib + 0].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);

-      m->bmi[ib + 1].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);

-      m->bmi[ib + 4].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);

-      m->bmi[ib + 5].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);

-#endif

-    }

-  } else

-    m->mbmi.uv_mode = (MB_PREDICTION_MODE)read_uv_mode(bc,

-                                                       pbi->common.kf_uv_mode_prob[m->mbmi.mode]);

-#if CONFIG_COMP_INTRA_PRED

-  m->mbmi.second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

-#endif

-#if CONFIG_SUPERBLOCKS

-  if (m->mbmi.encoded_as_sb)

-    m->mbmi.txfm_size = TX_8X8;

-  else

-#endif

-  if (cm->txfm_mode == TX_MODE_SELECT && m->mbmi.mb_skip_coeff == 0 &&

-      m->mbmi.mode <= I8X8_PRED) {

-    // FIXME(rbultje) code ternary symbol once all experiments are merged

-    m->mbmi.txfm_size = vp9_read(bc, cm->prob_tx[0]);

-    if (m->mbmi.txfm_size != TX_4X4 && m->mbmi.mode != I8X8_PRED)

-      m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[1]);

-  } else if (cm->txfm_mode >= ALLOW_16X16 && m->mbmi.mode <= TM_PRED) {

-    m->mbmi.txfm_size = TX_16X16;

-  } else if (cm->txfm_mode >= ALLOW_8X8 && m->mbmi.mode != B_PRED) {

-    m->mbmi.txfm_size = TX_8X8;

-  } else {

-    m->mbmi.txfm_size = TX_4X4;

-  }

-}

-static int read_nmv_component(vp9_reader *r,

-                              int rv,

-                              const nmv_component *mvcomp) {

-  int v, s, z, c, o, d;

-  s = vp9_read(r, mvcomp->sign);

-  c = treed_read(r, vp9_mv_class_tree, mvcomp->classes);

-  if (c == MV_CLASS_0) {

-    d = treed_read(r, vp9_mv_class0_tree, mvcomp->class0);

-  } else {

-    int i, b;

-    d = 0;

-    b = c + CLASS0_BITS - 1;  /* number of bits */

-    for (i = 0; i < b; ++i)

-      d |= (vp9_read(r, mvcomp->bits[i]) << i);

-  }

-  o = d << 3;

-  z = vp9_get_mv_mag(c, o);

-  v = (s ? -(z + 8) : (z + 8));

-  return v;

-}

-static int read_nmv_component_fp(vp9_reader *r,

-                                 int v,

-                                 int rv,

-                                 const nmv_component *mvcomp,

-                                 int usehp) {

-  int s, z, c, o, d, e, f;

-  s = v < 0;

-  z = (s ? -v : v) - 1;       /* magnitude - 1 */

-  z &= ~7;

-  c = vp9_get_mv_class(z, &o);

-  d = o >> 3;

-  if (c == MV_CLASS_0) {

-    f = treed_read(r, vp9_mv_fp_tree, mvcomp->class0_fp[d]);

-  } else {

-    f = treed_read(r, vp9_mv_fp_tree, mvcomp->fp);

-  }

-  o += (f << 1);

-  if (usehp) {

-    if (c == MV_CLASS_0) {

-      e = vp9_read(r, mvcomp->class0_hp);

-    } else {

-      e = vp9_read(r, mvcomp->hp);

-    }

-    o += e;

-  } else {

-    ++o;  /* Note if hp is not used, the default value of the hp bit is 1 */

-  }

-  z = vp9_get_mv_mag(c, o);

-  v = (s ? -(z + 1) : (z + 1));

-  return v;

-}

-static void read_nmv(vp9_reader *r, MV *mv, const MV *ref,

-                     const nmv_context *mvctx) {

-  MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, mvctx->joints);

-  mv->row = mv-> col = 0;

-  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {

-    mv->row = read_nmv_component(r, ref->row, &mvctx->comps[0]);

-  }

-  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {

-    mv->col = read_nmv_component(r, ref->col, &mvctx->comps[1]);

-  }

-}

-static void read_nmv_fp(vp9_reader *r, MV *mv, const MV *ref,

-                        const nmv_context *mvctx, int usehp) {

-  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);

-  usehp = usehp && vp9_use_nmv_hp(ref);

-  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {

-    mv->row = read_nmv_component_fp(r, mv->row, ref->row, &mvctx->comps[0],

-                                    usehp);

-  }

-  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {

-    mv->col = read_nmv_component_fp(r, mv->col, ref->col, &mvctx->comps[1],

-                                    usehp);

-  }

-  //printf("  %d: %d %d ref: %d %d\n", usehp, mv->row, mv-> col, ref->row, ref->col);

-}

-static void update_nmv(vp9_reader *bc, vp9_prob *const p,

-                       const vp9_prob upd_p) {

-  if (vp9_read(bc, upd_p)) {

-#ifdef LOW_PRECISION_MV_UPDATE

-    *p = (vp9_read_literal(bc, 7) << 1) | 1;

-#else

-    *p = (vp9_read_literal(bc, 8));

-#endif

-  }

-}

-static void read_nmvprobs(vp9_reader *bc, nmv_context *mvctx,

-                          int usehp) {

-  int i, j, k;

-#ifdef MV_GROUP_UPDATE

-  if (!vp9_read_bit(bc)) return;

-#endif

-  for (j = 0; j < MV_JOINTS - 1; ++j) {

-    update_nmv(bc, &mvctx->joints[j],

-               VP9_NMV_UPDATE_PROB);

-  }

-  for (i = 0; i < 2; ++i) {

-    update_nmv(bc, &mvctx->comps[i].sign,

-               VP9_NMV_UPDATE_PROB);

-    for (j = 0; j < MV_CLASSES - 1; ++j) {

-      update_nmv(bc, &mvctx->comps[i].classes[j],

-                 VP9_NMV_UPDATE_PROB);

-    }

-    for (j = 0; j < CLASS0_SIZE - 1; ++j) {

-      update_nmv(bc, &mvctx->comps[i].class0[j],

-                 VP9_NMV_UPDATE_PROB);

-    }

-    for (j = 0; j < MV_OFFSET_BITS; ++j) {

-      update_nmv(bc, &mvctx->comps[i].bits[j],

-                 VP9_NMV_UPDATE_PROB);

-    }

-  }

-  for (i = 0; i < 2; ++i) {

-    for (j = 0; j < CLASS0_SIZE; ++j) {

-      for (k = 0; k < 3; ++k)

-        update_nmv(bc, &mvctx->comps[i].class0_fp[j][k],

-                   VP9_NMV_UPDATE_PROB);

-    }

-    for (j = 0; j < 3; ++j) {

-      update_nmv(bc, &mvctx->comps[i].fp[j],

-                 VP9_NMV_UPDATE_PROB);

-    }

-  }

-  if (usehp) {

-    for (i = 0; i < 2; ++i) {

-      update_nmv(bc, &mvctx->comps[i].class0_hp,

-                 VP9_NMV_UPDATE_PROB);

-      update_nmv(bc, &mvctx->comps[i].hp,

-                 VP9_NMV_UPDATE_PROB);

-    }

-  }

-}

-// Read the referncence frame

-static MV_REFERENCE_FRAME read_ref_frame(VP9D_COMP *pbi,

-                                         vp9_reader *const bc,

-                                         unsigned char segment_id) {

-  MV_REFERENCE_FRAME ref_frame;

-  int seg_ref_active;

-  int seg_ref_count = 0;

-  VP9_COMMON *const cm = &pbi->common;

-  MACROBLOCKD *const xd = &pbi->mb;

-  seg_ref_active = vp9_segfeature_active(xd,

-                                         segment_id,

-                                         SEG_LVL_REF_FRAME);

-  // If segment coding enabled does the segment allow for more than one

-  // possible reference frame

-  if (seg_ref_active) {

-    seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME) +

-                    vp9_check_segref(xd, segment_id, LAST_FRAME) +

-                    vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +

-                    vp9_check_segref(xd, segment_id, ALTREF_FRAME);

-  }

-  // Segment reference frame features not available or allows for

-  // multiple reference frame options

-  if (!seg_ref_active || (seg_ref_count > 1)) {

-    // Values used in prediction model coding

-    unsigned char prediction_flag;

-    vp9_prob pred_prob;

-    MV_REFERENCE_FRAME pred_ref;

-    // Get the context probability the prediction flag

-    pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);

-    // Read the prediction status flag

-    prediction_flag = (unsigned char)vp9_read(bc, pred_prob);

-    // Store the prediction flag.

-    vp9_set_pred_flag(xd, PRED_REF, prediction_flag);

-    // Get the predicted reference frame.

-    pred_ref = vp9_get_pred_ref(cm, xd);

-    // If correctly predicted then use the predicted value

-    if (prediction_flag) {

-      ref_frame = pred_ref;

-    }

-    // else decode the explicitly coded value

-    else {

-      vp9_prob mod_refprobs[PREDICTION_PROBS];

-      vpx_memcpy(mod_refprobs,

-                 cm->mod_refprobs[pred_ref], sizeof(mod_refprobs));

-      // If segment coding enabled blank out options that cant occur by

-      // setting the branch probability to 0.

-      if (seg_ref_active) {

-        mod_refprobs[INTRA_FRAME] *=

-          vp9_check_segref(xd, segment_id, INTRA_FRAME);

-        mod_refprobs[LAST_FRAME] *=

-          vp9_check_segref(xd, segment_id, LAST_FRAME);

-        mod_refprobs[GOLDEN_FRAME] *=

-          (vp9_check_segref(xd, segment_id, GOLDEN_FRAME) *

-           vp9_check_segref(xd, segment_id, ALTREF_FRAME));

-      }

-      // Default to INTRA_FRAME (value 0)

-      ref_frame = INTRA_FRAME;

-      // Do we need to decode the Intra/Inter branch

-      if (mod_refprobs[0])

-        ref_frame = (MV_REFERENCE_FRAME) vp9_read(bc, mod_refprobs[0]);

-      else

-        ref_frame++;

-      if (ref_frame) {

-        // Do we need to decode the Last/Gf_Arf branch

-        if (mod_refprobs[1])

-          ref_frame += vp9_read(bc, mod_refprobs[1]);

-        else

-          ref_frame++;

-        if (ref_frame > 1) {

-          // Do we need to decode the GF/Arf branch

-          if (mod_refprobs[2])

-            ref_frame += vp9_read(bc, mod_refprobs[2]);

-          else {

-            if (seg_ref_active) {

-              if ((pred_ref == GOLDEN_FRAME) ||

-                  !vp9_check_segref(xd, segment_id, GOLDEN_FRAME)) {

-                ref_frame = ALTREF_FRAME;

-              } else

-                ref_frame = GOLDEN_FRAME;

-            } else

-              ref_frame = (pred_ref == GOLDEN_FRAME)

-                          ? ALTREF_FRAME : GOLDEN_FRAME;

-          }

-        }

-      }

-    }

-  }

-  // Segment reference frame features are enabled

-  else {

-    // The reference frame for the mb is considered as correclty predicted

-    // if it is signaled at the segment level for the purposes of the

-    // common prediction model

-    vp9_set_pred_flag(xd, PRED_REF, 1);

-    ref_frame = vp9_get_pred_ref(cm, xd);

-  }

-  return (MV_REFERENCE_FRAME)ref_frame;

-}

-#if CONFIG_SUPERBLOCKS

-static MB_PREDICTION_MODE read_sb_mv_ref(vp9_reader *bc, const vp9_prob *p) {

-  return (MB_PREDICTION_MODE) treed_read(bc, vp9_sb_mv_ref_tree, p);

-}

-#endif

-static MB_PREDICTION_MODE read_mv_ref(vp9_reader *bc, const vp9_prob *p) {

-  return (MB_PREDICTION_MODE) treed_read(bc, vp9_mv_ref_tree, p);

-}

-static B_PREDICTION_MODE sub_mv_ref(vp9_reader *bc, const vp9_prob *p) {

-  return (B_PREDICTION_MODE) treed_read(bc, vp9_sub_mv_ref_tree, p);

-}

-#ifdef VPX_MODE_COUNT

-unsigned int vp9_mv_cont_count[5][4] = {

-  { 0, 0, 0, 0 },

-  { 0, 0, 0, 0 },

-  { 0, 0, 0, 0 },

-  { 0, 0, 0, 0 },

-  { 0, 0, 0, 0 }

-};

-#endif

-static const unsigned char mbsplit_fill_count[4] = {8, 8, 4, 1};

-static const unsigned char mbsplit_fill_offset[4][16] = {

-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},

-  { 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,   6,  7, 10, 11, 14, 15},

-  { 0,  1,  4,  5,  2,  3,  6,  7,  8,  9,  12, 13, 10, 11, 14, 15},

-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}

-};

-static void read_switchable_interp_probs(VP9D_COMP* const pbi,

-                                         BOOL_DECODER* const bc) {

-  VP9_COMMON *const cm = &pbi->common;

-  int i, j;

-  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {

-    for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {

-      cm->fc.switchable_interp_prob[j][i] = vp9_read_literal(bc, 8);

-    }

-  }

-  //printf("DECODER: %d %d\n", cm->fc.switchable_interp_prob[0],

-  //cm->fc.switchable_interp_prob[1]);

-}

-static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *bc) {

-  VP9_COMMON *const cm = &pbi->common;

-  nmv_context *const nmvc = &pbi->common.fc.nmvc;

-  MACROBLOCKD *const xd  = &pbi->mb;

-  if (cm->frame_type == KEY_FRAME) {

-    if (!cm->kf_ymode_probs_update)

-      cm->kf_ymode_probs_index = vp9_read_literal(bc, 3);

-  } else {

-#if CONFIG_PRED_FILTER

-    cm->pred_filter_mode = (vp9_prob)vp9_read_literal(bc, 2);

-    if (cm->pred_filter_mode == 2)

-      cm->prob_pred_filter_off = (vp9_prob)vp9_read_literal(bc, 8);

-#endif

-    if (cm->mcomp_filter_type == SWITCHABLE)

-      read_switchable_interp_probs(pbi, bc);

-    // Decode the baseline probabilities for decoding reference frame

-    cm->prob_intra_coded = (vp9_prob)vp9_read_literal(bc, 8);

-    cm->prob_last_coded  = (vp9_prob)vp9_read_literal(bc, 8);

-    cm->prob_gf_coded    = (vp9_prob)vp9_read_literal(bc, 8);

-    // Computes a modified set of probabilities for use when reference

-    // frame prediction fails.

-    vp9_compute_mod_refprobs(cm);

-    pbi->common.comp_pred_mode = vp9_read(bc, 128);

-    if (cm->comp_pred_mode)

-      cm->comp_pred_mode += vp9_read(bc, 128);

-    if (cm->comp_pred_mode == HYBRID_PREDICTION) {

-      int i;

-      for (i = 0; i < COMP_PRED_CONTEXTS; i++)

-        cm->prob_comppred[i] = (vp9_prob)vp9_read_literal(bc, 8);

-    }

-    if (vp9_read_bit(bc)) {

-      int i = 0;

-      do {

-        cm->fc.ymode_prob[i] = (vp9_prob) vp9_read_literal(bc, 8);

-      } while (++i < VP9_YMODES - 1);

-    }

-#if CONFIG_NEW_MVREF

-  // Temp defaults probabilities for ecnoding the MV ref id signal

-  vpx_memset(xd->mb_mv_ref_id_probs, 192, sizeof(xd->mb_mv_ref_id_probs));

-#endif

-    read_nmvprobs(bc, nmvc, xd->allow_high_precision_mv);

-  }

-}

-// This function either reads the segment id for the current macroblock from

-// the bitstream or if the value is temporally predicted asserts the predicted

-// value

-static void read_mb_segment_id(VP9D_COMP *pbi,

-                               int mb_row, int mb_col,

-                               BOOL_DECODER* const bc) {

-  VP9_COMMON *const cm = &pbi->common;

-  MACROBLOCKD *const xd  = &pbi->mb;

-  MODE_INFO *mi = xd->mode_info_context;

-  MB_MODE_INFO *mbmi = &mi->mbmi;

-  int index = mb_row * pbi->common.mb_cols + mb_col;

-  if (xd->segmentation_enabled) {

-    if (xd->update_mb_segmentation_map) {

-      // Is temporal coding of the segment id for this mb enabled.

-      if (cm->temporal_update) {

-        // Get the context based probability for reading the

-        // prediction status flag

-        vp9_prob pred_prob =

-          vp9_get_pred_prob(cm, xd, PRED_SEG_ID);

-        // Read the prediction status flag

-        unsigned char seg_pred_flag =

-          (unsigned char)vp9_read(bc, pred_prob);

-        // Store the prediction flag.

-        vp9_set_pred_flag(xd, PRED_SEG_ID, seg_pred_flag);

-        // If the value is flagged as correctly predicted

-        // then use the predicted value

-        if (seg_pred_flag) {

-          mbmi->segment_id = vp9_get_pred_mb_segid(cm, xd, index);

-        }

-        // Else .... decode it explicitly

-        else {

-          read_mb_segid(bc, mbmi, xd);

-        }

-      }

-      // Normal unpredicted coding mode

-      else {

-        read_mb_segid(bc, mbmi, xd);

-      }

-#if CONFIG_SUPERBLOCKS

-      if (mbmi->encoded_as_sb) {

-        cm->last_frame_seg_map[index] = mbmi->segment_id;

-        if (mb_col + 1 < cm->mb_cols)

-          cm->last_frame_seg_map[index + 1] = mbmi->segment_id;

-        if (mb_row + 1 < cm->mb_rows) {

-          cm->last_frame_seg_map[index + cm->mb_cols] = mbmi->segment_id;

-          if (mb_col + 1 < cm->mb_cols)

-            cm->last_frame_seg_map[index + cm->mb_cols + 1] = mbmi->segment_id;

-        }

-      } else

-#endif

-      {

-        cm->last_frame_seg_map[index] = mbmi->segment_id;

-      }

-    } else {

-#if CONFIG_SUPERBLOCKS

-      if (mbmi->encoded_as_sb) {

-        mbmi->segment_id = cm->last_frame_seg_map[index];

-        if (mb_col < cm->mb_cols - 1)

-          mbmi->segment_id = mbmi->segment_id &&

-                             cm->last_frame_seg_map[index + 1];

-        if (mb_row < cm->mb_rows - 1) {

-          mbmi->segment_id = mbmi->segment_id &&

-                             cm->last_frame_seg_map[index + cm->mb_cols];

-          if (mb_col < cm->mb_cols - 1)

-            mbmi->segment_id = mbmi->segment_id &&

-                               cm->last_frame_seg_map[index + cm->mb_cols + 1];

-        }

-      } else

-#endif

-      {

-        mbmi->segment_id = cm->last_frame_seg_map[index];

-      }

-    }

-  } else {

-    // The encoder explicitly sets the segment_id to 0

-    // when segmentation is disabled

-    mbmi->segment_id = 0;

-  }

-}

-static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,

-                             MODE_INFO *prev_mi,

-                             int mb_row, int mb_col,

-                             BOOL_DECODER* const bc) {

-  VP9_COMMON *const cm = &pbi->common;

-  nmv_context *const nmvc = &pbi->common.fc.nmvc;

-  const int mis = pbi->common.mode_info_stride;

-  MACROBLOCKD *const xd  = &pbi->mb;

-  int_mv *const mv = &mbmi->mv;

-  int mb_to_left_edge;

-  int mb_to_right_edge;

-  int mb_to_top_edge;

-  int mb_to_bottom_edge;

-  mb_to_top_edge = xd->mb_to_top_edge;

-  mb_to_bottom_edge = xd->mb_to_bottom_edge;

-  mb_to_top_edge -= LEFT_TOP_MARGIN;

-  mb_to_bottom_edge += RIGHT_BOTTOM_MARGIN;

-  mbmi->need_to_clamp_mvs = 0;

-  mbmi->need_to_clamp_secondmv = 0;

-  mbmi->second_ref_frame = 0;

-  /* Distance of Mb to the various image edges.

-   * These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units

-   */

-  xd->mb_to_left_edge =

-    mb_to_left_edge = -((mb_col * 16) << 3);

-  mb_to_left_edge -= LEFT_TOP_MARGIN;

-  xd->mb_to_right_edge =

-    mb_to_right_edge = ((pbi->common.mb_cols - 1 - mb_col) * 16) << 3;

-  mb_to_right_edge += RIGHT_BOTTOM_MARGIN;

-  // Make sure the MACROBLOCKD mode info pointer is pointed at the

-  // correct entry for the current macroblock.

-  xd->mode_info_context = mi;

-  xd->prev_mode_info_context = prev_mi;

-  // Read the macroblock segment id.

-  read_mb_segment_id(pbi, mb_row, mb_col, bc);

-  if (pbi->common.mb_no_coeff_skip &&

-      (!vp9_segfeature_active(xd,

-                              mbmi->segment_id, SEG_LVL_EOB) ||

-       (vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_EOB) != 0))) {

-    // Read the macroblock coeff skip flag if this feature is in use,

-    // else default to 0

-    mbmi->mb_skip_coeff = vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));

-  } else {

-    if (vp9_segfeature_active(xd,

-                              mbmi->segment_id, SEG_LVL_EOB) &&

-        (vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_EOB) == 0)) {

-      mbmi->mb_skip_coeff = 1;

-    } else

-      mbmi->mb_skip_coeff = 0;

-  }

-  // Read the reference frame

-  mbmi->ref_frame = read_ref_frame(pbi, bc, mbmi->segment_id);

-  // If reference frame is an Inter frame

-  if (mbmi->ref_frame) {

-    int rct[4];

-    int_mv nearest, nearby, best_mv;

-    int_mv nearest_second, nearby_second, best_mv_second;

-    vp9_prob mv_ref_p [VP9_MVREFS - 1];

-#if CONFIG_NEWBESTREFMV

-    int recon_y_stride, recon_yoffset;

-    int recon_uv_stride, recon_uvoffset;

-#endif

-    vp9_find_near_mvs(xd, mi,

-                      prev_mi,

-                      &nearest, &nearby, &best_mv, rct,

-                      mbmi->ref_frame, cm->ref_frame_sign_bias);

-#if CONFIG_NEWBESTREFMV

-    {

-      int ref_fb_idx;

-      MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;

-      /* Select the appropriate reference frame for this MB */

-      if (ref_frame == LAST_FRAME)

-        ref_fb_idx = cm->lst_fb_idx;

-      else if (ref_frame == GOLDEN_FRAME)

-        ref_fb_idx = cm->gld_fb_idx;

-      else

-        ref_fb_idx = cm->alt_fb_idx;

-      recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride  ;

-      recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;

-      recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);

-      recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);

-      xd->pre.y_buffer = cm->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;

-      xd->pre.u_buffer = cm->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;

-      xd->pre.v_buffer = cm->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;

-      vp9_find_mv_refs(xd, mi, prev_mi,

-                       ref_frame, mbmi->ref_mvs[ref_frame],

-                       cm->ref_frame_sign_bias);

-      vp9_find_best_ref_mvs(xd,

-                            xd->pre.y_buffer,

-                            recon_y_stride,

-                            mbmi->ref_mvs[ref_frame],

-                            &best_mv, &nearest, &nearby);

-    }

-#endif

-    vp9_mv_ref_probs(&pbi->common, mv_ref_p, rct);

-    // Is the segment level mode feature enabled for this segment

-    if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE)) {

-      mbmi->mode =

-        vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);

-    } else {

-#if CONFIG_SUPERBLOCKS

-      if (mbmi->encoded_as_sb) {

-        mbmi->mode = read_sb_mv_ref(bc, mv_ref_p);

-      } else

-#endif

-      mbmi->mode = read_mv_ref(bc, mv_ref_p);

-      vp9_accum_mv_refs(&pbi->common, mbmi->mode, rct);

-    }

-#if CONFIG_PRED_FILTER

-    if (mbmi->mode >= NEARESTMV && mbmi->mode < SPLITMV) {

-      // Is the prediction filter enabled

-      if (cm->pred_filter_mode == 2)

-        mbmi->pred_filter_enabled =

-          vp9_read(bc, cm->prob_pred_filter_off);

-      else

-        mbmi->pred_filter_enabled = cm->pred_filter_mode;

-    }

-#endif

-    if (mbmi->mode >= NEARESTMV && mbmi->mode <= SPLITMV)

-    {

-      if (cm->mcomp_filter_type == SWITCHABLE) {

-        mbmi->interp_filter = vp9_switchable_interp[

-            treed_read(bc, vp9_switchable_interp_tree,

-                       vp9_get_pred_probs(cm, xd, PRED_SWITCHABLE_INTERP))];

-      } else {

-        mbmi->interp_filter = cm->mcomp_filter_type;

-      }

-    }

-    if (cm->comp_pred_mode == COMP_PREDICTION_ONLY ||

-        (cm->comp_pred_mode == HYBRID_PREDICTION &&

-         vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_COMP)))) {

-      /* Since we have 3 reference frames, we can only have 3 unique

-       * combinations of combinations of 2 different reference frames

-       * (A-G, G-L or A-L). In the bitstream, we use this to simply

-       * derive the second reference frame from the first reference

-       * frame, by saying it's the next one in the enumerator, and

-       * if that's > n_refs, then the second reference frame is the

-       * first one in the enumerator. */

-      mbmi->second_ref_frame = mbmi->ref_frame + 1;

-      if (mbmi->second_ref_frame == 4)

-        mbmi->second_ref_frame = 1;

-#if CONFIG_NEWBESTREFMV

-      if (mbmi->second_ref_frame) {

-        int second_ref_fb_idx;

-        /* Select the appropriate reference frame for this MB */

-        if (mbmi->second_ref_frame == LAST_FRAME)

-          second_ref_fb_idx = cm->lst_fb_idx;

-        else if (mbmi->second_ref_frame ==

-          GOLDEN_FRAME)

-          second_ref_fb_idx = cm->gld_fb_idx;

-        else

-          second_ref_fb_idx = cm->alt_fb_idx;

-        xd->second_pre.y_buffer =

-          cm->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;

-        xd->second_pre.u_buffer =

-          cm->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;

-        xd->second_pre.v_buffer =

-          cm->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;

-        vp9_find_near_mvs(xd, mi, prev_mi,

-                          &nearest_second, &nearby_second, &best_mv_second,

-                          rct,

-                          mbmi->second_ref_frame,

-                          cm->ref_frame_sign_bias);

-        vp9_find_mv_refs(xd, mi, prev_mi,

-                         mbmi->second_ref_frame,

-                         mbmi->ref_mvs[mbmi->second_ref_frame],

-                         cm->ref_frame_sign_bias);

-        vp9_find_best_ref_mvs(xd,

-                              xd->second_pre.y_buffer,

-                              recon_y_stride,

-                              mbmi->ref_mvs[mbmi->second_ref_frame],

-                              &best_mv_second,

-                              &nearest_second,

-                              &nearby_second);

-      }

-#else

-      vp9_find_near_mvs(xd, mi, prev_mi,

-                        &nearest_second, &nearby_second, &best_mv_second,

-                        rct,

-                        mbmi->second_ref_frame,

-                        pbi->common.ref_frame_sign_bias);

-#endif

-    } else {

-      mbmi->second_ref_frame = 0;

-    }

-    mbmi->uv_mode = DC_PRED;

-    switch (mbmi->mode) {

-      case SPLITMV: {

-        const int s = mbmi->partitioning =

-                        treed_read(bc, vp9_mbsplit_tree, cm->fc.mbsplit_prob);

-        const int num_p = vp9_mbsplit_count [s];

-        int j = 0;

-        cm->fc.mbsplit_counts[s]++;

-        mbmi->need_to_clamp_mvs = 0;

-        do { /* for each subset j */

-          int_mv leftmv, abovemv, second_leftmv, second_abovemv;

-          int_mv blockmv, secondmv;

-          int k;  /* first block in subset j */

-          int mv_contz;

-          int blockmode;

-          k = vp9_mbsplit_offset[s][j];

-          leftmv.as_int = left_block_mv(mi, k);

-          abovemv.as_int = above_block_mv(mi, k, mis);

-          if (mbmi->second_ref_frame) {

-            second_leftmv.as_int = left_block_second_mv(mi, k);

-            second_abovemv.as_int = above_block_second_mv(mi, k, mis);

-          }

-          mv_contz = vp9_mv_cont(&leftmv, &abovemv);

-          blockmode = sub_mv_ref(bc, cm->fc.sub_mv_ref_prob [mv_contz]);

-          cm->fc.sub_mv_ref_counts[mv_contz][blockmode - LEFT4X4]++;

-          switch (blockmode) {

-            case NEW4X4:

-              read_nmv(bc, &blockmv.as_mv, &best_mv.as_mv, nmvc);

-              read_nmv_fp(bc, &blockmv.as_mv, &best_mv.as_mv, nmvc,

-                          xd->allow_high_precision_mv);

-              vp9_increment_nmv(&blockmv.as_mv, &best_mv.as_mv,

-                                &cm->fc.NMVcount, xd->allow_high_precision_mv);

-              blockmv.as_mv.row += best_mv.as_mv.row;

-              blockmv.as_mv.col += best_mv.as_mv.col;

-              if (mbmi->second_ref_frame) {

-                read_nmv(bc, &secondmv.as_mv, &best_mv_second.as_mv, nmvc);

-                read_nmv_fp(bc, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,

-                            xd->allow_high_precision_mv);

-                vp9_increment_nmv(&secondmv.as_mv, &best_mv_second.as_mv,

-                                  &cm->fc.NMVcount, xd->allow_high_precision_mv);

-                secondmv.as_mv.row += best_mv_second.as_mv.row;

-                secondmv.as_mv.col += best_mv_second.as_mv.col;

-              }

-#ifdef VPX_MODE_COUNT

-              vp9_mv_cont_count[mv_contz][3]++;

-#endif

-              break;

-            case LEFT4X4:

-              blockmv.as_int = leftmv.as_int;

-              if (mbmi->second_ref_frame)

-                secondmv.as_int = second_leftmv.as_int;

-#ifdef VPX_MODE_COUNT

-              vp9_mv_cont_count[mv_contz][0]++;

-#endif

-              break;

-            case ABOVE4X4:

-              blockmv.as_int = abovemv.as_int;

-              if (mbmi->second_ref_frame)

-                secondmv.as_int = second_abovemv.as_int;

-#ifdef VPX_MODE_COUNT

-              vp9_mv_cont_count[mv_contz][1]++;

-#endif

-              break;

-            case ZERO4X4:

-              blockmv.as_int = 0;

-              if (mbmi->second_ref_frame)

-                secondmv.as_int = 0;

-#ifdef VPX_MODE_COUNT

-              vp9_mv_cont_count[mv_contz][2]++;

-#endif

-              break;

-            default:

-              break;

-          }

-          mbmi->need_to_clamp_mvs |= check_mv_bounds(&blockmv,

-                                                     mb_to_left_edge,

-                                                     mb_to_right_edge,

-                                                     mb_to_top_edge,

-                                                     mb_to_bottom_edge);

-          if (mbmi->second_ref_frame) {

-            mbmi->need_to_clamp_mvs |= check_mv_bounds(&secondmv,

-                                                       mb_to_left_edge,

-                                                       mb_to_right_edge,

-                                                       mb_to_top_edge,

-                                                       mb_to_bottom_edge);

-          }

-          {

-            /* Fill (uniform) modes, mvs of jth subset.

-             Must do it here because ensuing subsets can

-             refer back to us via "left" or "above". */

-            const unsigned char *fill_offset;

-            unsigned int fill_count = mbsplit_fill_count[s];

-            fill_offset = &mbsplit_fill_offset[s][(unsigned char)j * mbsplit_fill_count[s]];

-            do {

-              mi->bmi[ *fill_offset].as_mv.first.as_int = blockmv.as_int;

-              if (mbmi->second_ref_frame)

-                mi->bmi[ *fill_offset].as_mv.second.as_int = secondmv.as_int;

-              fill_offset++;

-            } while (--fill_count);

-          }

-        } while (++j < num_p);

-      }

-      mv->as_int = mi->bmi[15].as_mv.first.as_int;

-      mbmi->mv[1].as_int = mi->bmi[15].as_mv.second.as_int;

-      break;  /* done with SPLITMV */

-      case NEARMV:

-        mv->as_int = nearby.as_int;

-        /* Clip "next_nearest" so that it does not extend to far out of image */

-        clamp_mv(mv, mb_to_left_edge, mb_to_right_edge,

-                 mb_to_top_edge, mb_to_bottom_edge);

-        if (mbmi->second_ref_frame) {

-          mbmi->mv[1].as_int = nearby_second.as_int;

-          clamp_mv(&mbmi->mv[1], mb_to_left_edge, mb_to_right_edge,

-                   mb_to_top_edge, mb_to_bottom_edge);

-        }

-        break;

-      case NEARESTMV:

-        mv->as_int = nearest.as_int;

-        /* Clip "next_nearest" so that it does not extend to far out of image */

-        clamp_mv(mv, mb_to_left_edge, mb_to_right_edge,

-                 mb_to_top_edge, mb_to_bottom_edge);

-        if (mbmi->second_ref_frame) {

-          mbmi->mv[1].as_int = nearest_second.as_int;

-          clamp_mv(&mbmi->mv[1], mb_to_left_edge, mb_to_right_edge,

-                   mb_to_top_edge, mb_to_bottom_edge);

-        }

-        break;

-      case ZEROMV:

-        mv->as_int = 0;

-        if (mbmi->second_ref_frame)

-          mbmi->mv[1].as_int = 0;

-        break;

-      case NEWMV:

-#if CONFIG_NEW_MVREF

-        {

-          int best_index;

-          MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;

-          // Encode the index of the choice.

-          best_index =

-            vp9_read_mv_ref_id(bc, xd->mb_mv_ref_id_probs[ref_frame]);

-          best_mv.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;

-        }

-#endif

-        read_nmv(bc, &mv->as_mv, &best_mv.as_mv, nmvc);

-        read_nmv_fp(bc, &mv->as_mv, &best_mv.as_mv, nmvc,

-                    xd->allow_high_precision_mv);

-        vp9_increment_nmv(&mv->as_mv, &best_mv.as_mv, &cm->fc.NMVcount,

-                          xd->allow_high_precision_mv);

-        mv->as_mv.row += best_mv.as_mv.row;

-        mv->as_mv.col += best_mv.as_mv.col;

-        /* Don't need to check this on NEARMV and NEARESTMV modes

-         * since those modes clamp the MV. The NEWMV mode does not,

-         * so signal to the prediction stage whether special

-         * handling may be required.

-         */

-        mbmi->need_to_clamp_mvs = check_mv_bounds(mv,

-                                                  mb_to_left_edge,

-                                                  mb_to_right_edge,

-                                                  mb_to_top_edge,

-                                                  mb_to_bottom_edge);

-        if (mbmi->second_ref_frame) {

-#if CONFIG_NEW_MVREF

-        {

-          int best_index;

-          MV_REFERENCE_FRAME ref_frame = mbmi->second_ref_frame;

-          // Encode the index of the choice.

-          best_index =

-            vp9_read_mv_ref_id(bc, xd->mb_mv_ref_id_probs[ref_frame]);

-          best_mv_second.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;

-        }

-#endif

-          read_nmv(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc);

-          read_nmv_fp(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc,

-                      xd->allow_high_precision_mv);

-          vp9_increment_nmv(&mbmi->mv[1].as_mv, &best_mv_second.as_mv,

-                            &cm->fc.NMVcount, xd->allow_high_precision_mv);

-          mbmi->mv[1].as_mv.row += best_mv_second.as_mv.row;

-          mbmi->mv[1].as_mv.col += best_mv_second.as_mv.col;

-          mbmi->need_to_clamp_secondmv |=

-            check_mv_bounds(&mbmi->mv[1],

-                            mb_to_left_edge, mb_to_right_edge,

-                            mb_to_top_edge, mb_to_bottom_edge);

-        }

-        break;

-      default:

-;

-#if CONFIG_DEBUG

-        assert(0);

-#endif

-    }

-  } else {

-    /* required for left and above block mv */

-    mbmi->mv[0].as_int = 0;

-    if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE))

-      mbmi->mode = (MB_PREDICTION_MODE)

-                   vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);

-    else {

-      // FIXME write using SB mode tree

-      mbmi->mode = (MB_PREDICTION_MODE)

-                   read_ymode(bc, pbi->common.fc.ymode_prob);

-      pbi->common.fc.ymode_counts[mbmi->mode]++;

-    }

-#if CONFIG_COMP_INTRA_PRED

-    mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

-#endif

-    // If MB mode is BPRED read the block modes

-    if (mbmi->mode == B_PRED) {

-      int j = 0;

-#if CONFIG_COMP_INTRA_PRED

-      int use_comp_pred = vp9_read(bc, 128);

-#endif

-      do {

-        mi->bmi[j].as_mode.first = (B_PREDICTION_MODE)read_bmode(bc, pbi->common.fc.bmode_prob);

-        /*

-        {

-          int p;

-          for (p = 0; p < VP9_BINTRAMODES - 1; ++p)

-            printf(" %d", pbi->common.fc.bmode_prob[p]);

-          printf("\nbmode[%d][%d]: %d\n", pbi->common.current_video_frame, j, mi->bmi[j].as_mode.first);

-        }

-        */

-        pbi->common.fc.bmode_counts[mi->bmi[j].as_mode.first]++;

-#if CONFIG_COMP_INTRA_PRED

-        if (use_comp_pred) {

-          mi->bmi[j].as_mode.second = (B_PREDICTION_MODE)read_bmode(bc, pbi->common.fc.bmode_prob);

-        } else {

-          mi->bmi[j].as_mode.second = (B_PREDICTION_MODE)(B_DC_PRED - 1);

-        }

-#endif

-      } while (++j < 16);

-    }

-    if (mbmi->mode == I8X8_PRED) {

-      int i;

-      int mode8x8;

-      for (i = 0; i < 4; i++) {

-        int ib = vp9_i8x8_block[i];

-        mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);

-        mi->bmi[ib + 0].as_mode.first = mode8x8;

-        mi->bmi[ib + 1].as_mode.first = mode8x8;

-        mi->bmi[ib + 4].as_mode.first = mode8x8;

-        mi->bmi[ib + 5].as_mode.first = mode8x8;

-        pbi->common.fc.i8x8_mode_counts[mode8x8]++;

-#if CONFIG_COMP_INTRA_PRED

-        mi->bmi[ib + 0].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);

-        mi->bmi[ib + 1].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);

-        mi->bmi[ib + 4].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);

-        mi->bmi[ib + 5].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);

-#endif

-      }

-    } else {

-      mbmi->uv_mode = (MB_PREDICTION_MODE)read_uv_mode(

-        bc, pbi->common.fc.uv_mode_prob[mbmi->mode]);

-      pbi->common.fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;

-    }

-#if CONFIG_COMP_INTRA_PRED

-    mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

-#endif

-  }

-#if CONFIG_SUPERBLOCKS

-  if (mbmi->encoded_as_sb)

-    mbmi->txfm_size = TX_8X8;

-  else

-#endif

-  if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&

-      ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= I8X8_PRED) ||

-       (mbmi->ref_frame != INTRA_FRAME && !(mbmi->mode == SPLITMV &&

-                           mbmi->partitioning == PARTITIONING_4X4)))) {

-    // FIXME(rbultje) code ternary symbol once all experiments are merged

-    mbmi->txfm_size = vp9_read(bc, cm->prob_tx[0]);

-    if (mbmi->txfm_size != TX_4X4 && mbmi->mode != I8X8_PRED &&

-        mbmi->mode != SPLITMV)

-      mbmi->txfm_size += vp9_read(bc, cm->prob_tx[1]);

-  } else if (cm->txfm_mode >= ALLOW_16X16 &&

-      ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||

-       (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {

-    mbmi->txfm_size = TX_16X16;

-  } else if (cm->txfm_mode >= ALLOW_8X8 &&

-      (!(mbmi->ref_frame == INTRA_FRAME && mbmi->mode == B_PRED) &&

-       !(mbmi->ref_frame != INTRA_FRAME && mbmi->mode == SPLITMV &&

-         mbmi->partitioning == PARTITIONING_4X4))) {

-    mbmi->txfm_size = TX_8X8;

-  } else {

-    mbmi->txfm_size = TX_4X4;

-  }

-}

-void vp9_decode_mode_mvs_init(VP9D_COMP *pbi, BOOL_DECODER* const bc) {

-  VP9_COMMON *cm = &pbi->common;

-  vpx_memset(cm->mbskip_pred_probs, 0, sizeof(cm->mbskip_pred_probs));

-  if (pbi->common.mb_no_coeff_skip) {

-    int k;

-    for (k = 0; k < MBSKIP_CONTEXTS; ++k)

-      cm->mbskip_pred_probs[k] = (vp9_prob)vp9_read_literal(bc, 8);

-  }

-  mb_mode_mv_init(pbi, bc);

-}

-void vp9_decode_mb_mode_mv(VP9D_COMP *pbi,

-                           MACROBLOCKD *xd,

-                           int mb_row,

-                           int mb_col,

-                           BOOL_DECODER* const bc) {

-  MODE_INFO *mi = xd->mode_info_context;

-  MODE_INFO *prev_mi = xd->prev_mode_info_context;

-  if (pbi->common.frame_type == KEY_FRAME)

-    kfread_modes(pbi, mi, mb_row, mb_col, bc);

-  else

-    read_mb_modes_mv(pbi, mi, &mi->mbmi, prev_mi, mb_row, mb_col, bc);

-}

--- a/vp8/decoder/decodemv.h

+++ /dev/null

@@ -1,19 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "onyxd_int.h"

-void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,

-                           MACROBLOCKD* const xd,

-                           int mb_row,

-                           int mb_col,

-                           BOOL_DECODER* const bc);

-void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, BOOL_DECODER* const bc);

--- a/vp8/decoder/decodframe.c

+++ /dev/null

@@ -1,1337 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "onyxd_int.h"

-#include "vp8/common/header.h"

-#include "vp8/common/reconintra.h"

-#include "vp8/common/reconintra4x4.h"

-#include "vp8/common/reconinter.h"

-#include "detokenize.h"

-#include "vp8/common/invtrans.h"

-#include "vp8/common/alloccommon.h"

-#include "vp8/common/entropymode.h"

-#include "vp8/common/quant_common.h"

-#include "vpx_scale/vpxscale.h"

-#include "vpx_scale/yv12extend.h"

-#include "vp8/common/setupintrarecon.h"

-#include "decodemv.h"

-#include "vp8/common/extend.h"

-#include "vp8/common/modecont.h"

-#include "vpx_mem/vpx_mem.h"

-#include "vp8/common/idct.h"

-#include "dboolhuff.h"

-#include "vp8/common/seg_common.h"

-#include "vp8/common/entropy.h"

-#include "vpx_rtcd.h"

-#include <assert.h>

-#include <stdio.h>

-#define COEFCOUNT_TESTING

-static int merge_index(int v, int n, int modulus) {

-  int max1 = (n - 1 - modulus / 2) / modulus + 1;

-  if (v < max1) v = v * modulus + modulus / 2;

-  else {

-    int w;

-    v -= max1;

-    w = v;

-    v += (v + modulus - modulus / 2) / modulus;

-    while (v % modulus == modulus / 2 ||

-           w != v - (v + modulus - modulus / 2) / modulus) v++;

-  }

-  return v;

-}

-static int inv_remap_prob(int v, int m) {

-  const int n = 256;

-  const int modulus = MODULUS_PARAM;

-  int i;

-  v = merge_index(v, n - 1, modulus);

-  if ((m << 1) <= n) {

-    i = vp9_inv_recenter_nonneg(v + 1, m);

-  } else {

-    i = n - 1 - vp9_inv_recenter_nonneg(v + 1, n - 1 - m);

-  }

-  return i;

-}

-static vp9_prob read_prob_diff_update(vp9_reader *const bc, int oldp) {

-  int delp = vp9_decode_term_subexp(bc, SUBEXP_PARAM, 255);

-  return (vp9_prob)inv_remap_prob(delp, oldp);

-}

-void vp9_init_de_quantizer(VP9D_COMP *pbi) {

-  int i;

-  int Q;

-  VP9_COMMON *const pc = &pbi->common;

-  for (Q = 0; Q < QINDEX_RANGE; Q++) {

-    pc->Y1dequant[Q][0] = (short)vp9_dc_quant(Q, pc->y1dc_delta_q);

-    pc->Y2dequant[Q][0] = (short)vp9_dc2quant(Q, pc->y2dc_delta_q);

-    pc->UVdequant[Q][0] = (short)vp9_dc_uv_quant(Q, pc->uvdc_delta_q);

-    /* all the ac values =; */

-    for (i = 1; i < 16; i++) {

-      int rc = vp9_default_zig_zag1d[i];

-      pc->Y1dequant[Q][rc] = (short)vp9_ac_yquant(Q);

-      pc->Y2dequant[Q][rc] = (short)vp9_ac2quant(Q, pc->y2ac_delta_q);

-      pc->UVdequant[Q][rc] = (short)vp9_ac_uv_quant(Q, pc->uvac_delta_q);

-    }

-  }

-}

-static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *xd) {

-  int i;

-  int QIndex;

-  VP9_COMMON *const pc = &pbi->common;

-  int segment_id = xd->mode_info_context->mbmi.segment_id;

-  // Set the Q baseline allowing for any segment level adjustment

-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {

-    /* Abs Value */

-    if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA)

-      QIndex = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);

-    /* Delta Value */

-    else {

-      QIndex = pc->base_qindex +

-               vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);

-      QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;    /* Clamp to valid range */

-    }

-  } else

-    QIndex = pc->base_qindex;

-  xd->q_index = QIndex;

-  /* Set up the block level dequant pointers */

-  for (i = 0; i < 16; i++) {

-    xd->block[i].dequant = pc->Y1dequant[QIndex];

-  }

-#if CONFIG_LOSSLESS

-  if (!QIndex) {

-    pbi->common.rtcd.idct.idct1        = vp9_short_inv_walsh4x4_1_x8_c;

-    pbi->common.rtcd.idct.idct16       = vp9_short_inv_walsh4x4_x8_c;

-    pbi->common.rtcd.idct.idct1_scalar_add  = vp9_dc_only_inv_walsh_add_c;

-    pbi->common.rtcd.idct.iwalsh1      = vp9_short_inv_walsh4x4_1_lossless_c;

-    pbi->common.rtcd.idct.iwalsh16     = vp9_short_inv_walsh4x4_lossless_c;

-    pbi->idct_add            = vp9_dequant_idct_add_lossless_c;

-    pbi->dc_idct_add         = vp9_dequant_dc_idct_add_lossless_c;

-    pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block_lossless_c;

-    pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block_lossless_c;

-    pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block_lossless_c;

-  } else {

-    pbi->common.rtcd.idct.idct1        = vp9_short_idct4x4llm_1_c;

-    pbi->common.rtcd.idct.idct16       = vp9_short_idct4x4llm_c;

-    pbi->common.rtcd.idct.idct1_scalar_add  = vp9_dc_only_idct_add_c;

-    pbi->common.rtcd.idct.iwalsh1      = vp9_short_inv_walsh4x4_1_c;

-    pbi->common.rtcd.idct.iwalsh16     = vp9_short_inv_walsh4x4_c;

-    pbi->idct_add            = vp9_dequant_idct_add;

-    pbi->dc_idct_add         = vp9_dequant_dc_idct_add;

-    pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block;

-    pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block;

-    pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block;

-  }

-#else

-  pbi->idct_add            = vp9_dequant_idct_add;

-  pbi->dc_idct_add         = vp9_dequant_dc_idct_add;

-  pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block;

-  pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block;

-  pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block;

-#endif

-  for (i = 16; i < 24; i++) {

-    xd->block[i].dequant = pc->UVdequant[QIndex];

-  }

-  xd->block[24].dequant = pc->Y2dequant[QIndex];

-}

-#if CONFIG_RUNTIME_CPU_DETECT

-#define RTCD_VTABLE(x) (&(pbi)->common.rtcd.x)

-#else

-#define RTCD_VTABLE(x) NULL

-#endif

-/* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it

- *  to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.

- */

-static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd) {

-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {

-#if CONFIG_SUPERBLOCKS

-    if (xd->mode_info_context->mbmi.encoded_as_sb) {

-      vp9_build_intra_predictors_sbuv_s(xd);

-      vp9_build_intra_predictors_sby_s(xd);

-    } else {

-#endif

-    vp9_build_intra_predictors_mbuv_s(xd);

-    vp9_build_intra_predictors_mby_s(xd);

-#if CONFIG_SUPERBLOCKS

-    }

-#endif

-  } else {

-#if CONFIG_SUPERBLOCKS

-    if (xd->mode_info_context->mbmi.encoded_as_sb) {

-      vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,

-                                         xd->dst.u_buffer, xd->dst.v_buffer,

-                                         xd->dst.y_stride, xd->dst.uv_stride);

-    } else {

-#endif

-    vp9_build_1st_inter16x16_predictors_mb(xd, xd->dst.y_buffer,

-                                           xd->dst.u_buffer, xd->dst.v_buffer,

-                                           xd->dst.y_stride, xd->dst.uv_stride);

-    if (xd->mode_info_context->mbmi.second_ref_frame) {

-      vp9_build_2nd_inter16x16_predictors_mb(xd, xd->dst.y_buffer,

-                                             xd->dst.u_buffer, xd->dst.v_buffer,

-                                             xd->dst.y_stride, xd->dst.uv_stride);

-    }

-#if CONFIG_SUPERBLOCKS

-    }

-#endif

-  }

-}

-static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,

-                              int mb_row, unsigned int mb_col,

-                              BOOL_DECODER* const bc) {

-  int eobtotal = 0;

-  MB_PREDICTION_MODE mode;

-  int i;

-  int tx_size;

-  TX_TYPE tx_type;

-  VP9_COMMON *pc = &pbi->common;

-#if CONFIG_SUPERBLOCKS

-  int orig_skip_flag = xd->mode_info_context->mbmi.mb_skip_coeff;

-#endif

-  // re-initialize macroblock dequantizer before detokenization

-  if (xd->segmentation_enabled)

-    mb_init_dequantizer(pbi, xd);

-  tx_size = xd->mode_info_context->mbmi.txfm_size;

-  mode = xd->mode_info_context->mbmi.mode;

-  if (xd->mode_info_context->mbmi.mb_skip_coeff) {

-    vp9_reset_mb_tokens_context(xd);

-#if CONFIG_SUPERBLOCKS

-    if (xd->mode_info_context->mbmi.encoded_as_sb &&

-        (mb_col < pc->mb_cols - 1 || mb_row < pc->mb_rows - 1)) {

-      if (mb_col < pc->mb_cols - 1)

-        xd->above_context++;

-      if (mb_row < pc->mb_rows - 1)

-        xd->left_context++;

-      vp9_reset_mb_tokens_context(xd);

-      if (mb_col < pc->mb_cols - 1)

-        xd->above_context--;

-      if (mb_row < pc->mb_rows - 1)

-        xd->left_context--;

-    }

-#endif

-  } else if (!bool_error(bc)) {

-    for (i = 0; i < 25; i++) {

-      xd->block[i].eob = 0;

-      xd->eobs[i] = 0;

-    }

-    if (tx_size == TX_16X16) {

-      eobtotal = vp9_decode_mb_tokens_16x16(pbi, xd, bc);

-    } else if (tx_size == TX_8X8) {

-      eobtotal = vp9_decode_mb_tokens_8x8(pbi, xd, bc);

-    } else {

-      eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);

-    }

-  }

-  //mode = xd->mode_info_context->mbmi.mode;

-  if (pbi->common.frame_type != KEY_FRAME)

-    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter,

-                             &pbi->common);

-  if (eobtotal == 0 && mode != B_PRED && mode != SPLITMV

-      && mode != I8X8_PRED

-      && !bool_error(bc)) {

-    /* Special case:  Force the loopfilter to skip when eobtotal and

-     * mb_skip_coeff are zero.

-     * */

-    xd->mode_info_context->mbmi.mb_skip_coeff = 1;

-#if CONFIG_SUPERBLOCKS

-    if (!xd->mode_info_context->mbmi.encoded_as_sb || orig_skip_flag)

-#endif

-    {

-      skip_recon_mb(pbi, xd);

-      return;

-    }

-  }

-  // moved to be performed before detokenization

-//  if (xd->segmentation_enabled)

-//    mb_init_dequantizer(pbi, xd);

-  /* do prediction */

-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {

-#if CONFIG_SUPERBLOCKS

-    if (xd->mode_info_context->mbmi.encoded_as_sb) {

-      vp9_build_intra_predictors_sby_s(xd);

-      vp9_build_intra_predictors_sbuv_s(xd);

-    } else

-#endif

-    if (mode != I8X8_PRED) {

-      vp9_build_intra_predictors_mbuv(xd);

-      if (mode != B_PRED) {

-        vp9_build_intra_predictors_mby(xd);

-      }

-    }

-  } else {

-#if CONFIG_SUPERBLOCKS

-    if (xd->mode_info_context->mbmi.encoded_as_sb) {

-      vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,

-                                         xd->dst.u_buffer, xd->dst.v_buffer,

-                                         xd->dst.y_stride, xd->dst.uv_stride);

-    } else

-#endif

-    vp9_build_inter_predictors_mb(xd);

-  }

-  /* dequantization and idct */

-  if (mode == I8X8_PRED) {

-    for (i = 0; i < 4; i++) {

-      int ib = vp9_i8x8_block[i];

-      const int iblock[4] = {0, 1, 4, 5};

-      int j;

-      int i8x8mode;

-      BLOCKD *b;

-      int idx = (ib & 0x02) ? (ib + 2) : ib;

-      short *q  = xd->block[idx].qcoeff;

-      short *dq = xd->block[0].dequant;

-      unsigned char *pre = xd->block[ib].predictor;

-      unsigned char *dst = *(xd->block[ib].base_dst) + xd->block[ib].dst;

-      int stride = xd->dst.y_stride;

-      b = &xd->block[ib];

-      i8x8mode = b->bmi.as_mode.first;

-      vp9_intra8x8_predict(b, i8x8mode, b->predictor);

-      if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {

-        tx_type = get_tx_type(xd, &xd->block[idx]);

-        if (tx_type != DCT_DCT) {

-          vp9_ht_dequant_idct_add_8x8_c(tx_type,

-                                        q, dq, pre, dst, 16, stride);

-        } else {

-          vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);

-        }

-        q += 64;

-      } else {

-        for (j = 0; j < 4; j++) {

-          b = &xd->block[ib + iblock[j]];

-          vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,

-                                 *(b->base_dst) + b->dst, 16, b->dst_stride);

-        }

-      }

-      b = &xd->block[16 + i];

-      vp9_intra_uv4x4_predict(b, i8x8mode, b->predictor);

-      pbi->idct_add(b->qcoeff, b->dequant, b->predictor,

-                    *(b->base_dst) + b->dst, 8, b->dst_stride);

-      b = &xd->block[20 + i];

-      vp9_intra_uv4x4_predict(b, i8x8mode, b->predictor);

-      pbi->idct_add(b->qcoeff, b->dequant, b->predictor,

-                    *(b->base_dst) + b->dst, 8, b->dst_stride);

-    }

-  } else if (mode == B_PRED) {

-    for (i = 0; i < 16; i++) {

-      BLOCKD *b = &xd->block[i];

-      int b_mode = xd->mode_info_context->bmi[i].as_mode.first;

-#if CONFIG_COMP_INTRA_PRED

-      int b_mode2 = xd->mode_info_context->bmi[i].as_mode.second;

-      if (b_mode2 == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {

-#endif

-        vp9_intra4x4_predict(b, b_mode, b->predictor);

-#if CONFIG_COMP_INTRA_PRED

-      } else {

-        vp9_comp_intra4x4_predict(b, b_mode, b_mode2, b->predictor);

-      }

-#endif

-      tx_type = get_tx_type(xd, b);

-      if (tx_type != DCT_DCT) {

-        vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,

-                                  b->dequant, b->predictor,

-                                  *(b->base_dst) + b->dst, 16, b->dst_stride);

-      } else {

-        vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,

-                               *(b->base_dst) + b->dst, 16, b->dst_stride);

-      }

-    }

-  } else if (mode == SPLITMV) {

-    if (tx_size == TX_8X8) {

-      vp9_dequant_idct_add_y_block_8x8(xd->qcoeff, xd->block[0].dequant,

-                                         xd->predictor, xd->dst.y_buffer,

-                                         xd->dst.y_stride, xd->eobs, xd);

-    } else {

-      pbi->idct_add_y_block(xd->qcoeff, xd->block[0].dequant,

-                                       xd->predictor, xd->dst.y_buffer,

-                                       xd->dst.y_stride, xd->eobs);

-    }

-  } else {

-    BLOCKD *b = &xd->block[24];

-    if (tx_size == TX_16X16) {

-      BLOCKD *bd = &xd->block[0];

-      tx_type = get_tx_type(xd, bd);

-      if (tx_type != DCT_DCT) {

-        vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff,

-                                        xd->block[0].dequant, xd->predictor,

-                                        xd->dst.y_buffer, 16, xd->dst.y_stride);

-      } else {

-        vp9_dequant_idct_add_16x16(xd->qcoeff, xd->block[0].dequant,

-                                     xd->predictor, xd->dst.y_buffer,

-                                     16, xd->dst.y_stride);

-      }

-    } else if (tx_size == TX_8X8) {

-#if CONFIG_SUPERBLOCKS

-      void *orig = xd->mode_info_context;

-      int n, num = xd->mode_info_context->mbmi.encoded_as_sb ? 4 : 1;

-      for (n = 0; n < num; n++) {

-        int x_idx = n & 1, y_idx = n >> 1;

-        if (num == 4 && (mb_col + x_idx >= pc->mb_cols ||

-                         mb_row + y_idx >= pc->mb_rows))

-          continue;

-        if (n != 0) {

-          for (i = 0; i < 25; i++) {

-            xd->block[i].eob = 0;

-            xd->eobs[i] = 0;

-          }

-          xd->above_context = pc->above_context + mb_col + (n & 1);

-          xd->left_context = pc->left_context + (n >> 1);

-          xd->mode_info_context = orig;

-          xd->mode_info_context += (n & 1);

-          xd->mode_info_context += (n >> 1) * pc->mode_info_stride;

-          if (!orig_skip_flag) {

-            eobtotal = vp9_decode_mb_tokens_8x8(pbi, xd, bc);

-            if (eobtotal == 0) // skip loopfilter

-              xd->mode_info_context->mbmi.mb_skip_coeff = 1;

-          } else {

-            vp9_reset_mb_tokens_context(xd);

-          }

-        }

-        if (xd->mode_info_context->mbmi.mb_skip_coeff)

-          continue; // only happens for SBs, which are already in dest buffer

-#endif

-      vp9_dequantize_b_2x2(b);

-      IDCT_INVOKE(RTCD_VTABLE(idct), ihaar2)(&b->dqcoeff[0], b->diff, 8);

-      ((int *)b->qcoeff)[0] = 0;// 2nd order block are set to 0 after inverse transform

-      ((int *)b->qcoeff)[1] = 0;

-      ((int *)b->qcoeff)[2] = 0;

-      ((int *)b->qcoeff)[3] = 0;

-      ((int *)b->qcoeff)[4] = 0;

-      ((int *)b->qcoeff)[5] = 0;

-      ((int *)b->qcoeff)[6] = 0;

-      ((int *)b->qcoeff)[7] = 0;

-#if CONFIG_SUPERBLOCKS

-      if (xd->mode_info_context->mbmi.encoded_as_sb) {

-        vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(xd->qcoeff,

-          xd->block[0].dequant,

-          xd->dst.y_buffer + (n >> 1) * 16 * xd->dst.y_stride + (n & 1) * 16,

-          xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);

-        // do UV inline also

-        vp9_dequant_idct_add_uv_block_8x8_inplace_c(xd->qcoeff + 16 * 16,

-          xd->block[16].dequant,

-          xd->dst.u_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,

-          xd->dst.v_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,

-          xd->dst.uv_stride, xd->eobs + 16, xd);

-      } else

-#endif

-        vp9_dequant_dc_idct_add_y_block_8x8(xd->qcoeff,

-          xd->block[0].dequant, xd->predictor, xd->dst.y_buffer,

-          xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);

-#if CONFIG_SUPERBLOCKS

-      }

-      xd->mode_info_context = orig;

-#endif

-    } else {

-      vp9_dequantize_b(b);

-      if (xd->eobs[24] > 1) {

-        IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);

-        ((int *)b->qcoeff)[0] = 0;

-        ((int *)b->qcoeff)[1] = 0;

-        ((int *)b->qcoeff)[2] = 0;

-        ((int *)b->qcoeff)[3] = 0;

-        ((int *)b->qcoeff)[4] = 0;

-        ((int *)b->qcoeff)[5] = 0;

-        ((int *)b->qcoeff)[6] = 0;

-        ((int *)b->qcoeff)[7] = 0;

-      } else {

-        IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);

-        ((int *)b->qcoeff)[0] = 0;

-      }

-      pbi->dc_idct_add_y_block(xd->qcoeff, xd->block[0].dequant, xd->predictor,

-                               xd->dst.y_buffer, xd->dst.y_stride, xd->eobs,

-                               xd->block[24].diff);

-    }

-  }

-#if CONFIG_SUPERBLOCKS

-  if (!xd->mode_info_context->mbmi.encoded_as_sb) {

-#endif

-    if ((tx_size == TX_8X8 &&

-         xd->mode_info_context->mbmi.mode != I8X8_PRED &&

-         xd->mode_info_context->mbmi.mode != SPLITMV)

-        || tx_size == TX_16X16

-       )

-      vp9_dequant_idct_add_uv_block_8x8

-          (xd->qcoeff + 16 * 16, xd->block[16].dequant,

-           xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,

-           xd->dst.uv_stride, xd->eobs + 16, xd); //

-    else if (xd->mode_info_context->mbmi.mode != I8X8_PRED)

-      pbi->idct_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant,

-           xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,

-           xd->dst.uv_stride, xd->eobs + 16);

-#if CONFIG_SUPERBLOCKS

-  }

-#endif

-}

-static int get_delta_q(vp9_reader *bc, int prev, int *q_update) {

-  int ret_val = 0;

-  if (vp9_read_bit(bc)) {

-    ret_val = vp9_read_literal(bc, 4);

-    if (vp9_read_bit(bc))

-      ret_val = -ret_val;

-  }

-  /* Trigger a quantizer update if the delta-q value has changed */

-  if (ret_val != prev)

-    *q_update = 1;

-  return ret_val;

-}

-#ifdef PACKET_TESTING

-#include <stdio.h>

-FILE *vpxlog = 0;

-#endif

-/* Decode a row of Superblocks (2x2 region of MBs) */

-static void

-decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc, int mbrow, MACROBLOCKD *xd,

-              BOOL_DECODER* const bc) {

-  int i;

-  int sb_col;

-  int mb_row, mb_col;

-  int recon_yoffset, recon_uvoffset;

-  int ref_fb_idx = pc->lst_fb_idx;

-  int dst_fb_idx = pc->new_fb_idx;

-  int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;

-  int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;

-  int row_delta[4] = { 0, +1,  0, -1};

-  int col_delta[4] = { +1, -1, +1, +1};

-  int sb_cols = (pc->mb_cols + 1) >> 1;

-  // For a SB there are 2 left contexts, each pertaining to a MB row within

-  vpx_memset(pc->left_context, 0, sizeof(pc->left_context));

-  mb_row = mbrow;

-  mb_col = 0;

-  for (sb_col = 0; sb_col < sb_cols; sb_col++) {

-    MODE_INFO *mi = xd->mode_info_context;

-#if CONFIG_SUPERBLOCKS

-    mi->mbmi.encoded_as_sb = vp9_read(bc, pc->sb_coded);

-#endif

-    // Process the 4 MBs within the SB in the order:

-    // top-left, top-right, bottom-left, bottom-right

-    for (i = 0; i < 4; i++) {

-      int dy = row_delta[i];

-      int dx = col_delta[i];

-      int offset_extended = dy * xd->mode_info_stride + dx;

-      xd->mb_index = i;

-      mi = xd->mode_info_context;

-      if ((mb_row >= pc->mb_rows) || (mb_col >= pc->mb_cols)) {

-        // MB lies outside frame, skip on to next

-        mb_row += dy;

-        mb_col += dx;

-        xd->mode_info_context += offset_extended;

-        xd->prev_mode_info_context += offset_extended;

-        continue;

-      }

-      // Set above context pointer

-      xd->above_context = pc->above_context + mb_col;

-      xd->left_context = pc->left_context + (i >> 1);

-      /* Distance of Mb to the various image edges.

-       * These are specified to 8th pel as they are always compared to

-       * values that are in 1/8th pel units

-       */

-      xd->mb_to_top_edge = -((mb_row * 16)) << 3;

-      xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;

-      xd->mb_to_left_edge = -((mb_col * 16) << 3);

-      xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;

-      xd->up_available = (mb_row != 0);

-      xd->left_available = (mb_col != 0);

-      recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);

-      recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);

-      xd->dst.y_buffer = pc->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;

-      xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;

-      xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;

-#if CONFIG_SUPERBLOCKS

-      if (i)

-        mi->mbmi.encoded_as_sb = 0;

-#endif

-      vp9_decode_mb_mode_mv(pbi, xd, mb_row, mb_col, bc);

-      update_blockd_bmi(xd);

-      /* Select the appropriate reference frame for this MB */

-      if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)

-        ref_fb_idx = pc->lst_fb_idx;

-      else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)

-        ref_fb_idx = pc->gld_fb_idx;

-      else

-        ref_fb_idx = pc->alt_fb_idx;

-      xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;

-      xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;

-      xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;

-      if (xd->mode_info_context->mbmi.second_ref_frame) {

-        int second_ref_fb_idx;

-        /* Select the appropriate reference frame for this MB */

-        if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)

-          second_ref_fb_idx = pc->lst_fb_idx;

-        else if (xd->mode_info_context->mbmi.second_ref_frame ==

-                 GOLDEN_FRAME)

-          second_ref_fb_idx = pc->gld_fb_idx;

-        else

-          second_ref_fb_idx = pc->alt_fb_idx;

-        xd->second_pre.y_buffer =

-          pc->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;

-        xd->second_pre.u_buffer =

-          pc->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;

-        xd->second_pre.v_buffer =

-          pc->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;

-      }

-      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {

-        /* propagate errors from reference frames */

-        xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;

-      }

-#if CONFIG_SUPERBLOCKS

-      if (xd->mode_info_context->mbmi.encoded_as_sb) {

-        if (mb_col < pc->mb_cols - 1)

-          mi[1] = mi[0];

-        if (mb_row < pc->mb_rows - 1) {

-          mi[pc->mode_info_stride] = mi[0];

-          if (mb_col < pc->mb_cols - 1)

-            mi[pc->mode_info_stride + 1] = mi[0];

-        }

-      }

-#endif

-      vp9_intra_prediction_down_copy(xd);

-      decode_macroblock(pbi, xd, mb_row, mb_col, bc);

-      /* check if the boolean decoder has suffered an error */

-      xd->corrupted |= bool_error(bc);

-#if CONFIG_SUPERBLOCKS

-      if (mi->mbmi.encoded_as_sb) {

-        assert(!i);

-        mb_col += 2;

-        xd->mode_info_context += 2;

-        xd->prev_mode_info_context += 2;

-        break;

-      }

-#endif

-      // skip to next MB

-      xd->mode_info_context += offset_extended;

-      xd->prev_mode_info_context += offset_extended;

-      mb_row += dy;

-      mb_col += dx;

-    }

-  }

-  /* skip prediction column */

-  xd->mode_info_context += 1 - (pc->mb_cols & 0x1) + xd->mode_info_stride;

-  xd->prev_mode_info_context += 1 - (pc->mb_cols & 0x1) + xd->mode_info_stride;

-}

-static unsigned int read_partition_size(const unsigned char *cx_size) {

-  const unsigned int size =

-    cx_size[0] + (cx_size[1] << 8) + (cx_size[2] << 16);

-  return size;

-}

-static int read_is_valid(const unsigned char *start,

-                         size_t               len,

-                         const unsigned char *end) {

-  return (start + len > start && start + len <= end);

-}

-static void setup_token_decoder(VP9D_COMP *pbi,

-                                const unsigned char *cx_data,

-                                BOOL_DECODER* const bool_decoder) {

-  VP9_COMMON          *pc = &pbi->common;

-  const unsigned char *user_data_end = pbi->Source + pbi->source_sz;

-  const unsigned char *partition;

-  ptrdiff_t            partition_size;

-  ptrdiff_t            bytes_left;

-  // Set up pointers to token partition

-  partition = cx_data;

-  bytes_left = user_data_end - partition;

-  partition_size = bytes_left;

-  /* Validate the calculated partition length. If the buffer

-   * described by the partition can't be fully read, then restrict

-   * it to the portion that can be (for EC mode) or throw an error.

-   */

-  if (!read_is_valid(partition, partition_size, user_data_end)) {

-    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

-                       "Truncated packet or corrupt partition "

-                       "%d length", 1);

-  }

-  if (vp9_start_decode(bool_decoder, partition, partition_size))

-    vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,

-                       "Failed to allocate bool decoder %d", 1);

-}

-static void init_frame(VP9D_COMP *pbi) {

-  VP9_COMMON *const pc = &pbi->common;

-  MACROBLOCKD *const xd  = &pbi->mb;

-  if (pc->frame_type == KEY_FRAME) {

-    /* Various keyframe initializations */

-    vp9_init_mv_probs(pc);

-    vp9_init_mbmode_probs(pc);

-    vp9_default_bmode_probs(pc->fc.bmode_prob);

-    vp9_default_coef_probs(pc);

-    vp9_kf_default_bmode_probs(pc->kf_bmode_prob);

-    // Reset the segment feature data to the default stats:

-    // Features disabled, 0, with delta coding (Default state).

-    vp9_clearall_segfeatures(xd);

-    xd->mb_segment_abs_delta = SEGMENT_DELTADATA;

-    /* reset the mode ref deltasa for loop filter */

-    vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));

-    vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));

-    /* All buffers are implicitly updated on key frames. */

-    pc->refresh_golden_frame = 1;

-    pc->refresh_alt_ref_frame = 1;

-    pc->copy_buffer_to_gf = 0;

-    pc->copy_buffer_to_arf = 0;

-    /* Note that Golden and Altref modes cannot be used on a key frame so

-     * ref_frame_sign_bias[] is undefined and meaningless

-     */

-    pc->ref_frame_sign_bias[GOLDEN_FRAME] = 0;

-    pc->ref_frame_sign_bias[ALTREF_FRAME] = 0;

-    vp9_init_mode_contexts(&pbi->common);

-    vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));

-    vpx_memcpy(&pc->lfc_a, &pc->fc, sizeof(pc->fc));

-    vpx_memcpy(pbi->common.fc.vp8_mode_contexts,

-               pbi->common.fc.mode_context,

-               sizeof(pbi->common.fc.mode_context));

-    vpx_memset(pc->prev_mip, 0,

-               (pc->mb_cols + 1) * (pc->mb_rows + 1)* sizeof(MODE_INFO));

-    vpx_memset(pc->mip, 0,

-               (pc->mb_cols + 1) * (pc->mb_rows + 1)* sizeof(MODE_INFO));

-    vp9_update_mode_info_border(pc, pc->mip);

-    vp9_update_mode_info_in_image(pc, pc->mi);

-  } else {

-    if (!pc->use_bilinear_mc_filter)

-      pc->mcomp_filter_type = EIGHTTAP;

-    else

-      pc->mcomp_filter_type = BILINEAR;

-    /* To enable choice of different interpolation filters */

-    vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);

-  }

-  xd->mode_info_context = pc->mi;

-  xd->prev_mode_info_context = pc->prev_mi;

-  xd->frame_type = pc->frame_type;

-  xd->mode_info_context->mbmi.mode = DC_PRED;

-  xd->mode_info_stride = pc->mode_info_stride;

-  xd->corrupted = 0; /* init without corruption */

-  xd->fullpixel_mask = 0xffffffff;

-  if (pc->full_pixel)

-    xd->fullpixel_mask = 0xfffffff8;

-}

-#if 0

-static void read_coef_probs2(VP9D_COMP *pbi) {

-  const vp9_prob grpupd = 192;

-  int i, j, k, l;

-  vp9_reader *const bc = &pbi->bc;

-  VP9_COMMON *const pc = &pbi->common;

-  for (l = 0; l < ENTROPY_NODES; l++) {

-    if (vp9_read(bc, grpupd)) {

-      // printf("Decoding %d\n", l);

-      for (i = 0; i < BLOCK_TYPES; i++)

-        for (j = !i; j < COEF_BANDS; j++)

-          for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

-            if (k >= 3 && ((i == 0 && j == 1) ||

-                           (i > 0 && j == 0)))

-              continue;

-            {

-              vp9_prob *const p = pc->fc.coef_probs [i][j][k] + l;

-              int u = vp9_read(bc, COEF_UPDATE_PROB);

-              if (u) *p = read_prob_diff_update(bc, *p);

-            }

-          }

-    }

-  }

-  if (pbi->common.txfm_mode == ALLOW_8X8) {

-    for (l = 0; l < ENTROPY_NODES; l++) {

-      if (vp9_read(bc, grpupd)) {

-        for (i = 0; i < BLOCK_TYPES_8X8; i++)

-          for (j = !i; j < COEF_BANDS; j++)

-            for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

-              if (k >= 3 && ((i == 0 && j == 1) ||

-                             (i > 0 && j == 0)))

-                continue;

-              {

-                vp9_prob *const p = pc->fc.coef_probs_8x8 [i][j][k] + l;

-                int u = vp9_read(bc, COEF_UPDATE_PROB_8X8);

-                if (u) *p = read_prob_diff_update(bc, *p);

-              }

-            }

-      }

-    }

-  }

-}

-#endif

-static void read_coef_probs_common(

-    BOOL_DECODER* const bc,

-    vp9_prob coef_probs[BLOCK_TYPES][COEF_BANDS]

-                       [PREV_COEF_CONTEXTS][ENTROPY_NODES]) {

-  int i, j, k, l;

-  if (vp9_read_bit(bc)) {

-    for (i = 0; i < BLOCK_TYPES; i++) {

-      for (j = !i; j < COEF_BANDS; j++) {

-        /* NB: This j loop starts from 1 on block type i == 0 */

-        for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

-          if (k >= 3 && ((i == 0 && j == 1) ||

-                         (i > 0 && j == 0)))

-            continue;

-          for (l = 0; l < ENTROPY_NODES; l++) {

-            vp9_prob *const p = coef_probs[i][j][k] + l;

-            if (vp9_read(bc, COEF_UPDATE_PROB)) {

-              *p = read_prob_diff_update(bc, *p);

-            }

-          }

-        }

-      }

-    }

-  }

-}

-static void read_coef_probs(VP9D_COMP *pbi, BOOL_DECODER* const bc) {

-  VP9_COMMON *const pc = &pbi->common;

-  read_coef_probs_common(bc, pc->fc.coef_probs);

-  read_coef_probs_common(bc, pc->fc.hybrid_coef_probs);

-  if (pbi->common.txfm_mode != ONLY_4X4) {

-    read_coef_probs_common(bc, pc->fc.coef_probs_8x8);

-    read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_8x8);

-  }

-  if (pbi->common.txfm_mode > ALLOW_8X8) {

-    read_coef_probs_common(bc, pc->fc.coef_probs_16x16);

-    read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_16x16);

-  }

-}

-int vp9_decode_frame(VP9D_COMP *pbi) {

-  BOOL_DECODER header_bc, residual_bc;

-  VP9_COMMON *const pc = &pbi->common;

-  MACROBLOCKD *const xd  = &pbi->mb;

-  const unsigned char *data = (const unsigned char *)pbi->Source;

-  const unsigned char *data_end = data + pbi->source_sz;

-  ptrdiff_t first_partition_length_in_bytes = 0;

-  int mb_row;

-  int i, j;

-  int corrupt_tokens = 0;

-  /* start with no corruption of current frame */

-  xd->corrupted = 0;

-  pc->yv12_fb[pc->new_fb_idx].corrupted = 0;

-  if (data_end - data < 3) {

-    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

-                       "Truncated packet");

-  } else {

-    pc->last_frame_type = pc->frame_type;

-    pc->frame_type = (FRAME_TYPE)(data[0] & 1);

-    pc->version = (data[0] >> 1) & 7;

-    pc->show_frame = (data[0] >> 4) & 1;

-    first_partition_length_in_bytes =

-      (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5;

-    if ((data + first_partition_length_in_bytes > data_end

-         || data + first_partition_length_in_bytes < data))

-      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

-                         "Truncated packet or corrupt partition 0 length");

-    data += 3;

-    vp9_setup_version(pc);

-    if (pc->frame_type == KEY_FRAME) {

-      const int Width = pc->Width;

-      const int Height = pc->Height;

-      /* vet via sync code */

-      /* When error concealment is enabled we should only check the sync

-       * code if we have enough bits available

-       */

-      if (data + 3 < data_end) {

-        if (data[0] != 0x9d || data[1] != 0x01 || data[2] != 0x2a)

-          vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,

-                             "Invalid frame sync code");

-      }

-      /* If error concealment is enabled we should only parse the new size

-       * if we have enough data. Otherwise we will end up with the wrong

-       * size.

-       */

-      if (data + 6 < data_end) {

-        pc->Width = (data[3] | (data[4] << 8)) & 0x3fff;

-        pc->horiz_scale = data[4] >> 6;

-        pc->Height = (data[5] | (data[6] << 8)) & 0x3fff;

-        pc->vert_scale = data[6] >> 6;

-      }

-      data += 7;

-      if (Width != pc->Width  ||  Height != pc->Height) {

-        if (pc->Width <= 0) {

-          pc->Width = Width;

-          vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

-                             "Invalid frame width");

-        }

-        if (pc->Height <= 0) {

-          pc->Height = Height;

-          vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

-                             "Invalid frame height");

-        }

-        if (vp9_alloc_frame_buffers(pc, pc->Width, pc->Height))

-          vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,

-                             "Failed to allocate frame buffers");

-      }

-    }

-  }

-  if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME) ||

-      pc->Width == 0 || pc->Height == 0) {

-    return -1;

-  }

-  init_frame(pbi);

-  if (vp9_start_decode(&header_bc, data, first_partition_length_in_bytes))

-    vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,

-                       "Failed to allocate bool decoder 0");

-  if (pc->frame_type == KEY_FRAME) {

-    pc->clr_type    = (YUV_TYPE)vp9_read_bit(&header_bc);

-    pc->clamp_type  = (CLAMP_TYPE)vp9_read_bit(&header_bc);

-  }

-  /* Is segmentation enabled */

-  xd->segmentation_enabled = (unsigned char)vp9_read_bit(&header_bc);

-  if (xd->segmentation_enabled) {

-    // Read whether or not the segmentation map is being explicitly

-    // updated this frame.

-    xd->update_mb_segmentation_map = (unsigned char)vp9_read_bit(&header_bc);

-    // If so what method will be used.

-    if (xd->update_mb_segmentation_map) {

-      // Which macro block level features are enabled

-      // Read the probs used to decode the segment id for each macro

-      // block.

-      for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {

-          xd->mb_segment_tree_probs[i] = vp9_read_bit(&header_bc) ?

-              (vp9_prob)vp9_read_literal(&header_bc, 8) : 255;

-      }

-      // Read the prediction probs needed to decode the segment id

-      pc->temporal_update = (unsigned char)vp9_read_bit(&header_bc);

-      for (i = 0; i < PREDICTION_PROBS; i++) {

-        if (pc->temporal_update) {

-          pc->segment_pred_probs[i] = vp9_read_bit(&header_bc) ?

-              (vp9_prob)vp9_read_literal(&header_bc, 8) : 255;

-        } else {

-          pc->segment_pred_probs[i] = 255;

-        }

-      }

-    }

-    // Is the segment data being updated

-    xd->update_mb_segmentation_data = (unsigned char)vp9_read_bit(&header_bc);

-    if (xd->update_mb_segmentation_data) {

-      int data;

-      xd->mb_segment_abs_delta = (unsigned char)vp9_read_bit(&header_bc);

-      vp9_clearall_segfeatures(xd);

-      // For each segmentation...

-      for (i = 0; i < MAX_MB_SEGMENTS; i++) {

-        // For each of the segments features...

-        for (j = 0; j < SEG_LVL_MAX; j++) {

-          // Is the feature enabled

-          if (vp9_read_bit(&header_bc)) {

-            // Update the feature data and mask

-            vp9_enable_segfeature(xd, i, j);

-            data = (signed char)vp9_read_literal(

-                     &header_bc, vp9_seg_feature_data_bits(j));

-            // Is the segment data signed..

-            if (vp9_is_segfeature_signed(j)) {

-              if (vp9_read_bit(&header_bc))

-                data = - data;

-            }

-          } else

-            data = 0;

-          vp9_set_segdata(xd, i, j, data);

-        }

-      }

-    }

-  }

-  // Read common prediction model status flag probability updates for the

-  // reference frame

-  if (pc->frame_type == KEY_FRAME) {

-    // Set the prediction probabilities to defaults

-    pc->ref_pred_probs[0] = 120;

-    pc->ref_pred_probs[1] = 80;

-    pc->ref_pred_probs[2] = 40;

-  } else {

-    for (i = 0; i < PREDICTION_PROBS; i++) {

-      if (vp9_read_bit(&header_bc))

-        pc->ref_pred_probs[i] = (vp9_prob)vp9_read_literal(&header_bc, 8);

-    }

-  }

-#if CONFIG_SUPERBLOCKS

-  pc->sb_coded = vp9_read_literal(&header_bc, 8);

-#endif

-  /* Read the loop filter level and type */

-  pc->txfm_mode = vp9_read_literal(&header_bc, 2);

-  if (pc->txfm_mode == TX_MODE_SELECT) {

-    pc->prob_tx[0] = vp9_read_literal(&header_bc, 8);

-    pc->prob_tx[1] = vp9_read_literal(&header_bc, 8);

-  }

-  pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(&header_bc);

-  pc->filter_level = vp9_read_literal(&header_bc, 6);

-  pc->sharpness_level = vp9_read_literal(&header_bc, 3);

-  /* Read in loop filter deltas applied at the MB level based on mode or ref frame. */

-  xd->mode_ref_lf_delta_update = 0;

-  xd->mode_ref_lf_delta_enabled = (unsigned char)vp9_read_bit(&header_bc);

-  if (xd->mode_ref_lf_delta_enabled) {

-    /* Do the deltas need to be updated */

-    xd->mode_ref_lf_delta_update = (unsigned char)vp9_read_bit(&header_bc);

-    if (xd->mode_ref_lf_delta_update) {

-      /* Send update */

-      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {

-        if (vp9_read_bit(&header_bc)) {

-          /*sign = vp9_read_bit( &header_bc );*/

-          xd->ref_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6);

-          if (vp9_read_bit(&header_bc))        /* Apply sign */

-            xd->ref_lf_deltas[i] = xd->ref_lf_deltas[i] * -1;

-        }

-      }

-      /* Send update */

-      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {

-        if (vp9_read_bit(&header_bc)) {

-          /*sign = vp9_read_bit( &header_bc );*/

-          xd->mode_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6);

-          if (vp9_read_bit(&header_bc))        /* Apply sign */

-            xd->mode_lf_deltas[i] = xd->mode_lf_deltas[i] * -1;

-        }

-      }

-    }

-  }

-  // Dummy read for now

-  vp9_read_literal(&header_bc, 2);

-  setup_token_decoder(pbi, data + first_partition_length_in_bytes,

-                      &residual_bc);

-  /* Read the default quantizers. */

-  {

-    int Q, q_update;

-    Q = vp9_read_literal(&header_bc, QINDEX_BITS);

-    pc->base_qindex = Q;

-    q_update = 0;

-    /* AC 1st order Q = default */

-    pc->y1dc_delta_q = get_delta_q(&header_bc, pc->y1dc_delta_q, &q_update);

-    pc->y2dc_delta_q = get_delta_q(&header_bc, pc->y2dc_delta_q, &q_update);

-    pc->y2ac_delta_q = get_delta_q(&header_bc, pc->y2ac_delta_q, &q_update);

-    pc->uvdc_delta_q = get_delta_q(&header_bc, pc->uvdc_delta_q, &q_update);

-    pc->uvac_delta_q = get_delta_q(&header_bc, pc->uvac_delta_q, &q_update);

-    if (q_update)

-      vp9_init_de_quantizer(pbi);

-    /* MB level dequantizer setup */

-    mb_init_dequantizer(pbi, &pbi->mb);

-  }

-  /* Determine if the golden frame or ARF buffer should be updated and how.

-   * For all non key frames the GF and ARF refresh flags and sign bias

-   * flags must be set explicitly.

-   */

-  if (pc->frame_type != KEY_FRAME) {

-    /* Should the GF or ARF be updated from the current frame */

-    pc->refresh_golden_frame = vp9_read_bit(&header_bc);

-    pc->refresh_alt_ref_frame = vp9_read_bit(&header_bc);

-    if (pc->refresh_alt_ref_frame) {

-      vpx_memcpy(&pc->fc, &pc->lfc_a, sizeof(pc->fc));

-      vpx_memcpy(pc->fc.vp8_mode_contexts,

-                 pc->fc.mode_context_a,

-                 sizeof(pc->fc.vp8_mode_contexts));

-    } else {

-      vpx_memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc));

-      vpx_memcpy(pc->fc.vp8_mode_contexts,

-                 pc->fc.mode_context,

-                 sizeof(pc->fc.vp8_mode_contexts));

-    }

-    /* Buffer to buffer copy flags. */

-    pc->copy_buffer_to_gf = 0;

-    if (!pc->refresh_golden_frame)

-      pc->copy_buffer_to_gf = vp9_read_literal(&header_bc, 2);

-    pc->copy_buffer_to_arf = 0;

-    if (!pc->refresh_alt_ref_frame)

-      pc->copy_buffer_to_arf = vp9_read_literal(&header_bc, 2);

-    pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp9_read_bit(&header_bc);

-    pc->ref_frame_sign_bias[ALTREF_FRAME] = vp9_read_bit(&header_bc);

-    /* Is high precision mv allowed */

-    xd->allow_high_precision_mv = (unsigned char)vp9_read_bit(&header_bc);

-    // Read the type of subpel filter to use

-    if (vp9_read_bit(&header_bc)) {

-      pc->mcomp_filter_type = SWITCHABLE;

-    } else {

-      pc->mcomp_filter_type = vp9_read_literal(&header_bc, 2);

-    }

-    /* To enable choice of different interploation filters */

-    vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);

-  }

-  pc->refresh_entropy_probs = vp9_read_bit(&header_bc);

-  if (pc->refresh_entropy_probs == 0) {

-    vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));

-  }

-  pc->refresh_last_frame = (pc->frame_type == KEY_FRAME)

-                           || vp9_read_bit(&header_bc);

-  if (0) {

-    FILE *z = fopen("decodestats.stt", "a");

-    fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n",

-            pc->current_video_frame,

-            pc->frame_type,

-            pc->refresh_golden_frame,

-            pc->refresh_alt_ref_frame,

-            pc->refresh_last_frame,

-            pc->base_qindex);

-    fclose(z);

-  }

-  vp9_copy(pbi->common.fc.pre_coef_probs,

-           pbi->common.fc.coef_probs);

-  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs,

-           pbi->common.fc.hybrid_coef_probs);

-  vp9_copy(pbi->common.fc.pre_coef_probs_8x8,

-           pbi->common.fc.coef_probs_8x8);

-  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_8x8,

-           pbi->common.fc.hybrid_coef_probs_8x8);

-  vp9_copy(pbi->common.fc.pre_coef_probs_16x16,

-           pbi->common.fc.coef_probs_16x16);

-  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_16x16,

-           pbi->common.fc.hybrid_coef_probs_16x16);

-  vp9_copy(pbi->common.fc.pre_ymode_prob, pbi->common.fc.ymode_prob);

-  vp9_copy(pbi->common.fc.pre_uv_mode_prob, pbi->common.fc.uv_mode_prob);

-  vp9_copy(pbi->common.fc.pre_bmode_prob, pbi->common.fc.bmode_prob);

-  vp9_copy(pbi->common.fc.pre_i8x8_mode_prob, pbi->common.fc.i8x8_mode_prob);

-  vp9_copy(pbi->common.fc.pre_sub_mv_ref_prob, pbi->common.fc.sub_mv_ref_prob);

-  vp9_copy(pbi->common.fc.pre_mbsplit_prob, pbi->common.fc.mbsplit_prob);

-  pbi->common.fc.pre_nmvc = pbi->common.fc.nmvc;

-  vp9_zero(pbi->common.fc.coef_counts);

-  vp9_zero(pbi->common.fc.hybrid_coef_counts);

-  vp9_zero(pbi->common.fc.coef_counts_8x8);

-  vp9_zero(pbi->common.fc.hybrid_coef_counts_8x8);

-  vp9_zero(pbi->common.fc.coef_counts_16x16);

-  vp9_zero(pbi->common.fc.hybrid_coef_counts_16x16);

-  vp9_zero(pbi->common.fc.ymode_counts);

-  vp9_zero(pbi->common.fc.uv_mode_counts);

-  vp9_zero(pbi->common.fc.bmode_counts);

-  vp9_zero(pbi->common.fc.i8x8_mode_counts);

-  vp9_zero(pbi->common.fc.sub_mv_ref_counts);

-  vp9_zero(pbi->common.fc.mbsplit_counts);

-  vp9_zero(pbi->common.fc.NMVcount);

-  vp9_zero(pbi->common.fc.mv_ref_ct);

-  vp9_zero(pbi->common.fc.mv_ref_ct_a);

-  read_coef_probs(pbi, &header_bc);

-  vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->lst_fb_idx], sizeof(YV12_BUFFER_CONFIG));

-  vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG));

-  // Create the segmentation map structure and set to 0

-  if (!pc->last_frame_seg_map)

-    CHECK_MEM_ERROR(pc->last_frame_seg_map,

-                    vpx_calloc((pc->mb_rows * pc->mb_cols), 1));

-  /* set up frame new frame for intra coded blocks */

-  vp9_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]);

-  vp9_setup_block_dptrs(xd);

-  vp9_build_block_doffsets(xd);

-  /* clear out the coeff buffer */

-  vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));

-  /* Read the mb_no_coeff_skip flag */

-  pc->mb_no_coeff_skip = (int)vp9_read_bit(&header_bc);

-  vp9_decode_mode_mvs_init(pbi, &header_bc);

-  vpx_memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);

-  // Resset the macroblock mode info context to the start of the list

-  xd->mode_info_context = pc->mi;

-  xd->prev_mode_info_context = pc->prev_mi;

-  /* Decode a row of superblocks */

-  for (mb_row = 0; mb_row < pc->mb_rows; mb_row += 2) {

-    decode_sb_row(pbi, pc, mb_row, xd, &residual_bc);

-  }

-  corrupt_tokens |= xd->corrupted;

-  /* Collect information about decoder corruption. */

-  /* 1. Check first boolean decoder for errors. */

-  pc->yv12_fb[pc->new_fb_idx].corrupted = bool_error(&header_bc);

-  /* 2. Check the macroblock information */

-  pc->yv12_fb[pc->new_fb_idx].corrupted |= corrupt_tokens;

-  if (!pbi->decoded_key_frame) {

-    if (pc->frame_type == KEY_FRAME &&

-        !pc->yv12_fb[pc->new_fb_idx].corrupted)

-      pbi->decoded_key_frame = 1;

-    else

-      vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME,

-                         "A stream must start with a complete key frame");

-  }

-  vp9_adapt_coef_probs(pc);

-  if (pc->frame_type != KEY_FRAME) {

-    vp9_adapt_mode_probs(pc);

-    vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);

-    vp9_update_mode_context(&pbi->common);

-  }

-  /* If this was a kf or Gf note the Q used */

-  if ((pc->frame_type == KEY_FRAME) ||

-      pc->refresh_golden_frame || pc->refresh_alt_ref_frame) {

-    pc->last_kf_gf_q = pc->base_qindex;

-  }

-  if (pc->refresh_entropy_probs) {

-    if (pc->refresh_alt_ref_frame)

-      vpx_memcpy(&pc->lfc_a, &pc->fc, sizeof(pc->fc));

-    else

-      vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));

-  }

-#ifdef PACKET_TESTING

-  {

-    FILE *f = fopen("decompressor.VP8", "ab");

-    unsigned int size = residual_bc.pos + header_bc.pos + 8;

-    fwrite((void *) &size, 4, 1, f);

-    fwrite((void *) pbi->Source, size, 1, f);

-    fclose(f);

-  }

-#endif

-  // printf("Frame %d Done\n", frame_count++);

-  return 0;

-}

--- a/vp8/decoder/dequantize.c

+++ /dev/null

@@ -1,543 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "dequantize.h"

-#include "vp8/common/idct.h"

-#include "vpx_mem/vpx_mem.h"

-#include "onyxd_int.h"

-extern void vp9_short_idct4x4llm_c(short *input, short *output, int pitch);

-extern void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch);

-extern void vp9_short_idct8x8_c(short *input, short *output, int pitch);

-extern void vp9_short_idct8x8_1_c(short *input, short *output, int pitch);

-#if CONFIG_LOSSLESS

-extern void vp9_short_inv_walsh4x4_x8_c(short *input, short *output,

-                                        int pitch);

-extern void vp9_short_inv_walsh4x4_1_x8_c(short *input, short *output,

-                                          int pitch);

-#endif

-#ifdef DEC_DEBUG

-extern int dec_debug;

-#endif

-void vp9_dequantize_b_c(BLOCKD *d) {

-  int i;

-  short *DQ  = d->dqcoeff;

-  short *Q   = d->qcoeff;

-  short *DQC = d->dequant;

-  for (i = 0; i < 16; i++) {

-    DQ[i] = Q[i] * DQC[i];

-  }

-}

-void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,

-                               unsigned char *pred, unsigned char *dest,

-                               int pitch, int stride) {

-  short output[16];

-  short *diff_ptr = output;

-  int r, c;

-  int i;

-  for (i = 0; i < 16; i++) {

-    input[i] = dq[i] * input[i];

-  }

-  vp9_ihtllm_c(input, output, 4 << 1, tx_type, 4);

-  vpx_memset(input, 0, 32);

-  for (r = 0; r < 4; r++) {

-      for (c = 0; c < 4; c++) {

-        int a = diff_ptr[c] + pred[c];

-        if (a < 0)

-            a = 0;

-        if (a > 255)

-            a = 255;

-        dest[c] = (unsigned char) a;

-    }

-      dest += stride;

-      diff_ptr += 4;

-      pred += pitch;

-  }

-}

-void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,

-                                   unsigned char *pred, unsigned char *dest,

-                                   int pitch, int stride) {

-  short output[64];

-  short *diff_ptr = output;

-  int b, r, c;

-  int i;

-  unsigned char *origdest = dest;

-  unsigned char *origpred = pred;

-  input[0] = dq[0] * input[0];

-  for (i = 1; i < 64; i++) {

-    input[i] = dq[1] * input[i];

-  }

-  vp9_ihtllm_c(input, output, 16, tx_type, 8);

-  vpx_memset(input, 0, 128);

-  for (b = 0; b < 4; b++) {

-    for (r = 0; r < 4; r++) {

-      for (c = 0; c < 4; c++) {

-        int a = diff_ptr[c] + pred[c];

-        if (a < 0)

-          a = 0;

-        if (a > 255)

-          a = 255;

-        dest[c] = (unsigned char) a;

-      }

-      dest += stride;

-      diff_ptr += 8;

-      pred += pitch;

-    }

-    // shift buffer pointers to next 4x4 block in the submacroblock

-    diff_ptr = output + (b + 1) / 2 * 4 * 8 + ((b + 1) % 2) * 4;

-    dest = origdest + (b + 1) / 2 * 4 * stride + ((b + 1) % 2) * 4;

-    pred = origpred + (b + 1) / 2 * 4 * pitch + ((b + 1) % 2) * 4;

-  }

-}

-void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,

-                            unsigned char *dest, int pitch, int stride) {

-  short output[16];

-  short *diff_ptr = output;

-  int r, c;

-  int i;

-  for (i = 0; i < 16; i++) {

-    input[i] = dq[i] * input[i];

-  }

-  /* the idct halves ( >> 1) the pitch */

-  vp9_short_idct4x4llm_c(input, output, 4 << 1);

-  vpx_memset(input, 0, 32);

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

-      int a = diff_ptr[c] + pred[c];

-      if (a < 0)

-        a = 0;

-      if (a > 255)

-        a = 255;

-      dest[c] = (unsigned char) a;

-    }

-    dest += stride;

-    diff_ptr += 4;

-    pred += pitch;

-  }

-}

-void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,

-                               unsigned char *dest, int pitch, int stride,

-                               int Dc) {

-  int i;

-  short output[16];

-  short *diff_ptr = output;

-  int r, c;

-  input[0] = (short)Dc;

-  for (i = 1; i < 16; i++) {

-    input[i] = dq[i] * input[i];

-  }

-  /* the idct halves ( >> 1) the pitch */

-  vp9_short_idct4x4llm_c(input, output, 4 << 1);

-  vpx_memset(input, 0, 32);

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

-      int a = diff_ptr[c] + pred[c];

-      if (a < 0)

-        a = 0;

-      if (a > 255)

-        a = 255;

-      dest[c] = (unsigned char) a;

-    }

-    dest += stride;

-    diff_ptr += 4;

-    pred += pitch;

-  }

-}

-#if CONFIG_LOSSLESS

-void vp9_dequant_idct_add_lossless_c(short *input, short *dq,

-                                     unsigned char *pred, unsigned char *dest,

-                                     int pitch, int stride) {

-  short output[16];

-  short *diff_ptr = output;

-  int r, c;

-  int i;

-  for (i = 0; i < 16; i++) {

-    input[i] = dq[i] * input[i];

-  }

-  vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);

-  vpx_memset(input, 0, 32);

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

-      int a = diff_ptr[c] + pred[c];

-      if (a < 0)

-        a = 0;

-      if (a > 255)

-        a = 255;

-      dest[c] = (unsigned char) a;

-    }

-    dest += stride;

-    diff_ptr += 4;

-    pred += pitch;

-  }

-}

-void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,

-                                        unsigned char *pred,

-                                        unsigned char *dest,

-                                        int pitch, int stride, int dc) {

-  int i;

-  short output[16];

-  short *diff_ptr = output;

-  int r, c;

-  input[0] = (short)dc;

-  for (i = 1; i < 16; i++) {

-    input[i] = dq[i] * input[i];

-  }

-  vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);

-  vpx_memset(input, 0, 32);

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

-      int a = diff_ptr[c] + pred[c];

-      if (a < 0)

-        a = 0;

-      if (a > 255)

-        a = 255;

-      dest[c] = (unsigned char) a;

-    }

-    dest += stride;

-    diff_ptr += 4;

-    pred += pitch;

-  }

-}

-#endif

-void vp9_dequantize_b_2x2_c(BLOCKD *d) {

-  int i;

-  short *DQ  = d->dqcoeff;

-  short *Q   = d->qcoeff;

-  short *DQC = d->dequant;

-  for (i = 0; i < 16; i++) {

-    DQ[i] = (short)((Q[i] * DQC[i]));

-  }

-#ifdef DEC_DEBUG

-  if (dec_debug) {

-    int j;

-    printf("Dequantize 2x2\n");

-    for (j = 0; j < 16; j++) printf("%d ", Q[j]);

-    printf("\n");

-    for (j = 0; j < 16; j++) printf("%d ", DQ[j]);

-    printf("\n");

-  }

-#endif

-}

-void vp9_dequant_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,

-                                unsigned char *dest, int pitch, int stride) {

-  short output[64];

-  short *diff_ptr = output;

-  int r, c, b;

-  int i;

-  unsigned char *origdest = dest;

-  unsigned char *origpred = pred;

-#ifdef DEC_DEBUG

-  if (dec_debug) {

-    int j;

-    printf("Input 8x8\n");

-    for (j = 0; j < 64; j++) {

-      printf("%d ", input[j]);

-      if (j % 8 == 7) printf("\n");

-    }

-  }

-#endif

-  input[0] = input[0] * dq[0];

-  // recover quantizer for 4 4x4 blocks

-  for (i = 1; i < 64; i++) {

-    input[i] = input[i] * dq[1];

-  }

-#ifdef DEC_DEBUG

-  if (dec_debug) {

-    int j;

-    printf("Input DQ 8x8\n");

-    for (j = 0; j < 64; j++) {

-      printf("%d ", input[j]);

-      if (j % 8 == 7) printf("\n");

-    }

-  }

-#endif

-  // the idct halves ( >> 1) the pitch

-  vp9_short_idct8x8_c(input, output, 16);

-#ifdef DEC_DEBUG

-  if (dec_debug) {

-    int j;

-    printf("Output 8x8\n");

-    for (j = 0; j < 64; j++) {

-      printf("%d ", output[j]);

-      if (j % 8 == 7) printf("\n");

-    }

-  }

-#endif

-  vpx_memset(input, 0, 128);// test what should i put here

-  for (b = 0; b < 4; b++) {

-    for (r = 0; r < 4; r++) {

-      for (c = 0; c < 4; c++) {

-        int a = diff_ptr[c] + pred[c];

-        if (a < 0)

-          a = 0;

-        if (a > 255)

-          a = 255;

-        dest[c] = (unsigned char) a;

-      }

-      dest += stride;

-      diff_ptr += 8;

-      pred += pitch;

-    }

-    diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;

-    dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;

-    pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;

-  }

-#ifdef DEC_DEBUG

-  if (dec_debug) {

-    int k, j;

-    printf("Final 8x8\n");

-    for (j = 0; j < 8; j++) {

-      for (k = 0; k < 8; k++) {

-        printf("%d ", origdest[k]);

-      }

-      printf("\n");

-      origdest += stride;

-    }

-  }

-#endif

-}

-void vp9_dequant_dc_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,

-                                   unsigned char *dest, int pitch, int stride,

-                                   int Dc) { // Dc for 1st order T in some rear case

-  short output[64];

-  short *diff_ptr = output;

-  int r, c, b;

-  int i;

-  unsigned char *origdest = dest;

-  unsigned char *origpred = pred;

-  input[0] = (short)Dc;// Dc is the reconstructed value, do not need dequantization

-  // dc value is recovered after dequantization, since dc need not quantization

-#ifdef DEC_DEBUG

-  if (dec_debug) {

-    int j;

-    printf("Input 8x8\n");

-    for (j = 0; j < 64; j++) {

-      printf("%d ", input[j]);

-      if (j % 8 == 7) printf("\n");

-    }

-  }

-#endif

-  for (i = 1; i < 64; i++) {

-    input[i] = input[i] * dq[1];

-  }

-#ifdef DEC_DEBUG

-  if (dec_debug) {

-    int j;

-    printf("Input DQ 8x8\n");

-    for (j = 0; j < 64; j++) {

-      printf("%d ", input[j]);

-      if (j % 8 == 7) printf("\n");

-    }

-  }

-#endif

-  // the idct halves ( >> 1) the pitch

-  vp9_short_idct8x8_c(input, output, 16);

-#ifdef DEC_DEBUG

-  if (dec_debug) {

-    int j;

-    printf("Output 8x8\n");

-    for (j = 0; j < 64; j++) {

-      printf("%d ", output[j]);

-      if (j % 8 == 7) printf("\n");

-    }

-  }

-#endif

-  vpx_memset(input, 0, 128);

-  for (b = 0; b < 4; b++) {

-    for (r = 0; r < 4; r++) {

-      for (c = 0; c < 4; c++) {

-        int a = diff_ptr[c] + pred[c];

-        if (a < 0)

-          a = 0;

-        if (a > 255)

-          a = 255;

-        dest[c] = (unsigned char) a;

-      }

-      dest += stride;

-      diff_ptr += 8;

-      pred += pitch;

-    }

-    diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;

-    dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;

-    pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;

-  }

-#ifdef DEC_DEBUG

-  if (dec_debug) {

-    int k, j;

-    printf("Final 8x8\n");

-    for (j = 0; j < 8; j++) {

-      for (k = 0; k < 8; k++) {

-        printf("%d ", origdest[k]);

-      }

-      printf("\n");

-      origdest += stride;

-    }

-  }

-#endif

-}

-void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, short *dq,

-                                     unsigned char *pred, unsigned char *dest,

-                                     int pitch, int stride) {

-  short output[256];

-  short *diff_ptr = output;

-  int r, c, i;

-  input[0]= input[0] * dq[0];

-  // recover quantizer for 4 4x4 blocks

-  for (i = 1; i < 256; i++)

-    input[i] = input[i] * dq[1];

-  // inverse hybrid transform

-  vp9_ihtllm_c(input, output, 32, tx_type, 16);

-  // the idct halves ( >> 1) the pitch

-  // vp9_short_idct16x16_c(input, output, 32);

-  vpx_memset(input, 0, 512);

-  for (r = 0; r < 16; r++) {

-    for (c = 0; c < 16; c++) {

-      int a = diff_ptr[c] + pred[c];

-      if (a < 0)

-        a = 0;

-      else if (a > 255)

-        a = 255;

-      dest[c] = (unsigned char) a;

-    }

-    dest += stride;

-    diff_ptr += 16;

-    pred += pitch;

-  }

-}

-void vp9_dequant_idct_add_16x16_c(short *input, short *dq, unsigned char *pred,

-                                  unsigned char *dest, int pitch, int stride) {

-  short output[256];

-  short *diff_ptr = output;

-  int r, c, i;

-  input[0]= input[0] * dq[0];

-  // recover quantizer for 4 4x4 blocks

-  for (i = 1; i < 256; i++)

-    input[i] = input[i] * dq[1];

-  // the idct halves ( >> 1) the pitch

-  vp9_short_idct16x16_c(input, output, 32);

-  vpx_memset(input, 0, 512);

-  for (r = 0; r < 16; r++) {

-    for (c = 0; c < 16; c++) {

-      int a = diff_ptr[c] + pred[c];

-      if (a < 0)

-        a = 0;

-      else if (a > 255)

-        a = 255;

-      dest[c] = (unsigned char) a;

-    }

-    dest += stride;

-    diff_ptr += 16;

-    pred += pitch;

-  }

-}

--- a/vp8/decoder/dequantize.h

+++ /dev/null

@@ -1,78 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef DEQUANTIZE_H

-#define DEQUANTIZE_H

-#include "vp8/common/blockd.h"

-#if CONFIG_LOSSLESS

-extern void vp9_dequant_idct_add_lossless_c(short *input, short *dq,

-                                            unsigned char *pred,

-                                            unsigned char *output,

-                                            int pitch, int stride);

-extern void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,

-                                               unsigned char *pred,

-                                               unsigned char *output,

-                                               int pitch, int stride, int dc);

-extern void vp9_dequant_dc_idct_add_y_block_lossless_c(short *q, short *dq,

-                                                       unsigned char *pre,

-                                                       unsigned char *dst,

-                                                       int stride, char *eobs,

-                                                       short *dc);

-extern void vp9_dequant_idct_add_y_block_lossless_c(short *q, short *dq,

-                                                    unsigned char *pre,

-                                                    unsigned char *dst,

-                                                    int stride, char *eobs);

-extern void vp9_dequant_idct_add_uv_block_lossless_c(short *q, short *dq,

-                                                     unsigned char *pre,

-                                                     unsigned char *dst_u,

-                                                     unsigned char *dst_v,

-                                                     int stride, char *eobs);

-#endif

-typedef void (*vp9_dequant_idct_add_fn_t)(short *input, short *dq,

-    unsigned char *pred, unsigned char *output, int pitch, int stride);

-typedef void(*vp9_dequant_dc_idct_add_fn_t)(short *input, short *dq,

-    unsigned char *pred, unsigned char *output, int pitch, int stride, int dc);

-typedef void(*vp9_dequant_dc_idct_add_y_block_fn_t)(short *q, short *dq,

-    unsigned char *pre, unsigned char *dst, int stride, char *eobs, short *dc);

-typedef void(*vp9_dequant_idct_add_y_block_fn_t)(short *q, short *dq,

-    unsigned char *pre, unsigned char *dst, int stride, char *eobs);

-typedef void(*vp9_dequant_idct_add_uv_block_fn_t)(short *q, short *dq,

-    unsigned char *pre, unsigned char *dst_u, unsigned char *dst_v, int stride,

-    char *eobs);

-void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,

-                                    unsigned char *pred, unsigned char *dest,

-                                    int pitch, int stride);

-void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,

-                                   unsigned char *pred, unsigned char *dest,

-                                   int pitch, int stride);

-void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, short *dq,

-                                     unsigned char *pred, unsigned char *dest,

-                                     int pitch, int stride);

-#if CONFIG_SUPERBLOCKS

-void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, short *dq,

-                                                   unsigned char *dst,

-                                                   int stride, char *eobs,

-                                                   short *dc, MACROBLOCKD *xd);

-void vp9_dequant_idct_add_uv_block_8x8_inplace_c(short *q, short *dq,

-                                                 unsigned char *dstu,

-                                                 unsigned char *dstv,

-                                                 int stride, char *eobs,

-                                                 MACROBLOCKD *xd);

-#endif

-#endif

--- a/vp8/decoder/detokenize.c

+++ /dev/null

@@ -1,640 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp8/common/type_aliases.h"

-#include "vp8/common/blockd.h"

-#include "onyxd_int.h"

-#include "vpx_mem/vpx_mem.h"

-#include "vpx_ports/mem.h"

-#include "detokenize.h"

-#include "vp8/common/seg_common.h"

-#define BOOL_DATA UINT8

-#define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES

-DECLARE_ALIGNED(16, static const int, coef_bands_x[16]) = {

-  0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X,

-  6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X,

-  6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,

-  6 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X

-};

-DECLARE_ALIGNED(16, static const int, coef_bands_x_8x8[64]) = {

-  0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 4 * OCB_X, 5 * OCB_X,

-  5 * OCB_X, 3 * OCB_X, 6 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 6 * OCB_X, 6 * OCB_X,

-  6 * OCB_X, 5 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,

-  6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,

-  6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

-};

-DECLARE_ALIGNED(16, static const int, coef_bands_x_16x16[256]) = {

-  0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 4 * OCB_X, 5 * OCB_X, 5 * OCB_X, 3 * OCB_X, 6 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 6 * OCB_X, 6 * OCB_X,

-  6 * OCB_X, 5 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,

-  6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

-  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X

-};

-#define EOB_CONTEXT_NODE            0

-#define ZERO_CONTEXT_NODE           1

-#define ONE_CONTEXT_NODE            2

-#define LOW_VAL_CONTEXT_NODE        3

-#define TWO_CONTEXT_NODE            4

-#define THREE_CONTEXT_NODE          5

-#define HIGH_LOW_CONTEXT_NODE       6

-#define CAT_ONE_CONTEXT_NODE        7

-#define CAT_THREEFOUR_CONTEXT_NODE  8

-#define CAT_THREE_CONTEXT_NODE      9

-#define CAT_FIVE_CONTEXT_NODE       10

-#define CAT1_MIN_VAL    5

-#define CAT2_MIN_VAL    7

-#define CAT3_MIN_VAL   11

-#define CAT4_MIN_VAL   19

-#define CAT5_MIN_VAL   35

-#define CAT6_MIN_VAL   67

-#define CAT1_PROB0    159

-#define CAT2_PROB0    145

-#define CAT2_PROB1    165

-#define CAT3_PROB0 140

-#define CAT3_PROB1 148

-#define CAT3_PROB2 173

-#define CAT4_PROB0 135

-#define CAT4_PROB1 140

-#define CAT4_PROB2 155

-#define CAT4_PROB3 176

-#define CAT5_PROB0 130

-#define CAT5_PROB1 134

-#define CAT5_PROB2 141

-#define CAT5_PROB3 157

-#define CAT5_PROB4 180

-static const unsigned char cat6_prob[14] =

-{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };

-void vp9_reset_mb_tokens_context(MACROBLOCKD *xd) {

-  /* Clear entropy contexts for Y2 blocks */

-  if ((xd->mode_info_context->mbmi.mode != B_PRED &&

-      xd->mode_info_context->mbmi.mode != I8X8_PRED &&

-      xd->mode_info_context->mbmi.mode != SPLITMV)

-      || xd->mode_info_context->mbmi.txfm_size == TX_16X16

-      ) {

-    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));

-    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));

-  } else {

-    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);

-    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);

-  }

-}

-DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);

-// #define PREV_CONTEXT_INC(val) (2+((val)>2))

-// #define PREV_CONTEXT_INC(val) (vp9_prev_token_class[(val)])

-#define PREV_CONTEXT_INC(val) (vp9_prev_token_class[(val)>10?10:(val)])

-static int get_token(int v) {

-  if (v < 0) v = -v;

-  if (v == 0) return ZERO_TOKEN;

-  else if (v == 1) return ONE_TOKEN;

-  else if (v == 2) return TWO_TOKEN;

-  else if (v == 3) return THREE_TOKEN;

-  else if (v == 4) return FOUR_TOKEN;

-  else if (v <= 6) return DCT_VAL_CATEGORY1;

-  else if (v <= 10) return DCT_VAL_CATEGORY2;

-  else if (v <= 18) return DCT_VAL_CATEGORY3;

-  else if (v <= 34) return DCT_VAL_CATEGORY4;

-  else if (v <= 66) return DCT_VAL_CATEGORY5;

-  else return DCT_VAL_CATEGORY6;

-}

-void static count_tokens_adaptive_scan(const MACROBLOCKD *xd, INT16 *qcoeff_ptr,

-                                       int block, PLANE_TYPE type,

-                                       TX_TYPE tx_type,

-                                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

-                                       int eob, int seg_eob,

-                                       FRAME_CONTEXT *fc) {

-  int c, pt, token, band;

-  const int *scan;

-  switch(tx_type) {

-    case ADST_DCT :

-      scan = vp9_row_scan;

-      break;

-    case DCT_ADST :

-      scan = vp9_col_scan;

-      break;

-    default :

-      scan = vp9_default_zig_zag1d;

-      break;

-  }

-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

-  for (c = !type; c < eob; ++c) {

-    int rc = scan[c];

-    int v = qcoeff_ptr[rc];

-    band = vp9_coef_bands[c];

-    token = get_token(v);

-    if (tx_type != DCT_DCT)

-      fc->hybrid_coef_counts[type][band][pt][token]++;

-    else

-      fc->coef_counts[type][band][pt][token]++;

-    pt = vp9_prev_token_class[token];

-  }

-  if (eob < seg_eob) {

-    band = vp9_coef_bands[c];

-    if (tx_type != DCT_DCT)

-      fc->hybrid_coef_counts[type][band][pt][DCT_EOB_TOKEN]++;

-    else

-      fc->coef_counts[type][band][pt][DCT_EOB_TOKEN]++;

-  }

-}

-void static count_tokens(INT16 *qcoeff_ptr, int block, PLANE_TYPE type,

-                         ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

-                         int eob, int seg_eob, FRAME_CONTEXT *const fc) {

-  int c, pt, token, band;

-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

-  for (c = !type; c < eob; ++c) {

-    int rc = vp9_default_zig_zag1d[c];

-    int v = qcoeff_ptr[rc];

-    band = vp9_coef_bands[c];

-    token = get_token(v);

-    fc->coef_counts[type][band][pt][token]++;

-    pt = vp9_prev_token_class[token];

-  }

-  if (eob < seg_eob) {

-    band = vp9_coef_bands[c];

-    fc->coef_counts[type][band][pt][DCT_EOB_TOKEN]++;

-  }

-}

-void static count_tokens_8x8(INT16 *qcoeff_ptr, int block, PLANE_TYPE type,

-                             TX_TYPE tx_type,

-                             ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

-                             int eob, int seg_eob, FRAME_CONTEXT *fc) {

-  int c, pt, token, band;

-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

-  for (c = !type; c < eob; ++c) {

-    int rc = (type == 1 ? vp9_default_zig_zag1d[c] : vp9_default_zig_zag1d_8x8[c]);

-    int v = qcoeff_ptr[rc];

-    band = (type == 1 ? vp9_coef_bands[c] : vp9_coef_bands_8x8[c]);

-    token = get_token(v);

-    if (tx_type != DCT_DCT)

-      fc->hybrid_coef_counts_8x8[type][band][pt][token]++;

-    else

-      fc->coef_counts_8x8[type][band][pt][token]++;

-    pt = vp9_prev_token_class[token];

-  }

-  if (eob < seg_eob) {

-    band = (type == 1 ? vp9_coef_bands[c] : vp9_coef_bands_8x8[c]);

-    if (tx_type != DCT_DCT)

-      fc->hybrid_coef_counts_8x8[type][band][pt][DCT_EOB_TOKEN]++;

-    else

-      fc->coef_counts_8x8[type][band][pt][DCT_EOB_TOKEN]++;

-  }

-}

-void static count_tokens_16x16(INT16 *qcoeff_ptr, int block, PLANE_TYPE type,

-                               TX_TYPE tx_type,

-                               ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

-                               int eob, int seg_eob, FRAME_CONTEXT *fc) {

-  int c, pt, token;

-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

-  for (c = !type; c < eob; ++c) {

-    int rc = vp9_default_zig_zag1d_16x16[c];

-    int v = qcoeff_ptr[rc];

-    int band = vp9_coef_bands_16x16[c];

-    token = get_token(v);

-    if (tx_type != DCT_DCT)

-      fc->hybrid_coef_counts_16x16[type][band][pt][token]++;

-    else

-      fc->coef_counts_16x16[type][band][pt][token]++;

-    pt = vp9_prev_token_class[token];

-  }

-  if (eob < seg_eob) {

-    int band = vp9_coef_bands_16x16[c];

-    if (tx_type != DCT_DCT)

-      fc->hybrid_coef_counts_16x16[type][band][pt][DCT_EOB_TOKEN]++;

-    else

-      fc->coef_counts_16x16[type][band][pt][DCT_EOB_TOKEN]++;

-  }

-}

-static int get_signed(BOOL_DECODER *br, int value_to_sign) {

-  const int split = (br->range + 1) >> 1;

-  const VP9_BD_VALUE bigsplit = (VP9_BD_VALUE)split << (VP9_BD_VALUE_SIZE - 8);

-  int v;

-  if (br->count < 0)

-    vp9_bool_decoder_fill(br);

-  if (br->value < bigsplit) {

-    br->range = split;

-    v = value_to_sign;

-  } else {

-    br->range = br->range - split;

-    br->value = br->value - bigsplit;

-    v = -value_to_sign;

-  }

-  br->range += br->range;

-  br->value += br->value;

-  --br->count;

-  return v;

-}

-#define WRITE_COEF_CONTINUE(val)                              \

-  {                                                           \

-    prob = coef_probs + (ENTROPY_NODES*PREV_CONTEXT_INC(val));\

-    qcoeff_ptr[scan[c]] = (INT16) get_signed(br, val);        \

-    c++;                                                      \

-    continue;                                                 \

-  }

-#define ADJUST_COEF(prob, bits_count)  \

-  do {                                 \

-    if (vp9_read(br, prob))            \

-      val += (UINT16)(1 << bits_count);\

-  } while (0);

-static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,

-                        BOOL_DECODER* const br,

-                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

-                        PLANE_TYPE type,

-                        TX_TYPE tx_type,

-                        int seg_eob, INT16 *qcoeff_ptr, int i,

-                        const int *const scan, int block_type,

-                        const int *coef_bands) {

-  FRAME_CONTEXT *const fc = &dx->common.fc;

-  int tmp, c = (type == PLANE_TYPE_Y_NO_DC);

-  const vp9_prob *prob, *coef_probs;

-  switch (block_type) {

-    default:

-    case TX_4X4:

-      coef_probs =

-        tx_type != DCT_DCT ? fc->hybrid_coef_probs[type][0][0] :

-        fc->coef_probs[type][0][0];

-      break;

-    case TX_8X8:

-      coef_probs =

-        tx_type != DCT_DCT ? fc->hybrid_coef_probs_8x8[type][0][0] :

-        fc->coef_probs_8x8[type][0][0];

-      break;

-    case TX_16X16:

-      coef_probs =

-        tx_type != DCT_DCT ? fc->hybrid_coef_probs_16x16[type][0][0] :

-        fc->coef_probs_16x16[type][0][0];

-      break;

-  }

-  VP9_COMBINEENTROPYCONTEXTS(tmp, *a, *l);

-  prob = coef_probs + tmp * ENTROPY_NODES;

-  while (1) {

-    int val;

-    const uint8_t *cat6 = cat6_prob;

-    if (c == seg_eob) break;

-    prob += coef_bands[c];

-    if (!vp9_read(br, prob[EOB_CONTEXT_NODE]))

-      break;

-SKIP_START:

-    if (c == seg_eob) break;

-    if (!vp9_read(br, prob[ZERO_CONTEXT_NODE])) {

-      ++c;

-      prob = coef_probs + coef_bands[c];

-      goto SKIP_START;

-    }

-    // ONE_CONTEXT_NODE_0_

-    if (!vp9_read(br, prob[ONE_CONTEXT_NODE])) {

-      prob = coef_probs + ENTROPY_NODES;

-      qcoeff_ptr[scan[c]] = (INT16) get_signed(br, 1);

-      ++c;

-      continue;

-    }

-    // LOW_VAL_CONTEXT_NODE_0_

-    if (!vp9_read(br, prob[LOW_VAL_CONTEXT_NODE])) {

-      if (!vp9_read(br, prob[TWO_CONTEXT_NODE])) {

-        WRITE_COEF_CONTINUE(2);

-      }

-      if (!vp9_read(br, prob[THREE_CONTEXT_NODE])) {

-        WRITE_COEF_CONTINUE(3);

-      }

-      WRITE_COEF_CONTINUE(4);

-    }

-    // HIGH_LOW_CONTEXT_NODE_0_

-    if (!vp9_read(br, prob[HIGH_LOW_CONTEXT_NODE])) {

-      if (!vp9_read(br, prob[CAT_ONE_CONTEXT_NODE])) {

-        val = CAT1_MIN_VAL;

-        ADJUST_COEF(CAT1_PROB0, 0);

-        WRITE_COEF_CONTINUE(val);

-      }

-      val = CAT2_MIN_VAL;

-      ADJUST_COEF(CAT2_PROB1, 1);

-      ADJUST_COEF(CAT2_PROB0, 0);

-      WRITE_COEF_CONTINUE(val);

-    }

-    // CAT_THREEFOUR_CONTEXT_NODE_0_

-    if (!vp9_read(br, prob[CAT_THREEFOUR_CONTEXT_NODE])) {

-      if (!vp9_read(br, prob[CAT_THREE_CONTEXT_NODE])) {

-        val = CAT3_MIN_VAL;

-        ADJUST_COEF(CAT3_PROB2, 2);

-        ADJUST_COEF(CAT3_PROB1, 1);

-        ADJUST_COEF(CAT3_PROB0, 0);

-        WRITE_COEF_CONTINUE(val);

-      }

-      val = CAT4_MIN_VAL;

-      ADJUST_COEF(CAT4_PROB3, 3);

-      ADJUST_COEF(CAT4_PROB2, 2);

-      ADJUST_COEF(CAT4_PROB1, 1);

-      ADJUST_COEF(CAT4_PROB0, 0);

-      WRITE_COEF_CONTINUE(val);

-    }

-    // CAT_FIVE_CONTEXT_NODE_0_:

-    if (!vp9_read(br, prob[CAT_FIVE_CONTEXT_NODE])) {

-      val = CAT5_MIN_VAL;

-      ADJUST_COEF(CAT5_PROB4, 4);

-      ADJUST_COEF(CAT5_PROB3, 3);

-      ADJUST_COEF(CAT5_PROB2, 2);

-      ADJUST_COEF(CAT5_PROB1, 1);

-      ADJUST_COEF(CAT5_PROB0, 0);

-      WRITE_COEF_CONTINUE(val);

-    }

-    val = 0;

-    while (*cat6) {

-      val = (val << 1) | vp9_read(br, *cat6++);

-    }

-    val += CAT6_MIN_VAL;

-    WRITE_COEF_CONTINUE(val);

-  }

-  if (block_type == TX_4X4) {

-    count_tokens_adaptive_scan(xd, qcoeff_ptr, i, type,

-                               tx_type,

-                               a, l, c, seg_eob, fc);

-  }

-  else if (block_type == TX_8X8)

-    count_tokens_8x8(qcoeff_ptr, i, type,

-                     tx_type,

-                     a, l, c, seg_eob, fc);

-  else

-    count_tokens_16x16(qcoeff_ptr, i, type,

-                       tx_type,

-                       a, l, c, seg_eob, fc);

-  return c;

-}

-int vp9_decode_mb_tokens_16x16(VP9D_COMP *pbi, MACROBLOCKD *xd,

-                               BOOL_DECODER* const bc) {

-  ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context;

-  ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context;

-  char* const eobs = xd->eobs;

-  PLANE_TYPE type;

-  int c, i, eobtotal = 0, seg_eob;

-  const int segment_id = xd->mode_info_context->mbmi.segment_id;

-  const int seg_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB);

-  INT16 *qcoeff_ptr = &xd->qcoeff[0];

-  TX_TYPE tx_type = get_tx_type(xd, &xd->block[0]);

-  type = PLANE_TYPE_Y_WITH_DC;

-  if (seg_active)

-      seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-  else

-      seg_eob = 256;

-  // Luma block

-  {

-    const int* const scan = vp9_default_zig_zag1d_16x16;

-    c = decode_coefs(pbi, xd, bc, A, L, type,

-                     tx_type,

-                     seg_eob, qcoeff_ptr,

-                     0, scan, TX_16X16, coef_bands_x_16x16);

-    eobs[0] = c;

-    A[0] = L[0] = (c != !type);

-    A[1] = A[2] = A[3] = A[0];

-    L[1] = L[2] = L[3] = L[0];

-    eobtotal += c;

-  }

-  // 8x8 chroma blocks

-  qcoeff_ptr += 256;

-  type = PLANE_TYPE_UV;

-  tx_type = DCT_DCT;

-  if (seg_active)

-    seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-  else

-    seg_eob = 64;

-  for (i = 16; i < 24; i += 4) {

-    ENTROPY_CONTEXT* const a = A + vp9_block2above_8x8[i];

-    ENTROPY_CONTEXT* const l = L + vp9_block2left_8x8[i];

-    const int* const scan = vp9_default_zig_zag1d_8x8;

-    c = decode_coefs(pbi, xd, bc, a, l, type,

-                     tx_type,

-                     seg_eob, qcoeff_ptr,

-                     i, scan, TX_8X8, coef_bands_x_8x8);

-    a[0] = l[0] = ((eobs[i] = c) != !type);

-    a[1] = a[0];

-    l[1] = l[0];

-    eobtotal += c;

-    qcoeff_ptr += 64;

-  }

-  vpx_memset(&A[8], 0, sizeof(A[8]));

-  vpx_memset(&L[8], 0, sizeof(L[8]));

-  return eobtotal;

-}

-int vp9_decode_mb_tokens_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,

-                             BOOL_DECODER* const bc) {

-  ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;

-  ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;

-  char *const eobs = xd->eobs;

-  PLANE_TYPE type;

-  int c, i, eobtotal = 0, seg_eob;

-  const int segment_id = xd->mode_info_context->mbmi.segment_id;

-  const int seg_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB);

-  INT16 *qcoeff_ptr = &xd->qcoeff[0];

-  TX_TYPE tx_type = DCT_DCT;

-  int bufthred = (xd->mode_info_context->mbmi.mode == I8X8_PRED ||

-                  xd->mode_info_context->mbmi.mode == SPLITMV) ? 16 : 24;

-  if (xd->mode_info_context->mbmi.mode != B_PRED &&

-      xd->mode_info_context->mbmi.mode != SPLITMV &&

-      xd->mode_info_context->mbmi.mode != I8X8_PRED) {

-    ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[24];

-    ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[24];

-    const int *const scan = vp9_default_zig_zag1d;

-    type = PLANE_TYPE_Y2;

-    if (seg_active)

-      seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-    else

-      seg_eob = 4;

-    c = decode_coefs(pbi, xd, bc, a, l, type,

-                     tx_type,

-                     seg_eob, qcoeff_ptr + 24 * 16,

-                     24, scan, TX_8X8, coef_bands_x);

-    a[0] = l[0] = ((eobs[24] = c) != !type);

-    eobtotal += c - 4;

-    type = PLANE_TYPE_Y_NO_DC;

-  } else

-    type = PLANE_TYPE_Y_WITH_DC;

-  if (seg_active)

-    seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-  else

-    seg_eob = 64;

-  for (i = 0; i < bufthred ; i += 4) {

-    ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[i];

-    ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[i];

-    const int *const scan = vp9_default_zig_zag1d_8x8;

-    tx_type = DCT_DCT;

-    if (i == 16)

-      type = PLANE_TYPE_UV;

-    if (type == PLANE_TYPE_Y_WITH_DC) {

-      tx_type = get_tx_type(xd, xd->block + i);

-    }

-    c = decode_coefs(pbi, xd, bc, a, l, type,

-                     tx_type,

-                     seg_eob, qcoeff_ptr,

-                     i, scan, TX_8X8, coef_bands_x_8x8);

-    a[0] = l[0] = ((eobs[i] = c) != !type);

-    a[1] = a[0];

-    l[1] = l[0];

-    eobtotal += c;

-    qcoeff_ptr += 64;

-  }

-  if (bufthred == 16) {

-    type = PLANE_TYPE_UV;

-    tx_type = DCT_DCT;

-    seg_eob = 16;

-    // use 4x4 transform for U, V components in I8X8 prediction mode

-    for (i = 16; i < 24; i++) {

-      ENTROPY_CONTEXT *const a = A + vp9_block2above[i];

-      ENTROPY_CONTEXT *const l = L + vp9_block2left[i];

-      const int *scan = vp9_default_zig_zag1d;

-      c = decode_coefs(pbi, xd, bc, a, l, type,

-                       tx_type,

-                       seg_eob, qcoeff_ptr,

-                       i, scan, TX_4X4, coef_bands_x);

-      a[0] = l[0] = ((eobs[i] = c) != !type);

-      eobtotal += c;

-      qcoeff_ptr += 16;

-    }

-  }

-  return eobtotal;

-}

-int vp9_decode_mb_tokens(VP9D_COMP *dx, MACROBLOCKD *xd,

-                         BOOL_DECODER* const bc) {

-  ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;

-  ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;

-  char *const eobs = xd->eobs;

-  const int *scan = vp9_default_zig_zag1d;

-  PLANE_TYPE type;

-  int c, i, eobtotal = 0, seg_eob = 16;

-  INT16 *qcoeff_ptr = &xd->qcoeff[0];

-  int segment_id = xd->mode_info_context->mbmi.segment_id;

-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB))

-    seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-  if (xd->mode_info_context->mbmi.mode != B_PRED &&

-      xd->mode_info_context->mbmi.mode != I8X8_PRED &&

-      xd->mode_info_context->mbmi.mode != SPLITMV) {

-    ENTROPY_CONTEXT *const a = A + vp9_block2above[24];

-    ENTROPY_CONTEXT *const l = L + vp9_block2left[24];

-    type = PLANE_TYPE_Y2;

-    c = decode_coefs(dx, xd, bc, a, l, type,

-                     DCT_DCT,

-                     seg_eob, qcoeff_ptr + 24 * 16, 24,

-                     scan, TX_4X4, coef_bands_x);

-    a[0] = l[0] = ((eobs[24] = c) != !type);

-    eobtotal += c - 16;

-    type = PLANE_TYPE_Y_NO_DC;

-  } else {

-    type = PLANE_TYPE_Y_WITH_DC;

-  }

-  for (i = 0; i < 24; ++i) {

-    ENTROPY_CONTEXT *const a = A + vp9_block2above[i];

-    ENTROPY_CONTEXT *const l = L + vp9_block2left[i];

-    TX_TYPE tx_type = DCT_DCT;

-    if (i == 16)

-      type = PLANE_TYPE_UV;

-    tx_type = get_tx_type(xd, &xd->block[i]);

-    switch(tx_type) {

-      case ADST_DCT :

-        scan = vp9_row_scan;

-        break;

-      case DCT_ADST :

-        scan = vp9_col_scan;

-        break;

-      default :

-        scan = vp9_default_zig_zag1d;

-        break;

-    }

-    c = decode_coefs(dx, xd, bc, a, l, type, tx_type,

-                     seg_eob, qcoeff_ptr,

-                     i, scan, TX_4X4, coef_bands_x);

-    a[0] = l[0] = ((eobs[i] = c) != !type);

-    eobtotal += c;

-    qcoeff_ptr += 16;

-  }

-  return eobtotal;

-}

--- a/vp8/decoder/detokenize.h

+++ /dev/null

@@ -1,25 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef DETOKENIZE_H

-#define DETOKENIZE_H

-#include "onyxd_int.h"

-void vp9_reset_mb_tokens_context(MACROBLOCKD* const);

-int vp9_decode_mb_tokens(VP9D_COMP* const, MACROBLOCKD* const,

-                         BOOL_DECODER* const);

-int vp9_decode_mb_tokens_8x8(VP9D_COMP* const, MACROBLOCKD* const,

-                             BOOL_DECODER* const);

-int vp9_decode_mb_tokens_16x16(VP9D_COMP* const, MACROBLOCKD* const,

-                               BOOL_DECODER* const);

-#endif /* DETOKENIZE_H */

--- a/vp8/decoder/idct_blk.c

+++ /dev/null

@@ -1,292 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "vp8/common/idct.h"

-#include "dequantize.h"

-void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,

-                               unsigned char *dest, int pitch, int stride,

-                               int Dc);

-void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,

-                            unsigned char *dest, int pitch, int stride);

-void vp9_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,

-                            unsigned char *dst_ptr, int pitch, int stride);

-#if CONFIG_LOSSLESS

-void vp9_dequant_idct_add_lossless_c(short *input, short *dq,

-                                     unsigned char *pred, unsigned char *dest,

-                                     int pitch, int stride);

-void vp9_dc_only_idct_add_lossless_c(short input_dc, unsigned char *pred_ptr,

-                                     unsigned char *dst_ptr,

-                                     int pitch, int stride);

-#endif

-void vp9_dequant_dc_idct_add_y_block_c(short *q, short *dq,

-                                       unsigned char *pre,

-                                       unsigned char *dst,

-                                       int stride, char *eobs,

-                                       short *dc) {

-  int i, j;

-  for (i = 0; i < 4; i++) {

-    for (j = 0; j < 4; j++) {

-      if (*eobs++ > 1)

-        vp9_dequant_dc_idct_add_c(q, dq, pre, dst, 16, stride, dc[0]);

-      else

-        vp9_dc_only_idct_add_c(dc[0], pre, dst, 16, stride);

-      q   += 16;

-      pre += 4;

-      dst += 4;

-      dc++;

-    }

-    pre += 64 - 16;

-    dst += 4 * stride - 16;

-  }

-}

-void vp9_dequant_idct_add_y_block_c(short *q, short *dq,

-                                    unsigned char *pre,

-                                    unsigned char *dst,

-                                    int stride, char *eobs) {

-  int i, j;

-  for (i = 0; i < 4; i++) {

-    for (j = 0; j < 4; j++) {

-      if (*eobs++ > 1)

-        vp9_dequant_idct_add_c(q, dq, pre, dst, 16, stride);

-      else {

-        vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dst, 16, stride);

-        ((int *)q)[0] = 0;

-      }

-      q   += 16;

-      pre += 4;

-      dst += 4;

-    }

-    pre += 64 - 16;

-    dst += 4 * stride - 16;

-  }

-}

-void vp9_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *pre,

-                                     unsigned char *dstu, unsigned char *dstv,

-                                     int stride, char *eobs) {

-  int i, j;

-  for (i = 0; i < 2; i++) {

-    for (j = 0; j < 2; j++) {

-      if (*eobs++ > 1)

-        vp9_dequant_idct_add_c(q, dq, pre, dstu, 8, stride);

-      else {

-        vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstu, 8, stride);

-        ((int *)q)[0] = 0;

-      }

-      q    += 16;

-      pre  += 4;

-      dstu += 4;

-    }

-    pre  += 32 - 8;

-    dstu += 4 * stride - 8;

-  }

-  for (i = 0; i < 2; i++) {

-    for (j = 0; j < 2; j++) {

-      if (*eobs++ > 1)

-        vp9_dequant_idct_add_c(q, dq, pre, dstv, 8, stride);

-      else {

-        vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstv, 8, stride);

-        ((int *)q)[0] = 0;

-      }

-      q    += 16;

-      pre  += 4;

-      dstv += 4;

-    }

-    pre  += 32 - 8;

-    dstv += 4 * stride - 8;

-  }

-}

-void vp9_dequant_dc_idct_add_y_block_8x8_c(short *q, short *dq,

-                                           unsigned char *pre,

-                                           unsigned char *dst,

-                                           int stride, char *eobs, short *dc,

-                                           MACROBLOCKD *xd) {

-  vp9_dequant_dc_idct_add_8x8_c(q, dq, pre, dst, 16, stride, dc[0]);

-  vp9_dequant_dc_idct_add_8x8_c(&q[64], dq, pre + 8, dst + 8, 16, stride, dc[1]);

-  vp9_dequant_dc_idct_add_8x8_c(&q[128], dq, pre + 8 * 16,

-                                dst + 8 * stride, 16, stride, dc[4]);

-  vp9_dequant_dc_idct_add_8x8_c(&q[192], dq, pre + 8 * 16 + 8,

-                                dst + 8 * stride + 8, 16, stride, dc[8]);

-}

-#if CONFIG_SUPERBLOCKS

-void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, short *dq,

-                                                   unsigned char *dst,

-                                                   int stride, char *eobs,

-                                                   short *dc, MACROBLOCKD *xd) {

-  vp9_dequant_dc_idct_add_8x8_c(q, dq, dst, dst, stride, stride, dc[0]);

-  vp9_dequant_dc_idct_add_8x8_c(&q[64], dq, dst + 8,

-                                dst + 8, stride, stride, dc[1]);

-  vp9_dequant_dc_idct_add_8x8_c(&q[128], dq, dst + 8 * stride,

-                                dst + 8 * stride, stride, stride, dc[4]);

-  vp9_dequant_dc_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8,

-                                dst + 8 * stride + 8, stride, stride, dc[8]);

-}

-#endif

-void vp9_dequant_idct_add_y_block_8x8_c(short *q, short *dq,

-                                        unsigned char *pre,

-                                        unsigned char *dst,

-                                        int stride, char *eobs,

-                                        MACROBLOCKD *xd) {

-  unsigned char *origdest = dst;

-  unsigned char *origpred = pre;

-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);

-  vp9_dequant_idct_add_8x8_c(&q[64], dq, origpred + 8,

-                             origdest + 8, 16, stride);

-  vp9_dequant_idct_add_8x8_c(&q[128], dq, origpred + 8 * 16,

-                             origdest + 8 * stride, 16, stride);

-  vp9_dequant_idct_add_8x8_c(&q[192], dq, origpred + 8 * 16 + 8,

-                             origdest + 8 * stride + 8, 16, stride);

-}

-void vp9_dequant_idct_add_uv_block_8x8_c(short *q, short *dq,

-                                         unsigned char *pre,

-                                         unsigned char *dstu,

-                                         unsigned char *dstv,

-                                         int stride, char *eobs,

-                                         MACROBLOCKD *xd) {

-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride);

-  q    += 64;

-  pre  += 64;

-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride);

-}

-#if CONFIG_SUPERBLOCKS

-void vp9_dequant_idct_add_uv_block_8x8_inplace_c(short *q, short *dq,

-                                                 unsigned char *dstu,

-                                                 unsigned char *dstv,

-                                                 int stride, char *eobs,

-                                                 MACROBLOCKD *xd) {

-  vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride);

-  q    += 64;

-  vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride);

-}

-#endif

-#if CONFIG_LOSSLESS

-void vp9_dequant_dc_idct_add_y_block_lossless_c(short *q, short *dq,

-                                                unsigned char *pre,

-                                                unsigned char *dst,

-                                                int stride, char *eobs,

-                                                short *dc) {

-  int i, j;

-  for (i = 0; i < 4; i++) {

-    for (j = 0; j < 4; j++) {

-      if (*eobs++ > 1)

-        vp9_dequant_dc_idct_add_lossless_c(q, dq, pre, dst, 16, stride, dc[0]);

-      else

-        vp9_dc_only_inv_walsh_add_c(dc[0], pre, dst, 16, stride);

-      q   += 16;

-      pre += 4;

-      dst += 4;

-      dc++;

-    }

-    pre += 64 - 16;

-    dst += 4 * stride - 16;

-  }

-}

-void vp9_dequant_idct_add_y_block_lossless_c(short *q, short *dq,

-                                             unsigned char *pre,

-                                             unsigned char *dst,

-                                             int stride, char *eobs) {

-  int i, j;

-  for (i = 0; i < 4; i++) {

-    for (j = 0; j < 4; j++) {

-      if (*eobs++ > 1)

-        vp9_dequant_idct_add_lossless_c(q, dq, pre, dst, 16, stride);

-      else {

-        vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dst, 16, stride);

-        ((int *)q)[0] = 0;

-      }

-      q   += 16;

-      pre += 4;

-      dst += 4;

-    }

-    pre += 64 - 16;

-    dst += 4 * stride - 16;

-  }

-}

-void vp9_dequant_idct_add_uv_block_lossless_c(short *q, short *dq,

-                                              unsigned char *pre,

-                                              unsigned char *dstu,

-                                              unsigned char *dstv,

-                                              int stride, char *eobs) {

-  int i, j;

-  for (i = 0; i < 2; i++) {

-    for (j = 0; j < 2; j++) {

-      if (*eobs++ > 1)

-        vp9_dequant_idct_add_lossless_c(q, dq, pre, dstu, 8, stride);

-      else {

-        vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dstu, 8, stride);

-        ((int *)q)[0] = 0;

-      }

-      q    += 16;

-      pre  += 4;

-      dstu += 4;

-    }

-    pre  += 32 - 8;

-    dstu += 4 * stride - 8;

-  }

-  for (i = 0; i < 2; i++) {

-    for (j = 0; j < 2; j++) {

-      if (*eobs++ > 1)

-        vp9_dequant_idct_add_lossless_c(q, dq, pre, dstv, 8, stride);

-      else {

-        vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dstv, 8, stride);

-        ((int *)q)[0] = 0;

-      }

-      q    += 16;

-      pre  += 4;

-      dstv += 4;

-    }

-    pre  += 32 - 8;

-    dstv += 4 * stride - 8;

-  }

-}

-#endif

--- a/vp8/decoder/onyxd_if.c

+++ /dev/null

@@ -1,506 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp8/common/onyxc_int.h"

-#if CONFIG_POSTPROC

-#include "vp8/common/postproc.h"

-#endif

-#include "vp8/common/onyxd.h"

-#include "onyxd_int.h"

-#include "vpx_mem/vpx_mem.h"

-#include "vp8/common/alloccommon.h"

-#include "vpx_scale/yv12extend.h"

-#include "vp8/common/loopfilter.h"

-#include "vp8/common/swapyv12buffer.h"

-#include <stdio.h>

-#include <assert.h>

-#include "vp8/common/quant_common.h"

-#include "vpx_scale/vpxscale.h"

-#include "vp8/common/systemdependent.h"

-#include "vpx_ports/vpx_timer.h"

-#include "detokenize.h"

-#if ARCH_ARM

-#include "vpx_ports/arm.h"

-#endif

-extern void vp9_init_de_quantizer(VP9D_COMP *pbi);

-static int get_free_fb(VP9_COMMON *cm);

-static void ref_cnt_fb(int *buf, int *idx, int new_idx);

-#if CONFIG_DEBUG

-static void recon_write_yuv_frame(char *name, YV12_BUFFER_CONFIG *s) {

-  FILE *yuv_file = fopen((char *)name, "ab");

-  unsigned char *src = s->y_buffer;

-  int h = s->y_height;

-  do {

-    fwrite(src, s->y_width, 1,  yuv_file);

-    src += s->y_stride;

-  } while (--h);

-  src = s->u_buffer;

-  h = s->uv_height;

-  do {

-    fwrite(src, s->uv_width, 1,  yuv_file);

-    src += s->uv_stride;

-  } while (--h);

-  src = s->v_buffer;

-  h = s->uv_height;

-  do {

-    fwrite(src, s->uv_width, 1, yuv_file);

-    src += s->uv_stride;

-  } while (--h);

-  fclose(yuv_file);

-}

-#endif

-#define WRITE_RECON_BUFFER 0

-#if WRITE_RECON_BUFFER

-void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {

-  // write the frame

-  FILE *yframe;

-  int i;

-  char filename[255];

-  sprintf(filename, "dx\\y%04d.raw", this_frame);

-  yframe = fopen(filename, "wb");

-  for (i = 0; i < frame->y_height; i++)

-    fwrite(frame->y_buffer + i * frame->y_stride,

-           frame->y_width, 1, yframe);

-  fclose(yframe);

-  sprintf(filename, "dx\\u%04d.raw", this_frame);

-  yframe = fopen(filename, "wb");

-  for (i = 0; i < frame->uv_height; i++)

-    fwrite(frame->u_buffer + i * frame->uv_stride,

-           frame->uv_width, 1, yframe);

-  fclose(yframe);

-  sprintf(filename, "dx\\v%04d.raw", this_frame);

-  yframe = fopen(filename, "wb");

-  for (i = 0; i < frame->uv_height; i++)

-    fwrite(frame->v_buffer + i * frame->uv_stride,

-           frame->uv_width, 1, yframe);

-  fclose(yframe);

-}

-#endif

-void vp9_initialize_dec(void) {

-  static int init_done = 0;

-  if (!init_done) {

-    vp9_initialize_common();

-    vp9_init_quant_tables();

-    vp8_scale_machine_specific_config();

-    init_done = 1;

-  }

-}

-VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {

-  VP9D_COMP *pbi = vpx_memalign(32, sizeof(VP9D_COMP));

-  if (!pbi)

-    return NULL;

-  vpx_memset(pbi, 0, sizeof(VP9D_COMP));

-  if (setjmp(pbi->common.error.jmp)) {

-    pbi->common.error.setjmp = 0;

-    vp9_remove_decompressor(pbi);

-    return 0;

-  }

-  pbi->common.error.setjmp = 1;

-  vp9_initialize_dec();

-  vp9_create_common(&pbi->common);

-  pbi->common.current_video_frame = 0;

-  pbi->ready_for_new_data = 1;

-  /* vp9_init_de_quantizer() is first called here. Add check in

-   * frame_init_dequantizer() to avoid unnecessary calling of

-   * vp9_init_de_quantizer() for every frame.

-   */

-  vp9_init_de_quantizer(pbi);

-  vp9_loop_filter_init(&pbi->common);

-  pbi->common.error.setjmp = 0;

-  pbi->decoded_key_frame = 0;

-  return (VP9D_PTR) pbi;

-}

-void vp9_remove_decompressor(VP9D_PTR ptr) {

-  VP9D_COMP *pbi = (VP9D_COMP *) ptr;

-  if (!pbi)

-    return;

-  // Delete sementation map

-  if (pbi->common.last_frame_seg_map != 0)

-    vpx_free(pbi->common.last_frame_seg_map);

-  vp9_remove_common(&pbi->common);

-  vpx_free(pbi->mbc);

-  vpx_free(pbi);

-}

-vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,

-                                      YV12_BUFFER_CONFIG *sd) {

-  VP9D_COMP *pbi = (VP9D_COMP *) ptr;

-  VP9_COMMON *cm = &pbi->common;

-  int ref_fb_idx;

-  if (ref_frame_flag == VP9_LAST_FLAG)

-    ref_fb_idx = cm->lst_fb_idx;

-  else if (ref_frame_flag == VP9_GOLD_FLAG)

-    ref_fb_idx = cm->gld_fb_idx;

-  else if (ref_frame_flag == VP9_ALT_FLAG)

-    ref_fb_idx = cm->alt_fb_idx;

-  else {

-    vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,

-                       "Invalid reference frame");

-    return pbi->common.error.error_code;

-  }

-  if (cm->yv12_fb[ref_fb_idx].y_height != sd->y_height ||

-      cm->yv12_fb[ref_fb_idx].y_width != sd->y_width ||

-      cm->yv12_fb[ref_fb_idx].uv_height != sd->uv_height ||

-      cm->yv12_fb[ref_fb_idx].uv_width != sd->uv_width) {

-    vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,

-                       "Incorrect buffer dimensions");

-  } else

-    vp8_yv12_copy_frame_ptr(&cm->yv12_fb[ref_fb_idx], sd);

-  return pbi->common.error.error_code;

-}

-vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,

-                                      YV12_BUFFER_CONFIG *sd) {

-  VP9D_COMP *pbi = (VP9D_COMP *) ptr;

-  VP9_COMMON *cm = &pbi->common;

-  int *ref_fb_ptr = NULL;

-  int free_fb;

-  if (ref_frame_flag == VP9_LAST_FLAG)

-    ref_fb_ptr = &cm->lst_fb_idx;

-  else if (ref_frame_flag == VP9_GOLD_FLAG)

-    ref_fb_ptr = &cm->gld_fb_idx;

-  else if (ref_frame_flag == VP9_ALT_FLAG)

-    ref_fb_ptr = &cm->alt_fb_idx;

-  else {

-    vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,

-                       "Invalid reference frame");

-    return pbi->common.error.error_code;

-  }

-  if (cm->yv12_fb[*ref_fb_ptr].y_height != sd->y_height ||

-      cm->yv12_fb[*ref_fb_ptr].y_width != sd->y_width ||

-      cm->yv12_fb[*ref_fb_ptr].uv_height != sd->uv_height ||

-      cm->yv12_fb[*ref_fb_ptr].uv_width != sd->uv_width) {

-    vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,

-                       "Incorrect buffer dimensions");

-  } else {

-    /* Find an empty frame buffer. */

-    free_fb = get_free_fb(cm);

-    /* Decrease fb_idx_ref_cnt since it will be increased again in

-     * ref_cnt_fb() below. */

-    cm->fb_idx_ref_cnt[free_fb]--;

-    /* Manage the reference counters and copy image. */

-    ref_cnt_fb(cm->fb_idx_ref_cnt, ref_fb_ptr, free_fb);

-    vp8_yv12_copy_frame_ptr(sd, &cm->yv12_fb[*ref_fb_ptr]);

-  }

-  return pbi->common.error.error_code;

-}

-/*For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.*/

-#if HAVE_ARMV7

-extern void vp9_push_neon(int64_t *store);

-extern void vp9_pop_neon(int64_t *store);

-#endif

-static int get_free_fb(VP9_COMMON *cm) {

-  int i;

-  for (i = 0; i < NUM_YV12_BUFFERS; i++)

-    if (cm->fb_idx_ref_cnt[i] == 0)

-      break;

-  assert(i < NUM_YV12_BUFFERS);

-  cm->fb_idx_ref_cnt[i] = 1;

-  return i;

-}

-static void ref_cnt_fb(int *buf, int *idx, int new_idx) {

-  if (buf[*idx] > 0)

-    buf[*idx]--;

-  *idx = new_idx;

-  buf[new_idx]++;

-}

-/* If any buffer copy / swapping is signalled it should be done here. */

-static int swap_frame_buffers(VP9_COMMON *cm) {

-  int err = 0;

-  /* The alternate reference frame or golden frame can be updated

-   *  using the new, last, or golden/alt ref frame.  If it

-   *  is updated using the newly decoded frame it is a refresh.

-   *  An update using the last or golden/alt ref frame is a copy.

-   */

-  if (cm->copy_buffer_to_arf) {

-    int new_fb = 0;

-    if (cm->copy_buffer_to_arf == 1)

-      new_fb = cm->lst_fb_idx;

-    else if (cm->copy_buffer_to_arf == 2)

-      new_fb = cm->gld_fb_idx;

-    else

-      err = -1;

-    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->alt_fb_idx, new_fb);

-  }

-  if (cm->copy_buffer_to_gf) {

-    int new_fb = 0;

-    if (cm->copy_buffer_to_gf == 1)

-      new_fb = cm->lst_fb_idx;

-    else if (cm->copy_buffer_to_gf == 2)

-      new_fb = cm->alt_fb_idx;

-    else

-      err = -1;

-    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->gld_fb_idx, new_fb);

-  }

-  if (cm->refresh_golden_frame)

-    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->gld_fb_idx, cm->new_fb_idx);

-  if (cm->refresh_alt_ref_frame)

-    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->alt_fb_idx, cm->new_fb_idx);

-  if (cm->refresh_last_frame) {

-    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->lst_fb_idx, cm->new_fb_idx);

-    cm->frame_to_show = &cm->yv12_fb[cm->lst_fb_idx];

-  } else

-    cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];

-  cm->fb_idx_ref_cnt[cm->new_fb_idx]--;

-  return err;

-}

-int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,

-                                const unsigned char *source,

-                                int64_t time_stamp) {

-#if HAVE_ARMV7

-  int64_t dx_store_reg[8];

-#endif

-  VP9D_COMP *pbi = (VP9D_COMP *) ptr;

-  VP9_COMMON *cm = &pbi->common;

-  int retcode = 0;

-  /*if(pbi->ready_for_new_data == 0)

-      return -1;*/

-  if (ptr == 0) {

-    return -1;

-  }

-  pbi->common.error.error_code = VPX_CODEC_OK;

-  pbi->Source = source;

-  pbi->source_sz = size;

-  if (pbi->source_sz == 0) {

-    /* This is used to signal that we are missing frames.

-     * We do not know if the missing frame(s) was supposed to update

-     * any of the reference buffers, but we act conservative and

-     * mark only the last buffer as corrupted.

-     */

-    cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;

-  }

-#if HAVE_ARMV7

-#if CONFIG_RUNTIME_CPU_DETECT

-  if (cm->rtcd.flags & HAS_NEON)

-#endif

-  {

-    vp9_push_neon(dx_store_reg);

-  }

-#endif

-  cm->new_fb_idx = get_free_fb(cm);

-  if (setjmp(pbi->common.error.jmp)) {

-#if HAVE_ARMV7

-#if CONFIG_RUNTIME_CPU_DETECT

-    if (cm->rtcd.flags & HAS_NEON)

-#endif

-    {

-      vp9_pop_neon(dx_store_reg);

-    }

-#endif

-    pbi->common.error.setjmp = 0;

-    /* We do not know if the missing frame(s) was supposed to update

-     * any of the reference buffers, but we act conservative and

-     * mark only the last buffer as corrupted.

-     */

-    cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;

-    if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)

-      cm->fb_idx_ref_cnt[cm->new_fb_idx]--;

-    return -1;

-  }

-  pbi->common.error.setjmp = 1;

-  retcode = vp9_decode_frame(pbi);

-  if (retcode < 0) {

-#if HAVE_ARMV7

-#if CONFIG_RUNTIME_CPU_DETECT

-    if (cm->rtcd.flags & HAS_NEON)

-#endif

-    {

-      vp9_pop_neon(dx_store_reg);

-    }

-#endif

-    pbi->common.error.error_code = VPX_CODEC_ERROR;

-    pbi->common.error.setjmp = 0;

-    if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)

-      cm->fb_idx_ref_cnt[cm->new_fb_idx]--;

-    return retcode;

-  }

-  {

-    if (swap_frame_buffers(cm)) {

-#if HAVE_ARMV7

-#if CONFIG_RUNTIME_CPU_DETECT

-      if (cm->rtcd.flags & HAS_NEON)

-#endif

-      {

-        vp9_pop_neon(dx_store_reg);

-      }

-#endif

-      pbi->common.error.error_code = VPX_CODEC_ERROR;

-      pbi->common.error.setjmp = 0;

-      return -1;

-    }

-#if WRITE_RECON_BUFFER

-    if (cm->show_frame)

-      write_dx_frame_to_file(cm->frame_to_show,

-                             cm->current_video_frame);

-    else

-      write_dx_frame_to_file(cm->frame_to_show,

-                             cm->current_video_frame + 1000);

-#endif

-    if (cm->filter_level) {

-      /* Apply the loop filter if appropriate. */

-      vp9_loop_filter_frame(cm, &pbi->mb);

-    }

-    vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);

-  }

-#if CONFIG_DEBUG

-  if (cm->show_frame)

-    recon_write_yuv_frame("recon.yuv", cm->frame_to_show);

-#endif

-  vp9_clear_system_state();

-  if (cm->show_frame) {

-    vpx_memcpy(cm->prev_mip, cm->mip,

-               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

-  } else {

-    vpx_memset(cm->prev_mip, 0,

-               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

-  }

-  /*vp9_print_modes_and_motion_vectors(cm->mi, cm->mb_rows,cm->mb_cols,

-                                       cm->current_video_frame);*/

-  if (cm->show_frame)

-    cm->current_video_frame++;

-  pbi->ready_for_new_data = 0;

-  pbi->last_time_stamp = time_stamp;

-  pbi->source_sz = 0;

-#if HAVE_ARMV7

-#if CONFIG_RUNTIME_CPU_DETECT

-  if (cm->rtcd.flags & HAS_NEON)

-#endif

-  {

-    vp9_pop_neon(dx_store_reg);

-  }

-#endif

-  pbi->common.error.setjmp = 0;

-  return retcode;

-}

-int vp9_get_raw_frame(VP9D_PTR ptr, YV12_BUFFER_CONFIG *sd,

-                      int64_t *time_stamp, int64_t *time_end_stamp,

-                      vp9_ppflags_t *flags) {

-  int ret = -1;

-  VP9D_COMP *pbi = (VP9D_COMP *) ptr;

-  if (pbi->ready_for_new_data == 1)

-    return ret;

-  /* ie no raw frame to show!!! */

-  if (pbi->common.show_frame == 0)

-    return ret;

-  pbi->ready_for_new_data = 1;

-  *time_stamp = pbi->last_time_stamp;

-  *time_end_stamp = 0;

-  sd->clrtype = pbi->common.clr_type;

-#if CONFIG_POSTPROC

-  ret = vp9_post_proc_frame(&pbi->common, sd, flags);

-#else

-  if (pbi->common.frame_to_show) {

-    *sd = *pbi->common.frame_to_show;

-    sd->y_width = pbi->common.Width;

-    sd->y_height = pbi->common.Height;

-    sd->uv_height = pbi->common.Height / 2;

-    ret = 0;

-  } else {

-    ret = -1;

-  }

-#endif /*!CONFIG_POSTPROC*/

-  vp9_clear_system_state();

-  return ret;

-}

--- a/vp8/decoder/onyxd_int.h

+++ /dev/null

@@ -1,106 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_ONYXD_INT_H

-#define __INC_ONYXD_INT_H

-#include "vpx_ports/config.h"

-#include "vp8/common/onyxd.h"

-#include "treereader.h"

-#include "vp8/common/onyxc_int.h"

-#include "dequantize.h"

-// #define DEC_DEBUG

-typedef struct {

-  int ithread;

-  void *ptr1;

-  void *ptr2;

-} DECODETHREAD_DATA;

-typedef struct {

-  MACROBLOCKD  mbd;

-  int mb_row;

-  int current_mb_col;

-  short *coef_ptr;

-} MB_ROW_DEC;

-typedef struct {

-  int const *scan;

-  int const *scan_8x8;

-  UINT8 const *ptr_block2leftabove;

-  vp9_tree_index const *vp9_coef_tree_ptr;

-  unsigned char *norm_ptr;

-  UINT8 *ptr_coef_bands_x;

-  UINT8 *ptr_coef_bands_x_8x8;

-  ENTROPY_CONTEXT_PLANES *A;

-  ENTROPY_CONTEXT_PLANES *L;

-  INT16 *qcoeff_start_ptr;

-  vp9_prob const *coef_probs[BLOCK_TYPES];

-  vp9_prob const *coef_probs_8x8[BLOCK_TYPES_8X8];

-  vp9_prob const *coef_probs_16X16[BLOCK_TYPES_16X16];

-  UINT8 eob[25];

-} DETOK;

-typedef struct VP9Decompressor {

-  DECLARE_ALIGNED(16, MACROBLOCKD, mb);

-  DECLARE_ALIGNED(16, VP9_COMMON, common);

-  VP9D_CONFIG oxcf;

-  const unsigned char *Source;

-  unsigned int   source_sz;

-  vp9_reader *mbc;

-  int64_t last_time_stamp;

-  int   ready_for_new_data;

-  DETOK detoken;

-  vp9_dequant_idct_add_fn_t            idct_add;

-  vp9_dequant_dc_idct_add_fn_t         dc_idct_add;

-  vp9_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block;

-  vp9_dequant_idct_add_y_block_fn_t    idct_add_y_block;

-  vp9_dequant_idct_add_uv_block_fn_t   idct_add_uv_block;

-  vp9_prob prob_skip_false;

-  int decoded_key_frame;

-} VP9D_COMP;

-int vp9_decode_frame(VP9D_COMP *cpi);

-#if CONFIG_DEBUG

-#define CHECK_MEM_ERROR(lval,expr) do {\

-    lval = (expr); \

-    if(!lval) \

-      vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\

-                         "Failed to allocate "#lval" at %s:%d", \

-                         __FILE__,__LINE__);\

-  } while(0)

-#else

-#define CHECK_MEM_ERROR(lval,expr) do {\

-    lval = (expr); \

-    if(!lval) \

-      vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\

-                         "Failed to allocate "#lval);\

-  } while(0)

-#endif

-#endif  // __INC_ONYXD_INT_H

--- a/vp8/decoder/reconintra_mt.h

+++ /dev/null

@@ -1,15 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_RECONINTRA_MT_H

-#define __INC_RECONINTRA_MT_H

-#endif

--- a/vp8/decoder/treereader.h

+++ /dev/null

@@ -1,37 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef tree_reader_h

-#define tree_reader_h 1

-#include "vp8/common/treecoder.h"

-#include "dboolhuff.h"

-typedef BOOL_DECODER vp9_reader;

-#define vp9_read decode_bool

-#define vp9_read_literal decode_value

-#define vp9_read_bit(R) vp9_read(R, vp9_prob_half)

-/* Intent of tree data structure is to make decoding trivial. */

-static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */

-                      vp9_tree t,

-                      const vp9_prob *const p) {

-  register vp9_tree_index i = 0;

-  while ((i = t[ i + vp9_read(r, p[i >> 1])]) > 0);

-  return -i;

-}

-#endif /* tree_reader_h */

--- a/vp8/decoder/x86/dequantize_mmx.asm

+++ /dev/null

@@ -1,406 +1,0 @@

-;

-;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "third_party/x86inc/x86inc.asm"

-SECTION_RODATA

-align 16

-x_s1sqr2:      times 4 dw 0x8A8C

-align 16

-x_c1sqr2less1: times 4 dw 0x4E7B

-align 16

-pw_16:         times 4 dw 16

-SECTION .text

-INIT_MMX

-;void dequantize_b_impl_mmx(short *sq, short *dq, short *q)

-cglobal dequantize_b_impl_mmx, 3,3,0,sq,dq,arg3

-    mova       m1, [sqq]

-    pmullw     m1, [arg3q+0]            ; mm4 *= kernel 0 modifiers.

-    mova [dqq+ 0], m1

-    mova       m1, [sqq+8]

-    pmullw     m1, [arg3q+8]            ; mm4 *= kernel 0 modifiers.

-    mova [dqq+ 8], m1

-    mova       m1, [sqq+16]

-    pmullw     m1, [arg3q+16]            ; mm4 *= kernel 0 modifiers.

-    mova [dqq+16], m1

-    mova       m1, [sqq+24]

-    pmullw     m1, [arg3q+24]            ; mm4 *= kernel 0 modifiers.

-    mova [dqq+24], m1

-    RET

-;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)

-cglobal dequant_idct_add_mmx, 4,6,0,inp,dq,pred,dest,pit,stride

-%if ARCH_X86_64

-    movsxd              strideq,  dword stridem

-    movsxd              pitq,     dword pitm

-%else

-    mov                 strideq,  stridem

-    mov                 pitq,     pitm

-%endif

-    mova                m0,       [inpq+ 0]

-    pmullw              m0,       [dqq]

-    mova                m1,       [inpq+ 8]

-    pmullw              m1,       [dqq+ 8]

-    mova                m2,       [inpq+16]

-    pmullw              m2,       [dqq+16]

-    mova                m3,       [inpq+24]

-    pmullw              m3,       [dqq+24]

-    pxor                m7,        m7

-    mova            [inpq],        m7

-    mova          [inpq+8],        m7

-    mova         [inpq+16],        m7

-    mova         [inpq+24],        m7

-    psubw               m0,        m2             ; b1= 0-2

-    paddw               m2,        m2             ;

-    mova                m5,        m1

-    paddw               m2,        m0             ; a1 =0+2

-    pmulhw              m5,       [x_s1sqr2];

-    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)

-    mova                m7,        m3             ;

-    pmulhw              m7,       [x_c1sqr2less1];

-    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)

-    psubw               m7,        m5             ; c1

-    mova                m5,        m1

-    mova                m4,        m3

-    pmulhw              m5,       [x_c1sqr2less1]

-    paddw               m5,        m1

-    pmulhw              m3,       [x_s1sqr2]

-    paddw               m3,        m4

-    paddw               m3,        m5             ; d1

-    mova                m6,        m2             ; a1

-    mova                m4,        m0             ; b1

-    paddw               m2,        m3             ;0

-    paddw               m4,        m7             ;1

-    psubw               m0,        m7             ;2

-    psubw               m6,        m3             ;3

-    mova                m1,        m2             ; 03 02 01 00

-    mova                m3,        m4             ; 23 22 21 20

-    punpcklwd           m1,        m0             ; 11 01 10 00

-    punpckhwd           m2,        m0             ; 13 03 12 02

-    punpcklwd           m3,        m6             ; 31 21 30 20

-    punpckhwd           m4,        m6             ; 33 23 32 22

-    mova                m0,        m1             ; 11 01 10 00

-    mova                m5,        m2             ; 13 03 12 02

-    punpckldq           m0,        m3             ; 30 20 10 00

-    punpckhdq           m1,        m3             ; 31 21 11 01

-    punpckldq           m2,        m4             ; 32 22 12 02

-    punpckhdq           m5,        m4             ; 33 23 13 03

-    mova                m3,        m5             ; 33 23 13 03

-    psubw               m0,        m2             ; b1= 0-2

-    paddw               m2,        m2             ;

-    mova                m5,        m1

-    paddw               m2,        m0             ; a1 =0+2

-    pmulhw              m5,       [x_s1sqr2];

-    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)

-    mova                m7,        m3             ;

-    pmulhw              m7,       [x_c1sqr2less1];

-    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)

-    psubw               m7,        m5             ; c1

-    mova                m5,        m1

-    mova                m4,        m3

-    pmulhw              m5,       [x_c1sqr2less1]

-    paddw               m5,        m1

-    pmulhw              m3,       [x_s1sqr2]

-    paddw               m3,        m4

-    paddw               m3,        m5             ; d1

-    paddw               m0,       [pw_16]

-    paddw               m2,       [pw_16]

-    mova                m6,        m2             ; a1

-    mova                m4,        m0             ; b1

-    paddw               m2,        m3             ;0

-    paddw               m4,        m7             ;1

-    psubw               m0,        m7             ;2

-    psubw               m6,        m3             ;3

-    psraw               m2,        5

-    psraw               m0,        5

-    psraw               m4,        5

-    psraw               m6,        5

-    mova                m1,        m2             ; 03 02 01 00

-    mova                m3,        m4             ; 23 22 21 20

-    punpcklwd           m1,        m0             ; 11 01 10 00

-    punpckhwd           m2,        m0             ; 13 03 12 02

-    punpcklwd           m3,        m6             ; 31 21 30 20

-    punpckhwd           m4,        m6             ; 33 23 32 22

-    mova                m0,        m1             ; 11 01 10 00

-    mova                m5,        m2             ; 13 03 12 02

-    punpckldq           m0,        m3             ; 30 20 10 00

-    punpckhdq           m1,        m3             ; 31 21 11 01

-    punpckldq           m2,        m4             ; 32 22 12 02

-    punpckhdq           m5,        m4             ; 33 23 13 03

-    pxor                m7,        m7

-    movh                m4,       [predq]

-    punpcklbw           m4,        m7

-    paddsw              m0,        m4

-    packuswb            m0,        m7

-    movh           [destq],      m0

-    movh                m4,       [predq+pitq]

-    punpcklbw           m4,        m7

-    paddsw              m1,        m4

-    packuswb            m1,        m7

-    movh   [destq+strideq],        m1

-    movh                m4,       [predq+2*pitq]

-    punpcklbw           m4,        m7

-    paddsw              m2,        m4

-    packuswb            m2,        m7

-    movh [destq+strideq*2],        m2

-    add              destq,        strideq

-    add              predq,        pitq

-    movh                m4,       [predq+2*pitq]

-    punpcklbw           m4,        m7

-    paddsw              m5,        m4

-    packuswb            m5,        m7

-    movh [destq+strideq*2],        m5

-    RET

-;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)

-cglobal dequant_dc_idct_add_mmx, 4,7,0,inp,dq,pred,dest,pit,stride,Dc

-%if ARCH_X86_64

-    movsxd              strideq,   dword stridem

-    movsxd              pitq,      dword pitm

-%else

-    mov                 strideq,   stridem

-    mov                 pitq,      pitm

-%endif

-    mov                 Dcq, Dcm

-    mova                m0,       [inpq+ 0]

-    pmullw              m0,       [dqq+ 0]

-    mova                m1,       [inpq+ 8]

-    pmullw              m1,       [dqq+ 8]

-    mova                m2,       [inpq+16]

-    pmullw              m2,       [dqq+16]

-    mova                m3,       [inpq+24]

-    pmullw              m3,       [dqq+24]

-    pxor                m7,        m7

-    mova         [inpq+ 0],        m7

-    mova         [inpq+ 8],        m7

-    mova         [inpq+16],        m7

-    mova         [inpq+24],        m7

-    ; move lower word of Dc to lower word of m0

-    psrlq               m0,        16

-    psllq               m0,        16

-    and                Dcq,        0xFFFF         ; If Dc < 0, we don't want the full dword precision.

-    movh                m7,        Dcq

-    por                 m0,        m7

-    psubw               m0,        m2             ; b1= 0-2

-    paddw               m2,        m2             ;

-    mova                m5,        m1

-    paddw               m2,        m0             ; a1 =0+2

-    pmulhw              m5,       [x_s1sqr2];

-    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)

-    mova                m7,        m3             ;

-    pmulhw              m7,       [x_c1sqr2less1];

-    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)

-    psubw               m7,        m5             ; c1

-    mova                m5,        m1

-    mova                m4,        m3

-    pmulhw              m5,       [x_c1sqr2less1]

-    paddw               m5,        m1

-    pmulhw              m3,       [x_s1sqr2]

-    paddw               m3,        m4

-    paddw               m3,        m5             ; d1

-    mova                m6,        m2             ; a1

-    mova                m4,        m0             ; b1

-    paddw               m2,        m3             ;0

-    paddw               m4,        m7             ;1

-    psubw               m0,        m7             ;2

-    psubw               m6,        m3             ;3

-    mova                m1,        m2             ; 03 02 01 00

-    mova                m3,        m4             ; 23 22 21 20

-    punpcklwd           m1,        m0             ; 11 01 10 00

-    punpckhwd           m2,        m0             ; 13 03 12 02

-    punpcklwd           m3,        m6             ; 31 21 30 20

-    punpckhwd           m4,        m6             ; 33 23 32 22

-    mova                m0,        m1             ; 11 01 10 00

-    mova                m5,        m2             ; 13 03 12 02

-    punpckldq           m0,        m3             ; 30 20 10 00

-    punpckhdq           m1,        m3             ; 31 21 11 01

-    punpckldq           m2,        m4             ; 32 22 12 02

-    punpckhdq           m5,        m4             ; 33 23 13 03

-    mova                m3,        m5             ; 33 23 13 03

-    psubw               m0,        m2             ; b1= 0-2

-    paddw               m2,        m2             ;

-    mova                m5,        m1

-    paddw               m2,        m0             ; a1 =0+2

-    pmulhw              m5,       [x_s1sqr2];

-    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)

-    mova                m7,        m3             ;

-    pmulhw              m7,       [x_c1sqr2less1];

-    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)

-    psubw               m7,        m5             ; c1

-    mova                m5,        m1

-    mova                m4,        m3

-    pmulhw              m5,       [x_c1sqr2less1]

-    paddw               m5,        m1

-    pmulhw              m3,       [x_s1sqr2]

-    paddw               m3,        m4

-    paddw               m3,        m5             ; d1

-    paddw               m0,       [pw_16]

-    paddw               m2,       [pw_16]

-    mova                m6,        m2             ; a1

-    mova                m4,        m0             ; b1

-    paddw               m2,        m3             ;0

-    paddw               m4,        m7             ;1

-    psubw               m0,        m7             ;2

-    psubw               m6,        m3             ;3

-    psraw               m2,        5

-    psraw               m0,        5

-    psraw               m4,        5

-    psraw               m6,        5

-    mova                m1,        m2             ; 03 02 01 00

-    mova                m3,        m4             ; 23 22 21 20

-    punpcklwd           m1,        m0             ; 11 01 10 00

-    punpckhwd           m2,        m0             ; 13 03 12 02

-    punpcklwd           m3,        m6             ; 31 21 30 20

-    punpckhwd           m4,        m6             ; 33 23 32 22

-    mova                m0,        m1             ; 11 01 10 00

-    mova                m5,        m2             ; 13 03 12 02

-    punpckldq           m0,        m3             ; 30 20 10 00

-    punpckhdq           m1,        m3             ; 31 21 11 01

-    punpckldq           m2,        m4             ; 32 22 12 02

-    punpckhdq           m5,        m4             ; 33 23 13 03

-    pxor                m7,        m7

-    movh                m4,       [predq]

-    punpcklbw           m4,        m7

-    paddsw              m0,        m4

-    packuswb            m0,        m7

-    movh           [destq],        m0

-    movh                m4,       [predq+pitq]

-    punpcklbw           m4,        m7

-    paddsw              m1,        m4

-    packuswb            m1,        m7

-    movh   [destq+strideq],        m1

-    movh                m4,       [predq+2*pitq]

-    punpcklbw           m4,        m7

-    paddsw              m2,        m4

-    packuswb            m2,        m7

-    movh [destq+strideq*2],        m2

-    add              destq,        strideq

-    add              predq,        pitq

-    movh                m4,       [predq+2*pitq]

-    punpcklbw           m4,        m7

-    paddsw              m5,        m4

-    packuswb            m5,        m7

-    movh [destq+strideq*2],        m5

-    RET

--- a/vp8/decoder/x86/idct_blk_mmx.c

+++ /dev/null

@@ -1,143 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "vp8/common/idct.h"

-#include "vp8/decoder/dequantize.h"

-void vp9_dequant_dc_idct_add_y_block_mmx(short *q, short *dq,

-                                         unsigned char *pre,

-                                         unsigned char *dst,

-                                         int stride, char *eobs, short *dc) {

-  int i;

-  for (i = 0; i < 4; i++) {

-    if (eobs[0] > 1)

-      vp9_dequant_dc_idct_add_mmx(q, dq, pre, dst, 16, stride, dc[0]);

-    else

-      vp9_dc_only_idct_add_mmx(dc[0], pre, dst, 16, stride);

-    if (eobs[1] > 1)

-      vp9_dequant_dc_idct_add_mmx(q + 16, dq, pre + 4,

-                                  dst + 4, 16, stride, dc[1]);

-    else

-      vp9_dc_only_idct_add_mmx(dc[1], pre + 4, dst + 4, 16, stride);

-    if (eobs[2] > 1)

-      vp9_dequant_dc_idct_add_mmx(q + 32, dq, pre + 8,

-                                  dst + 8, 16, stride, dc[2]);

-    else

-      vp9_dc_only_idct_add_mmx(dc[2], pre + 8, dst + 8, 16, stride);

-    if (eobs[3] > 1)

-      vp9_dequant_dc_idct_add_mmx(q + 48, dq, pre + 12,

-                                  dst + 12, 16, stride, dc[3]);

-    else

-      vp9_dc_only_idct_add_mmx(dc[3], pre + 12, dst + 12, 16, stride);

-    q    += 64;

-    dc   += 4;

-    pre  += 64;

-    dst  += 4 * stride;

-    eobs += 4;

-  }

-}

-void vp9_dequant_idct_add_y_block_mmx(short *q, short *dq,

-                                      unsigned char *pre,

-                                      unsigned char *dst,

-                                      int stride, char *eobs) {

-  int i;

-  for (i = 0; i < 4; i++) {

-    if (eobs[0] > 1)

-      vp9_dequant_idct_add_mmx(q, dq, pre, dst, 16, stride);

-    else {

-      vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dst, 16, stride);

-      ((int *)q)[0] = 0;

-    }

-    if (eobs[1] > 1)

-      vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dst + 4, 16, stride);

-    else {

-      vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dst + 4, 16, stride);

-      ((int *)(q + 16))[0] = 0;

-    }

-    if (eobs[2] > 1)

-      vp9_dequant_idct_add_mmx(q + 32, dq, pre + 8, dst + 8, 16, stride);

-    else {

-      vp9_dc_only_idct_add_mmx(q[32]*dq[0], pre + 8, dst + 8, 16, stride);

-      ((int *)(q + 32))[0] = 0;

-    }

-    if (eobs[3] > 1)

-      vp9_dequant_idct_add_mmx(q + 48, dq, pre + 12, dst + 12, 16, stride);

-    else {

-      vp9_dc_only_idct_add_mmx(q[48]*dq[0], pre + 12, dst + 12, 16, stride);

-      ((int *)(q + 48))[0] = 0;

-    }

-    q    += 64;

-    pre  += 64;

-    dst  += 4 * stride;

-    eobs += 4;

-  }

-}

-void vp9_dequant_idct_add_uv_block_mmx(short *q, short *dq,

-                                       unsigned char *pre,

-                                       unsigned char *dstu,

-                                       unsigned char *dstv,

-                                       int stride, char *eobs) {

-  int i;

-  for (i = 0; i < 2; i++) {

-    if (eobs[0] > 1)

-      vp9_dequant_idct_add_mmx(q, dq, pre, dstu, 8, stride);

-    else {

-      vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dstu, 8, stride);

-      ((int *)q)[0] = 0;

-    }

-    if (eobs[1] > 1)

-      vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dstu + 4, 8, stride);

-    else {

-      vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dstu + 4, 8, stride);

-      ((int *)(q + 16))[0] = 0;

-    }

-    q    += 32;

-    pre  += 32;

-    dstu += 4 * stride;

-    eobs += 2;

-  }

-  for (i = 0; i < 2; i++) {

-    if (eobs[0] > 1)

-      vp9_dequant_idct_add_mmx(q, dq, pre, dstv, 8, stride);

-    else {

-      vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dstv, 8, stride);

-      ((int *)q)[0] = 0;

-    }

-    if (eobs[1] > 1)

-      vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dstv + 4, 8, stride);

-    else {

-      vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dstv + 4, 8, stride);

-      ((int *)(q + 16))[0] = 0;

-    }

-    q    += 32;

-    pre  += 32;

-    dstv += 4 * stride;

-    eobs += 2;

-  }

-}

--- a/vp8/decoder/x86/idct_blk_sse2.c

+++ /dev/null

@@ -1,116 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "vp8/common/idct.h"

-#include "vp8/decoder/dequantize.h"

-void vp9_idct_dequant_dc_0_2x_sse2(short *q, short *dq,

-                                   unsigned char *pre, unsigned char *dst,

-                                   int dst_stride, short *dc);

-void vp9_idct_dequant_dc_full_2x_sse2(short *q, short *dq,

-                                      unsigned char *pre, unsigned char *dst,

-                                      int dst_stride, short *dc);

-void vp9_idct_dequant_0_2x_sse2(short *q, short *dq,

-                                unsigned char *pre, unsigned char *dst,

-                                int dst_stride, int blk_stride);

-void vp9_idct_dequant_full_2x_sse2(short *q, short *dq,

-                                   unsigned char *pre, unsigned char *dst,

-                                   int dst_stride, int blk_stride);

-void vp9_dequant_dc_idct_add_y_block_sse2(short *q, short *dq,

-                                          unsigned char *pre,

-                                          unsigned char *dst,

-                                          int stride, char *eobs, short *dc) {

-  int i;

-  for (i = 0; i < 4; i++) {

-    if (((short *)(eobs))[0] & 0xfefe)

-      vp9_idct_dequant_dc_full_2x_sse2(q, dq, pre, dst, stride, dc);

-    else

-      vp9_idct_dequant_dc_0_2x_sse2(q, dq, pre, dst, stride, dc);

-    if (((short *)(eobs))[1] & 0xfefe)

-      vp9_idct_dequant_dc_full_2x_sse2(q + 32, dq, pre + 8, dst + 8,

-                                       stride, dc + 2);

-    else

-      vp9_idct_dequant_dc_0_2x_sse2(q + 32, dq, pre + 8, dst + 8,

-                                    stride, dc + 2);

-    q    += 64;

-    dc   += 4;

-    pre  += 64;

-    dst  += stride * 4;

-    eobs += 4;

-  }

-}

-void vp9_dequant_idct_add_y_block_sse2(short *q, short *dq,

-                                       unsigned char *pre, unsigned char *dst,

-                                       int stride, char *eobs) {

-  int i;

-  for (i = 0; i < 4; i++) {

-    if (((short *)(eobs))[0] & 0xfefe)

-      vp9_idct_dequant_full_2x_sse2(q, dq, pre, dst, stride, 16);

-    else

-      vp9_idct_dequant_0_2x_sse2(q, dq, pre, dst, stride, 16);

-    if (((short *)(eobs))[1] & 0xfefe)

-      vp9_idct_dequant_full_2x_sse2(q + 32, dq, pre + 8, dst + 8, stride, 16);

-    else

-      vp9_idct_dequant_0_2x_sse2(q + 32, dq, pre + 8, dst + 8, stride, 16);

-    q    += 64;

-    pre  += 64;

-    dst  += stride * 4;

-    eobs += 4;

-  }

-}

-void vp9_dequant_idct_add_uv_block_sse2(short *q, short *dq,

-                                        unsigned char *pre,

-                                        unsigned char *dstu,

-                                        unsigned char *dstv,

-                                        int stride, char *eobs) {

-  if (((short *)(eobs))[0] & 0xfefe)

-    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstu, stride, 8);

-  else

-    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstu, stride, 8);

-  q    += 32;

-  pre  += 32;

-  dstu += stride * 4;

-  if (((short *)(eobs))[1] & 0xfefe)

-    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstu, stride, 8);

-  else

-    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstu, stride, 8);

-  q    += 32;

-  pre  += 32;

-  if (((short *)(eobs))[2] & 0xfefe)

-    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstv, stride, 8);

-  else

-    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstv, stride, 8);

-  q    += 32;

-  pre  += 32;

-  dstv += stride * 4;

-  if (((short *)(eobs))[3] & 0xfefe)

-    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstv, stride, 8);

-  else

-    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstv, stride, 8);

-}

--- a/vp8/decoder/x86/x86_dsystemdependent.c

+++ /dev/null

@@ -1,26 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "vpx_ports/x86.h"

-#include "vp8/decoder/onyxd_int.h"

-#if HAVE_MMX

-void vp9_dequantize_b_impl_mmx(short *sq, short *dq, short *q);

-void vp9_dequantize_b_mmx(BLOCKD *d) {

-  short *sq = (short *) d->qcoeff;

-  short *dq = (short *) d->dqcoeff;

-  short *q = (short *) d->dequant;

-  vp9_dequantize_b_impl_mmx(sq, dq, q);

-}

-#endif

--- a/vp8/encoder/arm/arm_csystemdependent.c

+++ /dev/null

@@ -1,129 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "vpx_ports/arm.h"

-#include "vp8/encoder/variance.h"

-#include "vp8/encoder/onyx_int.h"

-extern void (*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);

-extern void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);

-extern void vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);

-void vp9_arch_arm_encoder_init(VP9_COMP *cpi) {

-#if CONFIG_RUNTIME_CPU_DETECT

-  int flags = cpi->common.rtcd.flags;

-#if HAVE_ARMV5TE

-  if (flags & HAS_EDSP) {

-  }

-#endif

-#if HAVE_ARMV6

-  if (flags & HAS_MEDIA) {

-    cpi->rtcd.variance.sad16x16              = vp9_sad16x16_armv6;

-    /*cpi->rtcd.variance.sad16x8               = vp9_sad16x8_c;

-    cpi->rtcd.variance.sad8x16               = vp9_sad8x16_c;

-    cpi->rtcd.variance.sad8x8                = vp9_sad8x8_c;

-    cpi->rtcd.variance.sad4x4                = vp9_sad4x4_c;*/

-    /*cpi->rtcd.variance.var4x4                = vp9_variance4x4_c;*/

-    cpi->rtcd.variance.var8x8                = vp9_variance8x8_armv6;

-    /*cpi->rtcd.variance.var8x16               = vp9_variance8x16_c;

-    cpi->rtcd.variance.var16x8               = vp9_variance16x8_c;*/

-    cpi->rtcd.variance.var16x16              = vp9_variance16x16_armv6;

-    /*cpi->rtcd.variance.subpixvar4x4          = vp9_sub_pixel_variance4x4_c;*/

-    cpi->rtcd.variance.subpixvar8x8          = vp9_sub_pixel_variance8x8_armv6;

-    /*cpi->rtcd.variance.subpixvar8x16         = vp9_sub_pixel_variance8x16_c;

-    cpi->rtcd.variance.subpixvar16x8         = vp9_sub_pixel_variance16x8_c;*/

-    cpi->rtcd.variance.subpixvar16x16        = vp9_sub_pixel_variance16x16_armv6;

-    cpi->rtcd.variance.halfpixvar16x16_h     = vp9_variance_halfpixvar16x16_h_armv6;

-    cpi->rtcd.variance.halfpixvar16x16_v     = vp9_variance_halfpixvar16x16_v_armv6;

-    cpi->rtcd.variance.halfpixvar16x16_hv    = vp9_variance_halfpixvar16x16_hv_armv6;

-    cpi->rtcd.variance.mse16x16              = vp9_mse16x16_armv6;

-    /*cpi->rtcd.variance.getmbss               = vp9_get_mb_ss_c;*/

-    cpi->rtcd.fdct.short4x4                  = vp9_short_fdct4x4_armv6;

-    cpi->rtcd.fdct.short8x4                  = vp9_short_fdct8x4_armv6;

-    cpi->rtcd.fdct.fast4x4                   = vp9_short_fdct4x4_armv6;

-    cpi->rtcd.fdct.fast8x4                   = vp9_short_fdct8x4_armv6;

-    cpi->rtcd.fdct.walsh_short4x4            = vp9_short_walsh4x4_armv6;

-    /*cpi->rtcd.encodemb.berr                  = vp9_block_error_c;

-    cpi->rtcd.encodemb.mberr                 = vp9_mbblock_error_c;

-    cpi->rtcd.encodemb.mbuverr               = vp9_mbuverror_c;*/

-    cpi->rtcd.encodemb.subb                  = vp9_subtract_b_armv6;

-    cpi->rtcd.encodemb.submby                = vp9_subtract_mby_armv6;

-    cpi->rtcd.encodemb.submbuv               = vp9_subtract_mbuv_armv6;

-    /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;*/

-    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_armv6;

-  }

-#endif

-#if HAVE_ARMV7

-  if (flags & HAS_NEON) {

-    cpi->rtcd.variance.sad16x16              = vp9_sad16x16_neon;

-    cpi->rtcd.variance.sad16x8               = vp9_sad16x8_neon;

-    cpi->rtcd.variance.sad8x16               = vp9_sad8x16_neon;

-    cpi->rtcd.variance.sad8x8                = vp9_sad8x8_neon;

-    cpi->rtcd.variance.sad4x4                = vp9_sad4x4_neon;

-    /*cpi->rtcd.variance.var4x4                = vp9_variance4x4_c;*/

-    cpi->rtcd.variance.var8x8                = vp9_variance8x8_neon;

-    cpi->rtcd.variance.var8x16               = vp9_variance8x16_neon;

-    cpi->rtcd.variance.var16x8               = vp9_variance16x8_neon;

-    cpi->rtcd.variance.var16x16              = vp9_variance16x16_neon;

-    /*cpi->rtcd.variance.subpixvar4x4          = vp9_sub_pixel_variance4x4_c;*/

-    cpi->rtcd.variance.subpixvar8x8          = vp9_sub_pixel_variance8x8_neon;

-    /*cpi->rtcd.variance.subpixvar8x16         = vp9_sub_pixel_variance8x16_c;

-    cpi->rtcd.variance.subpixvar16x8         = vp9_sub_pixel_variance16x8_c;*/

-    cpi->rtcd.variance.subpixvar16x16        = vp9_sub_pixel_variance16x16_neon;

-    cpi->rtcd.variance.halfpixvar16x16_h     = vp9_variance_halfpixvar16x16_h_neon;

-    cpi->rtcd.variance.halfpixvar16x16_v     = vp9_variance_halfpixvar16x16_v_neon;

-    cpi->rtcd.variance.halfpixvar16x16_hv    = vp9_variance_halfpixvar16x16_hv_neon;

-    cpi->rtcd.variance.mse16x16              = vp9_mse16x16_neon;

-    /*cpi->rtcd.variance.getmbss               = vp9_get_mb_ss_c;*/

-    cpi->rtcd.fdct.short4x4                  = vp9_short_fdct4x4_neon;

-    cpi->rtcd.fdct.short8x4                  = vp9_short_fdct8x4_neon;

-    cpi->rtcd.fdct.fast4x4                   = vp9_short_fdct4x4_neon;

-    cpi->rtcd.fdct.fast8x4                   = vp9_short_fdct8x4_neon;

-    cpi->rtcd.fdct.walsh_short4x4            = vp9_short_walsh4x4_neon;

-    /*cpi->rtcd.encodemb.berr                  = vp9_block_error_c;

-    cpi->rtcd.encodemb.mberr                 = vp9_mbblock_error_c;

-    cpi->rtcd.encodemb.mbuverr               = vp9_mbuverror_c;*/

-    cpi->rtcd.encodemb.subb                  = vp9_subtract_b_neon;

-    cpi->rtcd.encodemb.submby                = vp9_subtract_mby_neon;

-    cpi->rtcd.encodemb.submbuv               = vp9_subtract_mbuv_neon;

-    /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;

-    cpi->rtcd.quantize.quantb_pair           = vp8_regular_quantize_b_pair;*/

-    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_neon;

-    cpi->rtcd.quantize.fastquantb_pair       = vp8_fast_quantize_b_pair_neon;

-  }

-#endif

-#if HAVE_ARMV7

-#if CONFIG_RUNTIME_CPU_DETECT

-  if (flags & HAS_NEON)

-#endif

-  {

-    vp9_yv12_copy_partial_frame_ptr = vpxyv12_copy_partial_frame_neon;

-  }

-#endif

-#endif

-}

--- a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm

+++ /dev/null

@@ -1,286 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT |vp8_start_encode|

-    EXPORT |vp9_encode_bool|

-    EXPORT |vp8_stop_encode|

-    EXPORT |vp8_encode_value|

-    INCLUDE asm_enc_offsets.asm

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA    |.text|, CODE, READONLY

-; r0 BOOL_CODER *br

-; r1 unsigned char *source

-|vp8_start_encode| PROC

-    mov     r12, #0

-    mov     r3,  #255

-    mvn     r2,  #23

-    str     r12, [r0, #vp9_writer_lowvalue]

-    str     r3,  [r0, #vp9_writer_range]

-    str     r12, [r0, #vp9_writer_value]

-    str     r2,  [r0, #vp9_writer_count]

-    str     r12, [r0, #vp9_writer_pos]

-    str     r1,  [r0, #vp9_writer_buffer]

-    bx      lr

-    ENDP

-; r0 BOOL_CODER *br

-; r1 int bit

-; r2 int probability

-|vp9_encode_bool| PROC

-    push    {r4-r9, lr}

-    mov     r4, r2

-    ldr     r2, [r0, #vp9_writer_lowvalue]

-    ldr     r5, [r0, #vp9_writer_range]

-    ldr     r3, [r0, #vp9_writer_count]

-    sub     r7, r5, #1                  ; range-1

-    cmp     r1, #0

-    mul     r6, r4, r7                  ; ((range-1) * probability)

-    mov     r7, #1

-    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * probability) >> 8)

-    addne   r2, r2, r4                  ; if  (bit) lowvalue += split

-    subne   r4, r5, r4                  ; if  (bit) range = range-split

-    ; Counting the leading zeros is used to normalize range.

-    clz     r6, r4

-    sub     r6, r6, #24                 ; shift

-    ; Flag is set on the sum of count.  This flag is used later

-    ; to determine if count >= 0

-    adds    r3, r3, r6                  ; count += shift

-    lsl     r5, r4, r6                  ; range <<= shift

-    bmi     token_count_lt_zero         ; if(count >= 0)

-    sub     r6, r6, r3                  ; offset = shift - count

-    sub     r4, r6, #1                  ; offset-1

-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

-    bpl     token_high_bit_not_set

-    ldr     r4, [r0, #vp9_writer_pos]   ; x

-    sub     r4, r4, #1                  ; x = w->pos-1

-    b       token_zero_while_start

-token_zero_while_loop

-    mov     r9, #0

-    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0

-    sub     r4, r4, #1                  ; x--

-token_zero_while_start

-    cmp     r4, #0

-    ldrge   r7, [r0, #vp9_writer_buffer]

-    ldrb    r1, [r7, r4]

-    cmpge   r1, #0xff

-    beq     token_zero_while_loop

-    ldr     r7, [r0, #vp9_writer_buffer]

-    ldrb    r9, [r7, r4]                ; w->buffer[x]

-    add     r9, r9, #1

-    strb    r9, [r7, r4]                ; w->buffer[x] + 1

-token_high_bit_not_set

-    rsb     r4, r6, #24                 ; 24-offset

-    ldr     r9, [r0, #vp9_writer_buffer]

-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

-    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos

-    lsl     r2, r2, r6                  ; lowvalue <<= offset

-    mov     r6, r3                      ; shift = count

-    add     r1, r4, #1                  ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r1, [r0, #vp9_writer_pos]

-    sub     r3, r3, #8                  ; count -= 8

-    strb    r7, [r9, r4]                ; w->buffer[w->pos++]

-token_count_lt_zero

-    lsl     r2, r2, r6                  ; lowvalue <<= shift

-    str     r2, [r0, #vp9_writer_lowvalue]

-    str     r5, [r0, #vp9_writer_range]

-    str     r3, [r0, #vp9_writer_count]

-    pop     {r4-r9, pc}

-    ENDP

-; r0 BOOL_CODER *br

-|vp8_stop_encode| PROC

-    push    {r4-r10, lr}

-    ldr     r2, [r0, #vp9_writer_lowvalue]

-    ldr     r5, [r0, #vp9_writer_range]

-    ldr     r3, [r0, #vp9_writer_count]

-    mov     r10, #32

-stop_encode_loop

-    sub     r7, r5, #1                  ; range-1

-    mov     r4, r7, lsl #7              ; ((range-1) * 128)

-    mov     r7, #1

-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)

-    ; Counting the leading zeros is used to normalize range.

-    clz     r6, r4

-    sub     r6, r6, #24                 ; shift

-    ; Flag is set on the sum of count.  This flag is used later

-    ; to determine if count >= 0

-    adds    r3, r3, r6                  ; count += shift

-    lsl     r5, r4, r6                  ; range <<= shift

-    bmi     token_count_lt_zero_se      ; if(count >= 0)

-    sub     r6, r6, r3                  ; offset = shift - count

-    sub     r4, r6, #1                  ; offset-1

-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

-    bpl     token_high_bit_not_set_se

-    ldr     r4, [r0, #vp9_writer_pos]   ; x

-    sub     r4, r4, #1                  ; x = w->pos-1

-    b       token_zero_while_start_se

-token_zero_while_loop_se

-    mov     r9, #0

-    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0

-    sub     r4, r4, #1                  ; x--

-token_zero_while_start_se

-    cmp     r4, #0

-    ldrge   r7, [r0, #vp9_writer_buffer]

-    ldrb    r1, [r7, r4]

-    cmpge   r1, #0xff

-    beq     token_zero_while_loop_se

-    ldr     r7, [r0, #vp9_writer_buffer]

-    ldrb    r9, [r7, r4]                ; w->buffer[x]

-    add     r9, r9, #1

-    strb    r9, [r7, r4]                ; w->buffer[x] + 1

-token_high_bit_not_set_se

-    rsb     r4, r6, #24                 ; 24-offset

-    ldr     r9, [r0, #vp9_writer_buffer]

-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

-    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos

-    lsl     r2, r2, r6                  ; lowvalue <<= offset

-    mov     r6, r3                      ; shift = count

-    add     r1, r4, #1                  ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r1, [r0, #vp9_writer_pos]

-    sub     r3, r3, #8                  ; count -= 8

-    strb    r7, [r9, r4]                ; w->buffer[w->pos++]

-token_count_lt_zero_se

-    lsl     r2, r2, r6                  ; lowvalue <<= shift

-    subs    r10, r10, #1

-    bne     stop_encode_loop

-    str     r2, [r0, #vp9_writer_lowvalue]

-    str     r5, [r0, #vp9_writer_range]

-    str     r3, [r0, #vp9_writer_count]

-    pop     {r4-r10, pc}

-    ENDP

-; r0 BOOL_CODER *br

-; r1 int data

-; r2 int bits

-|vp8_encode_value| PROC

-    push    {r4-r11, lr}

-    mov     r10, r2

-    ldr     r2, [r0, #vp9_writer_lowvalue]

-    ldr     r5, [r0, #vp9_writer_range]

-    ldr     r3, [r0, #vp9_writer_count]

-    rsb     r4, r10, #32                 ; 32-n

-    ; v is kept in r1 during the token pack loop

-    lsl     r1, r1, r4                  ; r1 = v << 32 - n

-encode_value_loop

-    sub     r7, r5, #1                  ; range-1

-    ; Decisions are made based on the bit value shifted

-    ; off of v, so set a flag here based on this.

-    ; This value is refered to as "bb"

-    lsls    r1, r1, #1                  ; bit = v >> n

-    mov     r4, r7, lsl #7              ; ((range-1) * 128)

-    mov     r7, #1

-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)

-    addcs   r2, r2, r4                  ; if  (bit) lowvalue += split

-    subcs   r4, r5, r4                  ; if  (bit) range = range-split

-    ; Counting the leading zeros is used to normalize range.

-    clz     r6, r4

-    sub     r6, r6, #24                 ; shift

-    ; Flag is set on the sum of count.  This flag is used later

-    ; to determine if count >= 0

-    adds    r3, r3, r6                  ; count += shift

-    lsl     r5, r4, r6                  ; range <<= shift

-    bmi     token_count_lt_zero_ev      ; if(count >= 0)

-    sub     r6, r6, r3                  ; offset = shift - count

-    sub     r4, r6, #1                  ; offset-1

-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

-    bpl     token_high_bit_not_set_ev

-    ldr     r4, [r0, #vp9_writer_pos]   ; x

-    sub     r4, r4, #1                  ; x = w->pos-1

-    b       token_zero_while_start_ev

-token_zero_while_loop_ev

-    mov     r9, #0

-    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0

-    sub     r4, r4, #1                  ; x--

-token_zero_while_start_ev

-    cmp     r4, #0

-    ldrge   r7, [r0, #vp9_writer_buffer]

-    ldrb    r11, [r7, r4]

-    cmpge   r11, #0xff

-    beq     token_zero_while_loop_ev

-    ldr     r7, [r0, #vp9_writer_buffer]

-    ldrb    r9, [r7, r4]                ; w->buffer[x]

-    add     r9, r9, #1

-    strb    r9, [r7, r4]                ; w->buffer[x] + 1

-token_high_bit_not_set_ev

-    rsb     r4, r6, #24                 ; 24-offset

-    ldr     r9, [r0, #vp9_writer_buffer]

-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

-    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos

-    lsl     r2, r2, r6                  ; lowvalue <<= offset

-    mov     r6, r3                      ; shift = count

-    add     r11, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r11, [r0, #vp9_writer_pos]

-    sub     r3, r3, #8                  ; count -= 8

-    strb    r7, [r9, r4]                ; w->buffer[w->pos++]

-token_count_lt_zero_ev

-    lsl     r2, r2, r6                  ; lowvalue <<= shift

-    subs    r10, r10, #1

-    bne     encode_value_loop

-    str     r2, [r0, #vp9_writer_lowvalue]

-    str     r5, [r0, #vp9_writer_range]

-    str     r3, [r0, #vp9_writer_count]

-    pop     {r4-r11, pc}

-    ENDP

-    END

--- a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm

+++ /dev/null

@@ -1,291 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT |vp8cx_pack_tokens_armv5|

-    INCLUDE asm_enc_offsets.asm

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA    |.text|, CODE, READONLY

-; r0 vp9_writer *w

-; r1 const TOKENEXTRA *p

-; r2 int xcount

-; r3 vp8_coef_encodings

-; s0 vp8_extra_bits

-; s1 vp8_coef_tree

-|vp8cx_pack_tokens_armv5| PROC

-    push    {r4-r11, lr}

-    ; Add size of xcount * sizeof (TOKENEXTRA) to get stop

-    ;  sizeof (TOKENEXTRA) is 8

-    sub     sp, sp, #12

-    add     r2, r1, r2, lsl #3          ; stop = p + xcount*sizeof(TOKENEXTRA)

-    str     r2, [sp, #0]

-    str     r3, [sp, #8]                ; save vp8_coef_encodings

-    ldr     r2, [r0, #vp9_writer_lowvalue]

-    ldr     r5, [r0, #vp9_writer_range]

-    ldr     r3, [r0, #vp9_writer_count]

-    b       check_p_lt_stop

-while_p_lt_stop

-    ldrb    r6, [r1, #tokenextra_token] ; t

-    ldr     r4, [sp, #8]                ; vp8_coef_encodings

-    mov     lr, #0

-    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t

-    ldr     r9, [r1, #tokenextra_context_tree]   ; pp

-    ldrb    r7, [r1, #tokenextra_skip_eob_node]

-    ldr     r6, [r4, #vp9_token_value]  ; v

-    ldr     r8, [r4, #vp9_token_len]    ; n

-    ; vp8 specific skip_eob_node

-    cmp     r7, #0

-    movne   lr, #2                      ; i = 2

-    subne   r8, r8, #1                  ; --n

-    rsb     r4, r8, #32                 ; 32-n

-    ldr     r10, [sp, #52]              ; vp8_coef_tree

-    ; v is kept in r12 during the token pack loop

-    lsl     r12, r6, r4                ; r12 = v << 32 - n

-; loop start

-token_loop

-    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]

-    sub     r7, r5, #1                  ; range-1

-    ; Decisions are made based on the bit value shifted

-    ; off of v, so set a flag here based on this.

-    ; This value is refered to as "bb"

-    lsls    r12, r12, #1                ; bb = v >> n

-    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))

-    ; bb can only be 0 or 1.  So only execute this statement

-    ; if bb == 1, otherwise it will act like i + 0

-    addcs   lr, lr, #1                  ; i + bb

-    mov     r7, #1

-    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]

-    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)

-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

-    subcs   r4, r5, r4                  ; if  (bb) range = range-split

-    ; Counting the leading zeros is used to normalize range.

-    clz     r6, r4

-    sub     r6, r6, #24                 ; shift

-    ; Flag is set on the sum of count.  This flag is used later

-    ; to determine if count >= 0

-    adds    r3, r3, r6                  ; count += shift

-    lsl     r5, r4, r6                  ; range <<= shift

-    bmi     token_count_lt_zero         ; if(count >= 0)

-    sub     r6, r6, r3                  ; offset = shift - count

-    sub     r4, r6, #1                  ; offset-1

-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

-    bpl     token_high_bit_not_set

-    ldr     r4, [r0, #vp9_writer_pos]   ; x

-    sub     r4, r4, #1                  ; x = w->pos-1

-    b       token_zero_while_start

-token_zero_while_loop

-    mov     r10, #0

-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

-    sub     r4, r4, #1                  ; x--

-token_zero_while_start

-    cmp     r4, #0

-    ldrge   r7, [r0, #vp9_writer_buffer]

-    ldrb    r11, [r7, r4]

-    cmpge   r11, #0xff

-    beq     token_zero_while_loop

-    ldr     r7, [r0, #vp9_writer_buffer]

-    ldrb    r10, [r7, r4]               ; w->buffer[x]

-    add     r10, r10, #1

-    strb    r10, [r7, r4]               ; w->buffer[x] + 1

-token_high_bit_not_set

-    rsb     r4, r6, #24                 ; 24-offset

-    ldr     r10, [r0, #vp9_writer_buffer]

-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

-    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos

-    lsl     r2, r2, r6                  ; lowvalue <<= offset

-    mov     r6, r3                      ; shift = count

-    add     r11, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r11, [r0, #vp9_writer_pos]

-    sub     r3, r3, #8                  ; count -= 8

-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]

-    ; r10 is used earlier in the loop, but r10 is used as

-    ; temp variable here.  So after r10 is used, reload

-    ; vp8_coef_tree_dcd into r10

-    ldr     r10, [sp, #52]              ; vp8_coef_tree

-token_count_lt_zero

-    lsl     r2, r2, r6                  ; lowvalue <<= shift

-    subs    r8, r8, #1                  ; --n

-    bne     token_loop

-    ldrb    r6, [r1, #tokenextra_token] ; t

-    ldr     r7, [sp, #48]               ; vp8_extra_bits

-    ; Add t * sizeof (vp9_extra_bit_struct) to get the desired

-    ;  element.  Here vp9_extra_bit_struct == 16

-    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t

-    ldr     r4, [r12, #vp9_extra_bit_struct_base_val]

-    cmp     r4, #0

-    beq     skip_extra_bits

-;   if( b->base_val)

-    ldr     r8, [r12, #vp9_extra_bit_struct_len] ; L

-    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra

-    cmp     r8, #0                      ; if( L)

-    beq     no_extra_bits

-    ldr     r9, [r12, #vp9_extra_bit_struct_prob]

-    asr     r7, lr, #1                  ; v=e>>1

-    ldr     r10, [r12, #vp9_extra_bit_struct_tree]

-    str     r10, [sp, #4]               ; b->tree

-    rsb     r4, r8, #32

-    lsl     r12, r7, r4

-    mov     lr, #0                      ; i = 0

-extra_bits_loop

-    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]

-    sub     r7, r5, #1                  ; range-1

-    lsls    r12, r12, #1                ; v >> n

-    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]

-    addcs   lr, lr, #1                  ; i + bb

-    mov     r7, #1

-    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]

-    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)

-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

-    subcs   r4, r5, r4                  ; if  (bb) range = range-split

-    clz     r6, r4

-    sub     r6, r6, #24

-    adds    r3, r3, r6                  ; count += shift

-    lsl     r5, r4, r6                  ; range <<= shift

-    bmi     extra_count_lt_zero         ; if(count >= 0)

-    sub     r6, r6, r3                  ; offset= shift - count

-    sub     r4, r6, #1                  ; offset-1

-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

-    bpl     extra_high_bit_not_set

-    ldr     r4, [r0, #vp9_writer_pos]   ; x

-    sub     r4, r4, #1                  ; x = w->pos - 1

-    b       extra_zero_while_start

-extra_zero_while_loop

-    mov     r10, #0

-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

-    sub     r4, r4, #1                  ; x--

-extra_zero_while_start

-    cmp     r4, #0

-    ldrge   r7, [r0, #vp9_writer_buffer]

-    ldrb    r11, [r7, r4]

-    cmpge   r11, #0xff

-    beq     extra_zero_while_loop

-    ldr     r7, [r0, #vp9_writer_buffer]

-    ldrb    r10, [r7, r4]

-    add     r10, r10, #1

-    strb    r10, [r7, r4]

-extra_high_bit_not_set

-    rsb     r4, r6, #24                 ; 24-offset

-    ldr     r10, [r0, #vp9_writer_buffer]

-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

-    ldr     r4, [r0, #vp9_writer_pos]

-    lsl     r2, r2, r6                  ; lowvalue <<= offset

-    mov     r6, r3                      ; shift = count

-    add     r11, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r11, [r0, #vp9_writer_pos]

-    sub     r3, r3, #8                  ; count -= 8

-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))

-    ldr     r10, [sp, #4]               ; b->tree

-extra_count_lt_zero

-    lsl     r2, r2, r6

-    subs    r8, r8, #1                  ; --n

-    bne     extra_bits_loop             ; while (n)

-no_extra_bits

-    ldr     lr, [r1, #4]                ; e = p->Extra

-    add     r4, r5, #1                  ; range + 1

-    tst     lr, #1

-    lsr     r4, r4, #1                  ; split = (range + 1) >> 1

-    addne   r2, r2, r4                  ; lowvalue += split

-    subne   r4, r5, r4                  ; range = range-split

-    tst     r2, #0x80000000             ; lowvalue & 0x80000000

-    lsl     r5, r4, #1                  ; range <<= 1

-    beq     end_high_bit_not_set

-    ldr     r4, [r0, #vp9_writer_pos]

-    mov     r7, #0

-    sub     r4, r4, #1

-    b       end_zero_while_start

-end_zero_while_loop

-    strb    r7, [r6, r4]

-    sub     r4, r4, #1                  ; x--

-end_zero_while_start

-    cmp     r4, #0

-    ldrge   r6, [r0, #vp9_writer_buffer]

-    ldrb    r12, [r6, r4]

-    cmpge   r12, #0xff

-    beq     end_zero_while_loop

-    ldr     r6, [r0, #vp9_writer_buffer]

-    ldrb    r7, [r6, r4]

-    add     r7, r7, #1

-    strb    r7, [r6, r4]

-end_high_bit_not_set

-    adds    r3, r3, #1                  ; ++count

-    lsl     r2, r2, #1                  ; lowvalue  <<= 1

-    bne     end_count_zero

-    ldr     r4, [r0, #vp9_writer_pos]

-    mvn     r3, #7

-    ldr     r7, [r0, #vp9_writer_buffer]

-    lsr     r6, r2, #24                 ; lowvalue >> 24

-    add     r12, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r12, [r0, #0x10]

-    strb    r6, [r7, r4]

-end_count_zero

-skip_extra_bits

-    add     r1, r1, #TOKENEXTRA_SZ      ; ++p

-check_p_lt_stop

-    ldr     r4, [sp, #0]                ; stop

-    cmp     r1, r4                      ; while( p < stop)

-    bcc     while_p_lt_stop

-    str     r2, [r0, #vp9_writer_lowvalue]

-    str     r5, [r0, #vp9_writer_range]

-    str     r3, [r0, #vp9_writer_count]

-    add     sp, sp, #12

-    pop     {r4-r11, pc}

-    ENDP

-    END

--- a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm

+++ /dev/null

@@ -1,327 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT |vp8cx_pack_mb_row_tokens_armv5|

-    INCLUDE asm_enc_offsets.asm

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA    |.text|, CODE, READONLY

-; r0 VP8_COMP *cpi

-; r1 vp9_writer *w

-; r2 vp8_coef_encodings

-; r3 vp8_extra_bits

-; s0 vp8_coef_tree

-|vp8cx_pack_mb_row_tokens_armv5| PROC

-    push    {r4-r11, lr}

-    sub     sp, sp, #24

-    ; Compute address of cpi->common.mb_rows

-    ldr     r4, _VP8_COMP_common_

-    ldr     r6, _VP8_COMMON_MBrows_

-    add     r4, r0, r4

-    ldr     r5, [r4, r6]                ; load up mb_rows

-    str     r2, [sp, #20]               ; save vp8_coef_encodings

-    str     r5, [sp, #12]               ; save mb_rows

-    str     r3, [sp, #8]                ; save vp8_extra_bits

-    ldr     r4, _VP8_COMP_tplist_

-    add     r4, r0, r4

-    ldr     r7, [r4, #0]                ; dereference cpi->tp_list

-    mov     r0, r1                      ; keep same as other loops

-    ldr     r2, [r0, #vp9_writer_lowvalue]

-    ldr     r5, [r0, #vp9_writer_range]

-    ldr     r3, [r0, #vp9_writer_count]

-mb_row_loop

-    ldr     r1, [r7, #tokenlist_start]

-    ldr     r9, [r7, #tokenlist_stop]

-    str     r9, [sp, #0]                ; save stop for later comparison

-    str     r7, [sp, #16]               ; tokenlist address for next time

-    b       check_p_lt_stop

-    ; actuall work gets done here!

-while_p_lt_stop

-    ldrb    r6, [r1, #tokenextra_token] ; t

-    ldr     r4, [sp, #20]               ; vp8_coef_encodings

-    mov     lr, #0

-    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t

-    ldr     r9, [r1, #tokenextra_context_tree]   ; pp

-    ldrb    r7, [r1, #tokenextra_skip_eob_node]

-    ldr     r6, [r4, #vp9_token_value]  ; v

-    ldr     r8, [r4, #vp9_token_len]    ; n

-    ; vp8 specific skip_eob_node

-    cmp     r7, #0

-    movne   lr, #2                      ; i = 2

-    subne   r8, r8, #1                  ; --n

-    rsb     r4, r8, #32                 ; 32-n

-    ldr     r10, [sp, #60]              ; vp8_coef_tree

-    ; v is kept in r12 during the token pack loop

-    lsl     r12, r6, r4                 ; r12 = v << 32 - n

-; loop start

-token_loop

-    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]

-    sub     r7, r5, #1                  ; range-1

-    ; Decisions are made based on the bit value shifted

-    ; off of v, so set a flag here based on this.

-    ; This value is refered to as "bb"

-    lsls    r12, r12, #1                ; bb = v >> n

-    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))

-    ; bb can only be 0 or 1.  So only execute this statement

-    ; if bb == 1, otherwise it will act like i + 0

-    addcs   lr, lr, #1                  ; i + bb

-    mov     r7, #1

-    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]

-    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)

-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

-    subcs   r4, r5, r4                  ; if  (bb) range = range-split

-    ; Counting the leading zeros is used to normalize range.

-    clz     r6, r4

-    sub     r6, r6, #24                 ; shift

-    ; Flag is set on the sum of count.  This flag is used later

-    ; to determine if count >= 0

-    adds    r3, r3, r6                  ; count += shift

-    lsl     r5, r4, r6                  ; range <<= shift

-    bmi     token_count_lt_zero         ; if(count >= 0)

-    sub     r6, r6, r3                  ; offset = shift - count

-    sub     r4, r6, #1                  ; offset-1

-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

-    bpl     token_high_bit_not_set

-    ldr     r4, [r0, #vp9_writer_pos]   ; x

-    sub     r4, r4, #1                  ; x = w->pos-1

-    b       token_zero_while_start

-token_zero_while_loop

-    mov     r10, #0

-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

-    sub     r4, r4, #1                  ; x--

-token_zero_while_start

-    cmp     r4, #0

-    ldrge   r7, [r0, #vp9_writer_buffer]

-    ldrb    r11, [r7, r4]

-    cmpge   r11, #0xff

-    beq     token_zero_while_loop

-    ldr     r7, [r0, #vp9_writer_buffer]

-    ldrb    r10, [r7, r4]               ; w->buffer[x]

-    add     r10, r10, #1

-    strb    r10, [r7, r4]               ; w->buffer[x] + 1

-token_high_bit_not_set

-    rsb     r4, r6, #24                 ; 24-offset

-    ldr     r10, [r0, #vp9_writer_buffer]

-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

-    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos

-    lsl     r2, r2, r6                  ; lowvalue <<= offset

-    mov     r6, r3                      ; shift = count

-    add     r11, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r11, [r0, #vp9_writer_pos]

-    sub     r3, r3, #8                  ; count -= 8

-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]

-    ; r10 is used earlier in the loop, but r10 is used as

-    ; temp variable here.  So after r10 is used, reload

-    ; vp8_coef_tree_dcd into r10

-    ldr     r10, [sp, #60]              ; vp8_coef_tree

-token_count_lt_zero

-    lsl     r2, r2, r6                  ; lowvalue <<= shift

-    subs    r8, r8, #1                  ; --n

-    bne     token_loop

-    ldrb    r6, [r1, #tokenextra_token] ; t

-    ldr     r7, [sp, #8]                ; vp8_extra_bits

-    ; Add t * sizeof (vp9_extra_bit_struct) to get the desired

-    ;  element.  Here vp9_extra_bit_struct == 16

-    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t

-    ldr     r4, [r12, #vp9_extra_bit_struct_base_val]

-    cmp     r4, #0

-    beq     skip_extra_bits

-;   if( b->base_val)

-    ldr     r8, [r12, #vp9_extra_bit_struct_len] ; L

-    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra

-    cmp     r8, #0                      ; if( L)

-    beq     no_extra_bits

-    ldr     r9, [r12, #vp9_extra_bit_struct_prob]

-    asr     r7, lr, #1                  ; v=e>>1

-    ldr     r10, [r12, #vp9_extra_bit_struct_tree]

-    str     r10, [sp, #4]               ; b->tree

-    rsb     r4, r8, #32

-    lsl     r12, r7, r4

-    mov     lr, #0                      ; i = 0

-extra_bits_loop

-    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]

-    sub     r7, r5, #1                  ; range-1

-    lsls    r12, r12, #1                ; v >> n

-    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]

-    addcs   lr, lr, #1                  ; i + bb

-    mov     r7, #1

-    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]

-    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)

-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

-    subcs   r4, r5, r4                  ; if  (bb) range = range-split

-    clz     r6, r4

-    sub     r6, r6, #24

-    adds    r3, r3, r6                  ; count += shift

-    lsl     r5, r4, r6                  ; range <<= shift

-    bmi     extra_count_lt_zero         ; if(count >= 0)

-    sub     r6, r6, r3                  ; offset= shift - count

-    sub     r4, r6, #1                  ; offset-1

-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

-    bpl     extra_high_bit_not_set

-    ldr     r4, [r0, #vp9_writer_pos]   ; x

-    sub     r4, r4, #1                  ; x = w->pos - 1

-    b       extra_zero_while_start

-extra_zero_while_loop

-    mov     r10, #0

-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

-    sub     r4, r4, #1                  ; x--

-extra_zero_while_start

-    cmp     r4, #0

-    ldrge   r7, [r0, #vp9_writer_buffer]

-    ldrb    r11, [r7, r4]

-    cmpge   r11, #0xff

-    beq     extra_zero_while_loop

-    ldr     r7, [r0, #vp9_writer_buffer]

-    ldrb    r10, [r7, r4]

-    add     r10, r10, #1

-    strb    r10, [r7, r4]

-extra_high_bit_not_set

-    rsb     r4, r6, #24                 ; 24-offset

-    ldr     r10, [r0, #vp9_writer_buffer]

-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

-    ldr     r4, [r0, #vp9_writer_pos]

-    lsl     r2, r2, r6                  ; lowvalue <<= offset

-    mov     r6, r3                      ; shift = count

-    add     r11, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r11, [r0, #vp9_writer_pos]

-    sub     r3, r3, #8                  ; count -= 8

-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))

-    ldr     r10, [sp, #4]               ; b->tree

-extra_count_lt_zero

-    lsl     r2, r2, r6

-    subs    r8, r8, #1                  ; --n

-    bne     extra_bits_loop             ; while (n)

-no_extra_bits

-    ldr     lr, [r1, #4]                ; e = p->Extra

-    add     r4, r5, #1                  ; range + 1

-    tst     lr, #1

-    lsr     r4, r4, #1                  ; split = (range + 1) >> 1

-    addne   r2, r2, r4                  ; lowvalue += split

-    subne   r4, r5, r4                  ; range = range-split

-    tst     r2, #0x80000000             ; lowvalue & 0x80000000

-    lsl     r5, r4, #1                  ; range <<= 1

-    beq     end_high_bit_not_set

-    ldr     r4, [r0, #vp9_writer_pos]

-    mov     r7, #0

-    sub     r4, r4, #1

-    b       end_zero_while_start

-end_zero_while_loop

-    strb    r7, [r6, r4]

-    sub     r4, r4, #1                  ; x--

-end_zero_while_start

-    cmp     r4, #0

-    ldrge   r6, [r0, #vp9_writer_buffer]

-    ldrb    r12, [r6, r4]

-    cmpge   r12, #0xff

-    beq     end_zero_while_loop

-    ldr     r6, [r0, #vp9_writer_buffer]

-    ldrb    r7, [r6, r4]

-    add     r7, r7, #1

-    strb    r7, [r6, r4]

-end_high_bit_not_set

-    adds    r3, r3, #1                  ; ++count

-    lsl     r2, r2, #1                  ; lowvalue  <<= 1

-    bne     end_count_zero

-    ldr     r4, [r0, #vp9_writer_pos]

-    mvn     r3, #7

-    ldr     r7, [r0, #vp9_writer_buffer]

-    lsr     r6, r2, #24                 ; lowvalue >> 24

-    add     r12, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r12, [r0, #0x10]

-    strb    r6, [r7, r4]

-end_count_zero

-skip_extra_bits

-    add     r1, r1, #TOKENEXTRA_SZ      ; ++p

-check_p_lt_stop

-    ldr     r4, [sp, #0]                ; stop

-    cmp     r1, r4                      ; while( p < stop)

-    bcc     while_p_lt_stop

-    ldr     r6, [sp, #12]               ; mb_rows

-    ldr     r7, [sp, #16]               ; tokenlist address

-    subs    r6, r6, #1

-    add     r7, r7, #TOKENLIST_SZ       ; next element in the array

-    str     r6, [sp, #12]

-    bne     mb_row_loop

-    str     r2, [r0, #vp9_writer_lowvalue]

-    str     r5, [r0, #vp9_writer_range]

-    str     r3, [r0, #vp9_writer_count]

-    add     sp, sp, #24

-    pop     {r4-r11, pc}

-    ENDP

-_VP8_COMP_common_

-    DCD     vp8_comp_common

-_VP8_COMMON_MBrows_

-    DCD     vp8_common_mb_rows

-_VP8_COMP_tplist_

-    DCD     vp8_comp_tplist

-    END

--- a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm

+++ /dev/null

@@ -1,465 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT |vp8cx_pack_tokens_into_partitions_armv5|

-    INCLUDE asm_enc_offsets.asm

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA    |.text|, CODE, READONLY

-; r0 VP8_COMP *cpi

-; r1 unsigned char *cx_data

-; r2 int num_part

-; r3 *size

-; s0 vp8_coef_encodings

-; s1 vp8_extra_bits,

-; s2 const vp9_tree_index *,

-|vp8cx_pack_tokens_into_partitions_armv5| PROC

-    push    {r4-r11, lr}

-    sub     sp, sp, #44

-    ; Compute address of cpi->common.mb_rows

-    ldr     r4, _VP8_COMP_common_

-    ldr     r6, _VP8_COMMON_MBrows_

-    add     r4, r0, r4

-    ldr     r5, [r4, r6]                ; load up mb_rows

-    str     r5, [sp, #36]               ; save mb_rows

-    str     r1, [sp, #24]               ; save cx_data

-    str     r2, [sp, #20]               ; save num_part

-    str     r3, [sp, #8]                ; save *size

-    ; *size = 3*(num_part -1 );

-    sub     r2, r2, #1                  ; num_part - 1

-    add     r2, r2, r2, lsl #1          ; 3*(num_part - 1)

-    str     r2, [r3]

-    add     r2, r2, r1                  ; cx_data + *size

-    str     r2, [sp, #40]               ; ptr

-    ldr     r4, _VP8_COMP_tplist_

-    add     r4, r0, r4

-    ldr     r7, [r4, #0]                ; dereference cpi->tp_list

-    str     r7, [sp, #32]               ; store start of cpi->tp_list

-    ldr     r11, _VP8_COMP_bc2_         ; load up vp9_writer out of cpi

-    add     r0, r0, r11

-    mov     r11, #0

-    str     r11, [sp, #28]              ; i

-numparts_loop

-    ldr     r10, [sp, #40]              ; ptr

-    ldr     r5,  [sp, #36]              ; move mb_rows to the counting section

-    sub     r5, r5, r11                 ; move start point with each partition

-                                        ; mb_rows starts at i

-    str     r5,  [sp, #12]

-    ; Reset all of the VP8 Writer data for each partition that

-    ; is processed.

-    ; start_encode

-    mov     r2, #0                      ; vp9_writer_lowvalue

-    mov     r5, #255                    ; vp9_writer_range

-    mvn     r3, #23                     ; vp9_writer_count

-    str     r2,  [r0, #vp9_writer_value]

-    str     r2,  [r0, #vp9_writer_pos]

-    str     r10, [r0, #vp9_writer_buffer]

-mb_row_loop

-    ldr     r1, [r7, #tokenlist_start]

-    ldr     r9, [r7, #tokenlist_stop]

-    str     r9, [sp, #0]                ; save stop for later comparison

-    str     r7, [sp, #16]               ; tokenlist address for next time

-    b       check_p_lt_stop

-    ; actual work gets done here!

-while_p_lt_stop

-    ldrb    r6, [r1, #tokenextra_token] ; t

-    ldr     r4, [sp, #80]               ; vp8_coef_encodings

-    mov     lr, #0

-    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t

-    ldr     r9, [r1, #tokenextra_context_tree]   ; pp

-    ldrb    r7, [r1, #tokenextra_skip_eob_node]

-    ldr     r6, [r4, #vp9_token_value]  ; v

-    ldr     r8, [r4, #vp9_token_len]    ; n

-    ; vp8 specific skip_eob_node

-    cmp     r7, #0

-    movne   lr, #2                      ; i = 2

-    subne   r8, r8, #1                  ; --n

-    rsb     r4, r8, #32                 ; 32-n

-    ldr     r10, [sp, #88]              ; vp8_coef_tree

-    ; v is kept in r12 during the token pack loop

-    lsl     r12, r6, r4                ; r12 = v << 32 - n

-; loop start

-token_loop

-    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]

-    sub     r7, r5, #1                  ; range-1

-    ; Decisions are made based on the bit value shifted

-    ; off of v, so set a flag here based on this.

-    ; This value is refered to as "bb"

-    lsls    r12, r12, #1                ; bb = v >> n

-    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))

-    ; bb can only be 0 or 1.  So only execute this statement

-    ; if bb == 1, otherwise it will act like i + 0

-    addcs   lr, lr, #1                  ; i + bb

-    mov     r7, #1

-    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]

-    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)

-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

-    subcs   r4, r5, r4                  ; if  (bb) range = range-split

-    ; Counting the leading zeros is used to normalize range.

-    clz     r6, r4

-    sub     r6, r6, #24                 ; shift

-    ; Flag is set on the sum of count.  This flag is used later

-    ; to determine if count >= 0

-    adds    r3, r3, r6                  ; count += shift

-    lsl     r5, r4, r6                  ; range <<= shift

-    bmi     token_count_lt_zero         ; if(count >= 0)

-    sub     r6, r6, r3                  ; offset = shift - count

-    sub     r4, r6, #1                  ; offset-1

-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

-    bpl     token_high_bit_not_set

-    ldr     r4, [r0, #vp9_writer_pos]   ; x

-    sub     r4, r4, #1                  ; x = w->pos-1

-    b       token_zero_while_start

-token_zero_while_loop

-    mov     r10, #0

-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

-    sub     r4, r4, #1                  ; x--

-token_zero_while_start

-    cmp     r4, #0

-    ldrge   r7, [r0, #vp9_writer_buffer]

-    ldrb    r11, [r7, r4]

-    cmpge   r11, #0xff

-    beq     token_zero_while_loop

-    ldr     r7, [r0, #vp9_writer_buffer]

-    ldrb    r10, [r7, r4]               ; w->buffer[x]

-    add     r10, r10, #1

-    strb    r10, [r7, r4]               ; w->buffer[x] + 1

-token_high_bit_not_set

-    rsb     r4, r6, #24                 ; 24-offset

-    ldr     r10, [r0, #vp9_writer_buffer]

-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

-    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos

-    lsl     r2, r2, r6                  ; lowvalue <<= offset

-    mov     r6, r3                      ; shift = count

-    add     r11, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r11, [r0, #vp9_writer_pos]

-    sub     r3, r3, #8                  ; count -= 8

-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]

-    ; r10 is used earlier in the loop, but r10 is used as

-    ; temp variable here.  So after r10 is used, reload

-    ; vp8_coef_tree_dcd into r10

-    ldr     r10, [sp, #88]              ; vp8_coef_tree

-token_count_lt_zero

-    lsl     r2, r2, r6                  ; lowvalue <<= shift

-    subs    r8, r8, #1                  ; --n

-    bne     token_loop

-    ldrb    r6, [r1, #tokenextra_token] ; t

-    ldr     r7, [sp, #84]                ; vp8_extra_bits

-    ; Add t * sizeof (vp9_extra_bit_struct) to get the desired

-    ;  element.  Here vp9_extra_bit_struct == 16

-    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t

-    ldr     r4, [r12, #vp9_extra_bit_struct_base_val]

-    cmp     r4, #0

-    beq     skip_extra_bits

-;   if( b->base_val)

-    ldr     r8, [r12, #vp9_extra_bit_struct_len] ; L

-    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra

-    cmp     r8, #0                      ; if( L)

-    beq     no_extra_bits

-    ldr     r9, [r12, #vp9_extra_bit_struct_prob]

-    asr     r7, lr, #1                  ; v=e>>1

-    ldr     r10, [r12, #vp9_extra_bit_struct_tree]

-    str     r10, [sp, #4]               ; b->tree

-    rsb     r4, r8, #32

-    lsl     r12, r7, r4

-    mov     lr, #0                      ; i = 0

-extra_bits_loop

-    ldrb    r4, [r9, lr, asr #1]        ; pp[i>>1]

-    sub     r7, r5, #1                  ; range-1

-    lsls    r12, r12, #1                ; v >> n

-    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]

-    addcs   lr, lr, #1                  ; i + bb

-    mov     r7, #1

-    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]

-    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)

-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

-    subcs   r4, r5, r4                  ; if  (bb) range = range-split

-    clz     r6, r4

-    sub     r6, r6, #24

-    adds    r3, r3, r6                  ; count += shift

-    lsl     r5, r4, r6                  ; range <<= shift

-    bmi     extra_count_lt_zero         ; if(count >= 0)

-    sub     r6, r6, r3                  ; offset= shift - count

-    sub     r4, r6, #1                  ; offset-1

-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

-    bpl     extra_high_bit_not_set

-    ldr     r4, [r0, #vp9_writer_pos]   ; x

-    sub     r4, r4, #1                  ; x = w->pos - 1

-    b       extra_zero_while_start

-extra_zero_while_loop

-    mov     r10, #0

-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

-    sub     r4, r4, #1                  ; x--

-extra_zero_while_start

-    cmp     r4, #0

-    ldrge   r7, [r0, #vp9_writer_buffer]

-    ldrb    r11, [r7, r4]

-    cmpge   r11, #0xff

-    beq     extra_zero_while_loop

-    ldr     r7, [r0, #vp9_writer_buffer]

-    ldrb    r10, [r7, r4]

-    add     r10, r10, #1

-    strb    r10, [r7, r4]

-extra_high_bit_not_set

-    rsb     r4, r6, #24                 ; 24-offset

-    ldr     r10, [r0, #vp9_writer_buffer]

-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

-    ldr     r4, [r0, #vp9_writer_pos]

-    lsl     r2, r2, r6                  ; lowvalue <<= offset

-    mov     r6, r3                      ; shift = count

-    add     r11, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r11, [r0, #vp9_writer_pos]

-    sub     r3, r3, #8                  ; count -= 8

-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))

-    ldr     r10, [sp, #4]               ; b->tree

-extra_count_lt_zero

-    lsl     r2, r2, r6

-    subs    r8, r8, #1                  ; --n

-    bne     extra_bits_loop             ; while (n)

-no_extra_bits

-    ldr     lr, [r1, #4]                ; e = p->Extra

-    add     r4, r5, #1                  ; range + 1

-    tst     lr, #1

-    lsr     r4, r4, #1                  ; split = (range + 1) >> 1

-    addne   r2, r2, r4                  ; lowvalue += split

-    subne   r4, r5, r4                  ; range = range-split

-    tst     r2, #0x80000000             ; lowvalue & 0x80000000

-    lsl     r5, r4, #1                  ; range <<= 1

-    beq     end_high_bit_not_set

-    ldr     r4, [r0, #vp9_writer_pos]

-    mov     r7, #0

-    sub     r4, r4, #1

-    b       end_zero_while_start

-end_zero_while_loop

-    strb    r7, [r6, r4]

-    sub     r4, r4, #1                  ; x--

-end_zero_while_start

-    cmp     r4, #0

-    ldrge   r6, [r0, #vp9_writer_buffer]

-    ldrb    r12, [r6, r4]

-    cmpge   r12, #0xff

-    beq     end_zero_while_loop

-    ldr     r6, [r0, #vp9_writer_buffer]

-    ldrb    r7, [r6, r4]

-    add     r7, r7, #1

-    strb    r7, [r6, r4]

-end_high_bit_not_set

-    adds    r3, r3, #1                  ; ++count

-    lsl     r2, r2, #1                  ; lowvalue  <<= 1

-    bne     end_count_zero

-    ldr     r4, [r0, #vp9_writer_pos]

-    mvn     r3, #7

-    ldr     r7, [r0, #vp9_writer_buffer]

-    lsr     r6, r2, #24                 ; lowvalue >> 24

-    add     r12, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r12, [r0, #0x10]

-    strb    r6, [r7, r4]

-end_count_zero

-skip_extra_bits

-    add     r1, r1, #TOKENEXTRA_SZ      ; ++p

-check_p_lt_stop

-    ldr     r4, [sp, #0]                ; stop

-    cmp     r1, r4                      ; while( p < stop)

-    bcc     while_p_lt_stop

-    ldr     r10, [sp, #20]              ; num_parts

-    mov     r1, #TOKENLIST_SZ

-    mul     r1, r10, r1

-    ldr     r6, [sp, #12]               ; mb_rows

-    ldr     r7, [sp, #16]               ; tokenlist address

-    subs    r6, r6, r10

-    add     r7, r7, r1                  ; next element in the array

-    str     r6, [sp, #12]

-    bgt     mb_row_loop

-    mov     r12, #32

-stop_encode_loop

-    sub     r7, r5, #1                  ; range-1

-    mov     r4, r7, lsl #7              ; ((range-1) * 128)

-    mov     r7, #1

-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)

-    ; Counting the leading zeros is used to normalize range.

-    clz     r6, r4

-    sub     r6, r6, #24                 ; shift

-    ; Flag is set on the sum of count.  This flag is used later

-    ; to determine if count >= 0

-    adds    r3, r3, r6                  ; count += shift

-    lsl     r5, r4, r6                  ; range <<= shift

-    bmi     token_count_lt_zero_se      ; if(count >= 0)

-    sub     r6, r6, r3                  ; offset = shift - count

-    sub     r4, r6, #1                  ; offset-1

-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

-    bpl     token_high_bit_not_set_se

-    ldr     r4, [r0, #vp9_writer_pos]   ; x

-    sub     r4, r4, #1                  ; x = w->pos-1

-    b       token_zero_while_start_se

-token_zero_while_loop_se

-    mov     r10, #0

-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

-    sub     r4, r4, #1                  ; x--

-token_zero_while_start_se

-    cmp     r4, #0

-    ldrge   r7, [r0, #vp9_writer_buffer]

-    ldrb    r11, [r7, r4]

-    cmpge   r11, #0xff

-    beq     token_zero_while_loop_se

-    ldr     r7, [r0, #vp9_writer_buffer]

-    ldrb    r10, [r7, r4]               ; w->buffer[x]

-    add     r10, r10, #1

-    strb    r10, [r7, r4]               ; w->buffer[x] + 1

-token_high_bit_not_set_se

-    rsb     r4, r6, #24                 ; 24-offset

-    ldr     r10, [r0, #vp9_writer_buffer]

-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

-    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos

-    lsl     r2, r2, r6                  ; lowvalue <<= offset

-    mov     r6, r3                      ; shift = count

-    add     r11, r4, #1                 ; w->pos++

-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

-    str     r11, [r0, #vp9_writer_pos]

-    sub     r3, r3, #8                  ; count -= 8

-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]

-token_count_lt_zero_se

-    lsl     r2, r2, r6                  ; lowvalue <<= shift

-    subs    r12, r12, #1

-    bne     stop_encode_loop

-    ldr     r10, [sp, #8]               ; *size

-    ldr     r11, [r10]

-    ldr     r4,  [r0, #vp9_writer_pos]  ; w->pos

-    add     r11, r11, r4                ; *size += w->pos

-    str     r11, [r10]

-    ldr     r9, [sp, #20]               ; num_parts

-    sub     r9, r9, #1

-    ldr     r10, [sp, #28]              ; i

-    cmp     r10, r9                     ; if(i<(num_part - 1))

-    bge     skip_write_partition

-    ldr     r12, [sp, #40]              ; ptr

-    add     r12, r12, r4                ; ptr += w->pos

-    str     r12, [sp, #40]

-    ldr     r9, [sp, #24]               ; cx_data

-    mov     r8, r4, asr #8

-    strb    r4, [r9, #0]

-    strb    r8, [r9, #1]

-    mov     r4, r4, asr #16

-    strb    r4, [r9, #2]

-    add     r9, r9, #3                  ; cx_data += 3

-    str     r9, [sp, #24]

-skip_write_partition

-    ldr     r11, [sp, #28]              ; i

-    ldr     r10, [sp, #20]              ; num_parts

-    add     r11, r11, #1                ; i++

-    str     r11, [sp, #28]

-    ldr     r7, [sp, #32]               ; cpi->tp_list[i]

-    mov     r1, #TOKENLIST_SZ

-    add     r7, r7, r1                  ; next element in cpi->tp_list

-    str     r7, [sp, #32]               ; cpi->tp_list[i+1]

-    cmp     r10, r11

-    bgt     numparts_loop

-    add     sp, sp, #44

-    pop     {r4-r11, pc}

-    ENDP

-_VP8_COMP_common_

-    DCD     vp8_comp_common

-_VP8_COMMON_MBrows_

-    DCD     vp8_common_mb_rows

-_VP8_COMP_tplist_

-    DCD     vp8_comp_tplist

-_VP8_COMP_bc2_

-    DCD     vp8_comp_bc2

-    END

--- a/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm

+++ /dev/null

@@ -1,224 +1,0 @@

-;

-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_fast_quantize_b_armv6|

-    INCLUDE asm_enc_offsets.asm

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    BLOCK *b

-; r1    BLOCKD *d

-|vp8_fast_quantize_b_armv6| PROC

-    stmfd   sp!, {r1, r4-r11, lr}

-    ldr     r3, [r0, #vp8_block_coeff]      ; coeff

-    ldr     r4, [r0, #vp8_block_quant_fast] ; quant_fast

-    ldr     r5, [r0, #vp8_block_round]      ; round

-    ldr     r6, [r1, #vp8_blockd_qcoeff]    ; qcoeff

-    ldr     r7, [r1, #vp8_blockd_dqcoeff]   ; dqcoeff

-    ldr     r8, [r1, #vp8_blockd_dequant]   ; dequant

-    ldr     r2, loop_count          ; loop_count=0x1000000. 'lsls' instruction

-                                    ; is used to update the counter so that

-                                    ; it can be used to mark nonzero

-                                    ; quantized coefficient pairs.

-    mov     r1, #0                  ; flags for quantized coeffs

-    ; PART 1: quantization and dequantization loop

-loop

-    ldr     r9, [r3], #4            ; [z1 | z0]

-    ldr     r10, [r5], #4           ; [r1 | r0]

-    ldr     r11, [r4], #4           ; [q1 | q0]

-    ssat16  lr, #1, r9              ; [sz1 | sz0]

-    eor     r9, r9, lr              ; [z1 ^ sz1 | z0 ^ sz0]

-    ssub16  r9, r9, lr              ; x = (z ^ sz) - sz

-    sadd16  r9, r9, r10             ; [x1+r1 | x0+r0]

-    ldr     r12, [r3], #4           ; [z3 | z2]

-    smulbb  r0, r9, r11             ; [(x0+r0)*q0]

-    smultt  r9, r9, r11             ; [(x1+r1)*q1]

-    ldr     r10, [r5], #4           ; [r3 | r2]

-    ssat16  r11, #1, r12            ; [sz3 | sz2]

-    eor     r12, r12, r11           ; [z3 ^ sz3 | z2 ^ sz2]

-    pkhtb   r0, r9, r0, asr #16     ; [y1 | y0]

-    ldr     r9, [r4], #4            ; [q3 | q2]

-    ssub16  r12, r12, r11           ; x = (z ^ sz) - sz

-    sadd16  r12, r12, r10           ; [x3+r3 | x2+r2]

-    eor     r0, r0, lr              ; [(y1 ^ sz1) | (y0 ^ sz0)]

-    smulbb  r10, r12, r9            ; [(x2+r2)*q2]

-    smultt  r12, r12, r9            ; [(x3+r3)*q3]

-    ssub16  r0, r0, lr              ; x = (y ^ sz) - sz

-    cmp     r0, #0                  ; check if zero

-    orrne   r1, r1, r2, lsr #24     ; add flag for nonzero coeffs

-    str     r0, [r6], #4            ; *qcoeff++ = x

-    ldr     r9, [r8], #4            ; [dq1 | dq0]

-    pkhtb   r10, r12, r10, asr #16  ; [y3 | y2]

-    eor     r10, r10, r11           ; [(y3 ^ sz3) | (y2 ^ sz2)]

-    ssub16  r10, r10, r11           ; x = (y ^ sz) - sz

-    cmp     r10, #0                 ; check if zero

-    orrne   r1, r1, r2, lsr #23     ; add flag for nonzero coeffs

-    str     r10, [r6], #4           ; *qcoeff++ = x

-    ldr     r11, [r8], #4           ; [dq3 | dq2]

-    smulbb  r12, r0, r9             ; [x0*dq0]

-    smultt  r0, r0, r9              ; [x1*dq1]

-    smulbb  r9, r10, r11            ; [x2*dq2]

-    smultt  r10, r10, r11           ; [x3*dq3]

-    lsls    r2, r2, #2              ; update loop counter

-    strh    r12, [r7, #0]           ; dqcoeff[0] = [x0*dq0]

-    strh    r0, [r7, #2]            ; dqcoeff[1] = [x1*dq1]

-    strh    r9, [r7, #4]            ; dqcoeff[2] = [x2*dq2]

-    strh    r10, [r7, #6]           ; dqcoeff[3] = [x3*dq3]

-    add     r7, r7, #8              ; dqcoeff += 8

-    bne     loop

-    ; PART 2: check position for eob...

-    mov     lr, #0                  ; init eob

-    cmp     r1, #0                  ; coeffs after quantization?

-    ldr     r11, [sp, #0]           ; restore BLOCKD pointer

-    beq     end                     ; skip eob calculations if all zero

-    ldr     r0, [r11, #vp8_blockd_qcoeff]

-    ; check shortcut for nonzero qcoeffs

-    tst    r1, #0x80

-    bne    quant_coeff_15_14

-    tst    r1, #0x20

-    bne    quant_coeff_13_11

-    tst    r1, #0x8

-    bne    quant_coeff_12_7

-    tst    r1, #0x40

-    bne    quant_coeff_10_9

-    tst    r1, #0x10

-    bne    quant_coeff_8_3

-    tst    r1, #0x2

-    bne    quant_coeff_6_5

-    tst    r1, #0x4

-    bne    quant_coeff_4_2

-    b      quant_coeff_1_0

-quant_coeff_15_14

-    ldrh    r2, [r0, #30]       ; rc=15, i=15

-    mov     lr, #16

-    cmp     r2, #0

-    bne     end

-    ldrh    r3, [r0, #28]       ; rc=14, i=14

-    mov     lr, #15

-    cmp     r3, #0

-    bne     end

-quant_coeff_13_11

-    ldrh    r2, [r0, #22]       ; rc=11, i=13

-    mov     lr, #14

-    cmp     r2, #0

-    bne     end

-quant_coeff_12_7

-    ldrh    r3, [r0, #14]       ; rc=7,  i=12

-    mov     lr, #13

-    cmp     r3, #0

-    bne     end

-    ldrh    r2, [r0, #20]       ; rc=10, i=11

-    mov     lr, #12

-    cmp     r2, #0

-    bne     end

-quant_coeff_10_9

-    ldrh    r3, [r0, #26]       ; rc=13, i=10

-    mov     lr, #11

-    cmp     r3, #0

-    bne     end

-    ldrh    r2, [r0, #24]       ; rc=12, i=9

-    mov     lr, #10

-    cmp     r2, #0

-    bne     end

-quant_coeff_8_3

-    ldrh    r3, [r0, #18]       ; rc=9,  i=8

-    mov     lr, #9

-    cmp     r3, #0

-    bne     end

-    ldrh    r2, [r0, #12]       ; rc=6,  i=7

-    mov     lr, #8

-    cmp     r2, #0

-    bne     end

-quant_coeff_6_5

-    ldrh    r3, [r0, #6]        ; rc=3,  i=6

-    mov     lr, #7

-    cmp     r3, #0

-    bne     end

-    ldrh    r2, [r0, #4]        ; rc=2,  i=5

-    mov     lr, #6

-    cmp     r2, #0

-    bne     end

-quant_coeff_4_2

-    ldrh    r3, [r0, #10]       ; rc=5,  i=4

-    mov     lr, #5

-    cmp     r3, #0

-    bne     end

-    ldrh    r2, [r0, #16]       ; rc=8,  i=3

-    mov     lr, #4

-    cmp     r2, #0

-    bne     end

-    ldrh    r3, [r0, #8]        ; rc=4,  i=2

-    mov     lr, #3

-    cmp     r3, #0

-    bne     end

-quant_coeff_1_0

-    ldrh    r2, [r0, #2]        ; rc=1,  i=1

-    mov     lr, #2

-    cmp     r2, #0

-    bne     end

-    mov     lr, #1              ; rc=0,  i=0

-end

-    str     lr, [r11, #vp8_blockd_eob]

-    ldmfd   sp!, {r1, r4-r11, pc}

-    ENDP

-loop_count

-    DCD     0x1000000

-    END

--- a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm

+++ /dev/null

@@ -1,138 +1,0 @@

-;

-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_mse16x16_armv6|

-    ARM

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char *src_ptr

-; r1    int source_stride

-; r2    unsigned char *ref_ptr

-; r3    int  recon_stride

-; stack unsigned int *sse

-;

-;note: Based on vp9_variance16x16_armv6. In this function, sum is never used.

-;      So, we can remove this part of calculation.

-|vp8_mse16x16_armv6| PROC

-    push    {r4-r9, lr}

-    pld     [r0, r1, lsl #0]

-    pld     [r2, r3, lsl #0]

-    mov     r12, #16            ; set loop counter to 16 (=block height)

-    mov     r4, #0              ; initialize sse = 0

-loop

-    ; 1st 4 pixels

-    ldr     r5, [r0, #0x0]      ; load 4 src pixels

-    ldr     r6, [r2, #0x0]      ; load 4 ref pixels

-    mov     lr, #0              ; constant zero

-    usub8   r8, r5, r6          ; calculate difference

-    pld     [r0, r1, lsl #1]

-    sel     r7, r8, lr          ; select bytes with positive difference

-    usub8   r9, r6, r5          ; calculate difference with reversed operands

-    pld     [r2, r3, lsl #1]

-    sel     r8, r9, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r5, r7, lr          ; calculate sum of positive differences

-    usad8   r6, r8, lr          ; calculate sum of negative differences

-    orr     r8, r8, r7          ; differences of all 4 pixels

-    ldr     r5, [r0, #0x4]      ; load 4 src pixels

-    ; calculate sse

-    uxtb16  r6, r8              ; byte (two pixels) to halfwords

-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords

-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)

-    ; 2nd 4 pixels

-    ldr     r6, [r2, #0x4]      ; load 4 ref pixels

-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)

-    usub8   r8, r5, r6          ; calculate difference

-    sel     r7, r8, lr          ; select bytes with positive difference

-    usub8   r9, r6, r5          ; calculate difference with reversed operands

-    sel     r8, r9, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r5, r7, lr          ; calculate sum of positive differences

-    usad8   r6, r8, lr          ; calculate sum of negative differences

-    orr     r8, r8, r7          ; differences of all 4 pixels

-    ldr     r5, [r0, #0x8]      ; load 4 src pixels

-    ; calculate sse

-    uxtb16  r6, r8              ; byte (two pixels) to halfwords

-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords

-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)

-    ; 3rd 4 pixels

-    ldr     r6, [r2, #0x8]      ; load 4 ref pixels

-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)

-    usub8   r8, r5, r6          ; calculate difference

-    sel     r7, r8, lr          ; select bytes with positive difference

-    usub8   r9, r6, r5          ; calculate difference with reversed operands

-    sel     r8, r9, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r5, r7, lr          ; calculate sum of positive differences

-    usad8   r6, r8, lr          ; calculate sum of negative differences

-    orr     r8, r8, r7          ; differences of all 4 pixels

-    ldr     r5, [r0, #0xc]      ; load 4 src pixels

-    ; calculate sse

-    uxtb16  r6, r8              ; byte (two pixels) to halfwords

-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords

-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)

-    ; 4th 4 pixels

-    ldr     r6, [r2, #0xc]      ; load 4 ref pixels

-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)

-    usub8   r8, r5, r6          ; calculate difference

-    add     r0, r0, r1          ; set src_ptr to next row

-    sel     r7, r8, lr          ; select bytes with positive difference

-    usub8   r9, r6, r5          ; calculate difference with reversed operands

-    add     r2, r2, r3          ; set dst_ptr to next row

-    sel     r8, r9, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r5, r7, lr          ; calculate sum of positive differences

-    usad8   r6, r8, lr          ; calculate sum of negative differences

-    orr     r8, r8, r7          ; differences of all 4 pixels

-    subs    r12, r12, #1        ; next row

-    ; calculate sse

-    uxtb16  r6, r8              ; byte (two pixels) to halfwords

-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords

-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)

-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)

-    bne     loop

-    ; return stuff

-    ldr     r1, [sp, #28]       ; get address of sse

-    mov     r0, r4              ; return sse

-    str     r4, [r1]            ; store sse

-    pop     {r4-r9, pc}

-    ENDP

-    END

--- a/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm

+++ /dev/null

@@ -1,96 +1,0 @@

-;

-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_sad16x16_armv6|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    const unsigned char *src_ptr

-; r1    int  src_stride

-; r2    const unsigned char *ref_ptr

-; r3    int  ref_stride

-; stack max_sad (not used)

-|vp8_sad16x16_armv6| PROC

-    stmfd   sp!, {r4-r12, lr}

-    pld     [r0, r1, lsl #0]

-    pld     [r2, r3, lsl #0]

-    pld     [r0, r1, lsl #1]

-    pld     [r2, r3, lsl #1]

-    mov     r4, #0              ; sad = 0;

-    mov     r5, #8              ; loop count

-loop

-    ; 1st row

-    ldr     r6, [r0, #0x0]      ; load 4 src pixels (1A)

-    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (1A)

-    ldr     r7, [r0, #0x4]      ; load 4 src pixels (1A)

-    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (1A)

-    ldr     r10, [r0, #0x8]     ; load 4 src pixels (1B)

-    ldr     r11, [r0, #0xC]     ; load 4 src pixels (1B)

-    usada8  r4, r8, r6, r4      ; calculate sad for 4 pixels

-    usad8   r8, r7, r9          ; calculate sad for 4 pixels

-    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (1B)

-    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (1B)

-    add     r0, r0, r1          ; set src pointer to next row

-    add     r2, r2, r3          ; set dst pointer to next row

-    pld     [r0, r1, lsl #1]

-    pld     [r2, r3, lsl #1]

-    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels

-    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels

-    ldr     r6, [r0, #0x0]      ; load 4 src pixels (2A)

-    ldr     r7, [r0, #0x4]      ; load 4 src pixels (2A)

-    add     r4, r4, r8          ; add partial sad values

-    ; 2nd row

-    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (2A)

-    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (2A)

-    ldr     r10, [r0, #0x8]     ; load 4 src pixels (2B)

-    ldr     r11, [r0, #0xC]     ; load 4 src pixels (2B)

-    usada8  r4, r6, r8, r4      ; calculate sad for 4 pixels

-    usad8   r8, r7, r9          ; calculate sad for 4 pixels

-    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (2B)

-    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (2B)

-    add     r0, r0, r1          ; set src pointer to next row

-    add     r2, r2, r3          ; set dst pointer to next row

-    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels

-    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels

-    pld     [r0, r1, lsl #1]

-    pld     [r2, r3, lsl #1]

-    subs    r5, r5, #1          ; decrement loop counter

-    add     r4, r4, r8          ; add partial sad values

-    bne     loop

-    mov     r0, r4              ; return sad

-    ldmfd   sp!, {r4-r12, pc}

-    ENDP

-    END

--- a/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm

+++ /dev/null

@@ -1,262 +1,0 @@

-;

-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT |vp8_short_fdct4x4_armv6|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA    |.text|, CODE, READONLY

-; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)

-|vp8_short_fdct4x4_armv6| PROC

-    stmfd       sp!, {r4 - r12, lr}

-    ; PART 1

-    ; coeffs 0-3

-    ldrd        r4, r5, [r0]        ; [i1 | i0] [i3 | i2]

-    ldr         r10, c7500

-    ldr         r11, c14500

-    ldr         r12, c0x22a453a0    ; [2217*4 | 5352*4]

-    ldr         lr, c0x00080008

-    ror         r5, r5, #16         ; [i2 | i3]

-    qadd16      r6, r4, r5          ; [i1+i2 | i0+i3] = [b1 | a1] without shift

-    qsub16      r7, r4, r5          ; [i1-i2 | i0-i3] = [c1 | d1] without shift

-    add         r0, r0, r2          ; update input pointer

-    qadd16      r7, r7, r7          ; 2*[c1|d1] --> we can use smlad and smlsd

-                                    ; with 2217*4 and 5352*4 without losing the

-                                    ; sign bit (overflow)

-    smuad       r4, r6, lr          ; o0 = (i1+i2)*8 + (i0+i3)*8

-    smusd       r5, r6, lr          ; o2 = (i1+i2)*8 - (i0+i3)*8

-    smlad       r6, r7, r12, r11    ; o1 = (c1 * 2217 + d1 * 5352 +  14500)

-    smlsdx      r7, r7, r12, r10    ; o3 = (d1 * 2217 - c1 * 5352 +   7500)

-    ldrd        r8, r9, [r0]        ; [i5 | i4] [i7 | i6]

-    pkhbt       r3, r4, r6, lsl #4  ; [o1 | o0], keep in register for PART 2

-    pkhbt       r6, r5, r7, lsl #4  ; [o3 | o2]

-    str         r6, [r1, #4]

-    ; coeffs 4-7

-    ror         r9, r9, #16         ; [i6 | i7]

-    qadd16      r6, r8, r9          ; [i5+i6 | i4+i7] = [b1 | a1] without shift

-    qsub16      r7, r8, r9          ; [i5-i6 | i4-i7] = [c1 | d1] without shift

-    add         r0, r0, r2          ; update input pointer

-    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd

-                                    ; with 2217*4 and 5352*4 without losing the

-                                    ; sign bit (overflow)

-    smuad       r9, r6, lr          ; o4 = (i5+i6)*8 + (i4+i7)*8

-    smusd       r8, r6, lr          ; o6 = (i5+i6)*8 - (i4+i7)*8

-    smlad       r6, r7, r12, r11    ; o5 = (c1 * 2217 + d1 * 5352 +  14500)

-    smlsdx      r7, r7, r12, r10    ; o7 = (d1 * 2217 - c1 * 5352 +   7500)

-    ldrd        r4, r5, [r0]        ; [i9 | i8] [i11 | i10]

-    pkhbt       r9, r9, r6, lsl #4  ; [o5 | o4], keep in register for PART 2

-    pkhbt       r6, r8, r7, lsl #4  ; [o7 | o6]

-    str         r6, [r1, #12]

-    ; coeffs 8-11

-    ror         r5, r5, #16         ; [i10 | i11]

-    qadd16      r6, r4, r5          ; [i9+i10 | i8+i11]=[b1 | a1] without shift

-    qsub16      r7, r4, r5          ; [i9-i10 | i8-i11]=[c1 | d1] without shift

-    add         r0, r0, r2          ; update input pointer

-    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd

-                                    ; with 2217*4 and 5352*4 without losing the

-                                    ; sign bit (overflow)

-    smuad       r2, r6, lr          ; o8 = (i9+i10)*8 + (i8+i11)*8

-    smusd       r8, r6, lr          ; o10 = (i9+i10)*8 - (i8+i11)*8

-    smlad       r6, r7, r12, r11    ; o9 = (c1 * 2217 + d1 * 5352 +  14500)

-    smlsdx      r7, r7, r12, r10    ; o11 = (d1 * 2217 - c1 * 5352 +   7500)

-    ldrd        r4, r5, [r0]        ; [i13 | i12] [i15 | i14]

-    pkhbt       r2, r2, r6, lsl #4  ; [o9 | o8], keep in register for PART 2

-    pkhbt       r6, r8, r7, lsl #4  ; [o11 | o10]

-    str         r6, [r1, #20]

-    ; coeffs 12-15

-    ror         r5, r5, #16         ; [i14 | i15]

-    qadd16      r6, r4, r5          ; [i13+i14 | i12+i15]=[b1|a1] without shift

-    qsub16      r7, r4, r5          ; [i13-i14 | i12-i15]=[c1|d1] without shift

-    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd

-                                    ; with 2217*4 and 5352*4 without losing the

-                                    ; sign bit (overflow)

-    smuad       r4, r6, lr          ; o12 = (i13+i14)*8 + (i12+i15)*8

-    smusd       r5, r6, lr          ; o14 = (i13+i14)*8 - (i12+i15)*8

-    smlad       r6, r7, r12, r11    ; o13 = (c1 * 2217 + d1 * 5352 +  14500)

-    smlsdx      r7, r7, r12, r10    ; o15 = (d1 * 2217 - c1 * 5352 +   7500)

-    pkhbt       r0, r4, r6, lsl #4  ; [o13 | o12], keep in register for PART 2

-    pkhbt       r6, r5, r7, lsl #4  ; [o15 | o14]

-    str         r6, [r1, #28]

-    ; PART 2 -------------------------------------------------

-    ldr         r11, c12000

-    ldr         r10, c51000

-    ldr         lr, c0x00070007

-    qadd16      r4, r3, r0          ; a1 = [i1+i13 | i0+i12]

-    qadd16      r5, r9, r2          ; b1 = [i5+i9  |  i4+i8]

-    qsub16      r6, r9, r2          ; c1 = [i5-i9  |  i4-i8]

-    qsub16      r7, r3, r0          ; d1 = [i1-i13 | i0-i12]

-    qadd16      r4, r4, lr          ; a1 + 7

-    add         r0, r11, #0x10000   ; add (d!=0)

-    qadd16      r2, r4, r5          ; a1 + b1 + 7

-    qsub16      r3, r4, r5          ; a1 - b1 + 7

-    ldr         r12, c0x08a914e8    ; [2217 | 5352]

-    lsl         r8, r2, #16         ; prepare bottom halfword for scaling

-    asr         r2, r2, #4          ; scale top halfword

-    lsl         r9, r3, #16         ; prepare bottom halfword for scaling

-    asr         r3, r3, #4          ; scale top halfword

-    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword

-    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword

-    smulbt      r2, r6, r12         ; [ ------ | c1*2217]

-    str         r4, [r1, #0]        ; [     o1 |      o0]

-    smultt      r3, r6, r12         ; [c1*2217 | ------ ]

-    str         r5, [r1, #16]       ; [     o9 |      o8]

-    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]

-    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]

-    smulbb      r2, r6, r12         ; [ ------ | c1*5352]

-    smultb      r3, r6, r12         ; [c1*5352 | ------ ]

-    lsls        r6, r7, #16         ; d1 != 0 ?

-    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)

-    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)

-    asrs        r6, r7, #16

-    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)

-    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)

-    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000

-    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000

-    pkhtb       r9, r9, r8, asr #16

-    sub         r4, r4, r2

-    sub         r5, r5, r3

-    ldr         r3, [r1, #4]        ; [i3 | i2]

-    pkhtb       r5, r5, r4, asr #16 ; [o13|o12]

-    str         r9, [r1, #8]        ; [o5 | 04]

-    ldr         r9, [r1, #12]       ; [i7 | i6]

-    ldr         r8, [r1, #28]       ; [i15|i14]

-    ldr         r2, [r1, #20]       ; [i11|i10]

-    str         r5, [r1, #24]       ; [o13|o12]

-    qadd16      r4, r3, r8          ; a1 = [i3+i15 | i2+i14]

-    qadd16      r5, r9, r2          ; b1 = [i7+i11 | i6+i10]

-    qadd16      r4, r4, lr          ; a1 + 7

-    qsub16      r6, r9, r2          ; c1 = [i7-i11 | i6-i10]

-    qadd16      r2, r4, r5          ; a1 + b1 + 7

-    qsub16      r7, r3, r8          ; d1 = [i3-i15 | i2-i14]

-    qsub16      r3, r4, r5          ; a1 - b1 + 7

-    lsl         r8, r2, #16         ; prepare bottom halfword for scaling

-    asr         r2, r2, #4          ; scale top halfword

-    lsl         r9, r3, #16         ; prepare bottom halfword for scaling

-    asr         r3, r3, #4          ; scale top halfword

-    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword

-    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword

-    smulbt      r2, r6, r12         ; [ ------ | c1*2217]

-    str         r4, [r1, #4]        ; [     o3 |      o2]

-    smultt      r3, r6, r12         ; [c1*2217 | ------ ]

-    str         r5, [r1, #20]       ; [    o11 |     o10]

-    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]

-    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]

-    smulbb      r2, r6, r12         ; [ ------ | c1*5352]

-    smultb      r3, r6, r12         ; [c1*5352 | ------ ]

-    lsls        r6, r7, #16         ; d1 != 0 ?

-    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)

-    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)

-    asrs        r6, r7, #16

-    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)

-    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)

-    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000

-    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000

-    pkhtb       r9, r9, r8, asr #16

-    sub         r4, r4, r2

-    sub         r5, r5, r3

-    str         r9, [r1, #12]       ; [o7 | o6]

-    pkhtb       r5, r5, r4, asr #16 ; [o15|o14]

-    str         r5, [r1, #28]       ; [o15|o14]

-    ldmfd       sp!, {r4 - r12, pc}

-    ENDP

-; Used constants

-c7500

-    DCD     7500

-c14500

-    DCD     14500

-c0x22a453a0

-    DCD     0x22a453a0

-c0x00080008

-    DCD     0x00080008

-c12000

-    DCD     12000

-c51000

-    DCD     51000

-c0x00070007

-    DCD     0x00070007

-c0x08a914e8

-    DCD     0x08a914e8

-    END

--- a/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm

+++ /dev/null

@@ -1,265 +1,0 @@

-;

-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_subtract_mby_armv6|

-    EXPORT  |vp8_subtract_mbuv_armv6|

-    EXPORT  |vp8_subtract_b_armv6|

-    INCLUDE asm_enc_offsets.asm

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    BLOCK *be

-; r1    BLOCKD *bd

-; r2    int pitch

-|vp8_subtract_b_armv6| PROC

-    stmfd   sp!, {r4-r9}

-    ldr     r4, [r0, #vp8_block_base_src]

-    ldr     r5, [r0, #vp8_block_src]

-    ldr     r6, [r0, #vp8_block_src_diff]

-    ldr     r3, [r4]

-    ldr     r7, [r0, #vp8_block_src_stride]

-    add     r3, r3, r5          ; src = *base_src + src

-    ldr     r8, [r1, #vp8_blockd_predictor]

-    mov     r9, #4              ; loop count

-loop_block

-    ldr     r0, [r3], r7        ; src

-    ldr     r1, [r8], r2        ; pred

-    uxtb16  r4, r0              ; [s2 | s0]

-    uxtb16  r5, r1              ; [p2 | p0]

-    uxtb16  r0, r0, ror #8      ; [s3 | s1]

-    uxtb16  r1, r1, ror #8      ; [p3 | p1]

-    usub16  r4, r4, r5          ; [d2 | d0]

-    usub16  r5, r0, r1          ; [d3 | d1]

-    subs    r9, r9, #1          ; decrement loop counter

-    pkhbt   r0, r4, r5, lsl #16 ; [d1 | d0]

-    pkhtb   r1, r5, r4, asr #16 ; [d3 | d2]

-    str     r0, [r6, #0]        ; diff

-    str     r1, [r6, #4]        ; diff

-    add     r6, r6, r2, lsl #1  ; update diff pointer

-    bne     loop_block

-    ldmfd   sp!, {r4-r9}

-    mov     pc, lr

-    ENDP

-; r0    short *diff

-; r1    unsigned char *usrc

-; r2    unsigned char *vsrc

-; r3    unsigned char *pred

-; stack int stride

-|vp8_subtract_mbuv_armv6| PROC

-    stmfd   sp!, {r4-r12, lr}

-    add     r0, r0, #512        ; set *diff point to Cb

-    add     r3, r3, #256        ; set *pred point to Cb

-    mov     r4, #8              ; loop count

-    ldr     r5, [sp, #40]       ; stride

-    ; Subtract U block

-loop_u

-    ldr     r6, [r1]            ; src       (A)

-    ldr     r7, [r3], #4        ; pred      (A)

-    uxtb16  r8, r6              ; [s2 | s0] (A)

-    uxtb16  r9, r7              ; [p2 | p0] (A)

-    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)

-    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)

-    usub16  r6, r8, r9          ; [d2 | d0] (A)

-    usub16  r7, r10, r11        ; [d3 | d1] (A)

-    ldr     r10, [r1, #4]       ; src       (B)

-    ldr     r11, [r3], #4       ; pred      (B)

-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)

-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)

-    str     r8, [r0], #4        ; diff      (A)

-    uxtb16  r8, r10             ; [s2 | s0] (B)

-    str     r9, [r0], #4        ; diff      (A)

-    uxtb16  r9, r11             ; [p2 | p0] (B)

-    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)

-    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)

-    usub16  r6, r8, r9          ; [d2 | d0] (B)

-    usub16  r7, r10, r11        ; [d3 | d1] (B)

-    add     r1, r1, r5          ; update usrc pointer

-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)

-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)

-    str     r8, [r0], #4        ; diff      (B)

-    subs    r4, r4, #1          ; update loop counter

-    str     r9, [r0], #4        ; diff      (B)

-    bne     loop_u

-    mov     r4, #8              ; loop count

-    ; Subtract V block

-loop_v

-    ldr     r6, [r2]            ; src       (A)

-    ldr     r7, [r3], #4        ; pred      (A)

-    uxtb16  r8, r6              ; [s2 | s0] (A)

-    uxtb16  r9, r7              ; [p2 | p0] (A)

-    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)

-    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)

-    usub16  r6, r8, r9          ; [d2 | d0] (A)

-    usub16  r7, r10, r11        ; [d3 | d1] (A)

-    ldr     r10, [r2, #4]       ; src       (B)

-    ldr     r11, [r3], #4       ; pred      (B)

-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)

-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)

-    str     r8, [r0], #4        ; diff      (A)

-    uxtb16  r8, r10             ; [s2 | s0] (B)

-    str     r9, [r0], #4        ; diff      (A)

-    uxtb16  r9, r11             ; [p2 | p0] (B)

-    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)

-    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)

-    usub16  r6, r8, r9          ; [d2 | d0] (B)

-    usub16  r7, r10, r11        ; [d3 | d1] (B)

-    add     r2, r2, r5          ; update vsrc pointer

-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)

-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)

-    str     r8, [r0], #4        ; diff      (B)

-    subs    r4, r4, #1          ; update loop counter

-    str     r9, [r0], #4        ; diff      (B)

-    bne     loop_v

-    ldmfd   sp!, {r4-r12, pc}

-    ENDP

-; r0    short *diff

-; r1    unsigned char *src

-; r2    unsigned char *pred

-; r3    int stride

-|vp8_subtract_mby_armv6| PROC

-    stmfd   sp!, {r4-r11}

-    mov     r4, #16

-loop

-    ldr     r6, [r1]            ; src       (A)

-    ldr     r7, [r2], #4        ; pred      (A)

-    uxtb16  r8, r6              ; [s2 | s0] (A)

-    uxtb16  r9, r7              ; [p2 | p0] (A)

-    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)

-    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)

-    usub16  r6, r8, r9          ; [d2 | d0] (A)

-    usub16  r7, r10, r11        ; [d3 | d1] (A)

-    ldr     r10, [r1, #4]       ; src       (B)

-    ldr     r11, [r2], #4       ; pred      (B)

-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)

-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)

-    str     r8, [r0], #4        ; diff      (A)

-    uxtb16  r8, r10             ; [s2 | s0] (B)

-    str     r9, [r0], #4        ; diff      (A)

-    uxtb16  r9, r11             ; [p2 | p0] (B)

-    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)

-    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)

-    usub16  r6, r8, r9          ; [d2 | d0] (B)

-    usub16  r7, r10, r11        ; [d3 | d1] (B)

-    ldr     r10, [r1, #8]       ; src       (C)

-    ldr     r11, [r2], #4       ; pred      (C)

-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)

-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)

-    str     r8, [r0], #4        ; diff      (B)

-    uxtb16  r8, r10             ; [s2 | s0] (C)

-    str     r9, [r0], #4        ; diff      (B)

-    uxtb16  r9, r11             ; [p2 | p0] (C)

-    uxtb16  r10, r10, ror #8    ; [s3 | s1] (C)

-    uxtb16  r11, r11, ror #8    ; [p3 | p1] (C)

-    usub16  r6, r8, r9          ; [d2 | d0] (C)

-    usub16  r7, r10, r11        ; [d3 | d1] (C)

-    ldr     r10, [r1, #12]      ; src       (D)

-    ldr     r11, [r2], #4       ; pred      (D)

-    pkhbt   r8, r6, r7, lsl #16  ; [d1 | d0] (C)

-    pkhtb   r9, r7, r6, asr #16  ; [d3 | d2] (C)

-    str     r8, [r0], #4        ; diff      (C)

-    uxtb16  r8, r10             ; [s2 | s0] (D)

-    str     r9, [r0], #4        ; diff      (C)

-    uxtb16  r9, r11             ; [p2 | p0] (D)

-    uxtb16  r10, r10, ror #8    ; [s3 | s1] (D)

-    uxtb16  r11, r11, ror #8    ; [p3 | p1] (D)

-    usub16  r6, r8, r9          ; [d2 | d0] (D)

-    usub16  r7, r10, r11        ; [d3 | d1] (D)

-    add     r1, r1, r3          ; update src pointer

-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (D)

-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (D)

-    str     r8, [r0], #4        ; diff      (D)

-    subs    r4, r4, #1          ; update loop counter

-    str     r9, [r0], #4        ; diff      (D)

-    bne     loop

-    ldmfd   sp!, {r4-r11}

-    mov     pc, lr

-    ENDP

-    END

--- a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm

+++ /dev/null

@@ -1,154 +1,0 @@

-;

-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_variance16x16_armv6|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char *src_ptr

-; r1    int source_stride

-; r2    unsigned char *ref_ptr

-; r3    int  recon_stride

-; stack unsigned int *sse

-|vp9_variance16x16_armv6| PROC

-    stmfd   sp!, {r4-r12, lr}

-    pld     [r0, r1, lsl #0]

-    pld     [r2, r3, lsl #0]

-    mov     r8, #0              ; initialize sum = 0

-    mov     r11, #0             ; initialize sse = 0

-    mov     r12, #16            ; set loop counter to 16 (=block height)

-loop

-    ; 1st 4 pixels

-    ldr     r4, [r0, #0]        ; load 4 src pixels

-    ldr     r5, [r2, #0]        ; load 4 ref pixels

-    mov     lr, #0              ; constant zero

-    usub8   r6, r4, r5          ; calculate difference

-    pld     [r0, r1, lsl #1]

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r9, r5, r4          ; calculate difference with reversed operands

-    pld     [r2, r3, lsl #1]

-    sel     r6, r9, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    adds    r8, r8, r4          ; add positive differences to sum

-    subs    r8, r8, r5          ; substract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    ; 2nd 4 pixels

-    ldr     r4, [r0, #4]        ; load 4 src pixels

-    ldr     r5, [r2, #4]        ; load 4 ref pixels

-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)

-    usub8   r6, r4, r5          ; calculate difference

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r9, r5, r4          ; calculate difference with reversed operands

-    sel     r6, r9, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    add     r8, r8, r4          ; add positive differences to sum

-    sub     r8, r8, r5          ; substract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    ; 3rd 4 pixels

-    ldr     r4, [r0, #8]        ; load 4 src pixels

-    ldr     r5, [r2, #8]        ; load 4 ref pixels

-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)

-    usub8   r6, r4, r5          ; calculate difference

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r9, r5, r4          ; calculate difference with reversed operands

-    sel     r6, r9, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    add     r8, r8, r4          ; add positive differences to sum

-    sub     r8, r8, r5          ; substract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    ; 4th 4 pixels

-    ldr     r4, [r0, #12]       ; load 4 src pixels

-    ldr     r5, [r2, #12]       ; load 4 ref pixels

-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)

-    usub8   r6, r4, r5          ; calculate difference

-    add     r0, r0, r1          ; set src_ptr to next row

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r9, r5, r4          ; calculate difference with reversed operands

-    add     r2, r2, r3          ; set dst_ptr to next row

-    sel     r6, r9, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    add     r8, r8, r4          ; add positive differences to sum

-    sub     r8, r8, r5          ; substract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)

-    subs    r12, r12, #1

-    bne     loop

-    ; return stuff

-    ldr     r6, [sp, #40]       ; get address of sse

-    mul     r0, r8, r8          ; sum * sum

-    str     r11, [r6]           ; store sse

-    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))

-    ldmfd   sp!, {r4-r12, pc}

-    ENDP

-    END

--- a/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm

+++ /dev/null

@@ -1,101 +1,0 @@

-;

-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_variance8x8_armv6|

-    ARM

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char *src_ptr

-; r1    int source_stride

-; r2    unsigned char *ref_ptr

-; r3    int  recon_stride

-; stack unsigned int *sse

-|vp9_variance8x8_armv6| PROC

-    push    {r4-r10, lr}

-    pld     [r0, r1, lsl #0]

-    pld     [r2, r3, lsl #0]

-    mov     r12, #8             ; set loop counter to 8 (=block height)

-    mov     r4, #0              ; initialize sum = 0

-    mov     r5, #0              ; initialize sse = 0

-loop

-    ; 1st 4 pixels

-    ldr     r6, [r0, #0x0]      ; load 4 src pixels

-    ldr     r7, [r2, #0x0]      ; load 4 ref pixels

-    mov     lr, #0              ; constant zero

-    usub8   r8, r6, r7          ; calculate difference

-    pld     [r0, r1, lsl #1]

-    sel     r10, r8, lr         ; select bytes with positive difference

-    usub8   r9, r7, r6          ; calculate difference with reversed operands

-    pld     [r2, r3, lsl #1]

-    sel     r8, r9, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r6, r10, lr         ; calculate sum of positive differences

-    usad8   r7, r8, lr          ; calculate sum of negative differences

-    orr     r8, r8, r10         ; differences of all 4 pixels

-    ; calculate total sum

-    add    r4, r4, r6           ; add positive differences to sum

-    sub    r4, r4, r7           ; substract negative differences from sum

-    ; calculate sse

-    uxtb16  r7, r8              ; byte (two pixels) to halfwords

-    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords

-    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)

-    ; 2nd 4 pixels

-    ldr     r6, [r0, #0x4]      ; load 4 src pixels

-    ldr     r7, [r2, #0x4]      ; load 4 ref pixels

-    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)

-    usub8   r8, r6, r7          ; calculate difference

-    add     r0, r0, r1          ; set src_ptr to next row

-    sel     r10, r8, lr         ; select bytes with positive difference

-    usub8   r9, r7, r6          ; calculate difference with reversed operands

-    add     r2, r2, r3          ; set dst_ptr to next row

-    sel     r8, r9, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r6, r10, lr         ; calculate sum of positive differences

-    usad8   r7, r8, lr          ; calculate sum of negative differences

-    orr     r8, r8, r10         ; differences of all 4 pixels

-    ; calculate total sum

-    add     r4, r4, r6          ; add positive differences to sum

-    sub     r4, r4, r7          ; substract negative differences from sum

-    ; calculate sse

-    uxtb16  r7, r8              ; byte (two pixels) to halfwords

-    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords

-    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)

-    subs    r12, r12, #1        ; next row

-    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)

-    bne     loop

-    ; return stuff

-    ldr     r8, [sp, #32]       ; get address of sse

-    mul     r1, r4, r4          ; sum * sum

-    str     r5, [r8]            ; store sse

-    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))

-    pop     {r4-r10, pc}

-    ENDP

-    END

--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm

+++ /dev/null

@@ -1,182 +1,0 @@

-;

-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_variance_halfpixvar16x16_h_armv6|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char *src_ptr

-; r1    int source_stride

-; r2    unsigned char *ref_ptr

-; r3    int  recon_stride

-; stack unsigned int *sse

-|vp9_variance_halfpixvar16x16_h_armv6| PROC

-    stmfd   sp!, {r4-r12, lr}

-    pld     [r0, r1, lsl #0]

-    pld     [r2, r3, lsl #0]

-    mov     r8, #0              ; initialize sum = 0

-    ldr     r10, c80808080

-    mov     r11, #0             ; initialize sse = 0

-    mov     r12, #16            ; set loop counter to 16 (=block height)

-    mov     lr, #0              ; constant zero

-loop

-    ; 1st 4 pixels

-    ldr     r4, [r0, #0]        ; load 4 src pixels

-    ldr     r6, [r0, #1]        ; load 4 src pixels with 1 byte offset

-    ldr     r5, [r2, #0]        ; load 4 ref pixels

-    ; bilinear interpolation

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    usub8   r6, r4, r5          ; calculate difference

-    pld     [r0, r1, lsl #1]

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    pld     [r2, r3, lsl #1]

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    adds    r8, r8, r4          ; add positive differences to sum

-    subs    r8, r8, r5          ; substract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    ; 2nd 4 pixels

-    ldr     r4, [r0, #4]        ; load 4 src pixels

-    ldr     r6, [r0, #5]        ; load 4 src pixels with 1 byte offset

-    ldr     r5, [r2, #4]        ; load 4 ref pixels

-    ; bilinear interpolation

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    usub8   r6, r4, r5          ; calculate difference

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    add     r8, r8, r4          ; add positive differences to sum

-    sub     r8, r8, r5          ; substract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    ; 3rd 4 pixels

-    ldr     r4, [r0, #8]        ; load 4 src pixels

-    ldr     r6, [r0, #9]        ; load 4 src pixels with 1 byte offset

-    ldr     r5, [r2, #8]        ; load 4 ref pixels

-    ; bilinear interpolation

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    smlad   r11, r7, r7, r11  ; dual signed multiply, add and accumulate (2)

-    usub8   r6, r4, r5          ; calculate difference

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    add     r8, r8, r4          ; add positive differences to sum

-    sub     r8, r8, r5          ; substract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    ; 4th 4 pixels

-    ldr     r4, [r0, #12]       ; load 4 src pixels

-    ldr     r6, [r0, #13]       ; load 4 src pixels with 1 byte offset

-    ldr     r5, [r2, #12]       ; load 4 ref pixels

-    ; bilinear interpolation

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    usub8   r6, r4, r5          ; calculate difference

-    add     r0, r0, r1          ; set src_ptr to next row

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    add     r2, r2, r3          ; set dst_ptr to next row

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    add     r8, r8, r4          ; add positive differences to sum

-    sub     r8, r8, r5          ; substract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    subs    r12, r12, #1

-    bne     loop

-    ; return stuff

-    ldr     r6, [sp, #40]       ; get address of sse

-    mul     r0, r8, r8          ; sum * sum

-    str     r11, [r6]           ; store sse

-    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))

-    ldmfd   sp!, {r4-r12, pc}

-    ENDP

-c80808080

-    DCD     0x80808080

-    END

--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm

+++ /dev/null

@@ -1,222 +1,0 @@

-;

-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_variance_halfpixvar16x16_hv_armv6|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char *src_ptr

-; r1    int source_stride

-; r2    unsigned char *ref_ptr

-; r3    int  recon_stride

-; stack unsigned int *sse

-|vp9_variance_halfpixvar16x16_hv_armv6| PROC

-    stmfd   sp!, {r4-r12, lr}

-    pld     [r0, r1, lsl #0]

-    pld     [r2, r3, lsl #0]

-    mov     r8, #0              ; initialize sum = 0

-    ldr     r10, c80808080

-    mov     r11, #0             ; initialize sse = 0

-    mov     r12, #16            ; set loop counter to 16 (=block height)

-    mov     lr, #0              ; constant zero

-loop

-    add     r9, r0, r1          ; pointer to pixels on the next row

-    ; 1st 4 pixels

-    ldr     r4, [r0, #0]        ; load source pixels a, row N

-    ldr     r6, [r0, #1]        ; load source pixels b, row N

-    ldr     r5, [r9, #0]        ; load source pixels c, row N+1

-    ldr     r7, [r9, #1]        ; load source pixels d, row N+1

-    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1

-    mvn     r7, r7

-    uhsub8  r5, r5, r7

-    eor     r5, r5, r10

-    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically

-    mvn     r5, r5

-    uhsub8  r4, r4, r5

-    ldr     r5, [r2, #0]        ; load 4 ref pixels

-    eor     r4, r4, r10

-    usub8   r6, r4, r5          ; calculate difference

-    pld     [r0, r1, lsl #1]

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    pld     [r2, r3, lsl #1]

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    adds    r8, r8, r4          ; add positive differences to sum

-    subs    r8, r8, r5          ; substract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    ; 2nd 4 pixels

-    ldr     r4, [r0, #4]        ; load source pixels a, row N

-    ldr     r6, [r0, #5]        ; load source pixels b, row N

-    ldr     r5, [r9, #4]        ; load source pixels c, row N+1

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    ldr     r7, [r9, #5]        ; load source pixels d, row N+1

-    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1

-    mvn     r7, r7

-    uhsub8  r5, r5, r7

-    eor     r5, r5, r10

-    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically

-    mvn     r5, r5

-    uhsub8  r4, r4, r5

-    ldr     r5, [r2, #4]        ; load 4 ref pixels

-    eor     r4, r4, r10

-    usub8   r6, r4, r5          ; calculate difference

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    add     r8, r8, r4          ; add positive differences to sum

-    sub     r8, r8, r5          ; substract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    ; 3rd 4 pixels

-    ldr     r4, [r0, #8]        ; load source pixels a, row N

-    ldr     r6, [r0, #9]        ; load source pixels b, row N

-    ldr     r5, [r9, #8]        ; load source pixels c, row N+1

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    ldr     r7, [r9, #9]        ; load source pixels d, row N+1

-    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1

-    mvn     r7, r7

-    uhsub8  r5, r5, r7

-    eor     r5, r5, r10

-    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically

-    mvn     r5, r5

-    uhsub8  r4, r4, r5

-    ldr     r5, [r2, #8]        ; load 4 ref pixels

-    eor     r4, r4, r10

-    usub8   r6, r4, r5          ; calculate difference

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    add     r8, r8, r4          ; add positive differences to sum

-    sub     r8, r8, r5          ; substract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    ; 4th 4 pixels

-    ldr     r4, [r0, #12]       ; load source pixels a, row N

-    ldr     r6, [r0, #13]       ; load source pixels b, row N

-    ldr     r5, [r9, #12]       ; load source pixels c, row N+1

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    ldr     r7, [r9, #13]       ; load source pixels d, row N+1

-    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1

-    mvn     r7, r7

-    uhsub8  r5, r5, r7

-    eor     r5, r5, r10

-    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically

-    mvn     r5, r5

-    uhsub8  r4, r4, r5

-    ldr     r5, [r2, #12]       ; load 4 ref pixels

-    eor     r4, r4, r10

-    usub8   r6, r4, r5          ; calculate difference

-    add     r0, r0, r1          ; set src_ptr to next row

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    add     r2, r2, r3          ; set dst_ptr to next row

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    add     r8, r8, r4          ; add positive differences to sum

-    sub     r8, r8, r5          ; substract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    subs    r12, r12, #1

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    bne     loop

-    ; return stuff

-    ldr     r6, [sp, #40]       ; get address of sse

-    mul     r0, r8, r8          ; sum * sum

-    str     r11, [r6]           ; store sse

-    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))

-    ldmfd   sp!, {r4-r12, pc}

-    ENDP

-c80808080

-    DCD     0x80808080

-    END

--- a/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm

+++ /dev/null

@@ -1,184 +1,0 @@

-;

-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_variance_halfpixvar16x16_v_armv6|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char *src_ptr

-; r1    int source_stride

-; r2    unsigned char *ref_ptr

-; r3    int  recon_stride

-; stack unsigned int *sse

-|vp9_variance_halfpixvar16x16_v_armv6| PROC

-    stmfd   sp!, {r4-r12, lr}

-    pld     [r0, r1, lsl #0]

-    pld     [r2, r3, lsl #0]

-    mov     r8, #0              ; initialize sum = 0

-    ldr     r10, c80808080

-    mov     r11, #0             ; initialize sse = 0

-    mov     r12, #16            ; set loop counter to 16 (=block height)

-    mov     lr, #0              ; constant zero

-loop

-    add     r9, r0, r1          ; set src pointer to next row

-    ; 1st 4 pixels

-    ldr     r4, [r0, #0]        ; load 4 src pixels

-    ldr     r6, [r9, #0]        ; load 4 src pixels from next row

-    ldr     r5, [r2, #0]        ; load 4 ref pixels

-    ; bilinear interpolation

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    usub8   r6, r4, r5          ; calculate difference

-    pld     [r0, r1, lsl #1]

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    pld     [r2, r3, lsl #1]

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    adds    r8, r8, r4          ; add positive differences to sum

-    subs    r8, r8, r5          ; substract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    ; 2nd 4 pixels

-    ldr     r4, [r0, #4]        ; load 4 src pixels

-    ldr     r6, [r9, #4]        ; load 4 src pixels from next row

-    ldr     r5, [r2, #4]        ; load 4 ref pixels

-    ; bilinear interpolation

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    usub8   r6, r4, r5          ; calculate difference

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    add     r8, r8, r4          ; add positive differences to sum

-    sub     r8, r8, r5          ; substract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    ; 3rd 4 pixels

-    ldr     r4, [r0, #8]        ; load 4 src pixels

-    ldr     r6, [r9, #8]        ; load 4 src pixels from next row

-    ldr     r5, [r2, #8]        ; load 4 ref pixels

-    ; bilinear interpolation

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    usub8   r6, r4, r5          ; calculate difference

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    add     r8, r8, r4          ; add positive differences to sum

-    sub     r8, r8, r5          ; substract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    ; 4th 4 pixels

-    ldr     r4, [r0, #12]       ; load 4 src pixels

-    ldr     r6, [r9, #12]       ; load 4 src pixels from next row

-    ldr     r5, [r2, #12]       ; load 4 ref pixels

-    ; bilinear interpolation

-    mvn     r6, r6

-    uhsub8  r4, r4, r6

-    eor     r4, r4, r10

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    usub8   r6, r4, r5          ; calculate difference

-    add     r0, r0, r1          ; set src_ptr to next row

-    sel     r7, r6, lr          ; select bytes with positive difference

-    usub8   r6, r5, r4          ; calculate difference with reversed operands

-    add     r2, r2, r3          ; set dst_ptr to next row

-    sel     r6, r6, lr          ; select bytes with negative difference

-    ; calculate partial sums

-    usad8   r4, r7, lr          ; calculate sum of positive differences

-    usad8   r5, r6, lr          ; calculate sum of negative differences

-    orr     r6, r6, r7          ; differences of all 4 pixels

-    ; calculate total sum

-    add     r8, r8, r4          ; add positive differences to sum

-    sub     r8, r8, r5          ; substract negative differences from sum

-    ; calculate sse

-    uxtb16  r5, r6              ; byte (two pixels) to halfwords

-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

-    subs    r12, r12, #1

-    bne     loop

-    ; return stuff

-    ldr     r6, [sp, #40]       ; get address of sse

-    mul     r0, r8, r8          ; sum * sum

-    str     r11, [r6]           ; store sse

-    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))

-    ldmfd   sp!, {r4-r12, pc}

-    ENDP

-c80808080

-    DCD     0x80808080

-    END

--- a/vp8/encoder/arm/armv6/walsh_v6.asm

+++ /dev/null

@@ -1,212 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT |vp8_short_walsh4x4_armv6|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA    |.text|, CODE, READONLY  ; name this block of code

-;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)

-; r0    short *input,

-; r1    short *output,

-; r2    int pitch

-|vp8_short_walsh4x4_armv6| PROC

-    stmdb       sp!, {r4 - r11, lr}

-    ldrd        r4, r5, [r0], r2

-    ldr         lr, c00040004

-    ldrd        r6, r7, [r0], r2

-    ; 0-3

-    qadd16      r3, r4, r5          ; [d1|a1] [1+3   |   0+2]

-    qsub16      r4, r4, r5          ; [c1|b1] [1-3   |   0-2]

-    ldrd        r8, r9, [r0], r2

-    ; 4-7

-    qadd16      r5, r6, r7          ; [d1|a1] [5+7   |   4+6]

-    qsub16      r6, r6, r7          ; [c1|b1] [5-7   |   4-6]

-    ldrd        r10, r11, [r0]

-    ; 8-11

-    qadd16      r7, r8, r9          ; [d1|a1] [9+11  |  8+10]

-    qsub16      r8, r8, r9          ; [c1|b1] [9-11  |  8-10]

-    ; 12-15

-    qadd16      r9, r10, r11        ; [d1|a1] [13+15 | 12+14]

-    qsub16      r10, r10, r11       ; [c1|b1] [13-15 | 12-14]

-    lsls        r2, r3, #16

-    smuad       r11, r3, lr         ; A0 = a1<<2 + d1<<2

-    addne       r11, r11, #1        ; A0 += (a1!=0)

-    lsls        r2, r7, #16

-    smuad       r12, r7, lr         ; C0 = a1<<2 + d1<<2

-    addne       r12, r12, #1        ; C0 += (a1!=0)

-    add         r0, r11, r12        ; a1_0 = A0 + C0

-    sub         r11, r11, r12       ; b1_0 = A0 - C0

-    lsls        r2, r5, #16

-    smuad       r12, r5, lr         ; B0 = a1<<2 + d1<<2

-    addne       r12, r12, #1        ; B0 += (a1!=0)

-    lsls        r2, r9, #16

-    smuad       r2, r9, lr          ; D0 = a1<<2 + d1<<2

-    addne       r2, r2, #1          ; D0 += (a1!=0)

-    add         lr, r12, r2         ; d1_0 = B0 + D0

-    sub         r12, r12, r2        ; c1_0 = B0 - D0

-    ; op[0,4,8,12]

-    adds        r2, r0, lr          ; a2 = a1_0 + d1_0

-    addmi       r2, r2, #1          ; += a2 < 0

-    add         r2, r2, #3          ; += 3

-    subs        r0, r0, lr          ; d2 = a1_0 - d1_0

-    mov         r2, r2, asr #3      ; >> 3

-    strh        r2, [r1]            ; op[0]

-    addmi       r0, r0, #1          ; += a2 < 0

-    add         r0, r0, #3          ; += 3

-    ldr         lr, c00040004

-    mov         r0, r0, asr #3      ; >> 3

-    strh        r0, [r1, #24]       ; op[12]

-    adds        r2, r11, r12        ; b2 = b1_0 + c1_0

-    addmi       r2, r2, #1          ; += a2 < 0

-    add         r2, r2, #3          ; += 3

-    subs        r0, r11, r12        ; c2 = b1_0 - c1_0

-    mov         r2, r2, asr #3      ; >> 3

-    strh        r2, [r1, #8]        ; op[4]

-    addmi       r0, r0, #1          ; += a2 < 0

-    add         r0, r0, #3          ; += 3

-    smusd       r3, r3, lr          ; A3 = a1<<2 - d1<<2

-    smusd       r7, r7, lr          ; C3 = a1<<2 - d1<<2

-    mov         r0, r0, asr #3      ; >> 3

-    strh        r0, [r1, #16]       ; op[8]

-    ; op[3,7,11,15]

-    add         r0, r3, r7          ; a1_3 = A3 + C3

-    sub         r3, r3, r7          ; b1_3 = A3 - C3

-    smusd       r5, r5, lr          ; B3 = a1<<2 - d1<<2

-    smusd       r9, r9, lr          ; D3 = a1<<2 - d1<<2

-    add         r7, r5, r9          ; d1_3 = B3 + D3

-    sub         r5, r5, r9          ; c1_3 = B3 - D3

-    adds        r2, r0, r7          ; a2 = a1_3 + d1_3

-    addmi       r2, r2, #1          ; += a2 < 0

-    add         r2, r2, #3          ; += 3

-    adds        r9, r3, r5          ; b2 = b1_3 + c1_3

-    mov         r2, r2, asr #3      ; >> 3

-    strh        r2, [r1, #6]        ; op[3]

-    addmi       r9, r9, #1          ; += a2 < 0

-    add         r9, r9, #3          ; += 3

-    subs        r2, r3, r5          ; c2 = b1_3 - c1_3

-    mov         r9, r9, asr #3      ; >> 3

-    strh        r9, [r1, #14]       ; op[7]

-    addmi       r2, r2, #1          ; += a2 < 0

-    add         r2, r2, #3          ; += 3

-    subs        r9, r0, r7          ; d2 = a1_3 - d1_3

-    mov         r2, r2, asr #3      ; >> 3

-    strh        r2, [r1, #22]       ; op[11]

-    addmi       r9, r9, #1          ; += a2 < 0

-    add         r9, r9, #3          ; += 3

-    smuad       r3, r4, lr          ; A1 = b1<<2 + c1<<2

-    smuad       r5, r8, lr          ; C1 = b1<<2 + c1<<2

-    mov         r9, r9, asr #3      ; >> 3

-    strh        r9, [r1, #30]       ; op[15]

-    ; op[1,5,9,13]

-    add         r0, r3, r5          ; a1_1 = A1 + C1

-    sub         r3, r3, r5          ; b1_1 = A1 - C1

-    smuad       r7, r6, lr          ; B1 = b1<<2 + c1<<2

-    smuad       r9, r10, lr         ; D1 = b1<<2 + c1<<2

-    add         r5, r7, r9          ; d1_1 = B1 + D1

-    sub         r7, r7, r9          ; c1_1 = B1 - D1

-    adds        r2, r0, r5          ; a2 = a1_1 + d1_1

-    addmi       r2, r2, #1          ; += a2 < 0

-    add         r2, r2, #3          ; += 3

-    adds        r9, r3, r7          ; b2 = b1_1 + c1_1

-    mov         r2, r2, asr #3      ; >> 3

-    strh        r2, [r1, #2]        ; op[1]

-    addmi       r9, r9, #1          ; += a2 < 0

-    add         r9, r9, #3          ; += 3

-    subs        r2, r3, r7          ; c2 = b1_1 - c1_1

-    mov         r9, r9, asr #3      ; >> 3

-    strh        r9, [r1, #10]       ; op[5]

-    addmi       r2, r2, #1          ; += a2 < 0

-    add         r2, r2, #3          ; += 3

-    subs        r9, r0, r5          ; d2 = a1_1 - d1_1

-    mov         r2, r2, asr #3      ; >> 3

-    strh        r2, [r1, #18]       ; op[9]

-    addmi       r9, r9, #1          ; += a2 < 0

-    add         r9, r9, #3          ; += 3

-    smusd       r4, r4, lr          ; A2 = b1<<2 - c1<<2

-    smusd       r8, r8, lr          ; C2 = b1<<2 - c1<<2

-    mov         r9, r9, asr #3      ; >> 3

-    strh        r9, [r1, #26]       ; op[13]

-    ; op[2,6,10,14]

-    add         r11, r4, r8         ; a1_2 = A2 + C2

-    sub         r12, r4, r8         ; b1_2 = A2 - C2

-    smusd       r6, r6, lr          ; B2 = b1<<2 - c1<<2

-    smusd       r10, r10, lr        ; D2 = b1<<2 - c1<<2

-    add         r4, r6, r10         ; d1_2 = B2 + D2

-    sub         r8, r6, r10         ; c1_2 = B2 - D2

-    adds        r2, r11, r4         ; a2 = a1_2 + d1_2

-    addmi       r2, r2, #1          ; += a2 < 0

-    add         r2, r2, #3          ; += 3

-    adds        r9, r12, r8         ; b2 = b1_2 + c1_2

-    mov         r2, r2, asr #3      ; >> 3

-    strh        r2, [r1, #4]        ; op[2]

-    addmi       r9, r9, #1          ; += a2 < 0

-    add         r9, r9, #3          ; += 3

-    subs        r2, r12, r8         ; c2 = b1_2 - c1_2

-    mov         r9, r9, asr #3      ; >> 3

-    strh        r9, [r1, #12]       ; op[6]

-    addmi       r2, r2, #1          ; += a2 < 0

-    add         r2, r2, #3          ; += 3

-    subs        r9, r11, r4         ; d2 = a1_2 - d1_2

-    mov         r2, r2, asr #3      ; >> 3

-    strh        r2, [r1, #20]       ; op[10]

-    addmi       r9, r9, #1          ; += a2 < 0

-    add         r9, r9, #3          ; += 3

-    mov         r9, r9, asr #3      ; >> 3

-    strh        r9, [r1, #28]       ; op[14]

-    ldmia       sp!, {r4 - r11, pc}

-    ENDP        ; |vp8_short_walsh4x4_armv6|

-c00040004

-    DCD         0x00040004

-    END

--- a/vp8/encoder/arm/boolhuff_arm.c

+++ /dev/null

@@ -1,33 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp8/encoder/boolhuff.h"

-#include "vp8/common/blockd.h"

-const unsigned int vp9_prob_cost[256] = {

-  2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,

-  1023, 1000,  979,  959,  940,  922,  905,  889,  873,  858,  843,  829,  816,  803,  790,  778,

-  767,  755,  744,  733,  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,

-  617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,  534,  528,  522,  516,

-  511,  505,  499,  494,  488,  483,  477,  472,  467,  462,  457,  452,  447,  442,  437,  433,

-  428,  424,  419,  415,  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,

-  361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,  317,  314,  311,  307,

-  304,  301,  297,  294,  291,  288,  285,  281,  278,  275,  272,  269,  266,  263,  260,  257,

-  255,  252,  249,  246,  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,

-  211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,  181,  179,  177,  174,

-  172,  170,  168,  165,  163,  161,  159,  156,  154,  152,  150,  148,  145,  143,  141,  139,

-  137,  135,  133,  131,  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,

-  105,  103,  101,   99,   97,   95,   93,   92,   90,   88,   86,   84,   82,   81,   79,   77,

-  75,   73,   72,   70,   68,   66,   65,   63,   61,   60,   58,   56,   55,   53,   51,   50,

-  48,   46,   45,   43,   41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,

-  22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1

-};

--- a/vp8/encoder/arm/dct_arm.c

+++ /dev/null

@@ -1,21 +1,0 @@

-/*

- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_config.h"

-#include "./vpx_rtcd.h"

-#if HAVE_ARMV6

-void vp9_short_fdct8x4_armv6(short *input, short *output, int pitch) {

-  vp9_short_fdct4x4_armv6(input,   output,    pitch);

-  vp9_short_fdct4x4_armv6(input + 4, output + 16, pitch);

-}

-#endif /* HAVE_ARMV6 */

--- a/vp8/encoder/arm/dct_arm.h

+++ /dev/null

@@ -1,65 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef DCT_ARM_H

-#define DCT_ARM_H

-#if HAVE_ARMV6

-extern prototype_fdct(vp9_short_walsh4x4_armv6);

-extern prototype_fdct(vp9_short_fdct4x4_armv6);

-extern prototype_fdct(vp9_short_fdct8x4_armv6);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp8_fdct_walsh_short4x4

-#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_armv6

-#undef  vp8_fdct_short4x4

-#define vp8_fdct_short4x4 vp9_short_fdct4x4_armv6

-#undef  vp8_fdct_short8x4

-#define vp8_fdct_short8x4 vp9_short_fdct8x4_armv6

-#undef  vp8_fdct_fast4x4

-#define vp8_fdct_fast4x4 vp9_short_fdct4x4_armv6

-#undef  vp8_fdct_fast8x4

-#define vp8_fdct_fast8x4 vp9_short_fdct8x4_armv6

-#endif

-#endif /* HAVE_ARMV6 */

-#if HAVE_ARMV7

-extern prototype_fdct(vp9_short_fdct4x4_neon);

-extern prototype_fdct(vp9_short_fdct8x4_neon);

-extern prototype_fdct(vp8_fast_fdct4x4_neon);

-extern prototype_fdct(vp8_fast_fdct8x4_neon);

-extern prototype_fdct(vp9_short_walsh4x4_neon);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp8_fdct_short4x4

-#define vp8_fdct_short4x4 vp9_short_fdct4x4_neon

-#undef  vp8_fdct_short8x4

-#define vp8_fdct_short8x4 vp9_short_fdct8x4_neon

-#undef  vp8_fdct_fast4x4

-#define vp8_fdct_fast4x4 vp9_short_fdct4x4_neon

-#undef  vp8_fdct_fast8x4

-#define vp8_fdct_fast8x4 vp9_short_fdct8x4_neon

-#undef  vp8_fdct_walsh_short4x4

-#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_neon

-#endif

-#endif

-#endif

--- a/vp8/encoder/arm/encodemb_arm.h

+++ /dev/null

@@ -1,64 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef ENCODEMB_ARM_H

-#define ENCODEMB_ARM_H

-#if HAVE_ARMV6

-extern prototype_subb(vp9_subtract_b_armv6);

-extern prototype_submby(vp9_subtract_mby_armv6);

-extern prototype_submbuv(vp9_subtract_mbuv_armv6);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp8_encodemb_subb

-#define vp8_encodemb_subb vp9_subtract_b_armv6

-#undef  vp8_encodemb_submby

-#define vp8_encodemb_submby vp9_subtract_mby_armv6

-#undef  vp8_encodemb_submbuv

-#define vp8_encodemb_submbuv vp9_subtract_mbuv_armv6

-#endif

-#endif /* HAVE_ARMV6 */

-#if HAVE_ARMV7

-// extern prototype_berr(vp9_block_error_c);

-// extern prototype_mberr(vp9_mbblock_error_c);

-// extern prototype_mbuverr(vp9_mbuverror_c);

-extern prototype_subb(vp9_subtract_b_neon);

-extern prototype_submby(vp9_subtract_mby_neon);

-extern prototype_submbuv(vp9_subtract_mbuv_neon);

-// #undef  vp8_encodemb_berr

-// #define vp8_encodemb_berr vp9_block_error_c

-// #undef  vp8_encodemb_mberr

-// #define vp8_encodemb_mberr vp9_mbblock_error_c

-// #undef  vp8_encodemb_mbuverr

-// #define vp8_encodemb_mbuverr vp9_mbuverror_c

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp8_encodemb_subb

-#define vp8_encodemb_subb vp9_subtract_b_neon

-#undef  vp8_encodemb_submby

-#define vp8_encodemb_submby vp9_subtract_mby_neon

-#undef  vp8_encodemb_submbuv

-#define vp8_encodemb_submbuv vp9_subtract_mbuv_neon

-#endif

-#endif

-#endif

--- a/vp8/encoder/arm/neon/fastquantizeb_neon.asm

+++ /dev/null

@@ -1,261 +1,0 @@

-;

-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_fast_quantize_b_neon|

-    EXPORT  |vp8_fast_quantize_b_pair_neon|

-    INCLUDE asm_enc_offsets.asm

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=4

-;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);

-|vp8_fast_quantize_b_pair_neon| PROC

-    stmfd           sp!, {r4-r9}

-    vstmdb          sp!, {q4-q7}

-    ldr             r4, [r0, #vp8_block_coeff]

-    ldr             r5, [r0, #vp8_block_quant_fast]

-    ldr             r6, [r0, #vp8_block_round]

-    vld1.16         {q0, q1}, [r4@128]  ; load z

-    ldr             r7, [r2, #vp8_blockd_qcoeff]

-    vabs.s16        q4, q0              ; calculate x = abs(z)

-    vabs.s16        q5, q1

-    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative

-    vshr.s16        q2, q0, #15         ; sz

-    vshr.s16        q3, q1, #15

-    vld1.s16        {q6, q7}, [r6@128]  ; load round_ptr [0-15]

-    vld1.s16        {q8, q9}, [r5@128]  ; load quant_ptr [0-15]

-    ldr             r4, [r1, #vp8_block_coeff]

-    vadd.s16        q4, q6              ; x + Round

-    vadd.s16        q5, q7

-    vld1.16         {q0, q1}, [r4@128]  ; load z2

-    vqdmulh.s16     q4, q8              ; y = ((Round+abs(z)) * Quant) >> 16

-    vqdmulh.s16     q5, q9

-    vabs.s16        q10, q0             ; calculate x2 = abs(z_2)

-    vabs.s16        q11, q1

-    vshr.s16        q12, q0, #15        ; sz2

-    vshr.s16        q13, q1, #15

-    ;modify data to have its original sign

-    veor.s16        q4, q2              ; y^sz

-    veor.s16        q5, q3

-    vadd.s16        q10, q6             ; x2 + Round

-    vadd.s16        q11, q7

-    ldr             r8, [r2, #vp8_blockd_dequant]

-    vqdmulh.s16     q10, q8             ; y2 = ((Round+abs(z)) * Quant) >> 16

-    vqdmulh.s16     q11, q9

-    vshr.s16        q4, #1              ; right shift 1 after vqdmulh

-    vshr.s16        q5, #1

-    vld1.s16        {q6, q7}, [r8@128]  ;load dequant_ptr[i]

-    vsub.s16        q4, q2              ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)

-    vsub.s16        q5, q3

-    vshr.s16        q10, #1             ; right shift 1 after vqdmulh

-    vshr.s16        q11, #1

-    ldr             r9, [r2, #vp8_blockd_dqcoeff]

-    veor.s16        q10, q12            ; y2^sz2

-    veor.s16        q11, q13

-    vst1.s16        {q4, q5}, [r7]      ; store: qcoeff = x1

-    vsub.s16        q10, q12            ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)

-    vsub.s16        q11, q13

-    ldr             r6, [r3, #vp8_blockd_qcoeff]

-    vmul.s16        q2, q6, q4          ; x * Dequant

-    vmul.s16        q3, q7, q5

-    ldr             r0, _inv_zig_zag_   ; load ptr of inverse zigzag table

-    vceq.s16        q8, q8              ; set q8 to all 1

-    vst1.s16        {q10, q11}, [r6]    ; store: qcoeff = x2

-    vmul.s16        q12, q6, q10        ; x2 * Dequant

-    vmul.s16        q13, q7, q11

-    vld1.16         {q6, q7}, [r0@128]  ; load inverse scan order

-    vtst.16         q14, q4, q8         ; now find eob

-    vtst.16         q15, q5, q8         ; non-zero element is set to all 1

-    vst1.s16        {q2, q3}, [r9]      ; store dqcoeff = x * Dequant

-    ldr             r7, [r3, #vp8_blockd_dqcoeff]

-    vand            q0, q6, q14         ; get all valid numbers from scan array

-    vand            q1, q7, q15

-    vst1.s16        {q12, q13}, [r7]    ; store dqcoeff = x * Dequant

-    vtst.16         q2, q10, q8         ; now find eob

-    vtst.16         q3, q11, q8         ; non-zero element is set to all 1

-    vmax.u16        q0, q0, q1          ; find maximum value in q0, q1

-    vand            q10, q6, q2         ; get all valid numbers from scan array

-    vand            q11, q7, q3

-    vmax.u16        q10, q10, q11       ; find maximum value in q10, q11

-    vmax.u16        d0, d0, d1

-    vmax.u16        d20, d20, d21

-    vmovl.u16       q0, d0

-    vmovl.u16       q10, d20

-    vmax.u32        d0, d0, d1

-    vmax.u32        d20, d20, d21

-    vpmax.u32       d0, d0, d0

-    vpmax.u32       d20, d20, d20

-    add             r4, r2, #vp8_blockd_eob

-    add             r5, r3, #vp8_blockd_eob

-    vst1.32         {d0[0]}, [r4@32]

-    vst1.32         {d20[0]}, [r5@32]

-    vldmia          sp!, {q4-q7}

-    ldmfd           sp!, {r4-r9}

-    bx              lr

-    ENDP

-;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)

-|vp8_fast_quantize_b_neon| PROC

-    stmfd           sp!, {r4-r7}

-    ldr             r3, [r0, #vp8_block_coeff]

-    ldr             r4, [r0, #vp8_block_quant_fast]

-    ldr             r5, [r0, #vp8_block_round]

-    vld1.16         {q0, q1}, [r3@128]  ; load z

-    vorr.s16        q14, q0, q1         ; check if all zero (step 1)

-    ldr             r6, [r1, #vp8_blockd_qcoeff]

-    ldr             r7, [r1, #vp8_blockd_dqcoeff]

-    vorr.s16        d28, d28, d29       ; check if all zero (step 2)

-    vabs.s16        q12, q0             ; calculate x = abs(z)

-    vabs.s16        q13, q1

-    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative

-    vshr.s16        q2, q0, #15         ; sz

-    vmov            r2, r3, d28         ; check if all zero (step 3)

-    vshr.s16        q3, q1, #15

-    vld1.s16        {q14, q15}, [r5@128]; load round_ptr [0-15]

-    vld1.s16        {q8, q9}, [r4@128]  ; load quant_ptr [0-15]

-    vadd.s16        q12, q14            ; x + Round

-    vadd.s16        q13, q15

-    ldr             r0, _inv_zig_zag_   ; load ptr of inverse zigzag table

-    vqdmulh.s16     q12, q8             ; y = ((Round+abs(z)) * Quant) >> 16

-    vqdmulh.s16     q13, q9

-    vld1.16         {q10, q11}, [r0@128]; load inverse scan order

-    vceq.s16        q8, q8              ; set q8 to all 1

-    ldr             r4, [r1, #vp8_blockd_dequant]

-    vshr.s16        q12, #1             ; right shift 1 after vqdmulh

-    vshr.s16        q13, #1

-    orr             r2, r2, r3          ; check if all zero (step 4)

-    cmp             r2, #0              ; check if all zero (step 5)

-    beq             zero_output         ; check if all zero (step 6)

-    ;modify data to have its original sign

-    veor.s16        q12, q2             ; y^sz

-    veor.s16        q13, q3

-    vsub.s16        q12, q2             ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)

-    vsub.s16        q13, q3

-    vld1.s16        {q2, q3}, [r4@128]  ; load dequant_ptr[i]

-    vtst.16         q14, q12, q8        ; now find eob

-    vtst.16         q15, q13, q8        ; non-zero element is set to all 1

-    vst1.s16        {q12, q13}, [r6@128]; store: qcoeff = x1

-    vand            q10, q10, q14       ; get all valid numbers from scan array

-    vand            q11, q11, q15

-    vmax.u16        q0, q10, q11        ; find maximum value in q0, q1

-    vmax.u16        d0, d0, d1

-    vmovl.u16       q0, d0

-    vmul.s16        q2, q12             ; x * Dequant

-    vmul.s16        q3, q13

-    vmax.u32        d0, d0, d1

-    vpmax.u32       d0, d0, d0

-    vst1.s16        {q2, q3}, [r7@128]  ; store dqcoeff = x * Dequant

-    add             r4, r1, #vp8_blockd_eob

-    vst1.32         {d0[0]}, [r4@32]

-    ldmfd           sp!, {r4-r7}

-    bx              lr

-zero_output

-    str             r2, [r1, #vp8_blockd_eob]

-    vst1.s16        {q0, q1}, [r6@128]  ; qcoeff = 0

-    vst1.s16        {q0, q1}, [r7@128]  ; dqcoeff = 0

-    ldmfd           sp!, {r4-r7}

-    bx              lr

-    ENDP

-; default inverse zigzag table is defined in vp8/common/entropy.c

-_inv_zig_zag_

-    DCD inv_zig_zag

-    ALIGN 16    ; enable use of @128 bit aligned loads

-inv_zig_zag

-    DCW 0x0001, 0x0002, 0x0006, 0x0007

-    DCW 0x0003, 0x0005, 0x0008, 0x000d

-    DCW 0x0004, 0x0009, 0x000c, 0x000e

-    DCW 0x000a, 0x000b, 0x000f, 0x0010

-    END

--- a/vp8/encoder/arm/neon/picklpf_arm.c

+++ /dev/null

@@ -1,49 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp8/common/onyxc_int.h"

-#include "vp8/encoder/onyx_int.h"

-#include "vp8/encoder/quantize.h"

-#include "vpx_mem/vpx_mem.h"

-#include "vpx_scale/yv12extend.h"

-#include "vpx_scale/vpxscale.h"

-#include "vp8/common/alloccommon.h"

-extern void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);

-void

-vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction) {

-  unsigned char *src_y, *dst_y;

-  int yheight;

-  int ystride;

-  int border;

-  int yoffset;

-  int linestocopy;

-  border   = src_ybc->border;

-  yheight  = src_ybc->y_height;

-  ystride  = src_ybc->y_stride;

-  linestocopy = (yheight >> (Fraction + 4));

-  if (linestocopy < 1)

-    linestocopy = 1;

-  linestocopy <<= 4;

-  yoffset  = ystride * ((yheight >> 5) * 16 - 8);

-  src_y = src_ybc->y_buffer + yoffset;

-  dst_y = dst_ybc->y_buffer + yoffset;

-  // vpx_memcpy (dst_y, src_y, ystride * (linestocopy +16));

-  vp8_memcpy_neon((unsigned char *)dst_y, (unsigned char *)src_y, (int)(ystride * (linestocopy + 16)));

-}

--- a/vp8/encoder/arm/neon/sad16_neon.asm

+++ /dev/null

@@ -1,207 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_sad16x16_neon|

-    EXPORT  |vp8_sad16x8_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char *src_ptr

-; r1    int  src_stride

-; r2    unsigned char *ref_ptr

-; r3    int  ref_stride

-|vp8_sad16x16_neon| PROC

-;;

-    vld1.8          {q0}, [r0], r1

-    vld1.8          {q4}, [r2], r3

-    vld1.8          {q1}, [r0], r1

-    vld1.8          {q5}, [r2], r3

-    vabdl.u8        q12, d0, d8

-    vabdl.u8        q13, d1, d9

-    vld1.8          {q2}, [r0], r1

-    vld1.8          {q6}, [r2], r3

-    vabal.u8        q12, d2, d10

-    vabal.u8        q13, d3, d11

-    vld1.8          {q3}, [r0], r1

-    vld1.8          {q7}, [r2], r3

-    vabal.u8        q12, d4, d12

-    vabal.u8        q13, d5, d13

-;;

-    vld1.8          {q0}, [r0], r1

-    vld1.8          {q4}, [r2], r3

-    vabal.u8        q12, d6, d14

-    vabal.u8        q13, d7, d15

-    vld1.8          {q1}, [r0], r1

-    vld1.8          {q5}, [r2], r3

-    vabal.u8        q12, d0, d8

-    vabal.u8        q13, d1, d9

-    vld1.8          {q2}, [r0], r1

-    vld1.8          {q6}, [r2], r3

-    vabal.u8        q12, d2, d10

-    vabal.u8        q13, d3, d11

-    vld1.8          {q3}, [r0], r1

-    vld1.8          {q7}, [r2], r3

-    vabal.u8        q12, d4, d12

-    vabal.u8        q13, d5, d13

-;;

-    vld1.8          {q0}, [r0], r1

-    vld1.8          {q4}, [r2], r3

-    vabal.u8        q12, d6, d14

-    vabal.u8        q13, d7, d15

-    vld1.8          {q1}, [r0], r1

-    vld1.8          {q5}, [r2], r3

-    vabal.u8        q12, d0, d8

-    vabal.u8        q13, d1, d9

-    vld1.8          {q2}, [r0], r1

-    vld1.8          {q6}, [r2], r3

-    vabal.u8        q12, d2, d10

-    vabal.u8        q13, d3, d11

-    vld1.8          {q3}, [r0], r1

-    vld1.8          {q7}, [r2], r3

-    vabal.u8        q12, d4, d12

-    vabal.u8        q13, d5, d13

-;;

-    vld1.8          {q0}, [r0], r1

-    vld1.8          {q4}, [r2], r3

-    vabal.u8        q12, d6, d14

-    vabal.u8        q13, d7, d15

-    vld1.8          {q1}, [r0], r1

-    vld1.8          {q5}, [r2], r3

-    vabal.u8        q12, d0, d8

-    vabal.u8        q13, d1, d9

-    vld1.8          {q2}, [r0], r1

-    vld1.8          {q6}, [r2], r3

-    vabal.u8        q12, d2, d10

-    vabal.u8        q13, d3, d11

-    vld1.8          {q3}, [r0]

-    vld1.8          {q7}, [r2]

-    vabal.u8        q12, d4, d12

-    vabal.u8        q13, d5, d13

-    vabal.u8        q12, d6, d14

-    vabal.u8        q13, d7, d15

-    vadd.u16        q0, q12, q13

-    vpaddl.u16      q1, q0

-    vpaddl.u32      q0, q1

-    vadd.u32        d0, d0, d1

-    vmov.32         r0, d0[0]

-    bx              lr

-    ENDP

-;==============================

-;unsigned int vp8_sad16x8_c(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride)

-|vp8_sad16x8_neon| PROC

-    vld1.8          {q0}, [r0], r1

-    vld1.8          {q4}, [r2], r3

-    vld1.8          {q1}, [r0], r1

-    vld1.8          {q5}, [r2], r3

-    vabdl.u8        q12, d0, d8

-    vabdl.u8        q13, d1, d9

-    vld1.8          {q2}, [r0], r1

-    vld1.8          {q6}, [r2], r3

-    vabal.u8        q12, d2, d10

-    vabal.u8        q13, d3, d11

-    vld1.8          {q3}, [r0], r1

-    vld1.8          {q7}, [r2], r3

-    vabal.u8        q12, d4, d12

-    vabal.u8        q13, d5, d13

-    vld1.8          {q0}, [r0], r1

-    vld1.8          {q4}, [r2], r3

-    vabal.u8        q12, d6, d14

-    vabal.u8        q13, d7, d15

-    vld1.8          {q1}, [r0], r1

-    vld1.8          {q5}, [r2], r3

-    vabal.u8        q12, d0, d8

-    vabal.u8        q13, d1, d9

-    vld1.8          {q2}, [r0], r1

-    vld1.8          {q6}, [r2], r3

-    vabal.u8        q12, d2, d10

-    vabal.u8        q13, d3, d11

-    vld1.8          {q3}, [r0], r1

-    vld1.8          {q7}, [r2], r3

-    vabal.u8        q12, d4, d12

-    vabal.u8        q13, d5, d13

-    vabal.u8        q12, d6, d14

-    vabal.u8        q13, d7, d15

-    vadd.u16        q0, q12, q13

-    vpaddl.u16      q1, q0

-    vpaddl.u32      q0, q1

-    vadd.u32        d0, d0, d1

-    vmov.32         r0, d0[0]

-    bx              lr

-    ENDP

-    END

--- a/vp8/encoder/arm/neon/sad8_neon.asm

+++ /dev/null

@@ -1,209 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_sad8x8_neon|

-    EXPORT  |vp8_sad8x16_neon|

-    EXPORT  |vp8_sad4x4_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; unsigned int vp8_sad8x8_c(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride)

-|vp8_sad8x8_neon| PROC

-    vld1.8          {d0}, [r0], r1

-    vld1.8          {d8}, [r2], r3

-    vld1.8          {d2}, [r0], r1

-    vld1.8          {d10}, [r2], r3

-    vabdl.u8        q12, d0, d8

-    vld1.8          {d4}, [r0], r1

-    vld1.8          {d12}, [r2], r3

-    vabal.u8        q12, d2, d10

-    vld1.8          {d6}, [r0], r1

-    vld1.8          {d14}, [r2], r3

-    vabal.u8        q12, d4, d12

-    vld1.8          {d0}, [r0], r1

-    vld1.8          {d8}, [r2], r3

-    vabal.u8        q12, d6, d14

-    vld1.8          {d2}, [r0], r1

-    vld1.8          {d10}, [r2], r3

-    vabal.u8        q12, d0, d8

-    vld1.8          {d4}, [r0], r1

-    vld1.8          {d12}, [r2], r3

-    vabal.u8        q12, d2, d10

-    vld1.8          {d6}, [r0], r1

-    vld1.8          {d14}, [r2], r3

-    vabal.u8        q12, d4, d12

-    vabal.u8        q12, d6, d14

-    vpaddl.u16      q1, q12

-    vpaddl.u32      q0, q1

-    vadd.u32        d0, d0, d1

-    vmov.32         r0, d0[0]

-    bx              lr

-    ENDP

-;============================

-;unsigned int vp8_sad8x16_c(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride)

-|vp8_sad8x16_neon| PROC

-    vld1.8          {d0}, [r0], r1

-    vld1.8          {d8}, [r2], r3

-    vld1.8          {d2}, [r0], r1

-    vld1.8          {d10}, [r2], r3

-    vabdl.u8        q12, d0, d8

-    vld1.8          {d4}, [r0], r1

-    vld1.8          {d12}, [r2], r3

-    vabal.u8        q12, d2, d10

-    vld1.8          {d6}, [r0], r1

-    vld1.8          {d14}, [r2], r3

-    vabal.u8        q12, d4, d12

-    vld1.8          {d0}, [r0], r1

-    vld1.8          {d8}, [r2], r3

-    vabal.u8        q12, d6, d14

-    vld1.8          {d2}, [r0], r1

-    vld1.8          {d10}, [r2], r3

-    vabal.u8        q12, d0, d8

-    vld1.8          {d4}, [r0], r1

-    vld1.8          {d12}, [r2], r3

-    vabal.u8        q12, d2, d10

-    vld1.8          {d6}, [r0], r1

-    vld1.8          {d14}, [r2], r3

-    vabal.u8        q12, d4, d12

-    vld1.8          {d0}, [r0], r1

-    vld1.8          {d8}, [r2], r3

-    vabal.u8        q12, d6, d14

-    vld1.8          {d2}, [r0], r1

-    vld1.8          {d10}, [r2], r3

-    vabal.u8        q12, d0, d8

-    vld1.8          {d4}, [r0], r1

-    vld1.8          {d12}, [r2], r3

-    vabal.u8        q12, d2, d10

-    vld1.8          {d6}, [r0], r1

-    vld1.8          {d14}, [r2], r3

-    vabal.u8        q12, d4, d12

-    vld1.8          {d0}, [r0], r1

-    vld1.8          {d8}, [r2], r3

-    vabal.u8        q12, d6, d14

-    vld1.8          {d2}, [r0], r1

-    vld1.8          {d10}, [r2], r3

-    vabal.u8        q12, d0, d8

-    vld1.8          {d4}, [r0], r1

-    vld1.8          {d12}, [r2], r3

-    vabal.u8        q12, d2, d10

-    vld1.8          {d6}, [r0], r1

-    vld1.8          {d14}, [r2], r3

-    vabal.u8        q12, d4, d12

-    vabal.u8        q12, d6, d14

-    vpaddl.u16      q1, q12

-    vpaddl.u32      q0, q1

-    vadd.u32        d0, d0, d1

-    vmov.32         r0, d0[0]

-    bx              lr

-    ENDP

-;===========================

-;unsigned int vp8_sad4x4_c(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride)

-|vp8_sad4x4_neon| PROC

-    vld1.8          {d0}, [r0], r1

-    vld1.8          {d8}, [r2], r3

-    vld1.8          {d2}, [r0], r1

-    vld1.8          {d10}, [r2], r3

-    vabdl.u8        q12, d0, d8

-    vld1.8          {d4}, [r0], r1

-    vld1.8          {d12}, [r2], r3

-    vabal.u8        q12, d2, d10

-    vld1.8          {d6}, [r0], r1

-    vld1.8          {d14}, [r2], r3

-    vabal.u8        q12, d4, d12

-    vabal.u8        q12, d6, d14

-    vpaddl.u16      d1, d24

-    vpaddl.u32      d0, d1

-    vmov.32         r0, d0[0]

-    bx              lr

-    ENDP

-    END

--- a/vp8/encoder/arm/neon/shortfdct_neon.asm

+++ /dev/null

@@ -1,221 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_short_fdct4x4_neon|

-    EXPORT  |vp8_short_fdct8x4_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=4

-    ALIGN 16    ; enable use of @128 bit aligned loads

-coeff

-    DCW      5352,  5352,  5352, 5352

-    DCW      2217,  2217,  2217, 2217

-    DCD     14500, 14500, 14500, 14500

-    DCD      7500,  7500,  7500, 7500

-    DCD     12000, 12000, 12000, 12000

-    DCD     51000, 51000, 51000, 51000

-;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)

-|vp8_short_fdct4x4_neon| PROC

-    ; Part one

-    vld1.16         {d0}, [r0@64], r2

-    adr             r12, coeff

-    vld1.16         {d1}, [r0@64], r2

-    vld1.16         {q8}, [r12@128]!        ; d16=5352,  d17=2217

-    vld1.16         {d2}, [r0@64], r2

-    vld1.32         {q9, q10}, [r12@128]!   ;  q9=14500, q10=7500

-    vld1.16         {d3}, [r0@64], r2

-    ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]

-    vtrn.32         d0, d2

-    vtrn.32         d1, d3

-    vld1.32         {q11,q12}, [r12@128]    ; q11=12000, q12=51000

-    vtrn.16         d0, d1

-    vtrn.16         d2, d3

-    vadd.s16        d4, d0, d3      ; a1 = ip[0] + ip[3]

-    vadd.s16        d5, d1, d2      ; b1 = ip[1] + ip[2]

-    vsub.s16        d6, d1, d2      ; c1 = ip[1] - ip[2]

-    vsub.s16        d7, d0, d3      ; d1 = ip[0] - ip[3]

-    vshl.s16        q2, q2, #3      ; (a1, b1) << 3

-    vshl.s16        q3, q3, #3      ; (c1, d1) << 3

-    vadd.s16        d0, d4, d5      ; op[0] = a1 + b1

-    vsub.s16        d2, d4, d5      ; op[2] = a1 - b1

-    vmlal.s16       q9, d7, d16     ; d1*5352 + 14500

-    vmlal.s16       q10, d7, d17    ; d1*2217 + 7500

-    vmlal.s16       q9, d6, d17     ; c1*2217 + d1*5352 + 14500

-    vmlsl.s16       q10, d6, d16    ; d1*2217 - c1*5352 + 7500

-    vshrn.s32       d1, q9, #12     ; op[1] = (c1*2217 + d1*5352 + 14500)>>12

-    vshrn.s32       d3, q10, #12    ; op[3] = (d1*2217 - c1*5352 +  7500)>>12

-    ; Part two

-    ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]

-    vtrn.32         d0, d2

-    vtrn.32         d1, d3

-    vtrn.16         d0, d1

-    vtrn.16         d2, d3

-    vmov.s16        d26, #7

-    vadd.s16        d4, d0, d3      ; a1 = ip[0] + ip[12]

-    vadd.s16        d5, d1, d2      ; b1 = ip[4] + ip[8]

-    vsub.s16        d6, d1, d2      ; c1 = ip[4] - ip[8]

-    vadd.s16        d4, d4, d26     ; a1 + 7

-    vsub.s16        d7, d0, d3      ; d1 = ip[0] - ip[12]

-    vadd.s16        d0, d4, d5      ; op[0] = a1 + b1 + 7

-    vsub.s16        d2, d4, d5      ; op[8] = a1 - b1 + 7

-    vmlal.s16       q11, d7, d16    ; d1*5352 + 12000

-    vmlal.s16       q12, d7, d17    ; d1*2217 + 51000

-    vceq.s16        d4, d7, #0

-    vshr.s16        d0, d0, #4

-    vshr.s16        d2, d2, #4

-    vmlal.s16       q11, d6, d17    ; c1*2217 + d1*5352 + 12000

-    vmlsl.s16       q12, d6, d16    ; d1*2217 - c1*5352 + 51000

-    vmvn.s16        d4, d4

-    vshrn.s32       d1, q11, #16    ; op[4] = (c1*2217 + d1*5352 + 12000)>>16

-    vsub.s16        d1, d1, d4      ; op[4] += (d1!=0)

-    vshrn.s32       d3, q12, #16    ; op[12]= (d1*2217 - c1*5352 + 51000)>>16

-    vst1.16         {q0, q1}, [r1@128]

-    bx              lr

-    ENDP

-;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)

-|vp8_short_fdct8x4_neon| PROC

-    ; Part one

-    vld1.16         {q0}, [r0@128], r2

-    adr             r12, coeff

-    vld1.16         {q1}, [r0@128], r2

-    vld1.16         {q8}, [r12@128]!        ; d16=5352,  d17=2217

-    vld1.16         {q2}, [r0@128], r2

-    vld1.32         {q9, q10}, [r12@128]!   ;  q9=14500, q10=7500

-    vld1.16         {q3}, [r0@128], r2

-    ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]

-    vtrn.32         q0, q2          ; [A0|B0]

-    vtrn.32         q1, q3          ; [A1|B1]

-    vtrn.16         q0, q1          ; [A2|B2]

-    vtrn.16         q2, q3          ; [A3|B3]

-    vadd.s16        q11, q0, q3     ; a1 = ip[0] + ip[3]

-    vadd.s16        q12, q1, q2     ; b1 = ip[1] + ip[2]

-    vsub.s16        q13, q1, q2     ; c1 = ip[1] - ip[2]

-    vsub.s16        q14, q0, q3     ; d1 = ip[0] - ip[3]

-    vshl.s16        q11, q11, #3    ; a1 << 3

-    vshl.s16        q12, q12, #3    ; b1 << 3

-    vshl.s16        q13, q13, #3    ; c1 << 3

-    vshl.s16        q14, q14, #3    ; d1 << 3

-    vadd.s16        q0, q11, q12    ; [A0 | B0] = a1 + b1

-    vsub.s16        q2, q11, q12    ; [A2 | B2] = a1 - b1

-    vmov.s16        q11, q9         ; 14500

-    vmov.s16        q12, q10        ; 7500

-    vmlal.s16       q9, d28, d16    ; A[1] = d1*5352 + 14500

-    vmlal.s16       q10, d28, d17   ; A[3] = d1*2217 + 7500

-    vmlal.s16       q11, d29, d16   ; B[1] = d1*5352 + 14500

-    vmlal.s16       q12, d29, d17   ; B[3] = d1*2217 + 7500

-    vmlal.s16       q9, d26, d17    ; A[1] = c1*2217 + d1*5352 + 14500

-    vmlsl.s16       q10, d26, d16   ; A[3] = d1*2217 - c1*5352 + 7500

-    vmlal.s16       q11, d27, d17   ; B[1] = c1*2217 + d1*5352 + 14500

-    vmlsl.s16       q12, d27, d16   ; B[3] = d1*2217 - c1*5352 + 7500

-    vshrn.s32       d2, q9, #12     ; A[1] = (c1*2217 + d1*5352 + 14500)>>12

-    vshrn.s32       d6, q10, #12    ; A[3] = (d1*2217 - c1*5352 +  7500)>>12

-    vshrn.s32       d3, q11, #12    ; B[1] = (c1*2217 + d1*5352 + 14500)>>12

-    vshrn.s32       d7, q12, #12    ; B[3] = (d1*2217 - c1*5352 +  7500)>>12

-    ; Part two

-    vld1.32         {q9,q10}, [r12@128]    ; q9=12000, q10=51000

-    ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]

-    vtrn.32         q0, q2          ; q0=[A0 | B0]

-    vtrn.32         q1, q3          ; q1=[A4 | B4]

-    vtrn.16         q0, q1          ; q2=[A8 | B8]

-    vtrn.16         q2, q3          ; q3=[A12|B12]

-    vmov.s16        q15, #7

-    vadd.s16        q11, q0, q3     ; a1 = ip[0] + ip[12]

-    vadd.s16        q12, q1, q2     ; b1 = ip[4] + ip[8]

-    vadd.s16        q11, q11, q15   ; a1 + 7

-    vsub.s16        q13, q1, q2     ; c1 = ip[4] - ip[8]

-    vsub.s16        q14, q0, q3     ; d1 = ip[0] - ip[12]

-    vadd.s16        q0, q11, q12    ; a1 + b1 + 7

-    vsub.s16        q1, q11, q12    ; a1 - b1 + 7

-    vmov.s16        q11, q9         ; 12000

-    vmov.s16        q12, q10        ; 51000

-    vshr.s16        d0, d0, #4      ; A[0] = (a1 + b1 + 7)>>4

-    vshr.s16        d4, d1, #4      ; B[0] = (a1 + b1 + 7)>>4

-    vshr.s16        d2, d2, #4      ; A[8] = (a1 + b1 + 7)>>4

-    vshr.s16        d6, d3, #4      ; B[8] = (a1 + b1 + 7)>>4

-    vmlal.s16       q9, d28, d16    ; A[4]  = d1*5352 + 12000

-    vmlal.s16       q10, d28, d17   ; A[12] = d1*2217 + 51000

-    vmlal.s16       q11, d29, d16   ; B[4]  = d1*5352 + 12000

-    vmlal.s16       q12, d29, d17   ; B[12] = d1*2217 + 51000

-    vceq.s16        q14, q14, #0

-    vmlal.s16       q9, d26, d17    ; A[4]  = c1*2217 + d1*5352 + 12000

-    vmlsl.s16       q10, d26, d16   ; A[12] = d1*2217 - c1*5352 + 51000

-    vmlal.s16       q11, d27, d17   ; B[4]  = c1*2217 + d1*5352 + 12000

-    vmlsl.s16       q12, d27, d16   ; B[12] = d1*2217 - c1*5352 + 51000

-    vmvn.s16        q14, q14

-    vshrn.s32       d1, q9, #16     ; A[4] = (c1*2217 + d1*5352 + 12000)>>16

-    vshrn.s32       d3, q10, #16    ; A[12]= (d1*2217 - c1*5352 + 51000)>>16

-    vsub.s16        d1, d1, d28     ; A[4] += (d1!=0)

-    vshrn.s32       d5, q11, #16    ; B[4] = (c1*2217 + d1*5352 + 12000)>>16

-    vshrn.s32       d7, q12, #16    ; B[12]= (d1*2217 - c1*5352 + 51000)>>16

-    vsub.s16        d5, d5, d29     ; B[4] += (d1!=0)

-    vst1.16         {q0, q1}, [r1@128]! ; block A

-    vst1.16         {q2, q3}, [r1@128]! ; block B

-    bx              lr

-    ENDP

-    END

--- a/vp8/encoder/arm/neon/subtract_neon.asm

+++ /dev/null

@@ -1,185 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT |vp8_subtract_b_neon|

-    EXPORT |vp8_subtract_mby_neon|

-    EXPORT |vp8_subtract_mbuv_neon|

-    INCLUDE asm_enc_offsets.asm

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)

-|vp8_subtract_b_neon| PROC

-    stmfd   sp!, {r4-r7}

-    ldr     r3, [r0, #vp8_block_base_src]

-    ldr     r4, [r0, #vp8_block_src]

-    ldr     r5, [r0, #vp8_block_src_diff]

-    ldr     r3, [r3]

-    ldr     r6, [r0, #vp8_block_src_stride]

-    add     r3, r3, r4                      ; src = *base_src + src

-    ldr     r7, [r1, #vp8_blockd_predictor]

-    vld1.8          {d0}, [r3], r6          ;load src

-    vld1.8          {d1}, [r7], r2          ;load pred

-    vld1.8          {d2}, [r3], r6

-    vld1.8          {d3}, [r7], r2

-    vld1.8          {d4}, [r3], r6

-    vld1.8          {d5}, [r7], r2

-    vld1.8          {d6}, [r3], r6

-    vld1.8          {d7}, [r7], r2

-    vsubl.u8        q10, d0, d1

-    vsubl.u8        q11, d2, d3

-    vsubl.u8        q12, d4, d5

-    vsubl.u8        q13, d6, d7

-    mov             r2, r2, lsl #1

-    vst1.16         {d20}, [r5], r2         ;store diff

-    vst1.16         {d22}, [r5], r2

-    vst1.16         {d24}, [r5], r2

-    vst1.16         {d26}, [r5], r2

-    ldmfd   sp!, {r4-r7}

-    bx              lr

-    ENDP

-;==========================================

-;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)

-|vp8_subtract_mby_neon| PROC

-    mov             r12, #4

-subtract_mby_loop

-    vld1.8          {q0}, [r1], r3          ;load src

-    vld1.8          {q1}, [r2]!             ;load pred

-    vld1.8          {q2}, [r1], r3

-    vld1.8          {q3}, [r2]!

-    vld1.8          {q4}, [r1], r3

-    vld1.8          {q5}, [r2]!

-    vld1.8          {q6}, [r1], r3

-    vld1.8          {q7}, [r2]!

-    vsubl.u8        q8, d0, d2

-    vsubl.u8        q9, d1, d3

-    vsubl.u8        q10, d4, d6

-    vsubl.u8        q11, d5, d7

-    vsubl.u8        q12, d8, d10

-    vsubl.u8        q13, d9, d11

-    vsubl.u8        q14, d12, d14

-    vsubl.u8        q15, d13, d15

-    vst1.16         {q8}, [r0]!             ;store diff

-    vst1.16         {q9}, [r0]!

-    vst1.16         {q10}, [r0]!

-    vst1.16         {q11}, [r0]!

-    vst1.16         {q12}, [r0]!

-    vst1.16         {q13}, [r0]!

-    vst1.16         {q14}, [r0]!

-    vst1.16         {q15}, [r0]!

-    subs            r12, r12, #1

-    bne             subtract_mby_loop

-    bx              lr

-    ENDP

-;=================================

-;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)

-|vp8_subtract_mbuv_neon| PROC

-    ldr             r12, [sp]

-;u

-    add             r0, r0, #512        ;   short *udiff = diff + 256;

-    add             r3, r3, #256        ;   unsigned char *upred = pred + 256;

-    vld1.8          {d0}, [r1], r12         ;load src

-    vld1.8          {d1}, [r3]!             ;load pred

-    vld1.8          {d2}, [r1], r12

-    vld1.8          {d3}, [r3]!

-    vld1.8          {d4}, [r1], r12

-    vld1.8          {d5}, [r3]!

-    vld1.8          {d6}, [r1], r12

-    vld1.8          {d7}, [r3]!

-    vld1.8          {d8}, [r1], r12

-    vld1.8          {d9}, [r3]!

-    vld1.8          {d10}, [r1], r12

-    vld1.8          {d11}, [r3]!

-    vld1.8          {d12}, [r1], r12

-    vld1.8          {d13}, [r3]!

-    vld1.8          {d14}, [r1], r12

-    vld1.8          {d15}, [r3]!

-    vsubl.u8        q8, d0, d1

-    vsubl.u8        q9, d2, d3

-    vsubl.u8        q10, d4, d5

-    vsubl.u8        q11, d6, d7

-    vsubl.u8        q12, d8, d9

-    vsubl.u8        q13, d10, d11

-    vsubl.u8        q14, d12, d13

-    vsubl.u8        q15, d14, d15

-    vst1.16         {q8}, [r0]!             ;store diff

-    vst1.16         {q9}, [r0]!

-    vst1.16         {q10}, [r0]!

-    vst1.16         {q11}, [r0]!

-    vst1.16         {q12}, [r0]!

-    vst1.16         {q13}, [r0]!

-    vst1.16         {q14}, [r0]!

-    vst1.16         {q15}, [r0]!

-;v

-    vld1.8          {d0}, [r2], r12         ;load src

-    vld1.8          {d1}, [r3]!             ;load pred

-    vld1.8          {d2}, [r2], r12

-    vld1.8          {d3}, [r3]!

-    vld1.8          {d4}, [r2], r12

-    vld1.8          {d5}, [r3]!

-    vld1.8          {d6}, [r2], r12

-    vld1.8          {d7}, [r3]!

-    vld1.8          {d8}, [r2], r12

-    vld1.8          {d9}, [r3]!

-    vld1.8          {d10}, [r2], r12

-    vld1.8          {d11}, [r3]!

-    vld1.8          {d12}, [r2], r12

-    vld1.8          {d13}, [r3]!

-    vld1.8          {d14}, [r2], r12

-    vld1.8          {d15}, [r3]!

-    vsubl.u8        q8, d0, d1

-    vsubl.u8        q9, d2, d3

-    vsubl.u8        q10, d4, d5

-    vsubl.u8        q11, d6, d7

-    vsubl.u8        q12, d8, d9

-    vsubl.u8        q13, d10, d11

-    vsubl.u8        q14, d12, d13

-    vsubl.u8        q15, d14, d15

-    vst1.16         {q8}, [r0]!             ;store diff

-    vst1.16         {q9}, [r0]!

-    vst1.16         {q10}, [r0]!

-    vst1.16         {q11}, [r0]!

-    vst1.16         {q12}, [r0]!

-    vst1.16         {q13}, [r0]!

-    vst1.16         {q14}, [r0]!

-    vst1.16         {q15}, [r0]!

-    bx              lr

-    ENDP

-    END

--- a/vp8/encoder/arm/neon/variance_neon.asm

+++ /dev/null

@@ -1,276 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_variance16x16_neon|

-    EXPORT  |vp9_variance16x8_neon|

-    EXPORT  |vp9_variance8x16_neon|

-    EXPORT  |vp9_variance8x8_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char *src_ptr

-; r1    int source_stride

-; r2    unsigned char *ref_ptr

-; r3    int  recon_stride

-; stack unsigned int *sse

-|vp9_variance16x16_neon| PROC

-    vmov.i8         q8, #0                      ;q8 - sum

-    vmov.i8         q9, #0                      ;q9, q10 - sse

-    vmov.i8         q10, #0

-    mov             r12, #8

-variance16x16_neon_loop

-    vld1.8          {q0}, [r0], r1              ;Load up source and reference

-    vld1.8          {q2}, [r2], r3

-    vld1.8          {q1}, [r0], r1

-    vld1.8          {q3}, [r2], r3

-    vsubl.u8        q11, d0, d4                 ;calculate diff

-    vsubl.u8        q12, d1, d5

-    vsubl.u8        q13, d2, d6

-    vsubl.u8        q14, d3, d7

-    ;VPADAL adds adjacent pairs of elements of a vector, and accumulates

-    ;the results into the elements of the destination vector. The explanation

-    ;in ARM guide is wrong.

-    vpadal.s16      q8, q11                     ;calculate sum

-    vmlal.s16       q9, d22, d22                ;calculate sse

-    vmlal.s16       q10, d23, d23

-    subs            r12, r12, #1

-    vpadal.s16      q8, q12

-    vmlal.s16       q9, d24, d24

-    vmlal.s16       q10, d25, d25

-    vpadal.s16      q8, q13

-    vmlal.s16       q9, d26, d26

-    vmlal.s16       q10, d27, d27

-    vpadal.s16      q8, q14

-    vmlal.s16       q9, d28, d28

-    vmlal.s16       q10, d29, d29

-    bne             variance16x16_neon_loop

-    vadd.u32        q10, q9, q10                ;accumulate sse

-    vpaddl.s32      q0, q8                      ;accumulate sum

-    ldr             r12, [sp]                   ;load *sse from stack

-    vpaddl.u32      q1, q10

-    vadd.s64        d0, d0, d1

-    vadd.u64        d1, d2, d3

-    ;vmov.32        r0, d0[0]                   ;this instruction costs a lot

-    ;vmov.32        r1, d1[0]

-    ;mul            r0, r0, r0

-    ;str            r1, [r12]

-    ;sub            r0, r1, r0, asr #8

-    ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should

-    ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.

-    vmull.s32       q5, d0, d0

-    vst1.32         {d1[0]}, [r12]              ;store sse

-    vshr.s32        d10, d10, #8

-    vsub.s32        d0, d1, d10

-    vmov.32         r0, d0[0]                   ;return

-    bx              lr

-    ENDP

-;================================

-;unsigned int vp9_variance16x8_c(

-;    unsigned char *src_ptr,

-;    int  source_stride,

-;    unsigned char *ref_ptr,

-;    int  recon_stride,

-;   unsigned int *sse)

-|vp9_variance16x8_neon| PROC

-    vmov.i8         q8, #0                      ;q8 - sum

-    vmov.i8         q9, #0                      ;q9, q10 - sse

-    vmov.i8         q10, #0

-    mov             r12, #4

-variance16x8_neon_loop

-    vld1.8          {q0}, [r0], r1              ;Load up source and reference

-    vld1.8          {q2}, [r2], r3

-    vld1.8          {q1}, [r0], r1

-    vld1.8          {q3}, [r2], r3

-    vsubl.u8        q11, d0, d4                 ;calculate diff

-    vsubl.u8        q12, d1, d5

-    vsubl.u8        q13, d2, d6

-    vsubl.u8        q14, d3, d7

-    vpadal.s16      q8, q11                     ;calculate sum

-    vmlal.s16       q9, d22, d22                ;calculate sse

-    vmlal.s16       q10, d23, d23

-    subs            r12, r12, #1

-    vpadal.s16      q8, q12

-    vmlal.s16       q9, d24, d24

-    vmlal.s16       q10, d25, d25

-    vpadal.s16      q8, q13

-    vmlal.s16       q9, d26, d26

-    vmlal.s16       q10, d27, d27

-    vpadal.s16      q8, q14

-    vmlal.s16       q9, d28, d28

-    vmlal.s16       q10, d29, d29

-    bne             variance16x8_neon_loop

-    vadd.u32        q10, q9, q10                ;accumulate sse

-    vpaddl.s32      q0, q8                      ;accumulate sum

-    ldr             r12, [sp]                   ;load *sse from stack

-    vpaddl.u32      q1, q10

-    vadd.s64        d0, d0, d1

-    vadd.u64        d1, d2, d3

-    vmull.s32       q5, d0, d0

-    vst1.32         {d1[0]}, [r12]              ;store sse

-    vshr.s32        d10, d10, #7

-    vsub.s32        d0, d1, d10

-    vmov.32         r0, d0[0]                   ;return

-    bx              lr

-    ENDP

-;=================================

-;unsigned int vp9_variance8x16_c(

-;    unsigned char *src_ptr,

-;    int  source_stride,

-;    unsigned char *ref_ptr,

-;    int  recon_stride,

-;   unsigned int *sse)

-|vp9_variance8x16_neon| PROC

-    vmov.i8         q8, #0                      ;q8 - sum

-    vmov.i8         q9, #0                      ;q9, q10 - sse

-    vmov.i8         q10, #0

-    mov             r12, #8

-variance8x16_neon_loop

-    vld1.8          {d0}, [r0], r1              ;Load up source and reference

-    vld1.8          {d4}, [r2], r3

-    vld1.8          {d2}, [r0], r1

-    vld1.8          {d6}, [r2], r3

-    vsubl.u8        q11, d0, d4                 ;calculate diff

-    vsubl.u8        q12, d2, d6

-    vpadal.s16      q8, q11                     ;calculate sum

-    vmlal.s16       q9, d22, d22                ;calculate sse

-    vmlal.s16       q10, d23, d23

-    subs            r12, r12, #1

-    vpadal.s16      q8, q12

-    vmlal.s16       q9, d24, d24

-    vmlal.s16       q10, d25, d25

-    bne             variance8x16_neon_loop

-    vadd.u32        q10, q9, q10                ;accumulate sse

-    vpaddl.s32      q0, q8                      ;accumulate sum

-    ldr             r12, [sp]                   ;load *sse from stack

-    vpaddl.u32      q1, q10

-    vadd.s64        d0, d0, d1

-    vadd.u64        d1, d2, d3

-    vmull.s32       q5, d0, d0

-    vst1.32         {d1[0]}, [r12]              ;store sse

-    vshr.s32        d10, d10, #7

-    vsub.s32        d0, d1, d10

-    vmov.32         r0, d0[0]                   ;return

-    bx              lr

-    ENDP

-;==================================

-; r0    unsigned char *src_ptr

-; r1    int source_stride

-; r2    unsigned char *ref_ptr

-; r3    int  recon_stride

-; stack unsigned int *sse

-|vp9_variance8x8_neon| PROC

-    vmov.i8         q8, #0                      ;q8 - sum

-    vmov.i8         q9, #0                      ;q9, q10 - sse

-    vmov.i8         q10, #0

-    mov             r12, #2

-variance8x8_neon_loop

-    vld1.8          {d0}, [r0], r1              ;Load up source and reference

-    vld1.8          {d4}, [r2], r3

-    vld1.8          {d1}, [r0], r1

-    vld1.8          {d5}, [r2], r3

-    vld1.8          {d2}, [r0], r1

-    vld1.8          {d6}, [r2], r3

-    vld1.8          {d3}, [r0], r1

-    vld1.8          {d7}, [r2], r3

-    vsubl.u8        q11, d0, d4                 ;calculate diff

-    vsubl.u8        q12, d1, d5

-    vsubl.u8        q13, d2, d6

-    vsubl.u8        q14, d3, d7

-    vpadal.s16      q8, q11                     ;calculate sum

-    vmlal.s16       q9, d22, d22                ;calculate sse

-    vmlal.s16       q10, d23, d23

-    subs            r12, r12, #1

-    vpadal.s16      q8, q12

-    vmlal.s16       q9, d24, d24

-    vmlal.s16       q10, d25, d25

-    vpadal.s16      q8, q13

-    vmlal.s16       q9, d26, d26

-    vmlal.s16       q10, d27, d27

-    vpadal.s16      q8, q14

-    vmlal.s16       q9, d28, d28

-    vmlal.s16       q10, d29, d29

-    bne             variance8x8_neon_loop

-    vadd.u32        q10, q9, q10                ;accumulate sse

-    vpaddl.s32      q0, q8                      ;accumulate sum

-    ldr             r12, [sp]                   ;load *sse from stack

-    vpaddl.u32      q1, q10

-    vadd.s64        d0, d0, d1

-    vadd.u64        d1, d2, d3

-    vmull.s32       q5, d0, d0

-    vst1.32         {d1[0]}, [r12]              ;store sse

-    vshr.s32        d10, d10, #6

-    vsub.s32        d0, d1, d10

-    vmov.32         r0, d0[0]                   ;return

-    bx              lr

-    ENDP

-    END

--- a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm

+++ /dev/null

@@ -1,68 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT |vp8_memcpy_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;=========================================

-;void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);

-|vp8_memcpy_neon| PROC

-    ;pld                [r1]                        ;preload pred data

-    ;pld                [r1, #128]

-    ;pld                [r1, #256]

-    ;pld                [r1, #384]

-    mov             r12, r2, lsr #8                 ;copy 256 bytes data at one time

-memcpy_neon_loop

-    vld1.8          {q0, q1}, [r1]!                 ;load src data

-    subs            r12, r12, #1

-    vld1.8          {q2, q3}, [r1]!

-    vst1.8          {q0, q1}, [r0]!                 ;copy to dst_ptr

-    vld1.8          {q4, q5}, [r1]!

-    vst1.8          {q2, q3}, [r0]!

-    vld1.8          {q6, q7}, [r1]!

-    vst1.8          {q4, q5}, [r0]!

-    vld1.8          {q8, q9}, [r1]!

-    vst1.8          {q6, q7}, [r0]!

-    vld1.8          {q10, q11}, [r1]!

-    vst1.8          {q8, q9}, [r0]!

-    vld1.8          {q12, q13}, [r1]!

-    vst1.8          {q10, q11}, [r0]!

-    vld1.8          {q14, q15}, [r1]!

-    vst1.8          {q12, q13}, [r0]!

-    vst1.8          {q14, q15}, [r0]!

-    ;pld                [r1]                        ;preload pred data -- need to adjust for real device

-    ;pld                [r1, #128]

-    ;pld                [r1, #256]

-    ;pld                [r1, #384]

-    bne             memcpy_neon_loop

-    ands            r3, r2, #0xff                   ;extra copy

-    beq             done_copy_neon_loop

-extra_copy_neon_loop

-    vld1.8          {q0}, [r1]!                 ;load src data

-    subs            r3, r3, #16

-    vst1.8          {q0}, [r0]!

-    bne             extra_copy_neon_loop

-done_copy_neon_loop

-    bx              lr

-    ENDP

-    END

--- a/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm

+++ /dev/null

@@ -1,116 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_mse16x16_neon|

-    EXPORT  |vp8_get4x4sse_cs_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;============================

-; r0    unsigned char *src_ptr

-; r1    int source_stride

-; r2    unsigned char *ref_ptr

-; r3    int  recon_stride

-; stack unsigned int *sse

-;note: in this function, sum is never used. So, we can remove this part of calculation

-;from vp9_variance().

-|vp8_mse16x16_neon| PROC

-    vmov.i8         q7, #0                      ;q7, q8, q9, q10 - sse

-    vmov.i8         q8, #0

-    vmov.i8         q9, #0

-    vmov.i8         q10, #0

-    mov             r12, #8

-mse16x16_neon_loop

-    vld1.8          {q0}, [r0], r1              ;Load up source and reference

-    vld1.8          {q2}, [r2], r3

-    vld1.8          {q1}, [r0], r1

-    vld1.8          {q3}, [r2], r3

-    vsubl.u8        q11, d0, d4

-    vsubl.u8        q12, d1, d5

-    vsubl.u8        q13, d2, d6

-    vsubl.u8        q14, d3, d7

-    vmlal.s16       q7, d22, d22

-    vmlal.s16       q8, d23, d23

-    subs            r12, r12, #1

-    vmlal.s16       q9, d24, d24

-    vmlal.s16       q10, d25, d25

-    vmlal.s16       q7, d26, d26

-    vmlal.s16       q8, d27, d27

-    vmlal.s16       q9, d28, d28

-    vmlal.s16       q10, d29, d29

-    bne             mse16x16_neon_loop

-    vadd.u32        q7, q7, q8

-    vadd.u32        q9, q9, q10

-    ldr             r12, [sp]               ;load *sse from stack

-    vadd.u32        q10, q7, q9

-    vpaddl.u32      q1, q10

-    vadd.u64        d0, d2, d3

-    vst1.32         {d0[0]}, [r12]

-    vmov.32         r0, d0[0]

-    bx              lr

-    ENDP

-;=============================

-; r0    unsigned char *src_ptr,

-; r1    int  source_stride,

-; r2    unsigned char *ref_ptr,

-; r3    int  recon_stride

-|vp8_get4x4sse_cs_neon| PROC

-    vld1.8          {d0}, [r0], r1              ;Load up source and reference

-    vld1.8          {d4}, [r2], r3

-    vld1.8          {d1}, [r0], r1

-    vld1.8          {d5}, [r2], r3

-    vld1.8          {d2}, [r0], r1

-    vld1.8          {d6}, [r2], r3

-    vld1.8          {d3}, [r0], r1

-    vld1.8          {d7}, [r2], r3

-    vsubl.u8        q11, d0, d4

-    vsubl.u8        q12, d1, d5

-    vsubl.u8        q13, d2, d6

-    vsubl.u8        q14, d3, d7

-    vmull.s16       q7, d22, d22

-    vmull.s16       q8, d24, d24

-    vmull.s16       q9, d26, d26

-    vmull.s16       q10, d28, d28

-    vadd.u32        q7, q7, q8

-    vadd.u32        q9, q9, q10

-    vadd.u32        q9, q7, q9

-    vpaddl.u32      q1, q9

-    vadd.u64        d0, d2, d3

-    vmov.32         r0, d0[0]

-    bx              lr

-    ENDP

-    END

--- a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm

+++ /dev/null

@@ -1,103 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_short_walsh4x4_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)

-; r0   short *input,

-; r1   short *output,

-; r2   int pitch

-|vp8_short_walsh4x4_neon| PROC

-    vld1.16         {d0}, [r0@64], r2   ; load input

-    vld1.16         {d1}, [r0@64], r2

-    vld1.16         {d2}, [r0@64], r2

-    vld1.16         {d3}, [r0@64]

-    ;First for-loop

-    ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]

-    vtrn.32         d0, d2

-    vtrn.32         d1, d3

-    vmov.s32        q15, #3             ; add 3 to all values

-    vtrn.16         d0, d1

-    vtrn.16         d2, d3

-    vadd.s16        d4, d0, d2          ; ip[0] + ip[2]

-    vadd.s16        d5, d1, d3          ; ip[1] + ip[3]

-    vsub.s16        d6, d1, d3          ; ip[1] - ip[3]

-    vsub.s16        d7, d0, d2          ; ip[0] - ip[2]

-    vshl.s16        d4, d4, #2          ; a1 = (ip[0] + ip[2]) << 2

-    vshl.s16        d5, d5, #2          ; d1 = (ip[1] + ip[3]) << 2

-    vshl.s16        d6, d6, #2          ; c1 = (ip[1] - ip[3]) << 2

-    vceq.s16        d16, d4, #0         ; a1 == 0

-    vshl.s16        d7, d7, #2          ; b1 = (ip[0] - ip[2]) << 2

-    vadd.s16        d0, d4, d5          ; a1 + d1

-    vmvn            d16, d16            ; a1 != 0

-    vsub.s16        d3, d4, d5          ; op[3] = a1 - d1

-    vadd.s16        d1, d7, d6          ; op[1] = b1 + c1

-    vsub.s16        d2, d7, d6          ; op[2] = b1 - c1

-    vsub.s16        d0, d0, d16         ; op[0] = a1 + d1 + (a1 != 0)

-    ;Second for-loop

-    ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]

-    vtrn.32         d1, d3

-    vtrn.32         d0, d2

-    vtrn.16         d2, d3

-    vtrn.16         d0, d1

-    vaddl.s16       q8, d0, d2          ; a1 = ip[0]+ip[8]

-    vaddl.s16       q9, d1, d3          ; d1 = ip[4]+ip[12]

-    vsubl.s16       q10, d1, d3         ; c1 = ip[4]-ip[12]

-    vsubl.s16       q11, d0, d2         ; b1 = ip[0]-ip[8]

-    vadd.s32        q0, q8, q9          ; a2 = a1 + d1

-    vadd.s32        q1, q11, q10        ; b2 = b1 + c1

-    vsub.s32        q2, q11, q10        ; c2 = b1 - c1

-    vsub.s32        q3, q8, q9          ; d2 = a1 - d1

-    vclt.s32        q8, q0, #0

-    vclt.s32        q9, q1, #0

-    vclt.s32        q10, q2, #0

-    vclt.s32        q11, q3, #0

-    ; subtract -1 (or 0)

-    vsub.s32        q0, q0, q8          ; a2 += a2 < 0

-    vsub.s32        q1, q1, q9          ; b2 += b2 < 0

-    vsub.s32        q2, q2, q10         ; c2 += c2 < 0

-    vsub.s32        q3, q3, q11         ; d2 += d2 < 0

-    vadd.s32        q8, q0, q15         ; a2 + 3

-    vadd.s32        q9, q1, q15         ; b2 + 3

-    vadd.s32        q10, q2, q15        ; c2 + 3

-    vadd.s32        q11, q3, q15        ; d2 + 3

-    ; vrshrn? would add 1 << 3-1 = 2

-    vshrn.s32       d0, q8, #3

-    vshrn.s32       d1, q9, #3

-    vshrn.s32       d2, q10, #3

-    vshrn.s32       d3, q11, #3

-    vst1.16         {q0, q1}, [r1@128]

-    bx              lr

-    ENDP

-    END

--- a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm

+++ /dev/null

@@ -1,425 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_sub_pixel_variance16x16_neon_func|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char  *src_ptr,

-; r1    int  src_pixels_per_line,

-; r2    int  xoffset,

-; r3    int  yoffset,

-; stack(r4) unsigned char *dst_ptr,

-; stack(r5) int dst_pixels_per_line,

-; stack(r6) unsigned int *sse

-;note: most of the code is copied from bilinear_predict16x16_neon and vp9_variance16x16_neon.

-|vp9_sub_pixel_variance16x16_neon_func| PROC

-    push            {r4-r6, lr}

-    ldr             r12, _BilinearTaps_coeff_

-    ldr             r4, [sp, #16]           ;load *dst_ptr from stack

-    ldr             r5, [sp, #20]           ;load dst_pixels_per_line from stack

-    ldr             r6, [sp, #24]           ;load *sse from stack

-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

-    beq             secondpass_bfilter16x16_only

-    add             r2, r12, r2, lsl #3     ;calculate filter location

-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

-    vld1.s32        {d31}, [r2]             ;load first_pass filter

-    beq             firstpass_bfilter16x16_only

-    sub             sp, sp, #272            ;reserve space on stack for temporary storage

-    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data

-    mov             lr, sp

-    vld1.u8         {d5, d6, d7}, [r0], r1

-    mov             r2, #3                  ;loop counter

-    vld1.u8         {d8, d9, d10}, [r0], r1

-    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)

-    vld1.u8         {d11, d12, d13}, [r0], r1

-    vdup.8          d1, d31[4]

-;First Pass: output_height lines x output_width columns (17x16)

-vp8e_filt_blk2d_fp16x16_loop_neon

-    pld             [r0]

-    pld             [r0, r1]

-    pld             [r0, r1, lsl #1]

-    vmull.u8        q7, d2, d0              ;(src_ptr[0] * Filter[0])

-    vmull.u8        q8, d3, d0

-    vmull.u8        q9, d5, d0

-    vmull.u8        q10, d6, d0

-    vmull.u8        q11, d8, d0

-    vmull.u8        q12, d9, d0

-    vmull.u8        q13, d11, d0

-    vmull.u8        q14, d12, d0

-    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]

-    vext.8          d5, d5, d6, #1

-    vext.8          d8, d8, d9, #1

-    vext.8          d11, d11, d12, #1

-    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * Filter[1])

-    vmlal.u8        q9, d5, d1

-    vmlal.u8        q11, d8, d1

-    vmlal.u8        q13, d11, d1

-    vext.8          d3, d3, d4, #1

-    vext.8          d6, d6, d7, #1

-    vext.8          d9, d9, d10, #1

-    vext.8          d12, d12, d13, #1

-    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * Filter[1])

-    vmlal.u8        q10, d6, d1

-    vmlal.u8        q12, d9, d1

-    vmlal.u8        q14, d12, d1

-    subs            r2, r2, #1

-    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8

-    vqrshrn.u16    d15, q8, #7

-    vqrshrn.u16    d16, q9, #7

-    vqrshrn.u16    d17, q10, #7

-    vqrshrn.u16    d18, q11, #7

-    vqrshrn.u16    d19, q12, #7

-    vqrshrn.u16    d20, q13, #7

-    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data

-    vqrshrn.u16    d21, q14, #7

-    vld1.u8         {d5, d6, d7}, [r0], r1

-    vst1.u8         {d14, d15, d16, d17}, [lr]!     ;store result

-    vld1.u8         {d8, d9, d10}, [r0], r1

-    vst1.u8         {d18, d19, d20, d21}, [lr]!

-    vld1.u8         {d11, d12, d13}, [r0], r1

-    bne             vp8e_filt_blk2d_fp16x16_loop_neon

-;First-pass filtering for rest 5 lines

-    vld1.u8         {d14, d15, d16}, [r0], r1

-    vmull.u8        q9, d2, d0              ;(src_ptr[0] * Filter[0])

-    vmull.u8        q10, d3, d0

-    vmull.u8        q11, d5, d0

-    vmull.u8        q12, d6, d0

-    vmull.u8        q13, d8, d0

-    vmull.u8        q14, d9, d0

-    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]

-    vext.8          d5, d5, d6, #1

-    vext.8          d8, d8, d9, #1

-    vmlal.u8        q9, d2, d1              ;(src_ptr[0] * Filter[1])

-    vmlal.u8        q11, d5, d1

-    vmlal.u8        q13, d8, d1

-    vext.8          d3, d3, d4, #1

-    vext.8          d6, d6, d7, #1

-    vext.8          d9, d9, d10, #1

-    vmlal.u8        q10, d3, d1             ;(src_ptr[0] * Filter[1])

-    vmlal.u8        q12, d6, d1

-    vmlal.u8        q14, d9, d1

-    vmull.u8        q1, d11, d0

-    vmull.u8        q2, d12, d0

-    vmull.u8        q3, d14, d0

-    vmull.u8        q4, d15, d0

-    vext.8          d11, d11, d12, #1       ;construct src_ptr[1]

-    vext.8          d14, d14, d15, #1

-    vmlal.u8        q1, d11, d1             ;(src_ptr[0] * Filter[1])

-    vmlal.u8        q3, d14, d1

-    vext.8          d12, d12, d13, #1

-    vext.8          d15, d15, d16, #1

-    vmlal.u8        q2, d12, d1             ;(src_ptr[0] * Filter[1])

-    vmlal.u8        q4, d15, d1

-    vqrshrn.u16    d10, q9, #7              ;shift/round/saturate to u8

-    vqrshrn.u16    d11, q10, #7

-    vqrshrn.u16    d12, q11, #7

-    vqrshrn.u16    d13, q12, #7

-    vqrshrn.u16    d14, q13, #7

-    vqrshrn.u16    d15, q14, #7

-    vqrshrn.u16    d16, q1, #7

-    vqrshrn.u16    d17, q2, #7

-    vqrshrn.u16    d18, q3, #7

-    vqrshrn.u16    d19, q4, #7

-    vst1.u8         {d10, d11, d12, d13}, [lr]!         ;store result

-    vst1.u8         {d14, d15, d16, d17}, [lr]!

-    vst1.u8         {d18, d19}, [lr]!

-;Second pass: 16x16

-;secondpass_filter

-    add             r3, r12, r3, lsl #3

-    sub             lr, lr, #272

-    vld1.u32        {d31}, [r3]             ;load second_pass filter

-    sub             sp, sp, #256

-    mov             r3, sp

-    vld1.u8         {d22, d23}, [lr]!       ;load src data

-    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)

-    vdup.8          d1, d31[4]

-    mov             r12, #4                 ;loop counter

-vp8e_filt_blk2d_sp16x16_loop_neon

-    vld1.u8         {d24, d25}, [lr]!

-    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])

-    vld1.u8         {d26, d27}, [lr]!

-    vmull.u8        q2, d23, d0

-    vld1.u8         {d28, d29}, [lr]!

-    vmull.u8        q3, d24, d0

-    vld1.u8         {d30, d31}, [lr]!

-    vmull.u8        q4, d25, d0

-    vmull.u8        q5, d26, d0

-    vmull.u8        q6, d27, d0

-    vmull.u8        q7, d28, d0

-    vmull.u8        q8, d29, d0

-    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * Filter[1])

-    vmlal.u8        q2, d25, d1

-    vmlal.u8        q3, d26, d1

-    vmlal.u8        q4, d27, d1

-    vmlal.u8        q5, d28, d1

-    vmlal.u8        q6, d29, d1

-    vmlal.u8        q7, d30, d1

-    vmlal.u8        q8, d31, d1

-    subs            r12, r12, #1

-    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8

-    vqrshrn.u16    d3, q2, #7

-    vqrshrn.u16    d4, q3, #7

-    vqrshrn.u16    d5, q4, #7

-    vqrshrn.u16    d6, q5, #7

-    vqrshrn.u16    d7, q6, #7

-    vqrshrn.u16    d8, q7, #7

-    vqrshrn.u16    d9, q8, #7

-    vst1.u8         {d2, d3}, [r3]!         ;store result

-    vst1.u8         {d4, d5}, [r3]!

-    vst1.u8         {d6, d7}, [r3]!

-    vmov            q11, q15

-    vst1.u8         {d8, d9}, [r3]!

-    bne             vp8e_filt_blk2d_sp16x16_loop_neon

-    b               sub_pixel_variance16x16_neon

-;--------------------

-firstpass_bfilter16x16_only

-    mov             r2, #4                      ;loop counter

-    sub             sp, sp, #528            ;reserve space on stack for temporary storage

-    vdup.8          d0, d31[0]                  ;first_pass filter (d0 d1)

-    vdup.8          d1, d31[4]

-    mov             r3, sp

-;First Pass: output_height lines x output_width columns (16x16)

-vp8e_filt_blk2d_fpo16x16_loop_neon

-    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data

-    vld1.u8         {d5, d6, d7}, [r0], r1

-    vld1.u8         {d8, d9, d10}, [r0], r1

-    vld1.u8         {d11, d12, d13}, [r0], r1

-    pld             [r0]

-    pld             [r0, r1]

-    pld             [r0, r1, lsl #1]

-    vmull.u8        q7, d2, d0              ;(src_ptr[0] * Filter[0])

-    vmull.u8        q8, d3, d0

-    vmull.u8        q9, d5, d0

-    vmull.u8        q10, d6, d0

-    vmull.u8        q11, d8, d0

-    vmull.u8        q12, d9, d0

-    vmull.u8        q13, d11, d0

-    vmull.u8        q14, d12, d0

-    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]

-    vext.8          d5, d5, d6, #1

-    vext.8          d8, d8, d9, #1

-    vext.8          d11, d11, d12, #1

-    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * Filter[1])

-    vmlal.u8        q9, d5, d1

-    vmlal.u8        q11, d8, d1

-    vmlal.u8        q13, d11, d1

-    vext.8          d3, d3, d4, #1

-    vext.8          d6, d6, d7, #1

-    vext.8          d9, d9, d10, #1

-    vext.8          d12, d12, d13, #1

-    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * Filter[1])

-    vmlal.u8        q10, d6, d1

-    vmlal.u8        q12, d9, d1

-    vmlal.u8        q14, d12, d1

-    subs            r2, r2, #1

-    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8

-    vqrshrn.u16    d15, q8, #7

-    vqrshrn.u16    d16, q9, #7

-    vqrshrn.u16    d17, q10, #7

-    vqrshrn.u16    d18, q11, #7

-    vqrshrn.u16    d19, q12, #7

-    vqrshrn.u16    d20, q13, #7

-    vst1.u8         {d14, d15}, [r3]!       ;store result

-    vqrshrn.u16    d21, q14, #7

-    vst1.u8         {d16, d17}, [r3]!

-    vst1.u8         {d18, d19}, [r3]!

-    vst1.u8         {d20, d21}, [r3]!

-    bne             vp8e_filt_blk2d_fpo16x16_loop_neon

-    b               sub_pixel_variance16x16_neon

-;---------------------

-secondpass_bfilter16x16_only

-;Second pass: 16x16

-;secondpass_filter

-    sub             sp, sp, #528            ;reserve space on stack for temporary storage

-    add             r3, r12, r3, lsl #3

-    mov             r12, #4                     ;loop counter

-    vld1.u32        {d31}, [r3]                 ;load second_pass filter

-    vld1.u8         {d22, d23}, [r0], r1        ;load src data

-    mov             r3, sp

-    vdup.8          d0, d31[0]                  ;second_pass filter parameters (d0 d1)

-    vdup.8          d1, d31[4]

-vp8e_filt_blk2d_spo16x16_loop_neon

-    vld1.u8         {d24, d25}, [r0], r1

-    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])

-    vld1.u8         {d26, d27}, [r0], r1

-    vmull.u8        q2, d23, d0

-    vld1.u8         {d28, d29}, [r0], r1

-    vmull.u8        q3, d24, d0

-    vld1.u8         {d30, d31}, [r0], r1

-    vmull.u8        q4, d25, d0

-    vmull.u8        q5, d26, d0

-    vmull.u8        q6, d27, d0

-    vmull.u8        q7, d28, d0

-    vmull.u8        q8, d29, d0

-    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * Filter[1])

-    vmlal.u8        q2, d25, d1

-    vmlal.u8        q3, d26, d1

-    vmlal.u8        q4, d27, d1

-    vmlal.u8        q5, d28, d1

-    vmlal.u8        q6, d29, d1

-    vmlal.u8        q7, d30, d1

-    vmlal.u8        q8, d31, d1

-    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8

-    vqrshrn.u16    d3, q2, #7

-    vqrshrn.u16    d4, q3, #7

-    vqrshrn.u16    d5, q4, #7

-    vqrshrn.u16    d6, q5, #7

-    vqrshrn.u16    d7, q6, #7

-    vqrshrn.u16    d8, q7, #7

-    vqrshrn.u16    d9, q8, #7

-    vst1.u8         {d2, d3}, [r3]!         ;store result

-    subs            r12, r12, #1

-    vst1.u8         {d4, d5}, [r3]!

-    vmov            q11, q15

-    vst1.u8         {d6, d7}, [r3]!

-    vst1.u8         {d8, d9}, [r3]!

-    bne             vp8e_filt_blk2d_spo16x16_loop_neon

-    b               sub_pixel_variance16x16_neon

-;----------------------------

-;variance16x16

-sub_pixel_variance16x16_neon

-    vmov.i8         q8, #0                      ;q8 - sum

-    vmov.i8         q9, #0                      ;q9, q10 - sse

-    vmov.i8         q10, #0

-    sub             r3, r3, #256

-    mov             r12, #8

-sub_pixel_variance16x16_neon_loop

-    vld1.8          {q0}, [r3]!                 ;Load up source and reference

-    vld1.8          {q2}, [r4], r5

-    vld1.8          {q1}, [r3]!

-    vld1.8          {q3}, [r4], r5

-    vsubl.u8        q11, d0, d4                 ;diff

-    vsubl.u8        q12, d1, d5

-    vsubl.u8        q13, d2, d6

-    vsubl.u8        q14, d3, d7

-    vpadal.s16      q8, q11                     ;sum

-    vmlal.s16       q9, d22, d22                ;sse

-    vmlal.s16       q10, d23, d23

-    subs            r12, r12, #1

-    vpadal.s16      q8, q12

-    vmlal.s16       q9, d24, d24

-    vmlal.s16       q10, d25, d25

-    vpadal.s16      q8, q13

-    vmlal.s16       q9, d26, d26

-    vmlal.s16       q10, d27, d27

-    vpadal.s16      q8, q14

-    vmlal.s16       q9, d28, d28

-    vmlal.s16       q10, d29, d29

-    bne             sub_pixel_variance16x16_neon_loop

-    vadd.u32        q10, q9, q10                ;accumulate sse

-    vpaddl.s32      q0, q8                      ;accumulate sum

-    vpaddl.u32      q1, q10

-    vadd.s64        d0, d0, d1

-    vadd.u64        d1, d2, d3

-    vmull.s32       q5, d0, d0

-    vst1.32         {d1[0]}, [r6]               ;store sse

-    vshr.s32        d10, d10, #8

-    vsub.s32        d0, d1, d10

-    add             sp, sp, #528

-    vmov.32         r0, d0[0]                   ;return

-    pop             {r4-r6,pc}

-    ENDP

-;-----------------

-_BilinearTaps_coeff_

-    DCD     bilinear_taps_coeff

-bilinear_taps_coeff

-    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112

-    END

--- a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm

+++ /dev/null

@@ -1,572 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_variance_halfpixvar16x16_h_neon|

-    EXPORT  |vp9_variance_halfpixvar16x16_v_neon|

-    EXPORT  |vp9_variance_halfpixvar16x16_hv_neon|

-    EXPORT  |vp9_sub_pixel_variance16x16s_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;================================================

-;unsigned int vp9_variance_halfpixvar16x16_h_neon

-;(

-;    unsigned char  *src_ptr, r0

-;    int  src_pixels_per_line,  r1

-;    unsigned char *dst_ptr,  r2

-;    int dst_pixels_per_line,   r3

-;    unsigned int *sse

-;);

-;================================================

-|vp9_variance_halfpixvar16x16_h_neon| PROC

-    push            {lr}

-    mov             r12, #4                  ;loop counter

-    ldr             lr, [sp, #4]           ;load *sse from stack

-    vmov.i8         q8, #0                      ;q8 - sum

-    vmov.i8         q9, #0                      ;q9, q10 - sse

-    vmov.i8         q10, #0

-;First Pass: output_height lines x output_width columns (16x16)

-vp8_filt_fpo16x16s_4_0_loop_neon

-    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data

-    vld1.8          {q11}, [r2], r3

-    vld1.u8         {d4, d5, d6, d7}, [r0], r1

-    vld1.8          {q12}, [r2], r3

-    vld1.u8         {d8, d9, d10, d11}, [r0], r1

-    vld1.8          {q13}, [r2], r3

-    vld1.u8         {d12, d13, d14, d15}, [r0], r1

-    ;pld                [r0]

-    ;pld                [r0, r1]

-    ;pld                [r0, r1, lsl #1]

-    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]

-    vext.8          q3, q2, q3, #1

-    vext.8          q5, q4, q5, #1

-    vext.8          q7, q6, q7, #1

-    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1

-    vld1.8          {q14}, [r2], r3

-    vrhadd.u8       q1, q2, q3

-    vrhadd.u8       q2, q4, q5

-    vrhadd.u8       q3, q6, q7

-    vsubl.u8        q4, d0, d22                 ;diff

-    vsubl.u8        q5, d1, d23

-    vsubl.u8        q6, d2, d24

-    vsubl.u8        q7, d3, d25

-    vsubl.u8        q0, d4, d26

-    vsubl.u8        q1, d5, d27

-    vsubl.u8        q2, d6, d28

-    vsubl.u8        q3, d7, d29

-    vpadal.s16      q8, q4                     ;sum

-    vmlal.s16       q9, d8, d8                ;sse

-    vmlal.s16       q10, d9, d9

-    subs            r12, r12, #1

-    vpadal.s16      q8, q5

-    vmlal.s16       q9, d10, d10

-    vmlal.s16       q10, d11, d11

-    vpadal.s16      q8, q6

-    vmlal.s16       q9, d12, d12

-    vmlal.s16       q10, d13, d13

-    vpadal.s16      q8, q7

-    vmlal.s16       q9, d14, d14

-    vmlal.s16       q10, d15, d15

-    vpadal.s16      q8, q0                     ;sum

-    vmlal.s16       q9, d0, d0                ;sse

-    vmlal.s16       q10, d1, d1

-    vpadal.s16      q8, q1

-    vmlal.s16       q9, d2, d2

-    vmlal.s16       q10, d3, d3

-    vpadal.s16      q8, q2

-    vmlal.s16       q9, d4, d4

-    vmlal.s16       q10, d5, d5

-    vpadal.s16      q8, q3

-    vmlal.s16       q9, d6, d6

-    vmlal.s16       q10, d7, d7

-    bne             vp8_filt_fpo16x16s_4_0_loop_neon

-    vadd.u32        q10, q9, q10                ;accumulate sse

-    vpaddl.s32      q0, q8                      ;accumulate sum

-    vpaddl.u32      q1, q10

-    vadd.s64        d0, d0, d1

-    vadd.u64        d1, d2, d3

-    vmull.s32       q5, d0, d0

-    vst1.32         {d1[0]}, [lr]               ;store sse

-    vshr.s32        d10, d10, #8

-    vsub.s32        d0, d1, d10

-    vmov.32         r0, d0[0]                   ;return

-    pop             {pc}

-    ENDP

-;================================================

-;unsigned int vp9_variance_halfpixvar16x16_v_neon

-;(

-;    unsigned char  *src_ptr, r0

-;    int  src_pixels_per_line,  r1

-;    unsigned char *dst_ptr,  r2

-;    int dst_pixels_per_line,   r3

-;    unsigned int *sse

-;);

-;================================================

-|vp9_variance_halfpixvar16x16_v_neon| PROC

-    push            {lr}

-    mov             r12, #4                     ;loop counter

-    vld1.u8         {q0}, [r0], r1              ;load src data

-    ldr             lr, [sp, #4]                ;load *sse from stack

-    vmov.i8         q8, #0                      ;q8 - sum

-    vmov.i8         q9, #0                      ;q9, q10 - sse

-    vmov.i8         q10, #0

-vp8_filt_spo16x16s_0_4_loop_neon

-    vld1.u8         {q2}, [r0], r1

-    vld1.8          {q1}, [r2], r3

-    vld1.u8         {q4}, [r0], r1

-    vld1.8          {q3}, [r2], r3

-    vld1.u8         {q6}, [r0], r1

-    vld1.8          {q5}, [r2], r3

-    vld1.u8         {q15}, [r0], r1

-    vrhadd.u8       q0, q0, q2

-    vld1.8          {q7}, [r2], r3

-    vrhadd.u8       q2, q2, q4

-    vrhadd.u8       q4, q4, q6

-    vrhadd.u8       q6, q6, q15

-    vsubl.u8        q11, d0, d2                 ;diff

-    vsubl.u8        q12, d1, d3

-    vsubl.u8        q13, d4, d6

-    vsubl.u8        q14, d5, d7

-    vsubl.u8        q0, d8, d10

-    vsubl.u8        q1, d9, d11

-    vsubl.u8        q2, d12, d14

-    vsubl.u8        q3, d13, d15

-    vpadal.s16      q8, q11                     ;sum

-    vmlal.s16       q9, d22, d22                ;sse

-    vmlal.s16       q10, d23, d23

-    subs            r12, r12, #1

-    vpadal.s16      q8, q12

-    vmlal.s16       q9, d24, d24

-    vmlal.s16       q10, d25, d25

-    vpadal.s16      q8, q13

-    vmlal.s16       q9, d26, d26

-    vmlal.s16       q10, d27, d27

-    vpadal.s16      q8, q14

-    vmlal.s16       q9, d28, d28

-    vmlal.s16       q10, d29, d29

-    vpadal.s16      q8, q0                     ;sum

-    vmlal.s16       q9, d0, d0                 ;sse

-    vmlal.s16       q10, d1, d1

-    vpadal.s16      q8, q1

-    vmlal.s16       q9, d2, d2

-    vmlal.s16       q10, d3, d3

-    vpadal.s16      q8, q2

-    vmlal.s16       q9, d4, d4

-    vmlal.s16       q10, d5, d5

-    vmov            q0, q15

-    vpadal.s16      q8, q3

-    vmlal.s16       q9, d6, d6

-    vmlal.s16       q10, d7, d7

-    bne             vp8_filt_spo16x16s_0_4_loop_neon

-    vadd.u32        q10, q9, q10                ;accumulate sse

-    vpaddl.s32      q0, q8                      ;accumulate sum

-    vpaddl.u32      q1, q10

-    vadd.s64        d0, d0, d1

-    vadd.u64        d1, d2, d3

-    vmull.s32       q5, d0, d0

-    vst1.32         {d1[0]}, [lr]               ;store sse

-    vshr.s32        d10, d10, #8

-    vsub.s32        d0, d1, d10

-    vmov.32         r0, d0[0]                   ;return

-    pop             {pc}

-    ENDP

-;================================================

-;unsigned int vp9_variance_halfpixvar16x16_hv_neon

-;(

-;    unsigned char  *src_ptr, r0

-;    int  src_pixels_per_line,  r1

-;    unsigned char *dst_ptr,  r2

-;    int dst_pixels_per_line,   r3

-;    unsigned int *sse

-;);

-;================================================

-|vp9_variance_halfpixvar16x16_hv_neon| PROC

-    push            {lr}

-    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data

-    ldr             lr, [sp, #4]           ;load *sse from stack

-    vmov.i8         q13, #0                      ;q8 - sum

-    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]

-    vmov.i8         q14, #0                      ;q9, q10 - sse

-    vmov.i8         q15, #0

-    mov             r12, #4                  ;loop counter

-    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1

-;First Pass: output_height lines x output_width columns (17x16)

-vp8_filt16x16s_4_4_loop_neon

-    vld1.u8         {d4, d5, d6, d7}, [r0], r1

-    vld1.u8         {d8, d9, d10, d11}, [r0], r1

-    vld1.u8         {d12, d13, d14, d15}, [r0], r1

-    vld1.u8         {d16, d17, d18, d19}, [r0], r1

-    ;pld                [r0]

-    ;pld                [r0, r1]

-    ;pld                [r0, r1, lsl #1]

-    vext.8          q3, q2, q3, #1          ;construct src_ptr[1]

-    vext.8          q5, q4, q5, #1

-    vext.8          q7, q6, q7, #1

-    vext.8          q9, q8, q9, #1

-    vrhadd.u8       q1, q2, q3              ;(src_ptr[0]+src_ptr[1])/round/shift right 1

-    vrhadd.u8       q2, q4, q5

-    vrhadd.u8       q3, q6, q7

-    vrhadd.u8       q4, q8, q9

-    vld1.8          {q5}, [r2], r3

-    vrhadd.u8       q0, q0, q1

-    vld1.8          {q6}, [r2], r3

-    vrhadd.u8       q1, q1, q2

-    vld1.8          {q7}, [r2], r3

-    vrhadd.u8       q2, q2, q3

-    vld1.8          {q8}, [r2], r3

-    vrhadd.u8       q3, q3, q4

-    vsubl.u8        q9, d0, d10                 ;diff

-    vsubl.u8        q10, d1, d11

-    vsubl.u8        q11, d2, d12

-    vsubl.u8        q12, d3, d13

-    vsubl.u8        q0, d4, d14                 ;diff

-    vsubl.u8        q1, d5, d15

-    vsubl.u8        q5, d6, d16

-    vsubl.u8        q6, d7, d17

-    vpadal.s16      q13, q9                     ;sum

-    vmlal.s16       q14, d18, d18                ;sse

-    vmlal.s16       q15, d19, d19

-    vpadal.s16      q13, q10                     ;sum

-    vmlal.s16       q14, d20, d20                ;sse

-    vmlal.s16       q15, d21, d21

-    vpadal.s16      q13, q11                     ;sum

-    vmlal.s16       q14, d22, d22                ;sse

-    vmlal.s16       q15, d23, d23

-    vpadal.s16      q13, q12                     ;sum

-    vmlal.s16       q14, d24, d24                ;sse

-    vmlal.s16       q15, d25, d25

-    subs            r12, r12, #1

-    vpadal.s16      q13, q0                     ;sum

-    vmlal.s16       q14, d0, d0                ;sse

-    vmlal.s16       q15, d1, d1

-    vpadal.s16      q13, q1                     ;sum

-    vmlal.s16       q14, d2, d2                ;sse

-    vmlal.s16       q15, d3, d3

-    vpadal.s16      q13, q5                     ;sum

-    vmlal.s16       q14, d10, d10                ;sse

-    vmlal.s16       q15, d11, d11

-    vmov            q0, q4

-    vpadal.s16      q13, q6                     ;sum

-    vmlal.s16       q14, d12, d12                ;sse

-    vmlal.s16       q15, d13, d13

-    bne             vp8_filt16x16s_4_4_loop_neon

-    vadd.u32        q15, q14, q15                ;accumulate sse

-    vpaddl.s32      q0, q13                      ;accumulate sum

-    vpaddl.u32      q1, q15

-    vadd.s64        d0, d0, d1

-    vadd.u64        d1, d2, d3

-    vmull.s32       q5, d0, d0

-    vst1.32         {d1[0]}, [lr]               ;store sse

-    vshr.s32        d10, d10, #8

-    vsub.s32        d0, d1, d10

-    vmov.32         r0, d0[0]                   ;return

-    pop             {pc}

-    ENDP

-;==============================

-; r0    unsigned char  *src_ptr,

-; r1    int  src_pixels_per_line,

-; r2    int  xoffset,

-; r3    int  yoffset,

-; stack unsigned char *dst_ptr,

-; stack int dst_pixels_per_line,

-; stack unsigned int *sse

-;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()

-;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,

-;or filter coeff is {64, 64}. This simplified program only works in this situation.

-;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.

-|vp9_sub_pixel_variance16x16s_neon| PROC

-    push            {r4, lr}

-    ldr             r4, [sp, #8]            ;load *dst_ptr from stack

-    ldr             r12, [sp, #12]          ;load dst_pixels_per_line from stack

-    ldr             lr, [sp, #16]           ;load *sse from stack

-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

-    beq             secondpass_bfilter16x16s_only

-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

-    beq             firstpass_bfilter16x16s_only

-    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data

-    sub             sp, sp, #256            ;reserve space on stack for temporary storage

-    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]

-    mov             r3, sp

-    mov             r2, #4                  ;loop counter

-    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1

-;First Pass: output_height lines x output_width columns (17x16)

-vp8e_filt_blk2d_fp16x16s_loop_neon

-    vld1.u8         {d4, d5, d6, d7}, [r0], r1

-    vld1.u8         {d8, d9, d10, d11}, [r0], r1

-    vld1.u8         {d12, d13, d14, d15}, [r0], r1

-    vld1.u8         {d16, d17, d18, d19}, [r0], r1

-    ;pld                [r0]

-    ;pld                [r0, r1]

-    ;pld                [r0, r1, lsl #1]

-    vext.8          q3, q2, q3, #1          ;construct src_ptr[1]

-    vext.8          q5, q4, q5, #1

-    vext.8          q7, q6, q7, #1

-    vext.8          q9, q8, q9, #1

-    vrhadd.u8       q1, q2, q3              ;(src_ptr[0]+src_ptr[1])/round/shift right 1

-    vrhadd.u8       q2, q4, q5

-    vrhadd.u8       q3, q6, q7

-    vrhadd.u8       q4, q8, q9

-    vrhadd.u8       q0, q0, q1

-    vrhadd.u8       q1, q1, q2

-    vrhadd.u8       q2, q2, q3

-    vrhadd.u8       q3, q3, q4

-    subs            r2, r2, #1

-    vst1.u8         {d0, d1 ,d2, d3}, [r3]!         ;store result

-    vmov            q0, q4

-    vst1.u8         {d4, d5, d6, d7}, [r3]!

-    bne             vp8e_filt_blk2d_fp16x16s_loop_neon

-    b               sub_pixel_variance16x16s_neon

-;--------------------

-firstpass_bfilter16x16s_only

-    mov             r2, #2                  ;loop counter

-    sub             sp, sp, #256            ;reserve space on stack for temporary storage

-    mov             r3, sp

-;First Pass: output_height lines x output_width columns (16x16)

-vp8e_filt_blk2d_fpo16x16s_loop_neon

-    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data

-    vld1.u8         {d4, d5, d6, d7}, [r0], r1

-    vld1.u8         {d8, d9, d10, d11}, [r0], r1

-    vld1.u8         {d12, d13, d14, d15}, [r0], r1

-    ;pld                [r0]

-    ;pld                [r0, r1]

-    ;pld                [r0, r1, lsl #1]

-    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]

-    vld1.u8         {d16, d17, d18, d19}, [r0], r1

-    vext.8          q3, q2, q3, #1

-    vld1.u8         {d20, d21, d22, d23}, [r0], r1

-    vext.8          q5, q4, q5, #1

-    vld1.u8         {d24, d25, d26, d27}, [r0], r1

-    vext.8          q7, q6, q7, #1

-    vld1.u8         {d28, d29, d30, d31}, [r0], r1

-    vext.8          q9, q8, q9, #1

-    vext.8          q11, q10, q11, #1

-    vext.8          q13, q12, q13, #1

-    vext.8          q15, q14, q15, #1

-    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1

-    vrhadd.u8       q1, q2, q3

-    vrhadd.u8       q2, q4, q5

-    vrhadd.u8       q3, q6, q7

-    vrhadd.u8       q4, q8, q9

-    vrhadd.u8       q5, q10, q11

-    vrhadd.u8       q6, q12, q13

-    vrhadd.u8       q7, q14, q15

-    subs            r2, r2, #1

-    vst1.u8         {d0, d1, d2, d3}, [r3]!         ;store result

-    vst1.u8         {d4, d5, d6, d7}, [r3]!

-    vst1.u8         {d8, d9, d10, d11}, [r3]!

-    vst1.u8         {d12, d13, d14, d15}, [r3]!

-    bne             vp8e_filt_blk2d_fpo16x16s_loop_neon

-    b               sub_pixel_variance16x16s_neon

-;---------------------

-secondpass_bfilter16x16s_only

-    sub             sp, sp, #256            ;reserve space on stack for temporary storage

-    mov             r2, #2                  ;loop counter

-    vld1.u8         {d0, d1}, [r0], r1      ;load src data

-    mov             r3, sp

-vp8e_filt_blk2d_spo16x16s_loop_neon

-    vld1.u8         {d2, d3}, [r0], r1

-    vld1.u8         {d4, d5}, [r0], r1

-    vld1.u8         {d6, d7}, [r0], r1

-    vld1.u8         {d8, d9}, [r0], r1

-    vrhadd.u8       q0, q0, q1

-    vld1.u8         {d10, d11}, [r0], r1

-    vrhadd.u8       q1, q1, q2

-    vld1.u8         {d12, d13}, [r0], r1

-    vrhadd.u8       q2, q2, q3

-    vld1.u8         {d14, d15}, [r0], r1

-    vrhadd.u8       q3, q3, q4

-    vld1.u8         {d16, d17}, [r0], r1

-    vrhadd.u8       q4, q4, q5

-    vrhadd.u8       q5, q5, q6

-    vrhadd.u8       q6, q6, q7

-    vrhadd.u8       q7, q7, q8

-    subs            r2, r2, #1

-    vst1.u8         {d0, d1, d2, d3}, [r3]!         ;store result

-    vmov            q0, q8

-    vst1.u8         {d4, d5, d6, d7}, [r3]!

-    vst1.u8         {d8, d9, d10, d11}, [r3]!           ;store result

-    vst1.u8         {d12, d13, d14, d15}, [r3]!

-    bne             vp8e_filt_blk2d_spo16x16s_loop_neon

-    b               sub_pixel_variance16x16s_neon

-;----------------------------

-;variance16x16

-sub_pixel_variance16x16s_neon

-    vmov.i8         q8, #0                      ;q8 - sum

-    vmov.i8         q9, #0                      ;q9, q10 - sse

-    vmov.i8         q10, #0

-    sub             r3, r3, #256

-    mov             r2, #4

-sub_pixel_variance16x16s_neon_loop

-    vld1.8          {q0}, [r3]!                 ;Load up source and reference

-    vld1.8          {q1}, [r4], r12

-    vld1.8          {q2}, [r3]!

-    vld1.8          {q3}, [r4], r12

-    vld1.8          {q4}, [r3]!

-    vld1.8          {q5}, [r4], r12

-    vld1.8          {q6}, [r3]!

-    vld1.8          {q7}, [r4], r12

-    vsubl.u8        q11, d0, d2                 ;diff

-    vsubl.u8        q12, d1, d3

-    vsubl.u8        q13, d4, d6

-    vsubl.u8        q14, d5, d7

-    vsubl.u8        q0, d8, d10

-    vsubl.u8        q1, d9, d11

-    vsubl.u8        q2, d12, d14

-    vsubl.u8        q3, d13, d15

-    vpadal.s16      q8, q11                     ;sum

-    vmlal.s16       q9, d22, d22                ;sse

-    vmlal.s16       q10, d23, d23

-    subs            r2, r2, #1

-    vpadal.s16      q8, q12

-    vmlal.s16       q9, d24, d24

-    vmlal.s16       q10, d25, d25

-    vpadal.s16      q8, q13

-    vmlal.s16       q9, d26, d26

-    vmlal.s16       q10, d27, d27

-    vpadal.s16      q8, q14

-    vmlal.s16       q9, d28, d28

-    vmlal.s16       q10, d29, d29

-    vpadal.s16      q8, q0                     ;sum

-    vmlal.s16       q9, d0, d0                ;sse

-    vmlal.s16       q10, d1, d1

-    vpadal.s16      q8, q1

-    vmlal.s16       q9, d2, d2

-    vmlal.s16       q10, d3, d3

-    vpadal.s16      q8, q2

-    vmlal.s16       q9, d4, d4

-    vmlal.s16       q10, d5, d5

-    vpadal.s16      q8, q3

-    vmlal.s16       q9, d6, d6

-    vmlal.s16       q10, d7, d7

-    bne             sub_pixel_variance16x16s_neon_loop

-    vadd.u32        q10, q9, q10                ;accumulate sse

-    vpaddl.s32      q0, q8                      ;accumulate sum

-    vpaddl.u32      q1, q10

-    vadd.s64        d0, d0, d1

-    vadd.u64        d1, d2, d3

-    vmull.s32       q5, d0, d0

-    vst1.32         {d1[0]}, [lr]               ;store sse

-    vshr.s32        d10, d10, #8

-    vsub.s32        d0, d1, d10

-    add             sp, sp, #256

-    vmov.32         r0, d0[0]                   ;return

-    pop             {r4, pc}

-    ENDP

-    END

--- a/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm

+++ /dev/null

@@ -1,224 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp9_sub_pixel_variance8x8_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-; r0    unsigned char  *src_ptr,

-; r1    int  src_pixels_per_line,

-; r2    int  xoffset,

-; r3    int  yoffset,

-; stack(r4) unsigned char *dst_ptr,

-; stack(r5) int dst_pixels_per_line,

-; stack(r6) unsigned int *sse

-;note: most of the code is copied from bilinear_predict8x8_neon and vp9_variance8x8_neon.

-|vp9_sub_pixel_variance8x8_neon| PROC

-    push            {r4-r5, lr}

-    ldr             r12, _BilinearTaps_coeff_

-    ldr             r4, [sp, #12]           ;load *dst_ptr from stack

-    ldr             r5, [sp, #16]           ;load dst_pixels_per_line from stack

-    ldr             lr, [sp, #20]           ;load *sse from stack

-    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

-    beq             skip_firstpass_filter

-;First pass: output_height lines x output_width columns (9x8)

-    add             r2, r12, r2, lsl #3     ;calculate filter location

-    vld1.u8         {q1}, [r0], r1          ;load src data

-    vld1.u32        {d31}, [r2]             ;load first_pass filter

-    vld1.u8         {q2}, [r0], r1

-    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)

-    vld1.u8         {q3}, [r0], r1

-    vdup.8          d1, d31[4]

-    vld1.u8         {q4}, [r0], r1

-    vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])

-    vmull.u8        q7, d4, d0

-    vmull.u8        q8, d6, d0

-    vmull.u8        q9, d8, d0

-    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]

-    vext.8          d5, d4, d5, #1

-    vext.8          d7, d6, d7, #1

-    vext.8          d9, d8, d9, #1

-    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])

-    vmlal.u8        q7, d5, d1

-    vmlal.u8        q8, d7, d1

-    vmlal.u8        q9, d9, d1

-    vld1.u8         {q1}, [r0], r1          ;load src data

-    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8

-    vld1.u8         {q2}, [r0], r1

-    vqrshrn.u16    d23, q7, #7

-    vld1.u8         {q3}, [r0], r1

-    vqrshrn.u16    d24, q8, #7

-    vld1.u8         {q4}, [r0], r1

-    vqrshrn.u16    d25, q9, #7

-    ;first_pass filtering on the rest 5-line data

-    vld1.u8         {q5}, [r0], r1

-    vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])

-    vmull.u8        q7, d4, d0

-    vmull.u8        q8, d6, d0

-    vmull.u8        q9, d8, d0

-    vmull.u8        q10, d10, d0

-    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]

-    vext.8          d5, d4, d5, #1

-    vext.8          d7, d6, d7, #1

-    vext.8          d9, d8, d9, #1

-    vext.8          d11, d10, d11, #1

-    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])

-    vmlal.u8        q7, d5, d1

-    vmlal.u8        q8, d7, d1

-    vmlal.u8        q9, d9, d1

-    vmlal.u8        q10, d11, d1

-    vqrshrn.u16    d26, q6, #7              ;shift/round/saturate to u8

-    vqrshrn.u16    d27, q7, #7

-    vqrshrn.u16    d28, q8, #7

-    vqrshrn.u16    d29, q9, #7

-    vqrshrn.u16    d30, q10, #7

-;Second pass: 8x8

-secondpass_filter

-    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

-    ;skip_secondpass_filter

-    beq             sub_pixel_variance8x8_neon

-    add             r3, r12, r3, lsl #3

-    vld1.u32        {d31}, [r3]             ;load second_pass filter

-    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)

-    vdup.8          d1, d31[4]

-    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])

-    vmull.u8        q2, d23, d0

-    vmull.u8        q3, d24, d0

-    vmull.u8        q4, d25, d0

-    vmull.u8        q5, d26, d0

-    vmull.u8        q6, d27, d0

-    vmull.u8        q7, d28, d0

-    vmull.u8        q8, d29, d0

-    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * Filter[1])

-    vmlal.u8        q2, d24, d1

-    vmlal.u8        q3, d25, d1

-    vmlal.u8        q4, d26, d1

-    vmlal.u8        q5, d27, d1

-    vmlal.u8        q6, d28, d1

-    vmlal.u8        q7, d29, d1

-    vmlal.u8        q8, d30, d1

-    vqrshrn.u16    d22, q1, #7              ;shift/round/saturate to u8

-    vqrshrn.u16    d23, q2, #7

-    vqrshrn.u16    d24, q3, #7

-    vqrshrn.u16    d25, q4, #7

-    vqrshrn.u16    d26, q5, #7

-    vqrshrn.u16    d27, q6, #7

-    vqrshrn.u16    d28, q7, #7

-    vqrshrn.u16    d29, q8, #7

-    b               sub_pixel_variance8x8_neon

-;--------------------

-skip_firstpass_filter

-    vld1.u8         {d22}, [r0], r1         ;load src data

-    vld1.u8         {d23}, [r0], r1

-    vld1.u8         {d24}, [r0], r1

-    vld1.u8         {d25}, [r0], r1

-    vld1.u8         {d26}, [r0], r1

-    vld1.u8         {d27}, [r0], r1

-    vld1.u8         {d28}, [r0], r1

-    vld1.u8         {d29}, [r0], r1

-    vld1.u8         {d30}, [r0], r1

-    b               secondpass_filter

-;----------------------

-;vp9_variance8x8_neon

-sub_pixel_variance8x8_neon

-    vmov.i8         q8, #0                      ;q8 - sum

-    vmov.i8         q9, #0                      ;q9, q10 - sse

-    vmov.i8         q10, #0

-    mov             r12, #2

-sub_pixel_variance8x8_neon_loop

-    vld1.8          {d0}, [r4], r5              ;load dst data

-    subs            r12, r12, #1

-    vld1.8          {d1}, [r4], r5

-    vld1.8          {d2}, [r4], r5

-    vsubl.u8        q4, d22, d0                 ;calculate diff

-    vld1.8          {d3}, [r4], r5

-    vsubl.u8        q5, d23, d1

-    vsubl.u8        q6, d24, d2

-    vpadal.s16      q8, q4                      ;sum

-    vmlal.s16       q9, d8, d8                  ;sse

-    vmlal.s16       q10, d9, d9

-    vsubl.u8        q7, d25, d3

-    vpadal.s16      q8, q5

-    vmlal.s16       q9, d10, d10

-    vmlal.s16       q10, d11, d11

-    vmov            q11, q13

-    vpadal.s16      q8, q6

-    vmlal.s16       q9, d12, d12

-    vmlal.s16       q10, d13, d13

-    vmov            q12, q14

-    vpadal.s16      q8, q7

-    vmlal.s16       q9, d14, d14

-    vmlal.s16       q10, d15, d15

-    bne             sub_pixel_variance8x8_neon_loop

-    vadd.u32        q10, q9, q10                ;accumulate sse

-    vpaddl.s32      q0, q8                      ;accumulate sum

-    vpaddl.u32      q1, q10

-    vadd.s64        d0, d0, d1

-    vadd.u64        d1, d2, d3

-    vmull.s32       q5, d0, d0

-    vst1.32         {d1[0]}, [lr]               ;store sse

-    vshr.s32        d10, d10, #6

-    vsub.s32        d0, d1, d10

-    vmov.32         r0, d0[0]                   ;return

-    pop             {r4-r5, pc}

-    ENDP

-;-----------------

-_BilinearTaps_coeff_

-    DCD     bilinear_taps_coeff

-bilinear_taps_coeff

-    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112

-    END

--- a/vp8/encoder/arm/quantize_arm.c

+++ /dev/null

@@ -1,59 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <math.h>

-#include "vpx_mem/vpx_mem.h"

-#include "vp8/encoder/quantize.h"

-#include "vp8/common/entropy.h"

-#if HAVE_ARMV7

-/* vp8_quantize_mbX functions here differs from corresponding ones in

- * quantize.c only by using quantize_b_pair function pointer instead of

- * the regular quantize_b function pointer */

-void vp8_quantize_mby_neon(MACROBLOCK *x) {

-  int i;

-  int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED

-                       && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);

-  for (i = 0; i < 16; i += 2)

-    x->quantize_b_pair(&x->block[i], &x->block[i + 1],

-                       &x->e_mbd.block[i], &x->e_mbd.block[i + 1]);

-  if (has_2nd_order)

-    x->quantize_b(&x->block[24], &x->e_mbd.block[24]);

-}

-void vp8_quantize_mb_neon(MACROBLOCK *x) {

-  int i;

-  int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED

-                       && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);

-  for (i = 0; i < 24; i += 2)

-    x->quantize_b_pair(&x->block[i], &x->block[i + 1],

-                       &x->e_mbd.block[i], &x->e_mbd.block[i + 1]);

-  if (has_2nd_order)

-    x->quantize_b(&x->block[i], &x->e_mbd.block[i]);

-}

-void vp8_quantize_mbuv_neon(MACROBLOCK *x) {

-  int i;

-  for (i = 16; i < 24; i += 2)

-    x->quantize_b_pair(&x->block[i], &x->block[i + 1],

-                       &x->e_mbd.block[i], &x->e_mbd.block[i + 1]);

-}

-#endif /* HAVE_ARMV7 */

--- a/vp8/encoder/arm/quantize_arm.h

+++ /dev/null

@@ -1,52 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef QUANTIZE_ARM_H

-#define QUANTIZE_ARM_H

-#if HAVE_ARMV6

-extern prototype_quantize_block(vp8_fast_quantize_b_armv6);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp8_quantize_fastquantb

-#define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6

-#endif

-#endif /* HAVE_ARMV6 */

-#if HAVE_ARMV7

-extern prototype_quantize_block(vp8_fast_quantize_b_neon);

-extern prototype_quantize_block_pair(vp8_fast_quantize_b_pair_neon);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp8_quantize_fastquantb

-#define vp8_quantize_fastquantb vp8_fast_quantize_b_neon

-#undef  vp8_quantize_fastquantb_pair

-#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_neon

-#undef vp8_quantize_mb

-#define vp8_quantize_mb vp8_quantize_mb_neon

-#undef vp8_quantize_mbuv

-#define vp8_quantize_mbuv vp8_quantize_mbuv_neon

-#undef vp8_quantize_mby

-#define vp8_quantize_mby vp8_quantize_mby_neon

-#endif

-#endif /* HAVE_ARMV7 */

-#endif

--- a/vp8/encoder/arm/variance_arm.c

+++ /dev/null

@@ -1,112 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_config.h"

-#include "vp8/encoder/variance.h"

-#include "vp8/common/filter.h"

-#include "vp8/common/arm/bilinearfilter_arm.h"

-#define HALFNDX 8

-#if HAVE_ARMV6

-unsigned int vp9_sub_pixel_variance8x8_armv6

-(

-  const unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  const unsigned char *dst_ptr,

-  int dst_pixels_per_line,

-  unsigned int *sse

-) {

-  unsigned short first_pass[10 * 8];

-  unsigned char  second_pass[8 * 8];

-  const short *HFilter, *VFilter;

-  HFilter = vp8_bilinear_filters[xoffset];

-  VFilter = vp8_bilinear_filters[yoffset];

-  vp9_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,

-                                          src_pixels_per_line,

-                                          9, 8, HFilter);

-  vp9_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,

-                                           8, 8, 8, VFilter);

-  return vp9_variance8x8_armv6(second_pass, 8, dst_ptr,

-                               dst_pixels_per_line, sse);

-}

-unsigned int vp9_sub_pixel_variance16x16_armv6

-(

-  const unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  const unsigned char *dst_ptr,

-  int dst_pixels_per_line,

-  unsigned int *sse

-) {

-  unsigned short first_pass[36 * 16];

-  unsigned char  second_pass[20 * 16];

-  const short *HFilter, *VFilter;

-  unsigned int var;

-  if (xoffset == HALFNDX && yoffset == 0) {

-    var = vp9_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line,

-                                               dst_ptr, dst_pixels_per_line, sse);

-  } else if (xoffset == 0 && yoffset == HALFNDX) {

-    var = vp9_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line,

-                                               dst_ptr, dst_pixels_per_line, sse);

-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {

-    var = vp9_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line,

-                                                dst_ptr, dst_pixels_per_line, sse);

-  } else {

-    HFilter = vp8_bilinear_filters[xoffset];

-    VFilter = vp8_bilinear_filters[yoffset];

-    vp9_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,

-                                            src_pixels_per_line,

-                                            17, 16, HFilter);

-    vp9_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,

-                                             16, 16, 16, VFilter);

-    var = vp9_variance16x16_armv6(second_pass, 16, dst_ptr,

-                                  dst_pixels_per_line, sse);

-  }

-  return var;

-}

-#endif /* HAVE_ARMV6 */

-#if HAVE_ARMV7

-unsigned int vp9_sub_pixel_variance16x16_neon

-(

-  const unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  const unsigned char *dst_ptr,

-  int dst_pixels_per_line,

-  unsigned int *sse

-) {

-  if (xoffset == HALFNDX && yoffset == 0)

-    return vp9_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);

-  else if (xoffset == 0 && yoffset == HALFNDX)

-    return vp9_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);

-  else if (xoffset == HALFNDX && yoffset == HALFNDX)

-    return vp9_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);

-  else

-    return vp9_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);

-}

-#endif

--- a/vp8/encoder/arm/variance_arm.h

+++ /dev/null

@@ -1,132 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef VARIANCE_ARM_H

-#define VARIANCE_ARM_H

-#if HAVE_ARMV6

-extern prototype_sad(vp9_sad16x16_armv6);

-extern prototype_variance(vp9_variance16x16_armv6);

-extern prototype_variance(vp9_variance8x8_armv6);

-extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_armv6);

-extern prototype_subpixvariance(vp9_sub_pixel_variance8x8_armv6);

-extern prototype_variance(vp9_variance_halfpixvar16x16_h_armv6);

-extern prototype_variance(vp9_variance_halfpixvar16x16_v_armv6);

-extern prototype_variance(vp9_variance_halfpixvar16x16_hv_armv6);

-extern prototype_variance(vp9_mse16x16_armv6);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_variance_sad16x16

-#define vp9_variance_sad16x16 vp9_sad16x16_armv6

-#undef  vp9_variance_subpixvar16x16

-#define vp9_variance_subpixvar16x16 vp9_sub_pixel_variance16x16_armv6

-#undef  vp9_variance_subpixvar8x8

-#define vp9_variance_subpixvar8x8 vp9_sub_pixel_variance8x8_armv6

-#undef  vp9_variance_var16x16

-#define vp9_variance_var16x16 vp9_variance16x16_armv6

-#undef  vp9_variance_mse16x16

-#define vp9_variance_mse16x16 vp9_mse16x16_armv6

-#undef  vp9_variance_var8x8

-#define vp9_variance_var8x8 vp9_variance8x8_armv6

-#undef  vp9_variance_halfpixvar16x16_h

-#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_armv6

-#undef  vp9_variance_halfpixvar16x16_v

-#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_armv6

-#undef  vp9_variance_halfpixvar16x16_hv

-#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_armv6

-#endif /* !CONFIG_RUNTIME_CPU_DETECT */

-#endif /* HAVE_ARMV6 */

-#if HAVE_ARMV7

-extern prototype_sad(vp9_sad4x4_neon);

-extern prototype_sad(vp9_sad8x8_neon);

-extern prototype_sad(vp9_sad8x16_neon);

-extern prototype_sad(vp9_sad16x8_neon);

-extern prototype_sad(vp9_sad16x16_neon);

-extern prototype_variance(vp9_variance8x8_neon);

-extern prototype_variance(vp9_variance8x16_neon);

-extern prototype_variance(vp9_variance16x8_neon);

-extern prototype_variance(vp9_variance16x16_neon);

-extern prototype_subpixvariance(vp9_sub_pixel_variance8x8_neon);

-extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_neon);

-extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_neon_func);

-extern prototype_variance(vp9_variance_halfpixvar16x16_h_neon);

-extern prototype_variance(vp9_variance_halfpixvar16x16_v_neon);

-extern prototype_variance(vp9_variance_halfpixvar16x16_hv_neon);

-extern prototype_variance(vp9_mse16x16_neon);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_variance_sad4x4

-#define vp9_variance_sad4x4 vp9_sad4x4_neon

-#undef  vp9_variance_sad8x8

-#define vp9_variance_sad8x8 vp9_sad8x8_neon

-#undef  vp9_variance_sad8x16

-#define vp9_variance_sad8x16 vp9_sad8x16_neon

-#undef  vp9_variance_sad16x8

-#define vp9_variance_sad16x8 vp9_sad16x8_neon

-#undef  vp9_variance_sad16x16

-#define vp9_variance_sad16x16 vp9_sad16x16_neon

-#undef  vp9_variance_var8x8

-#define vp9_variance_var8x8 vp9_variance8x8_neon

-#undef  vp9_variance_var8x16

-#define vp9_variance_var8x16 vp9_variance8x16_neon

-#undef  vp9_variance_var16x8

-#define vp9_variance_var16x8 vp9_variance16x8_neon

-#undef  vp9_variance_var16x16

-#define vp9_variance_var16x16 vp9_variance16x16_neon

-#undef  vp9_variance_subpixvar8x8

-#define vp9_variance_subpixvar8x8 vp9_sub_pixel_variance8x8_neon

-#undef  vp9_variance_subpixvar16x16

-#define vp9_variance_subpixvar16x16 vp9_sub_pixel_variance16x16_neon

-#undef  vp9_variance_halfpixvar16x16_h

-#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_neon

-#undef  vp9_variance_halfpixvar16x16_v

-#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_neon

-#undef  vp9_variance_halfpixvar16x16_hv

-#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_neon

-#undef  vp9_variance_mse16x16

-#define vp9_variance_mse16x16 vp9_mse16x16_neon

-#endif

-#endif

-#endif

--- a/vp8/encoder/asm_enc_offsets.c

+++ /dev/null

@@ -1,90 +1,0 @@

-/*

- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/asm_offsets.h"

-#include "vpx_config.h"

-#include "block.h"

-#include "vp8/common/blockd.h"

-#include "onyx_int.h"

-#include "treewriter.h"

-#include "tokenize.h"

-BEGIN

-/* regular quantize */

-DEFINE(vp9_block_coeff,                         offsetof(BLOCK, coeff));

-DEFINE(vp9_block_zbin,                          offsetof(BLOCK, zbin));

-DEFINE(vp9_block_round,                         offsetof(BLOCK, round));

-DEFINE(vp9_block_quant,                         offsetof(BLOCK, quant));

-DEFINE(vp9_block_quant_fast,                    offsetof(BLOCK, quant_fast));

-DEFINE(vp9_block_zbin_extra,                    offsetof(BLOCK, zbin_extra));

-DEFINE(vp9_block_zrun_zbin_boost,               offsetof(BLOCK, zrun_zbin_boost));

-DEFINE(vp9_block_quant_shift,                   offsetof(BLOCK, quant_shift));

-DEFINE(vp9_blockd_qcoeff,                       offsetof(BLOCKD, qcoeff));

-DEFINE(vp9_blockd_dequant,                      offsetof(BLOCKD, dequant));

-DEFINE(vp9_blockd_dqcoeff,                      offsetof(BLOCKD, dqcoeff));

-DEFINE(vp9_blockd_eob,                          offsetof(BLOCKD, eob));

-/* subtract */

-DEFINE(vp9_block_base_src,                      offsetof(BLOCK, base_src));

-DEFINE(vp9_block_src,                           offsetof(BLOCK, src));

-DEFINE(vp9_block_src_diff,                      offsetof(BLOCK, src_diff));

-DEFINE(vp9_block_src_stride,                    offsetof(BLOCK, src_stride));

-DEFINE(vp9_blockd_predictor,                    offsetof(BLOCKD, predictor));

-/* pack tokens */

-DEFINE(vp9_writer_lowvalue,                     offsetof(vp9_writer, lowvalue));

-DEFINE(vp9_writer_range,                        offsetof(vp9_writer, range));

-DEFINE(vp9_writer_value,                        offsetof(vp9_writer, value));

-DEFINE(vp9_writer_count,                        offsetof(vp9_writer, count));

-DEFINE(vp9_writer_pos,                          offsetof(vp9_writer, pos));

-DEFINE(vp9_writer_buffer,                       offsetof(vp9_writer, buffer));

-DEFINE(tokenextra_token,                        offsetof(TOKENEXTRA, Token));

-DEFINE(tokenextra_extra,                        offsetof(TOKENEXTRA, Extra));

-DEFINE(tokenextra_context_tree,                 offsetof(TOKENEXTRA, context_tree));

-DEFINE(tokenextra_skip_eob_node,                offsetof(TOKENEXTRA, skip_eob_node));

-DEFINE(TOKENEXTRA_SZ,                           sizeof(TOKENEXTRA));

-DEFINE(vp9_extra_bit_struct_sz,                 sizeof(vp9_extra_bit_struct));

-DEFINE(vp9_token_value,                         offsetof(vp9_token, value));

-DEFINE(vp9_token_len,                           offsetof(vp9_token, Len));

-DEFINE(vp9_extra_bit_struct_tree,               offsetof(vp9_extra_bit_struct, tree));

-DEFINE(vp9_extra_bit_struct_prob,               offsetof(vp9_extra_bit_struct, prob));

-DEFINE(vp9_extra_bit_struct_len,                offsetof(vp9_extra_bit_struct, Len));

-DEFINE(vp9_extra_bit_struct_base_val,           offsetof(vp9_extra_bit_struct, base_val));

-DEFINE(vp9_comp_tplist,                         offsetof(VP9_COMP, tplist));

-DEFINE(vp9_comp_common,                         offsetof(VP9_COMP, common));

-DEFINE(tokenlist_start,                         offsetof(TOKENLIST, start));

-DEFINE(tokenlist_stop,                          offsetof(TOKENLIST, stop));

-DEFINE(TOKENLIST_SZ,                            sizeof(TOKENLIST));

-DEFINE(vp9_common_mb_rows,                      offsetof(VP9_COMMON, mb_rows));

-END

-/* add asserts for any offset that is not supported by assembly code

- * add asserts for any size that is not supported by assembly code

- * These are used in vp8cx_pack_tokens.  They are hard coded so if their sizes

- * change they will have to be adjusted.

- */

-#if HAVE_ARMV5TE

-ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)

-ct_assert(vp9_extra_bit_struct_sz, sizeof(vp9_extra_bit_struct) == 16)

-#endif

--- a/vp8/encoder/bitstream.c

+++ /dev/null

@@ -1,2394 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp8/common/header.h"

-#include "encodemv.h"

-#include "vp8/common/entropymode.h"

-#include "vp8/common/findnearmv.h"

-#include "mcomp.h"

-#include "vp8/common/systemdependent.h"

-#include <assert.h>

-#include <stdio.h>

-#include <limits.h>

-#include "vp8/common/pragmas.h"

-#include "vpx/vpx_encoder.h"

-#include "vpx_mem/vpx_mem.h"

-#include "bitstream.h"

-#include "segmentation.h"

-#include "vp8/common/seg_common.h"

-#include "vp8/common/pred_common.h"

-#include "vp8/common/entropy.h"

-#include "vp8/encoder/encodemv.h"

-#include "vp8/common/entropymv.h"

-#if CONFIG_NEWBESTREFMV

-#include "vp8/common/mvref_common.h"

-#endif

-#if defined(SECTIONBITS_OUTPUT)

-unsigned __int64 Sectionbits[500];

-#endif

-#ifdef ENTROPY_STATS

-int intra_mode_stats [VP9_BINTRAMODES] [VP9_BINTRAMODES] [VP9_BINTRAMODES];

-unsigned int tree_update_hist [BLOCK_TYPES]

-                              [COEF_BANDS]

-                              [PREV_COEF_CONTEXTS]

-                              [ENTROPY_NODES][2];

-unsigned int hybrid_tree_update_hist [BLOCK_TYPES]

-                                     [COEF_BANDS]

-                                     [PREV_COEF_CONTEXTS]

-                                     [ENTROPY_NODES][2];

-unsigned int tree_update_hist_8x8 [BLOCK_TYPES_8X8]

-                                  [COEF_BANDS]

-                                  [PREV_COEF_CONTEXTS]

-                                  [ENTROPY_NODES] [2];

-unsigned int hybrid_tree_update_hist_8x8 [BLOCK_TYPES_8X8]

-                                         [COEF_BANDS]

-                                         [PREV_COEF_CONTEXTS]

-                                         [ENTROPY_NODES] [2];

-unsigned int tree_update_hist_16x16 [BLOCK_TYPES_16X16]

-                                    [COEF_BANDS]

-                                    [PREV_COEF_CONTEXTS]

-                                    [ENTROPY_NODES] [2];

-unsigned int hybrid_tree_update_hist_16x16 [BLOCK_TYPES_16X16]

-                                           [COEF_BANDS]

-                                           [PREV_COEF_CONTEXTS]

-                                           [ENTROPY_NODES] [2];

-extern unsigned int active_section;

-#endif

-#ifdef MODE_STATS

-int count_mb_seg[4] = { 0, 0, 0, 0 };

-#endif

-#define vp9_cost_upd  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)) >> 8)

-#define vp9_cost_upd256  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))

-#define SEARCH_NEWP

-static int update_bits[255];

-static void compute_update_table() {

-  int i;

-  for (i = 0; i < 255; i++)

-    update_bits[i] = vp9_count_term_subexp(i, SUBEXP_PARAM, 255);

-}

-static int split_index(int i, int n, int modulus) {

-  int max1 = (n - 1 - modulus / 2) / modulus + 1;

-  if (i % modulus == modulus / 2) i = i / modulus;

-  else i = max1 + i - (i + modulus - modulus / 2) / modulus;

-  return i;

-}

-static int remap_prob(int v, int m) {

-  const int n = 256;

-  const int modulus = MODULUS_PARAM;

-  int i;

-  if ((m << 1) <= n)

-    i = vp9_recenter_nonneg(v, m) - 1;

-  else

-    i = vp9_recenter_nonneg(n - 1 - v, n - 1 - m) - 1;

-  i = split_index(i, n - 1, modulus);

-  return i;

-}

-static void write_prob_diff_update(vp9_writer *const bc,

-                                   vp9_prob newp, vp9_prob oldp) {

-  int delp = remap_prob(newp, oldp);

-  vp9_encode_term_subexp(bc, delp, SUBEXP_PARAM, 255);

-}

-static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {

-  int delp = remap_prob(newp, oldp);

-  return update_bits[delp] * 256;

-}

-static void update_mode(

-  vp9_writer *const bc,

-  int n,

-  vp9_token tok               [/* n */],

-  vp9_tree tree,

-  vp9_prob Pnew               [/* n-1 */],

-  vp9_prob Pcur               [/* n-1 */],

-  unsigned int bct            [/* n-1 */] [2],

-  const unsigned int num_events[/* n */]

-) {

-  unsigned int new_b = 0, old_b = 0;

-  int i = 0;

-  vp9_tree_probs_from_distribution(

-    n--, tok, tree,

-    Pnew, bct, num_events,

-    256, 1

-  );

-  do {

-    new_b += cost_branch(bct[i], Pnew[i]);

-    old_b += cost_branch(bct[i], Pcur[i]);

-  } while (++i < n);

-  if (new_b + (n << 8) < old_b) {

-    int i = 0;

-    vp9_write_bit(bc, 1);

-    do {

-      const vp9_prob p = Pnew[i];

-      vp9_write_literal(bc, Pcur[i] = p ? p : 1, 8);

-    } while (++i < n);

-  } else

-    vp9_write_bit(bc, 0);

-}

-static void update_mbintra_mode_probs(VP9_COMP* const cpi,

-                                      vp9_writer* const bc) {

-  VP9_COMMON *const cm = &cpi->common;

-  {

-    vp9_prob Pnew   [VP9_YMODES - 1];

-    unsigned int bct [VP9_YMODES - 1] [2];

-    update_mode(

-      bc, VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree,

-      Pnew, cm->fc.ymode_prob, bct, (unsigned int *)cpi->ymode_count

-    );

-  }

-}

-static int get_prob(int num, int den) {

-  int p;

-  if (den <= 0)

-    return 128;

-  p = (num * 255 + (den >> 1)) / den;

-  if (p > 255)

-    return 255;

-  else if (p < 1)

-    return 1;

-  return p;

-}

-static int get_binary_prob(int n0, int n1) {

-  return get_prob(n0, n0 + n1);

-}

-void vp9_update_skip_probs(VP9_COMP *cpi) {

-  VP9_COMMON *const pc = &cpi->common;

-  int prob_skip_false[3] = {0, 0, 0};

-  int k;

-  for (k = 0; k < MBSKIP_CONTEXTS; ++k) {

-    pc->mbskip_pred_probs[k] = get_binary_prob(cpi->skip_false_count[k],

-                                               cpi->skip_true_count[k]);

-  }

-}

-static void update_switchable_interp_probs(VP9_COMP *cpi,

-                                           vp9_writer* const bc) {

-  VP9_COMMON *const pc = &cpi->common;

-  unsigned int branch_ct[32][2];

-  int i, j;

-  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {

-    vp9_tree_probs_from_distribution(

-        VP9_SWITCHABLE_FILTERS,

-        vp9_switchable_interp_encodings, vp9_switchable_interp_tree,

-        pc->fc.switchable_interp_prob[j], branch_ct,

-        cpi->switchable_interp_count[j], 256, 1);

-    for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {

-      if (pc->fc.switchable_interp_prob[j][i] < 1)

-        pc->fc.switchable_interp_prob[j][i] = 1;

-      vp9_write_literal(bc, pc->fc.switchable_interp_prob[j][i], 8);

-    }

-  }

-}

-// This function updates the reference frame prediction stats

-static void update_refpred_stats(VP9_COMP *cpi) {

-  VP9_COMMON *const cm = &cpi->common;

-  int i;

-  int tot_count;

-  vp9_prob new_pred_probs[PREDICTION_PROBS];

-  int old_cost, new_cost;

-  // Set the prediction probability structures to defaults

-  if (cm->frame_type == KEY_FRAME) {

-    // Set the prediction probabilities to defaults

-    cm->ref_pred_probs[0] = 120;

-    cm->ref_pred_probs[1] = 80;

-    cm->ref_pred_probs[2] = 40;

-    vpx_memset(cpi->ref_pred_probs_update, 0,

-               sizeof(cpi->ref_pred_probs_update));

-  } else {

-    // From the prediction counts set the probabilities for each context

-    for (i = 0; i < PREDICTION_PROBS; i++) {

-      new_pred_probs[i] = get_binary_prob(cpi->ref_pred_count[i][0],

-                                          cpi->ref_pred_count[i][1]);

-      // Decide whether or not to update the reference frame probs.

-      // Returned costs are in 1/256 bit units.

-      old_cost =

-        (cpi->ref_pred_count[i][0] * vp9_cost_zero(cm->ref_pred_probs[i])) +

-        (cpi->ref_pred_count[i][1] * vp9_cost_one(cm->ref_pred_probs[i]));

-      new_cost =

-        (cpi->ref_pred_count[i][0] * vp9_cost_zero(new_pred_probs[i])) +

-        (cpi->ref_pred_count[i][1] * vp9_cost_one(new_pred_probs[i]));

-      // Cost saving must be >= 8 bits (2048 in these units)

-      if ((old_cost - new_cost) >= 2048) {

-        cpi->ref_pred_probs_update[i] = 1;

-        cm->ref_pred_probs[i] = new_pred_probs[i];

-      } else

-        cpi->ref_pred_probs_update[i] = 0;

-    }

-  }

-}

-static void update_mvcount(VP9_COMP *cpi, MACROBLOCK *x,

-                           int_mv *best_ref_mv, int_mv *second_best_ref_mv) {

-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

-  MV mv;

-  if (mbmi->mode == SPLITMV) {

-    int i;

-    for (i = 0; i < x->partition_info->count; i++) {

-      if (x->partition_info->bmi[i].mode == NEW4X4) {

-        if (x->e_mbd.allow_high_precision_mv) {

-          mv.row = (x->partition_info->bmi[i].mv.as_mv.row

-                    - best_ref_mv->as_mv.row);

-          mv.col = (x->partition_info->bmi[i].mv.as_mv.col

-                    - best_ref_mv->as_mv.col);

-          vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 1);

-          if (x->e_mbd.mode_info_context->mbmi.second_ref_frame) {

-            mv.row = (x->partition_info->bmi[i].second_mv.as_mv.row

-                      - second_best_ref_mv->as_mv.row);

-            mv.col = (x->partition_info->bmi[i].second_mv.as_mv.col

-                      - second_best_ref_mv->as_mv.col);

-            vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv,

-                              &cpi->NMVcount, 1);

-          }

-        } else {

-          mv.row = (x->partition_info->bmi[i].mv.as_mv.row

-                    - best_ref_mv->as_mv.row);

-          mv.col = (x->partition_info->bmi[i].mv.as_mv.col

-                    - best_ref_mv->as_mv.col);

-          vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 0);

-          if (x->e_mbd.mode_info_context->mbmi.second_ref_frame) {

-            mv.row = (x->partition_info->bmi[i].second_mv.as_mv.row

-                      - second_best_ref_mv->as_mv.row);

-            mv.col = (x->partition_info->bmi[i].second_mv.as_mv.col

-                      - second_best_ref_mv->as_mv.col);

-            vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv,

-                              &cpi->NMVcount, 0);

-          }

-        }

-      }

-    }

-  } else if (mbmi->mode == NEWMV) {

-    if (x->e_mbd.allow_high_precision_mv) {

-      mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);

-      mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);

-      vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 1);

-      if (mbmi->second_ref_frame) {

-        mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);

-        mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);

-        vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount, 1);

-      }

-    } else {

-      mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);

-      mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);

-      vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 0);

-      if (mbmi->second_ref_frame) {

-        mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);

-        mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);

-        vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount, 0);

-      }

-    }

-  }

-}

-static void write_ymode(vp9_writer *bc, int m, const vp9_prob *p) {

-  write_token(bc, vp9_ymode_tree, p, vp9_ymode_encodings + m);

-}

-static void kfwrite_ymode(vp9_writer *bc, int m, const vp9_prob *p) {

-  write_token(bc, vp9_kf_ymode_tree, p, vp9_kf_ymode_encodings + m);

-}

-#if CONFIG_SUPERBLOCKS

-static void sb_kfwrite_ymode(vp9_writer *bc, int m, const vp9_prob *p) {

-  write_token(bc, vp9_uv_mode_tree, p, vp9_sb_kf_ymode_encodings + m);

-}

-#endif

-static void write_i8x8_mode(vp9_writer *bc, int m, const vp9_prob *p) {

-  write_token(bc, vp9_i8x8_mode_tree, p, vp9_i8x8_mode_encodings + m);

-}

-static void write_uv_mode(vp9_writer *bc, int m, const vp9_prob *p) {

-  write_token(bc, vp9_uv_mode_tree, p, vp9_uv_mode_encodings + m);

-}

-static void write_bmode(vp9_writer *bc, int m, const vp9_prob *p) {

-  write_token(bc, vp9_bmode_tree, p, vp9_bmode_encodings + m);

-}

-static void write_split(vp9_writer *bc, int x, const vp9_prob *p) {

-  write_token(bc, vp9_mbsplit_tree, p, vp9_mbsplit_encodings + x);

-}

-static int prob_update_savings(const unsigned int *ct,

-                               const vp9_prob oldp, const vp9_prob newp,

-                               const vp9_prob upd) {

-  const int old_b = cost_branch256(ct, oldp);

-  const int new_b = cost_branch256(ct, newp);

-  const int update_b = 2048 + vp9_cost_upd256;

-  return (old_b - new_b - update_b);

-}

-static int prob_diff_update_savings(const unsigned int *ct,

-                                    const vp9_prob oldp, const vp9_prob newp,

-                                    const vp9_prob upd) {

-  const int old_b = cost_branch256(ct, oldp);

-  const int new_b = cost_branch256(ct, newp);

-  const int update_b = (newp == oldp ? 0 :

-                        prob_diff_update_cost(newp, oldp) + vp9_cost_upd256);

-  return (old_b - new_b - update_b);

-}

-static int prob_diff_update_savings_search(const unsigned int *ct,

-                                           const vp9_prob oldp, vp9_prob *bestp,

-                                           const vp9_prob upd) {

-  const int old_b = cost_branch256(ct, oldp);

-  int new_b, update_b, savings, bestsavings, step;

-  vp9_prob newp, bestnewp;

-  bestsavings = 0;

-  bestnewp = oldp;

-  step = (*bestp > oldp ? -1 : 1);

-  for (newp = *bestp; newp != oldp; newp += step) {

-    new_b = cost_branch256(ct, newp);

-    update_b = prob_diff_update_cost(newp, oldp) + vp9_cost_upd256;

-    savings = old_b - new_b - update_b;

-    if (savings > bestsavings) {

-      bestsavings = savings;

-      bestnewp = newp;

-    }

-  }

-  *bestp = bestnewp;

-  return bestsavings;

-}

-static void pack_mb_tokens(vp9_writer* const bc,

-                           TOKENEXTRA **tp,

-                           const TOKENEXTRA *const stop) {

-  unsigned int split;

-  unsigned int shift;

-  int count = bc->count;

-  unsigned int range = bc->range;

-  unsigned int lowvalue = bc->lowvalue;

-  TOKENEXTRA *p = *tp;

-  while (p < stop) {

-    const int t = p->Token;

-    vp9_token *const a = vp9_coef_encodings + t;

-    const vp9_extra_bit_struct *const b = vp9_extra_bits + t;

-    int i = 0;

-    const unsigned char *pp = p->context_tree;

-    int v = a->value;

-    int n = a->Len;

-    if (t == EOSB_TOKEN)

-    {

-      ++p;

-      break;

-    }

-    /* skip one or two nodes */

-    if (p->skip_eob_node) {

-      n -= p->skip_eob_node;

-      i = 2 * p->skip_eob_node;

-    }

-    do {

-      const int bb = (v >> --n) & 1;

-      split = 1 + (((range - 1) * pp[i >> 1]) >> 8);

-      i = vp9_coef_tree[i + bb];

-      if (bb) {

-        lowvalue += split;

-        range = range - split;

-      } else {

-        range = split;

-      }

-      shift = vp9_norm[range];

-      range <<= shift;

-      count += shift;

-      if (count >= 0) {

-        int offset = shift - count;

-        if ((lowvalue << (offset - 1)) & 0x80000000) {

-          int x = bc->pos - 1;

-          while (x >= 0 && bc->buffer[x] == 0xff) {

-            bc->buffer[x] = (unsigned char)0;

-            x--;

-          }

-          bc->buffer[x] += 1;

-        }

-        bc->buffer[bc->pos++] = (lowvalue >> (24 - offset));

-        lowvalue <<= offset;

-        shift = count;

-        lowvalue &= 0xffffff;

-        count -= 8;

-      }

-      lowvalue <<= shift;

-    } while (n);

-    if (b->base_val) {

-      const int e = p->Extra, L = b->Len;

-      if (L) {

-        const unsigned char *pp = b->prob;

-        int v = e >> 1;

-        int n = L;              /* number of bits in v, assumed nonzero */

-        int i = 0;

-        do {

-          const int bb = (v >> --n) & 1;

-          split = 1 + (((range - 1) * pp[i >> 1]) >> 8);

-          i = b->tree[i + bb];

-          if (bb) {

-            lowvalue += split;

-            range = range - split;

-          } else {

-            range = split;

-          }

-          shift = vp9_norm[range];

-          range <<= shift;

-          count += shift;

-          if (count >= 0) {

-            int offset = shift - count;

-            if ((lowvalue << (offset - 1)) & 0x80000000) {

-              int x = bc->pos - 1;

-              while (x >= 0 && bc->buffer[x] == 0xff) {

-                bc->buffer[x] = (unsigned char)0;

-                x--;

-              }

-              bc->buffer[x] += 1;

-            }

-            bc->buffer[bc->pos++] = (lowvalue >> (24 - offset));

-            lowvalue <<= offset;

-            shift = count;

-            lowvalue &= 0xffffff;

-            count -= 8;

-          }

-          lowvalue <<= shift;

-        } while (n);

-      }

-      {

-        split = (range + 1) >> 1;

-        if (e & 1) {

-          lowvalue += split;

-          range = range - split;

-        } else {

-          range = split;

-        }

-        range <<= 1;

-        if ((lowvalue & 0x80000000)) {

-          int x = bc->pos - 1;

-          while (x >= 0 && bc->buffer[x] == 0xff) {

-            bc->buffer[x] = (unsigned char)0;

-            x--;

-          }

-          bc->buffer[x] += 1;

-        }

-        lowvalue  <<= 1;

-        if (!++count) {

-          count = -8;

-          bc->buffer[bc->pos++] = (lowvalue >> 24);

-          lowvalue &= 0xffffff;

-        }

-      }

-    }

-    ++p;

-  }

-  bc->count = count;

-  bc->lowvalue = lowvalue;

-  bc->range = range;

-  *tp = p;

-}

-static void write_partition_size(unsigned char *cx_data, int size) {

-  signed char csize;

-  csize = size & 0xff;

-  *cx_data = csize;

-  csize = (size >> 8) & 0xff;

-  *(cx_data + 1) = csize;

-  csize = (size >> 16) & 0xff;

-  *(cx_data + 2) = csize;

-}

-static void write_mv_ref

-(

-  vp9_writer *bc, MB_PREDICTION_MODE m, const vp9_prob *p

-) {

-#if CONFIG_DEBUG

-  assert(NEARESTMV <= m  &&  m <= SPLITMV);

-#endif

-  write_token(bc, vp9_mv_ref_tree, p,

-              vp9_mv_ref_encoding_array - NEARESTMV + m);

-}

-#if CONFIG_SUPERBLOCKS

-static void write_sb_mv_ref(vp9_writer *bc, MB_PREDICTION_MODE m,

-                            const vp9_prob *p) {

-#if CONFIG_DEBUG

-  assert(NEARESTMV <= m  &&  m < SPLITMV);

-#endif

-  write_token(bc, vp9_sb_mv_ref_tree, p,

-              vp9_sb_mv_ref_encoding_array - NEARESTMV + m);

-}

-#endif

-static void write_sub_mv_ref

-(

-  vp9_writer *bc, B_PREDICTION_MODE m, const vp9_prob *p

-) {

-#if CONFIG_DEBUG

-  assert(LEFT4X4 <= m  &&  m <= NEW4X4);

-#endif

-  write_token(bc, vp9_sub_mv_ref_tree, p,

-              vp9_sub_mv_ref_encoding_array - LEFT4X4 + m);

-}

-static void write_nmv(vp9_writer *bc, const MV *mv, const int_mv *ref,

-                      const nmv_context *nmvc, int usehp) {

-  MV e;

-  e.row = mv->row - ref->as_mv.row;

-  e.col = mv->col - ref->as_mv.col;

-  vp9_encode_nmv(bc, &e, &ref->as_mv, nmvc);

-  vp9_encode_nmv_fp(bc, &e, &ref->as_mv, nmvc, usehp);

-}

-#if CONFIG_NEW_MVREF

-static int vp9_cost_mv_ref_id(vp9_prob * ref_id_probs, int mv_ref_id) {

-  int cost;

-  // Encode the index for the MV reference.

-  switch (mv_ref_id) {

-    case 0:

-      cost = vp9_cost_zero(ref_id_probs[0]);

-      break;

-    case 1:

-      cost = vp9_cost_one(ref_id_probs[0]);

-      cost += vp9_cost_zero(ref_id_probs[1]);

-      break;

-    case 2:

-      cost = vp9_cost_one(ref_id_probs[0]);

-      cost += vp9_cost_one(ref_id_probs[1]);

-      cost += vp9_cost_zero(ref_id_probs[2]);

-      break;

-    case 3:

-      cost = vp9_cost_one(ref_id_probs[0]);

-      cost += vp9_cost_one(ref_id_probs[1]);

-      cost += vp9_cost_one(ref_id_probs[2]);

-      break;

-      // TRAP.. This should not happen

-    default:

-      assert(0);

-      break;

-  }

-  return cost;

-}

-static void vp9_write_mv_ref_id(vp9_writer *w,

-                                vp9_prob * ref_id_probs,

-                                int mv_ref_id) {

-  // Encode the index for the MV reference.

-  switch (mv_ref_id) {

-    case 0:

-      vp9_write(w, 0, ref_id_probs[0]);

-      break;

-    case 1:

-      vp9_write(w, 1, ref_id_probs[0]);

-      vp9_write(w, 0, ref_id_probs[1]);

-      break;

-    case 2:

-      vp9_write(w, 1, ref_id_probs[0]);

-      vp9_write(w, 1, ref_id_probs[1]);

-      vp9_write(w, 0, ref_id_probs[2]);

-      break;

-    case 3:

-      vp9_write(w, 1, ref_id_probs[0]);

-      vp9_write(w, 1, ref_id_probs[1]);

-      vp9_write(w, 1, ref_id_probs[2]);

-      break;

-      // TRAP.. This should not happen

-    default:

-      assert(0);

-      break;

-  }

-}

-// Estimate the cost of each coding the vector using each reference candidate

-static unsigned int pick_best_mv_ref(MACROBLOCK *x,

-                                     MV_REFERENCE_FRAME ref_frame,

-                                     int_mv target_mv,

-                                     int_mv * mv_ref_list,

-                                     int_mv * best_ref) {

-  int i;

-  int best_index = 0;

-  int cost, cost2;

-  int zero_seen = (mv_ref_list[0].as_int) ? FALSE : TRUE;

-  MACROBLOCKD *xd = &x->e_mbd;

-  int max_mv = MV_MAX;

-  cost = vp9_cost_mv_ref_id(xd->mb_mv_ref_id_probs[ref_frame], 0) +

-         vp9_mv_bit_cost(&target_mv,

-                         &mv_ref_list[0],

-                         XMVCOST, 96,

-                         xd->allow_high_precision_mv);

-  // Use 4 for now : for (i = 1; i < MAX_MV_REFS; ++i ) {

-  for (i = 1; i < 4; ++i) {

-    // If we see a 0,0 reference vector for a second time we have reached

-    // the end of the list of valid candidate vectors.

-    if (!mv_ref_list[i].as_int)

-      if (zero_seen)

-        break;

-      else

-        zero_seen = TRUE;

-    // Check for cases where the reference choice would give rise to an

-    // uncodable/out of range residual for row or col.

-    if ((abs(target_mv.as_mv.row - mv_ref_list[i].as_mv.row) > max_mv) ||

-        (abs(target_mv.as_mv.col - mv_ref_list[i].as_mv.col) > max_mv)) {

-      continue;

-    }

-    cost2 = vp9_cost_mv_ref_id(xd->mb_mv_ref_id_probs[ref_frame], i) +

-            vp9_mv_bit_cost(&target_mv,

-                            &mv_ref_list[i],

-                            XMVCOST, 96,

-                            xd->allow_high_precision_mv);

-    if (cost2 < cost) {

-      cost = cost2;

-      best_index = i;

-    }

-  }

-  (*best_ref).as_int = mv_ref_list[best_index].as_int;

-  return best_index;

-}

-#endif

-// This function writes the current macro block's segnment id to the bitstream

-// It should only be called if a segment map update is indicated.

-static void write_mb_segid(vp9_writer *bc,

-                           const MB_MODE_INFO *mi, const MACROBLOCKD *xd) {

-  // Encode the MB segment id.

-  int seg_id = mi->segment_id;

-#if CONFIG_SUPERBLOCKS

-  if (mi->encoded_as_sb) {

-    if (xd->mb_to_right_edge > 0)

-      seg_id = seg_id && xd->mode_info_context[1].mbmi.segment_id;

-    if (xd->mb_to_bottom_edge > 0) {

-      seg_id = seg_id &&

-               xd->mode_info_context[xd->mode_info_stride].mbmi.segment_id;

-      if (xd->mb_to_right_edge > 0)

-        seg_id = seg_id &&

-                xd->mode_info_context[xd->mode_info_stride + 1].mbmi.segment_id;

-    }

-  }

-#endif

-  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {

-    switch (seg_id) {

-      case 0:

-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);

-        vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);

-        break;

-      case 1:

-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);

-        vp9_write(bc, 1, xd->mb_segment_tree_probs[1]);

-        break;

-      case 2:

-        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);

-        vp9_write(bc, 0, xd->mb_segment_tree_probs[2]);

-        break;

-      case 3:

-        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);

-        vp9_write(bc, 1, xd->mb_segment_tree_probs[2]);

-        break;

-        // TRAP.. This should not happen

-      default:

-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);

-        vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);

-        break;

-    }

-  }

-}

-// This function encodes the reference frame

-static void encode_ref_frame(vp9_writer *const bc,

-                             VP9_COMMON *const cm,

-                             MACROBLOCKD *xd,

-                             int segment_id,

-                             MV_REFERENCE_FRAME rf) {

-  int seg_ref_active;

-  int seg_ref_count = 0;

-  seg_ref_active = vp9_segfeature_active(xd,

-                                         segment_id,

-                                         SEG_LVL_REF_FRAME);

-  if (seg_ref_active) {

-    seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME) +

-                    vp9_check_segref(xd, segment_id, LAST_FRAME) +

-                    vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +

-                    vp9_check_segref(xd, segment_id, ALTREF_FRAME);

-  }

-  // If segment level coding of this signal is disabled...

-  // or the segment allows multiple reference frame options

-  if (!seg_ref_active || (seg_ref_count > 1)) {

-    // Values used in prediction model coding

-    unsigned char prediction_flag;

-    vp9_prob pred_prob;

-    MV_REFERENCE_FRAME pred_rf;

-    // Get the context probability the prediction flag

-    pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);

-    // Get the predicted value.

-    pred_rf = vp9_get_pred_ref(cm, xd);

-    // Did the chosen reference frame match its predicted value.

-    prediction_flag =

-      (xd->mode_info_context->mbmi.ref_frame == pred_rf);

-    vp9_set_pred_flag(xd, PRED_REF, prediction_flag);

-    vp9_write(bc, prediction_flag, pred_prob);

-    // If not predicted correctly then code value explicitly

-    if (!prediction_flag) {

-      vp9_prob mod_refprobs[PREDICTION_PROBS];

-      vpx_memcpy(mod_refprobs,

-                 cm->mod_refprobs[pred_rf], sizeof(mod_refprobs));

-      // If segment coding enabled blank out options that cant occur by

-      // setting the branch probability to 0.

-      if (seg_ref_active) {

-        mod_refprobs[INTRA_FRAME] *=

-          vp9_check_segref(xd, segment_id, INTRA_FRAME);

-        mod_refprobs[LAST_FRAME] *=

-          vp9_check_segref(xd, segment_id, LAST_FRAME);

-        mod_refprobs[GOLDEN_FRAME] *=

-          (vp9_check_segref(xd, segment_id, GOLDEN_FRAME) *

-           vp9_check_segref(xd, segment_id, ALTREF_FRAME));

-      }

-      if (mod_refprobs[0]) {

-        vp9_write(bc, (rf != INTRA_FRAME), mod_refprobs[0]);

-      }

-      // Inter coded

-      if (rf != INTRA_FRAME) {

-        if (mod_refprobs[1]) {

-          vp9_write(bc, (rf != LAST_FRAME), mod_refprobs[1]);

-        }

-        if (rf != LAST_FRAME) {

-          if (mod_refprobs[2]) {

-            vp9_write(bc, (rf != GOLDEN_FRAME), mod_refprobs[2]);

-          }

-        }

-      }

-    }

-  }

-  // if using the prediction mdoel we have nothing further to do because

-  // the reference frame is fully coded by the segment

-}

-// Update the probabilities used to encode reference frame data

-static void update_ref_probs(VP9_COMP *const cpi) {

-  VP9_COMMON *const cm = &cpi->common;

-  const int *const rfct = cpi->count_mb_ref_frame_usage;

-  const int rf_intra = rfct[INTRA_FRAME];

-  const int rf_inter = rfct[LAST_FRAME] +

-                       rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];

-  cm->prob_intra_coded = get_binary_prob(rf_intra, rf_inter);

-  cm->prob_last_coded = get_prob(rfct[LAST_FRAME], rf_inter);

-  cm->prob_gf_coded = get_binary_prob(rfct[GOLDEN_FRAME], rfct[ALTREF_FRAME]);

-  // Compute a modified set of probabilities to use when prediction of the

-  // reference frame fails

-  vp9_compute_mod_refprobs(cm);

-}

-static void pack_inter_mode_mvs(VP9_COMP *const cpi, vp9_writer *const bc) {

-  int i;

-  VP9_COMMON *const pc = &cpi->common;

-  const nmv_context *nmvc = &pc->fc.nmvc;

-  MACROBLOCK *x = &cpi->mb;

-  MACROBLOCKD *xd = &cpi->mb.e_mbd;

-  MODE_INFO *m;

-  MODE_INFO *prev_m;

-  TOKENEXTRA *tok = cpi->tok;

-  TOKENEXTRA *tok_end = tok + cpi->tok_count;

-  const int mis = pc->mode_info_stride;

-  int mb_row, mb_col;

-  int row, col;

-  // Values used in prediction model coding

-  vp9_prob pred_prob;

-  unsigned char prediction_flag;

-  int row_delta[4] = { 0, +1,  0, -1};

-  int col_delta[4] = { +1, -1, +1, +1};

-  cpi->mb.partition_info = cpi->mb.pi;

-  mb_row = 0;

-  for (row = 0; row < pc->mb_rows; row += 2) {

-    m = pc->mi + row * mis;

-    prev_m = pc->prev_mi + row * mis;

-    mb_col = 0;

-    for (col = 0; col < pc->mb_cols; col += 2) {

-      int i;

-      // Process the 4 MBs in the order:

-      // top-left, top-right, bottom-left, bottom-right

-#if CONFIG_SUPERBLOCKS

-      vp9_write(bc, m->mbmi.encoded_as_sb, pc->sb_coded);

-#endif

-      for (i = 0; i < 4; i++) {

-        MB_MODE_INFO *mi;

-        MV_REFERENCE_FRAME rf;

-        MB_PREDICTION_MODE mode;

-        int segment_id;

-        int dy = row_delta[i];

-        int dx = col_delta[i];

-        int offset_extended = dy * mis + dx;

-        if ((mb_row >= pc->mb_rows) || (mb_col >= pc->mb_cols)) {

-          // MB lies outside frame, move on

-          mb_row += dy;

-          mb_col += dx;

-          m += offset_extended;

-          prev_m += offset_extended;

-          cpi->mb.partition_info += offset_extended;

-          continue;

-        }

-        mi = &m->mbmi;

-        rf = mi->ref_frame;

-        mode = mi->mode;

-        segment_id = mi->segment_id;

-        // Distance of Mb to the various image edges.

-        // These specified to 8th pel as they are always compared to MV

-        // values that are in 1/8th pel units

-        xd->mb_to_left_edge = -((mb_col * 16) << 3);

-        xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;

-        xd->mb_to_top_edge = -((mb_row * 16)) << 3;

-        xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;

-        // Make sure the MacroBlockD mode info pointer is set correctly

-        xd->mode_info_context = m;

-        xd->prev_mode_info_context = prev_m;

-#ifdef ENTROPY_STATS

-        active_section = 9;

-#endif

-        if (cpi->mb.e_mbd.update_mb_segmentation_map) {

-          // Is temporal coding of the segment map enabled

-          if (pc->temporal_update) {

-            prediction_flag = vp9_get_pred_flag(xd, PRED_SEG_ID);

-            pred_prob = vp9_get_pred_prob(pc, xd, PRED_SEG_ID);

-            // Code the segment id prediction flag for this mb

-            vp9_write(bc, prediction_flag, pred_prob);

-            // If the mb segment id wasn't predicted code explicitly

-            if (!prediction_flag)

-              write_mb_segid(bc, mi, &cpi->mb.e_mbd);

-          } else {

-            // Normal unpredicted coding

-            write_mb_segid(bc, mi, &cpi->mb.e_mbd);

-          }

-        }

-        if (pc->mb_no_coeff_skip &&

-            (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||

-             (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {

-          int skip_coeff = mi->mb_skip_coeff;

-#if CONFIG_SUPERBLOCKS

-          if (mi->encoded_as_sb) {

-            skip_coeff &= m[1].mbmi.mb_skip_coeff;

-            skip_coeff &= m[mis].mbmi.mb_skip_coeff;

-            skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;

-          }

-#endif

-          vp9_write(bc, skip_coeff,

-                    vp9_get_pred_prob(pc, xd, PRED_MBSKIP));

-        }

-        // Encode the reference frame.

-        encode_ref_frame(bc, pc, xd, segment_id, rf);

-        if (rf == INTRA_FRAME) {

-#ifdef ENTROPY_STATS

-          active_section = 6;

-#endif

-          // TODO(rbultje) write using SB tree structure

-          if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {

-            write_ymode(bc, mode, pc->fc.ymode_prob);

-          }

-          if (mode == B_PRED) {

-            int j = 0;

-#if CONFIG_COMP_INTRA_PRED

-            int uses_second =

-              m->bmi[0].as_mode.second !=

-              (B_PREDICTION_MODE)(B_DC_PRED - 1);

-            vp9_write(bc, uses_second, 128);

-#endif

-            do {

-#if CONFIG_COMP_INTRA_PRED

-              B_PREDICTION_MODE mode2 = m->bmi[j].as_mode.second;

-#endif

-              write_bmode(bc, m->bmi[j].as_mode.first,

-                          pc->fc.bmode_prob);

-              /*

-              if (!cpi->dummy_packing) {

-                int p;

-                for (p = 0; p < VP9_BINTRAMODES - 1; ++p)

-                  printf(" %d", pc->fc.bmode_prob[p]);

-                printf("\nbmode[%d][%d]: %d\n", pc->current_video_frame, j, m->bmi[j].as_mode.first);

-              }

-              */

-#if CONFIG_COMP_INTRA_PRED

-              if (uses_second) {

-                write_bmode(bc, mode2, pc->fc.bmode_prob);

-              }

-#endif

-            } while (++j < 16);

-          }

-          if (mode == I8X8_PRED) {

-            write_i8x8_mode(bc, m->bmi[0].as_mode.first,

-                            pc->fc.i8x8_mode_prob);

-            write_i8x8_mode(bc, m->bmi[2].as_mode.first,

-                            pc->fc.i8x8_mode_prob);

-            write_i8x8_mode(bc, m->bmi[8].as_mode.first,

-                            pc->fc.i8x8_mode_prob);

-            write_i8x8_mode(bc, m->bmi[10].as_mode.first,

-                            pc->fc.i8x8_mode_prob);

-          } else {

-            write_uv_mode(bc, mi->uv_mode,

-                          pc->fc.uv_mode_prob[mode]);

-          }

-        } else {

-          int_mv best_mv, best_second_mv;

-          int ct[4];

-          vp9_prob mv_ref_p [VP9_MVREFS - 1];

-          {

-            int_mv n1, n2;

-            // Only used for context just now and soon to be deprecated.

-            vp9_find_near_mvs(xd, m, prev_m, &n1, &n2, &best_mv, ct,

-                              rf, cpi->common.ref_frame_sign_bias);

-#if CONFIG_NEWBESTREFMV

-            best_mv.as_int = mi->ref_mvs[rf][0].as_int;

-#endif

-            vp9_mv_ref_probs(&cpi->common, mv_ref_p, ct);

-#ifdef ENTROPY_STATS

-            accum_mv_refs(mode, ct);

-#endif

-          }

-#ifdef ENTROPY_STATS

-          active_section = 3;

-#endif

-          // Is the segment coding of mode enabled

-          if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {

-#if CONFIG_SUPERBLOCKS

-            if (mi->encoded_as_sb) {

-              write_sb_mv_ref(bc, mode, mv_ref_p);

-            } else

-#endif

-            {

-              write_mv_ref(bc, mode, mv_ref_p);

-            }

-            vp9_accum_mv_refs(&cpi->common, mode, ct);

-          }

-#if CONFIG_PRED_FILTER

-          // Is the prediction filter enabled

-          if (mode >= NEARESTMV && mode < SPLITMV) {

-            if (cpi->common.pred_filter_mode == 2)

-              vp9_write(bc, mi->pred_filter_enabled,

-                        pc->prob_pred_filter_off);

-            else

-              assert(mi->pred_filter_enabled ==

-                     cpi->common.pred_filter_mode);

-          }

-#endif

-          if (mode >= NEARESTMV && mode <= SPLITMV)

-          {

-            if (cpi->common.mcomp_filter_type == SWITCHABLE) {

-              write_token(bc, vp9_switchable_interp_tree,

-                          vp9_get_pred_probs(&cpi->common, xd,

-                                             PRED_SWITCHABLE_INTERP),

-                          vp9_switchable_interp_encodings +

-                              vp9_switchable_interp_map[mi->interp_filter]);

-            } else {

-              assert (mi->interp_filter ==

-                      cpi->common.mcomp_filter_type);

-            }

-          }

-          if (mi->second_ref_frame &&

-              (mode == NEWMV || mode == SPLITMV)) {

-            int_mv n1, n2;

-            // Only used for context just now and soon to be deprecated.

-            vp9_find_near_mvs(xd, m, prev_m,

-                              &n1, &n2, &best_second_mv, ct,

-                              mi->second_ref_frame,

-                              cpi->common.ref_frame_sign_bias);

-#if CONFIG_NEWBESTREFMV

-            best_second_mv.as_int =

-              mi->ref_mvs[mi->second_ref_frame][0].as_int;

-#endif

-          }

-          // does the feature use compound prediction or not

-          // (if not specified at the frame/segment level)

-          if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {

-            vp9_write(bc, mi->second_ref_frame != INTRA_FRAME,

-                      vp9_get_pred_prob(pc, xd, PRED_COMP));

-          }

-          {

-            switch (mode) { /* new, split require MVs */

-              case NEWMV:

-#ifdef ENTROPY_STATS

-                active_section = 5;

-#endif

-#if CONFIG_NEW_MVREF

-                {

-                  unsigned int best_index;

-                  // Choose the best mv reference

-                  best_index = pick_best_mv_ref(x, rf, mi->mv[0],

-                                                mi->ref_mvs[rf], &best_mv);

-                  // Encode the index of the choice.

-                  vp9_write_mv_ref_id(bc,

-                                      xd->mb_mv_ref_id_probs[rf], best_index);

-                  cpi->best_ref_index_counts[rf][best_index]++;

-                }

-#endif

-                write_nmv(bc, &mi->mv[0].as_mv, &best_mv,

-                          (const nmv_context*) nmvc,

-                          xd->allow_high_precision_mv);

-                if (mi->second_ref_frame) {

-#if CONFIG_NEW_MVREF

-                  unsigned int best_index;

-                  MV_REFERENCE_FRAME sec_ref_frame = mi->second_ref_frame;

-                  best_index =

-                    pick_best_mv_ref(x, sec_ref_frame, mi->mv[1],

-                                     mi->ref_mvs[sec_ref_frame],

-                                     &best_second_mv);

-                  // Encode the index of the choice.

-                  vp9_write_mv_ref_id(bc,

-                                      xd->mb_mv_ref_id_probs[sec_ref_frame],

-                                      best_index);

-                  cpi->best_ref_index_counts[sec_ref_frame][best_index]++;

-#endif

-                  write_nmv(bc, &mi->mv[1].as_mv, &best_second_mv,

-                            (const nmv_context*) nmvc,

-                            xd->allow_high_precision_mv);

-                }

-                break;

-              case SPLITMV: {

-                int j = 0;

-#ifdef MODE_STATS

-                ++count_mb_seg [mi->partitioning];

-#endif

-                write_split(bc, mi->partitioning, cpi->common.fc.mbsplit_prob);

-                cpi->mbsplit_count[mi->partitioning]++;

-                do {

-                  B_PREDICTION_MODE blockmode;

-                  int_mv blockmv;

-                  const int *const  L =

-                    vp9_mbsplits [mi->partitioning];

-                  int k = -1;  /* first block in subset j */

-                  int mv_contz;

-                  int_mv leftmv, abovemv;

-                  blockmode = cpi->mb.partition_info->bmi[j].mode;

-                  blockmv = cpi->mb.partition_info->bmi[j].mv;

-#if CONFIG_DEBUG

-                  while (j != L[++k])

-                    if (k >= 16)

-                      assert(0);

-#else

-                  while (j != L[++k]);

-#endif

-                  leftmv.as_int = left_block_mv(m, k);

-                  abovemv.as_int = above_block_mv(m, k, mis);

-                  mv_contz = vp9_mv_cont(&leftmv, &abovemv);

-                  write_sub_mv_ref(bc, blockmode,

-                                   cpi->common.fc.sub_mv_ref_prob [mv_contz]);

-                  cpi->sub_mv_ref_count[mv_contz][blockmode - LEFT4X4]++;

-                  if (blockmode == NEW4X4) {

-#ifdef ENTROPY_STATS

-                    active_section = 11;

-#endif

-                    write_nmv(bc, &blockmv.as_mv, &best_mv,

-                              (const nmv_context*) nmvc,

-                              xd->allow_high_precision_mv);

-                    if (mi->second_ref_frame) {

-                      write_nmv(bc,

-                                &cpi->mb.partition_info->bmi[j].second_mv.as_mv,

-                                &best_second_mv,

-                                (const nmv_context*) nmvc,

-                                xd->allow_high_precision_mv);

-                    }

-                  }

-                } while (++j < cpi->mb.partition_info->count);

-              }

-              break;

-              default:

-                break;

-            }

-          }

-          // Update the mvcounts used to tune mv probs but only if this is

-          // the real pack run.

-          if ( !cpi->dummy_packing ) {

-            update_mvcount(cpi, x, &best_mv, &best_second_mv);

-          }

-        }

-        if (

-#if CONFIG_SUPERBLOCKS

-            !mi->encoded_as_sb &&

-#endif

-            ((rf == INTRA_FRAME && mode <= I8X8_PRED) ||

-             (rf != INTRA_FRAME && !(mode == SPLITMV &&

-                                     mi->partitioning == PARTITIONING_4X4))) &&

-            pc->txfm_mode == TX_MODE_SELECT &&

-            !((pc->mb_no_coeff_skip && mi->mb_skip_coeff) ||

-              (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&

-               vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {

-          TX_SIZE sz = mi->txfm_size;

-          // FIXME(rbultje) code ternary symbol once all experiments are merged

-          vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);

-          if (sz != TX_4X4 && mode != I8X8_PRED && mode != SPLITMV)

-            vp9_write(bc, sz != TX_8X8, pc->prob_tx[1]);

-        }

-#ifdef ENTROPY_STATS

-        active_section = 1;

-#endif

-        assert(tok < tok_end);

-        pack_mb_tokens(bc, &tok, tok_end);

-#if CONFIG_SUPERBLOCKS

-        if (m->mbmi.encoded_as_sb) {

-          assert(!i);

-          mb_col += 2;

-          m += 2;

-          cpi->mb.partition_info += 2;

-          prev_m += 2;

-          break;

-        }

-#endif

-        // Next MB

-        mb_row += dy;

-        mb_col += dx;

-        m += offset_extended;

-        prev_m += offset_extended;

-        cpi->mb.partition_info += offset_extended;

-#if CONFIG_DEBUG

-        assert((prev_m - cpi->common.prev_mip) == (m - cpi->common.mip));

-        assert((prev_m - cpi->common.prev_mi) == (m - cpi->common.mi));

-#endif

-      }

-    }

-    // Next SB

-    mb_row += 2;

-    m += mis + (1 - (pc->mb_cols & 0x1));

-    prev_m += mis + (1 - (pc->mb_cols & 0x1));

-    cpi->mb.partition_info += mis + (1 - (pc->mb_cols & 0x1));

-  }

-}

-static void write_mb_modes_kf(const VP9_COMMON  *c,

-                              const MACROBLOCKD *xd,

-                              const MODE_INFO   *m,

-                              int                mode_info_stride,

-                              vp9_writer *const  bc) {

-  const int mis = mode_info_stride;

-  int ym;

-  int segment_id;

-  ym = m->mbmi.mode;

-  segment_id = m->mbmi.segment_id;

-  if (xd->update_mb_segmentation_map) {

-    write_mb_segid(bc, &m->mbmi, xd);

-  }

-  if (c->mb_no_coeff_skip &&

-      (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||

-       (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {

-        int skip_coeff = m->mbmi.mb_skip_coeff;

-#if CONFIG_SUPERBLOCKS

-        if (m->mbmi.encoded_as_sb) {

-          skip_coeff &= m[1].mbmi.mb_skip_coeff;

-          skip_coeff &= m[mis].mbmi.mb_skip_coeff;

-          skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;

-        }

-#endif

-        vp9_write(bc, skip_coeff,

-                  vp9_get_pred_prob(c, xd, PRED_MBSKIP));

-  }

-#if CONFIG_SUPERBLOCKS

-  if (m->mbmi.encoded_as_sb) {

-    sb_kfwrite_ymode(bc, ym,

-                     c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);

-  } else

-#endif

-  {

-    kfwrite_ymode(bc, ym,

-                  c->kf_ymode_prob[c->kf_ymode_probs_index]);

-  }

-  if (ym == B_PRED) {

-    const int mis = c->mode_info_stride;

-    int i = 0;

-#if CONFIG_COMP_INTRA_PRED

-    int uses_second =

-      m->bmi[0].as_mode.second !=

-      (B_PREDICTION_MODE)(B_DC_PRED - 1);

-    vp9_write(bc, uses_second, 128);

-#endif

-    do {

-      const B_PREDICTION_MODE A = above_block_mode(m, i, mis);

-      const B_PREDICTION_MODE L = left_block_mode(m, i);

-      const int bm = m->bmi[i].as_mode.first;

-#if CONFIG_COMP_INTRA_PRED

-      const int bm2 = m->bmi[i].as_mode.second;

-#endif

-#ifdef ENTROPY_STATS

-      ++intra_mode_stats [A] [L] [bm];

-#endif

-      write_bmode(bc, bm, c->kf_bmode_prob [A] [L]);

-      // printf("    mode: %d\n", bm);

-#if CONFIG_COMP_INTRA_PRED

-      if (uses_second) {

-        write_bmode(bc, bm2, c->kf_bmode_prob [A] [L]);

-      }

-#endif

-    } while (++i < 16);

-  }

-  if (ym == I8X8_PRED) {

-    write_i8x8_mode(bc, m->bmi[0].as_mode.first,

-                    c->fc.i8x8_mode_prob);

-    // printf("    mode: %d\n", m->bmi[0].as_mode.first); fflush(stdout);

-    write_i8x8_mode(bc, m->bmi[2].as_mode.first,

-                    c->fc.i8x8_mode_prob);

-    // printf("    mode: %d\n", m->bmi[2].as_mode.first); fflush(stdout);

-    write_i8x8_mode(bc, m->bmi[8].as_mode.first,

-                    c->fc.i8x8_mode_prob);

-    // printf("    mode: %d\n", m->bmi[8].as_mode.first); fflush(stdout);

-    write_i8x8_mode(bc, m->bmi[10].as_mode.first,

-                    c->fc.i8x8_mode_prob);

-    // printf("    mode: %d\n", m->bmi[10].as_mode.first); fflush(stdout);

-  } else

-    write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);

-  if (

-#if CONFIG_SUPERBLOCKS

-      !m->mbmi.encoded_as_sb &&

-#endif

-      ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&

-      !((c->mb_no_coeff_skip && m->mbmi.mb_skip_coeff) ||

-        (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&

-         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {

-    TX_SIZE sz = m->mbmi.txfm_size;

-    // FIXME(rbultje) code ternary symbol once all experiments are merged

-    vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);

-    if (sz != TX_4X4 && ym <= TM_PRED)

-      vp9_write(bc, sz != TX_8X8, c->prob_tx[1]);

-  }

-}

-static void write_kfmodes(VP9_COMP* const cpi, vp9_writer* const bc) {

-  VP9_COMMON *const c = &cpi->common;

-  const int mis = c->mode_info_stride;

-  MACROBLOCKD *xd = &cpi->mb.e_mbd;

-  MODE_INFO *m;

-  int i;

-  int row, col;

-  int mb_row, mb_col;

-  int row_delta[4] = { 0, +1,  0, -1};

-  int col_delta[4] = { +1, -1, +1, +1};

-  TOKENEXTRA *tok = cpi->tok;

-  TOKENEXTRA *tok_end = tok + cpi->tok_count;

-  mb_row = 0;

-  for (row = 0; row < c->mb_rows; row += 2) {

-    m = c->mi + row * mis;

-    mb_col = 0;

-    for (col = 0; col < c->mb_cols; col += 2) {

-#if CONFIG_SUPERBLOCKS

-      vp9_write(bc, m->mbmi.encoded_as_sb, c->sb_coded);

-#endif

-      // Process the 4 MBs in the order:

-      // top-left, top-right, bottom-left, bottom-right

-      for (i = 0; i < 4; i++) {

-        int dy = row_delta[i];

-        int dx = col_delta[i];

-        int offset_extended = dy * mis + dx;

-        if ((mb_row >= c->mb_rows) || (mb_col >= c->mb_cols)) {

-          // MB lies outside frame, move on

-          mb_row += dy;

-          mb_col += dx;

-          m += offset_extended;

-          continue;

-        }

-        // Make sure the MacroBlockD mode info pointer is set correctly

-        xd->mode_info_context = m;

-        write_mb_modes_kf(c, xd, m, mis, bc);

-#ifdef ENTROPY_STATS

-        active_section = 8;

-#endif

-        assert(tok < tok_end);

-        pack_mb_tokens(bc, &tok, tok_end);

-#if CONFIG_SUPERBLOCKS

-        if (m->mbmi.encoded_as_sb) {

-          assert(!i);

-          mb_col += 2;

-          m += 2;

-          break;

-        }

-#endif

-        // Next MB

-        mb_row += dy;

-        mb_col += dx;

-        m += offset_extended;

-      }

-    }

-    mb_row += 2;

-  }

-}

-/* This function is used for debugging probability trees. */

-static void print_prob_tree(vp9_prob

-                            coef_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]) {

-  /* print coef probability tree */

-  int i, j, k, l;

-  FILE *f = fopen("enc_tree_probs.txt", "a");

-  fprintf(f, "{\n");

-  for (i = 0; i < BLOCK_TYPES; i++) {

-    fprintf(f, "  {\n");

-    for (j = 0; j < COEF_BANDS; j++) {

-      fprintf(f, "    {\n");

-      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

-        fprintf(f, "      {");

-        for (l = 0; l < ENTROPY_NODES; l++) {

-          fprintf(f, "%3u, ",

-                  (unsigned int)(coef_probs [i][j][k][l]));

-        }

-        fprintf(f, " }\n");

-      }

-      fprintf(f, "    }\n");

-    }

-    fprintf(f, "  }\n");

-  }

-  fprintf(f, "}\n");

-  fclose(f);

-}

-static void build_coeff_contexts(VP9_COMP *cpi) {

-  int i = 0, j, k;

-#ifdef ENTROPY_STATS

-  int t = 0;

-#endif

-  for (i = 0; i < BLOCK_TYPES; ++i) {

-    for (j = 0; j < COEF_BANDS; ++j) {

-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

-          continue;

-        vp9_tree_probs_from_distribution(

-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

-          cpi->frame_coef_probs [i][j][k],

-          cpi->frame_branch_ct [i][j][k],

-          cpi->coef_counts [i][j][k],

-          256, 1

-        );

-#ifdef ENTROPY_STATS

-        if (!cpi->dummy_packing)

-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

-            context_counters[i][j][k][t] += cpi->coef_counts[i][j][k][t];

-#endif

-      }

-    }

-  }

-  for (i = 0; i < BLOCK_TYPES; ++i) {

-    for (j = 0; j < COEF_BANDS; ++j) {

-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

-          continue;

-        vp9_tree_probs_from_distribution(

-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

-          cpi->frame_hybrid_coef_probs [i][j][k],

-          cpi->frame_hybrid_branch_ct [i][j][k],

-          cpi->hybrid_coef_counts [i][j][k],

-          256, 1

-        );

-#ifdef ENTROPY_STATS

-        if (!cpi->dummy_packing)

-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

-            hybrid_context_counters[i][j][k][t] += cpi->hybrid_coef_counts[i][j][k][t];

-#endif

-      }

-    }

-  }

-  if (cpi->common.txfm_mode != ONLY_4X4) {

-    for (i = 0; i < BLOCK_TYPES_8X8; ++i) {

-      for (j = 0; j < COEF_BANDS; ++j) {

-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-          /* at every context */

-          /* calc probs and branch cts for this frame only */

-          // vp9_prob new_p           [ENTROPY_NODES];

-          // unsigned int branch_ct   [ENTROPY_NODES] [2];

-          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

-            continue;

-          vp9_tree_probs_from_distribution(

-            MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

-            cpi->frame_coef_probs_8x8 [i][j][k],

-            cpi->frame_branch_ct_8x8 [i][j][k],

-            cpi->coef_counts_8x8 [i][j][k],

-            256, 1

-          );

-#ifdef ENTROPY_STATS

-          if (!cpi->dummy_packing)

-            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

-              context_counters_8x8[i][j][k][t] += cpi->coef_counts_8x8[i][j][k][t];

-#endif

-        }

-      }

-    }

-    for (i = 0; i < BLOCK_TYPES_8X8; ++i) {

-      for (j = 0; j < COEF_BANDS; ++j) {

-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-          /* at every context */

-          /* calc probs and branch cts for this frame only */

-          // vp9_prob new_p           [ENTROPY_NODES];

-          // unsigned int branch_ct   [ENTROPY_NODES] [2];

-          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

-            continue;

-          vp9_tree_probs_from_distribution(

-            MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

-            cpi->frame_hybrid_coef_probs_8x8 [i][j][k],

-            cpi->frame_hybrid_branch_ct_8x8 [i][j][k],

-            cpi->hybrid_coef_counts_8x8 [i][j][k],

-            256, 1

-          );

-#ifdef ENTROPY_STATS

-          if (!cpi->dummy_packing)

-            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

-              hybrid_context_counters_8x8[i][j][k][t] += cpi->hybrid_coef_counts_8x8[i][j][k][t];

-#endif

-        }

-      }

-    }

-  }

-  if (cpi->common.txfm_mode > ALLOW_8X8) {

-    for (i = 0; i < BLOCK_TYPES_16X16; ++i) {

-      for (j = 0; j < COEF_BANDS; ++j) {

-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

-            continue;

-          vp9_tree_probs_from_distribution(

-            MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

-            cpi->frame_coef_probs_16x16[i][j][k],

-            cpi->frame_branch_ct_16x16[i][j][k],

-            cpi->coef_counts_16x16[i][j][k], 256, 1);

-#ifdef ENTROPY_STATS

-          if (!cpi->dummy_packing)

-            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

-              context_counters_16x16[i][j][k][t] += cpi->coef_counts_16x16[i][j][k][t];

-#endif

-        }

-      }

-    }

-  }

-  for (i = 0; i < BLOCK_TYPES_16X16; ++i) {

-    for (j = 0; j < COEF_BANDS; ++j) {

-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

-          continue;

-        vp9_tree_probs_from_distribution(

-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

-          cpi->frame_hybrid_coef_probs_16x16[i][j][k],

-          cpi->frame_hybrid_branch_ct_16x16[i][j][k],

-          cpi->hybrid_coef_counts_16x16[i][j][k], 256, 1);

-#ifdef ENTROPY_STATS

-        if (!cpi->dummy_packing)

-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

-            hybrid_context_counters_16x16[i][j][k][t] += cpi->hybrid_coef_counts_16x16[i][j][k][t];

-#endif

-      }

-    }

-  }

-}

-static void update_coef_probs_common(

-    vp9_writer* const bc,

-    vp9_prob new_frame_coef_probs[BLOCK_TYPES][COEF_BANDS]

-                                 [PREV_COEF_CONTEXTS][ENTROPY_NODES],

-    vp9_prob old_frame_coef_probs[BLOCK_TYPES][COEF_BANDS]

-                                 [PREV_COEF_CONTEXTS][ENTROPY_NODES],

-    unsigned int frame_branch_ct[BLOCK_TYPES][COEF_BANDS]

-                                [PREV_COEF_CONTEXTS][ENTROPY_NODES][2]) {

-  int i, j, k, t;

-  int update[2] = {0, 0};

-  int savings;

-  // vp9_prob bestupd = find_coef_update_prob(cpi);

-  /* dry run to see if there is any udpate at all needed */

-  savings = 0;

-  for (i = 0; i < BLOCK_TYPES; ++i) {

-    for (j = !i; j < COEF_BANDS; ++j) {

-      int prev_coef_savings[ENTROPY_NODES] = {0};

-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-        for (t = 0; t < ENTROPY_NODES; ++t) {

-          vp9_prob newp = new_frame_coef_probs[i][j][k][t];

-          const vp9_prob oldp = old_frame_coef_probs[i][j][k][t];

-          const vp9_prob upd = COEF_UPDATE_PROB;

-          int s = prev_coef_savings[t];

-          int u = 0;

-          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

-            continue;

-#if defined(SEARCH_NEWP)

-          s = prob_diff_update_savings_search(

-                frame_branch_ct[i][j][k][t],

-                oldp, &newp, upd);

-          if (s > 0 && newp != oldp)

-            u = 1;

-          if (u)

-            savings += s - (int)(vp9_cost_zero(upd));

-          else

-            savings -= (int)(vp9_cost_zero(upd));

-#else

-          s = prob_update_savings(

-                frame_branch_ct[i][j][k][t],

-                oldp, newp, upd);

-          if (s > 0)

-            u = 1;

-          if (u)

-            savings += s;

-#endif

-          update[u]++;

-        }

-      }

-    }

-  }

-  // printf("Update %d %d, savings %d\n", update[0], update[1], savings);

-  /* Is coef updated at all */

-  if (update[1] == 0 || savings < 0) {

-    vp9_write_bit(bc, 0);

-  } else {

-    vp9_write_bit(bc, 1);

-    for (i = 0; i < BLOCK_TYPES; ++i) {

-      for (j = !i; j < COEF_BANDS; ++j) {

-        int prev_coef_savings[ENTROPY_NODES] = {0};

-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-          // calc probs and branch cts for this frame only

-          for (t = 0; t < ENTROPY_NODES; ++t) {

-            vp9_prob newp = new_frame_coef_probs[i][j][k][t];

-            vp9_prob *oldp = old_frame_coef_probs[i][j][k] + t;

-            const vp9_prob upd = COEF_UPDATE_PROB;

-            int s = prev_coef_savings[t];

-            int u = 0;

-            if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

-              continue;

-#if defined(SEARCH_NEWP)

-            s = prob_diff_update_savings_search(

-                  frame_branch_ct[i][j][k][t],

-                  *oldp, &newp, upd);

-            if (s > 0 && newp != *oldp)

-              u = 1;

-#else

-            s = prob_update_savings(

-                  frame_branch_ct[i][j][k][t],

-                  *oldp, newp, upd);

-            if (s > 0)

-              u = 1;

-#endif

-            vp9_write(bc, u, upd);

-#ifdef ENTROPY_STATS

-            if (!cpi->dummy_packing)

-              ++ tree_update_hist [i][j][k][t] [u];

-#endif

-            if (u) {

-              /* send/use new probability */

-              write_prob_diff_update(bc, newp, *oldp);

-              *oldp = newp;

-            }

-          }

-        }

-      }

-    }

-  }

-}

-static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {

-  vp9_clear_system_state();

-  // Build the cofficient contexts based on counts collected in encode loop

-  build_coeff_contexts(cpi);

-  update_coef_probs_common(bc,

-                           cpi->frame_coef_probs,

-                           cpi->common.fc.coef_probs,

-                           cpi->frame_branch_ct);

-  update_coef_probs_common(bc,

-                           cpi->frame_hybrid_coef_probs,

-                           cpi->common.fc.hybrid_coef_probs,

-                           cpi->frame_hybrid_branch_ct);

-  /* do not do this if not even allowed */

-  if (cpi->common.txfm_mode != ONLY_4X4) {

-    update_coef_probs_common(bc,

-                             cpi->frame_coef_probs_8x8,

-                             cpi->common.fc.coef_probs_8x8,

-                             cpi->frame_branch_ct_8x8);

-    update_coef_probs_common(bc,

-                             cpi->frame_hybrid_coef_probs_8x8,

-                             cpi->common.fc.hybrid_coef_probs_8x8,

-                             cpi->frame_hybrid_branch_ct_8x8);

-  }

-  if (cpi->common.txfm_mode > ALLOW_8X8) {

-    update_coef_probs_common(bc,

-                             cpi->frame_coef_probs_16x16,

-                             cpi->common.fc.coef_probs_16x16,

-                             cpi->frame_branch_ct_16x16);

-    update_coef_probs_common(bc,

-                             cpi->frame_hybrid_coef_probs_16x16,

-                             cpi->common.fc.hybrid_coef_probs_16x16,

-                             cpi->frame_hybrid_branch_ct_16x16);

-  }

-}

-#ifdef PACKET_TESTING

-FILE *vpxlogc = 0;

-#endif

-static void put_delta_q(vp9_writer *bc, int delta_q) {

-  if (delta_q != 0) {

-    vp9_write_bit(bc, 1);

-    vp9_write_literal(bc, abs(delta_q), 4);

-    if (delta_q < 0)

-      vp9_write_bit(bc, 1);

-    else

-      vp9_write_bit(bc, 0);

-  } else

-    vp9_write_bit(bc, 0);

-}

-static void decide_kf_ymode_entropy(VP9_COMP *cpi) {

-  int mode_cost[MB_MODE_COUNT];

-  int cost;

-  int bestcost = INT_MAX;

-  int bestindex = 0;

-  int i, j;

-  for (i = 0; i < 8; i++) {

-    vp9_cost_tokens(mode_cost, cpi->common.kf_ymode_prob[i], vp9_kf_ymode_tree);

-    cost = 0;

-    for (j = 0; j < VP9_YMODES; j++) {

-      cost += mode_cost[j] * cpi->ymode_count[j];

-    }

-#if CONFIG_SUPERBLOCKS

-    vp9_cost_tokens(mode_cost, cpi->common.sb_kf_ymode_prob[i],

-                    vp9_sb_ymode_tree);

-    for (j = 0; j < VP9_I32X32_MODES; j++) {

-      cost += mode_cost[j] * cpi->sb_ymode_count[j];

-    }

-#endif

-    if (cost < bestcost) {

-      bestindex = i;

-      bestcost = cost;

-    }

-  }

-  cpi->common.kf_ymode_probs_index = bestindex;

-}

-static void segment_reference_frames(VP9_COMP *cpi) {

-  VP9_COMMON *oci = &cpi->common;

-  MODE_INFO *mi = oci->mi;

-  int ref[MAX_MB_SEGMENTS] = {0};

-  int i, j;

-  int mb_index = 0;

-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;

-  for (i = 0; i < oci->mb_rows; i++) {

-    for (j = 0; j < oci->mb_cols; j++, mb_index++) {

-      ref[mi[mb_index].mbmi.segment_id] |= (1 << mi[mb_index].mbmi.ref_frame);

-    }

-    mb_index++;

-  }

-  for (i = 0; i < MAX_MB_SEGMENTS; i++) {

-    vp9_enable_segfeature(xd, i, SEG_LVL_REF_FRAME);

-    vp9_set_segdata(xd, i, SEG_LVL_REF_FRAME, ref[i]);

-  }

-}

-void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,

-                        unsigned long *size) {

-  int i, j;

-  VP9_HEADER oh;

-  VP9_COMMON *const pc = &cpi->common;

-  vp9_writer header_bc, residual_bc;

-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;

-  int extra_bytes_packed = 0;

-  unsigned char *cx_data = dest;

-  oh.show_frame = (int) pc->show_frame;

-  oh.type = (int)pc->frame_type;

-  oh.version = pc->version;

-  oh.first_partition_length_in_bytes = 0;

-  cx_data += 3;

-#if defined(SECTIONBITS_OUTPUT)

-  Sectionbits[active_section = 1] += sizeof(VP9_HEADER) * 8 * 256;

-#endif

-  compute_update_table();

-  /* vp9_kf_default_bmode_probs() is called in vp9_setup_key_frame() once

-   * for each K frame before encode frame. pc->kf_bmode_prob doesn't get

-   * changed anywhere else. No need to call it again here. --yw

-   * vp9_kf_default_bmode_probs( pc->kf_bmode_prob);

-   */

-  /* every keyframe send startcode, width, height, scale factor, clamp

-   * and color type.

-   */

-  if (oh.type == KEY_FRAME) {

-    int v;

-    // Start / synch code

-    cx_data[0] = 0x9D;

-    cx_data[1] = 0x01;

-    cx_data[2] = 0x2a;

-    v = (pc->horiz_scale << 14) | pc->Width;

-    cx_data[3] = v;

-    cx_data[4] = v >> 8;

-    v = (pc->vert_scale << 14) | pc->Height;

-    cx_data[5] = v;

-    cx_data[6] = v >> 8;

-    extra_bytes_packed = 7;

-    cx_data += extra_bytes_packed;

-    vp9_start_encode(&header_bc, cx_data);

-    // signal clr type

-    vp9_write_bit(&header_bc, pc->clr_type);

-    vp9_write_bit(&header_bc, pc->clamp_type);

-  } else {

-    vp9_start_encode(&header_bc, cx_data);

-  }

-  // Signal whether or not Segmentation is enabled

-  vp9_write_bit(&header_bc, (xd->segmentation_enabled) ? 1 : 0);

-  // Indicate which features are enabled

-  if (xd->segmentation_enabled) {

-    // Indicate whether or not the segmentation map is being updated.

-    vp9_write_bit(&header_bc, (xd->update_mb_segmentation_map) ? 1 : 0);

-    // If it is, then indicate the method that will be used.

-    if (xd->update_mb_segmentation_map) {

-      // Select the coding strategy (temporal or spatial)

-      vp9_choose_segmap_coding_method(cpi);

-      // Send the tree probabilities used to decode unpredicted

-      // macro-block segments

-      for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {

-        int data = xd->mb_segment_tree_probs[i];

-        if (data != 255) {

-          vp9_write_bit(&header_bc, 1);

-          vp9_write_literal(&header_bc, data, 8);

-        } else {

-          vp9_write_bit(&header_bc, 0);

-        }

-      }

-      // Write out the chosen coding method.

-      vp9_write_bit(&header_bc, (pc->temporal_update) ? 1 : 0);

-      if (pc->temporal_update) {

-        for (i = 0; i < PREDICTION_PROBS; i++) {

-          int data = pc->segment_pred_probs[i];

-          if (data != 255) {

-            vp9_write_bit(&header_bc, 1);

-            vp9_write_literal(&header_bc, data, 8);

-          } else {

-            vp9_write_bit(&header_bc, 0);

-          }

-        }

-      }

-    }

-    vp9_write_bit(&header_bc, (xd->update_mb_segmentation_data) ? 1 : 0);

-    // segment_reference_frames(cpi);

-    if (xd->update_mb_segmentation_data) {

-      signed char Data;

-      vp9_write_bit(&header_bc, (xd->mb_segment_abs_delta) ? 1 : 0);

-      // For each segments id...

-      for (i = 0; i < MAX_MB_SEGMENTS; i++) {

-        // For each segmentation codable feature...

-        for (j = 0; j < SEG_LVL_MAX; j++) {

-          Data = vp9_get_segdata(xd, i, j);

-          // If the feature is enabled...

-          if (vp9_segfeature_active(xd, i, j)) {

-            vp9_write_bit(&header_bc, 1);

-            // Is the segment data signed..

-            if (vp9_is_segfeature_signed(j)) {

-              // Encode the relevant feature data

-              if (Data < 0) {

-                Data = - Data;

-                vp9_write_literal(&header_bc, Data,

-                                  vp9_seg_feature_data_bits(j));

-                vp9_write_bit(&header_bc, 1);

-              } else {

-                vp9_write_literal(&header_bc, Data,

-                                  vp9_seg_feature_data_bits(j));

-                vp9_write_bit(&header_bc, 0);

-              }

-            }

-            // Unsigned data element so no sign bit needed

-            else

-              vp9_write_literal(&header_bc, Data,

-                                vp9_seg_feature_data_bits(j));

-          } else

-            vp9_write_bit(&header_bc, 0);

-        }

-      }

-    }

-  }

-  // Encode the common prediction model status flag probability updates for

-  // the reference frame

-  update_refpred_stats(cpi);

-  if (pc->frame_type != KEY_FRAME) {

-    for (i = 0; i < PREDICTION_PROBS; i++) {

-      if (cpi->ref_pred_probs_update[i]) {

-        vp9_write_bit(&header_bc, 1);

-        vp9_write_literal(&header_bc, pc->ref_pred_probs[i], 8);

-      } else {

-        vp9_write_bit(&header_bc, 0);

-      }

-    }

-  }

-#if CONFIG_SUPERBLOCKS

-  {

-    /* sb mode probability */

-    const int sb_max = (((pc->mb_rows + 1) >> 1) * ((pc->mb_cols + 1) >> 1));

-    pc->sb_coded = get_prob(sb_max - cpi->sb_count, sb_max);

-    vp9_write_literal(&header_bc, pc->sb_coded, 8);

-  }

-#endif

-  {

-    if (pc->txfm_mode == TX_MODE_SELECT) {

-      pc->prob_tx[0] = get_prob(cpi->txfm_count[0] + cpi->txfm_count_8x8p[0],

-                                cpi->txfm_count[0] + cpi->txfm_count[1] + cpi->txfm_count[2] +

-                                cpi->txfm_count_8x8p[0] + cpi->txfm_count_8x8p[1]);

-      pc->prob_tx[1] = get_prob(cpi->txfm_count[1], cpi->txfm_count[1] + cpi->txfm_count[2]);

-    } else {

-      pc->prob_tx[0] = 128;

-      pc->prob_tx[1] = 128;

-    }

-    vp9_write_literal(&header_bc, pc->txfm_mode, 2);

-    if (pc->txfm_mode == TX_MODE_SELECT) {

-      vp9_write_literal(&header_bc, pc->prob_tx[0], 8);

-      vp9_write_literal(&header_bc, pc->prob_tx[1], 8);

-    }

-  }

-  // Encode the loop filter level and type

-  vp9_write_bit(&header_bc, pc->filter_type);

-  vp9_write_literal(&header_bc, pc->filter_level, 6);

-  vp9_write_literal(&header_bc, pc->sharpness_level, 3);

-  // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled).

-  vp9_write_bit(&header_bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0);

-  if (xd->mode_ref_lf_delta_enabled) {

-    // Do the deltas need to be updated

-    int send_update = xd->mode_ref_lf_delta_update;

-    vp9_write_bit(&header_bc, send_update);

-    if (send_update) {

-      int Data;

-      // Send update

-      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {

-        Data = xd->ref_lf_deltas[i];

-        // Frame level data

-        if (xd->ref_lf_deltas[i] != xd->last_ref_lf_deltas[i]) {

-          xd->last_ref_lf_deltas[i] = xd->ref_lf_deltas[i];

-          vp9_write_bit(&header_bc, 1);

-          if (Data > 0) {

-            vp9_write_literal(&header_bc, (Data & 0x3F), 6);

-            vp9_write_bit(&header_bc, 0);    // sign

-          } else {

-            Data = -Data;

-            vp9_write_literal(&header_bc, (Data & 0x3F), 6);

-            vp9_write_bit(&header_bc, 1);    // sign

-          }

-        } else {

-          vp9_write_bit(&header_bc, 0);

-        }

-      }

-      // Send update

-      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {

-        Data = xd->mode_lf_deltas[i];

-        if (xd->mode_lf_deltas[i] != xd->last_mode_lf_deltas[i]) {

-          xd->last_mode_lf_deltas[i] = xd->mode_lf_deltas[i];

-          vp9_write_bit(&header_bc, 1);

-          if (Data > 0) {

-            vp9_write_literal(&header_bc, (Data & 0x3F), 6);

-            vp9_write_bit(&header_bc, 0);    // sign

-          } else {

-            Data = -Data;

-            vp9_write_literal(&header_bc, (Data & 0x3F), 6);

-            vp9_write_bit(&header_bc, 1);    // sign

-          }

-        } else {

-          vp9_write_bit(&header_bc, 0);

-        }

-      }

-    }

-  }

-  // signal here is multi token partition is enabled

-  // vp9_write_literal(&header_bc, pc->multi_token_partition, 2);

-  vp9_write_literal(&header_bc, 0, 2);

-  // Frame Q baseline quantizer index

-  vp9_write_literal(&header_bc, pc->base_qindex, QINDEX_BITS);

-  // Transmit Dc, Second order and Uv quantizer delta information

-  put_delta_q(&header_bc, pc->y1dc_delta_q);

-  put_delta_q(&header_bc, pc->y2dc_delta_q);

-  put_delta_q(&header_bc, pc->y2ac_delta_q);

-  put_delta_q(&header_bc, pc->uvdc_delta_q);

-  put_delta_q(&header_bc, pc->uvac_delta_q);

-  // When there is a key frame all reference buffers are updated using the new key frame

-  if (pc->frame_type != KEY_FRAME) {

-    // Should the GF or ARF be updated using the transmitted frame or buffer

-    vp9_write_bit(&header_bc, pc->refresh_golden_frame);

-    vp9_write_bit(&header_bc, pc->refresh_alt_ref_frame);

-    // For inter frames the current default behavior is that when

-    // cm->refresh_golden_frame is set we copy the old GF over to

-    // the ARF buffer. This is purely an encoder decision at present.

-    if (pc->refresh_golden_frame)

-      pc->copy_buffer_to_arf  = 2;

-    // If not being updated from current frame should either GF or ARF be updated from another buffer

-    if (!pc->refresh_golden_frame)

-      vp9_write_literal(&header_bc, pc->copy_buffer_to_gf, 2);

-    if (!pc->refresh_alt_ref_frame)

-      vp9_write_literal(&header_bc, pc->copy_buffer_to_arf, 2);

-    // Indicate reference frame sign bias for Golden and ARF frames (always 0 for last frame buffer)

-    vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]);

-    vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);

-    // Signal whether to allow high MV precision

-    vp9_write_bit(&header_bc, (xd->allow_high_precision_mv) ? 1 : 0);

-    if (pc->mcomp_filter_type == SWITCHABLE) {

-      /* Check to see if only one of the filters is actually used */

-      int count[VP9_SWITCHABLE_FILTERS];

-      int i, j, c = 0;

-      for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {

-        count[i] = 0;

-        for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {

-          count[i] += cpi->switchable_interp_count[j][i];

-        }

-        c += (count[i] > 0);

-      }

-      if (c == 1) {

-        /* Only one filter is used. So set the filter at frame level */

-        for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {

-          if (count[i]) {

-            pc->mcomp_filter_type = vp9_switchable_interp[i];

-            break;

-          }

-        }

-      }

-    }

-    // Signal the type of subpel filter to use

-    vp9_write_bit(&header_bc, (pc->mcomp_filter_type == SWITCHABLE));

-    if (pc->mcomp_filter_type != SWITCHABLE)

-      vp9_write_literal(&header_bc, (pc->mcomp_filter_type), 2);

-  }

-  vp9_write_bit(&header_bc, pc->refresh_entropy_probs);

-  if (pc->frame_type != KEY_FRAME)

-    vp9_write_bit(&header_bc, pc->refresh_last_frame);

-#ifdef ENTROPY_STATS

-  if (pc->frame_type == INTER_FRAME)

-    active_section = 0;

-  else

-    active_section = 7;

-#endif

-  vp9_clear_system_state();  // __asm emms;

-  vp9_copy(cpi->common.fc.pre_coef_probs, cpi->common.fc.coef_probs);

-  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs, cpi->common.fc.hybrid_coef_probs);

-  vp9_copy(cpi->common.fc.pre_coef_probs_8x8, cpi->common.fc.coef_probs_8x8);

-  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_8x8, cpi->common.fc.hybrid_coef_probs_8x8);

-  vp9_copy(cpi->common.fc.pre_coef_probs_16x16, cpi->common.fc.coef_probs_16x16);

-  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_16x16, cpi->common.fc.hybrid_coef_probs_16x16);

-  vp9_copy(cpi->common.fc.pre_ymode_prob, cpi->common.fc.ymode_prob);

-  vp9_copy(cpi->common.fc.pre_uv_mode_prob, cpi->common.fc.uv_mode_prob);

-  vp9_copy(cpi->common.fc.pre_bmode_prob, cpi->common.fc.bmode_prob);

-  vp9_copy(cpi->common.fc.pre_sub_mv_ref_prob, cpi->common.fc.sub_mv_ref_prob);

-  vp9_copy(cpi->common.fc.pre_mbsplit_prob, cpi->common.fc.mbsplit_prob);

-  vp9_copy(cpi->common.fc.pre_i8x8_mode_prob, cpi->common.fc.i8x8_mode_prob);

-  cpi->common.fc.pre_nmvc = cpi->common.fc.nmvc;

-  vp9_zero(cpi->sub_mv_ref_count);

-  vp9_zero(cpi->mbsplit_count);

-  vp9_zero(cpi->common.fc.mv_ref_ct)

-  vp9_zero(cpi->common.fc.mv_ref_ct_a)

-  update_coef_probs(cpi, &header_bc);

-#ifdef ENTROPY_STATS

-  active_section = 2;

-#endif

-  // Write out the mb_no_coeff_skip flag

-  vp9_write_bit(&header_bc, pc->mb_no_coeff_skip);

-  if (pc->mb_no_coeff_skip) {

-    int k;

-    vp9_update_skip_probs(cpi);

-    for (k = 0; k < MBSKIP_CONTEXTS; ++k)

-      vp9_write_literal(&header_bc, pc->mbskip_pred_probs[k], 8);

-  }

-  if (pc->frame_type == KEY_FRAME) {

-    if (!pc->kf_ymode_probs_update) {

-      vp9_write_literal(&header_bc, pc->kf_ymode_probs_index, 3);

-    }

-  } else {

-    // Update the probabilities used to encode reference frame data

-    update_ref_probs(cpi);

-#ifdef ENTROPY_STATS

-    active_section = 1;

-#endif

-#if CONFIG_PRED_FILTER

-    // Write the prediction filter mode used for this frame

-    vp9_write_literal(&header_bc, pc->pred_filter_mode, 2);

-    // Write prediction filter on/off probability if signaling at MB level

-    if (pc->pred_filter_mode == 2)

-      vp9_write_literal(&header_bc, pc->prob_pred_filter_off, 8);

-#endif

-    if (pc->mcomp_filter_type == SWITCHABLE)

-      update_switchable_interp_probs(cpi, &header_bc);

-    vp9_write_literal(&header_bc, pc->prob_intra_coded, 8);

-    vp9_write_literal(&header_bc, pc->prob_last_coded, 8);

-    vp9_write_literal(&header_bc, pc->prob_gf_coded, 8);

-    {

-      const int comp_pred_mode = cpi->common.comp_pred_mode;

-      const int use_compound_pred = (comp_pred_mode != SINGLE_PREDICTION_ONLY);

-      const int use_hybrid_pred = (comp_pred_mode == HYBRID_PREDICTION);

-      vp9_write(&header_bc, use_compound_pred, 128);

-      if (use_compound_pred) {

-        vp9_write(&header_bc, use_hybrid_pred, 128);

-        if (use_hybrid_pred) {

-          for (i = 0; i < COMP_PRED_CONTEXTS; i++) {

-            pc->prob_comppred[i] = get_binary_prob(cpi->single_pred_count[i],

-                                                   cpi->comp_pred_count[i]);

-            vp9_write_literal(&header_bc, pc->prob_comppred[i], 8);

-          }

-        }

-      }

-    }

-    update_mbintra_mode_probs(cpi, &header_bc);

-#if CONFIG_NEW_MVREF

-    // Temp defaults probabilities for ecnoding the MV ref id signal

-    vpx_memset(xd->mb_mv_ref_id_probs, 192, sizeof(xd->mb_mv_ref_id_probs));

-#endif

-    vp9_write_nmvprobs(cpi, xd->allow_high_precision_mv, &header_bc);

-  }

-  vp9_stop_encode(&header_bc);

-  oh.first_partition_length_in_bytes = header_bc.pos;

-  /* update frame tag */

-  {

-    int v = (oh.first_partition_length_in_bytes << 5) |

-            (oh.show_frame << 4) |

-            (oh.version << 1) |

-            oh.type;

-    dest[0] = v;

-    dest[1] = v >> 8;

-    dest[2] = v >> 16;

-  }

-  *size = VP9_HEADER_SIZE + extra_bytes_packed + header_bc.pos;

-  vp9_start_encode(&residual_bc, cx_data + header_bc.pos);

-  if (pc->frame_type == KEY_FRAME) {

-    decide_kf_ymode_entropy(cpi);

-    write_kfmodes(cpi, &residual_bc);

-  } else {

-    pack_inter_mode_mvs(cpi, &residual_bc);

-    vp9_update_mode_context(&cpi->common);

-  }

-  vp9_stop_encode(&residual_bc);

-  *size += residual_bc.pos;

-}

-#ifdef ENTROPY_STATS

-void print_tree_update_probs() {

-  int i, j, k, l;

-  FILE *f = fopen("coefupdprob.h", "w");

-  int Sum;

-  fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");

-  fprintf(f, "const vp9_prob\n"

-          "vp9_coef_update_probs[BLOCK_TYPES]\n"

-          "                     [COEF_BANDS]\n"

-          "                     [PREV_COEF_CONTEXTS]\n"

-          "                     [ENTROPY_NODES] = {\n");

-  for (i = 0; i < BLOCK_TYPES; i++) {

-    fprintf(f, "  { \n");

-    for (j = 0; j < COEF_BANDS; j++) {

-      fprintf(f, "    {\n");

-      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

-        fprintf(f, "      {");

-        for (l = 0; l < ENTROPY_NODES; l++) {

-          fprintf(f, "%3ld, ",

-              get_binary_prob(tree_update_hist[i][j][k][l][0],

-                              tree_update_hist[i][j][k][l][1]));

-        }

-        fprintf(f, "},\n");

-      }

-      fprintf(f, "    },\n");

-    }

-    fprintf(f, "  },\n");

-  }

-  fprintf(f, "};\n");

-  fprintf(f, "const vp9_prob\n"

-          "vp9_coef_update_probs_8x8[BLOCK_TYPES_8X8]\n"

-          "                         [COEF_BANDS]\n"

-          "                         [PREV_COEF_CONTEXTS]\n"

-          "                         [ENTROPY_NODES] = {\n");

-  for (i = 0; i < BLOCK_TYPES_8X8; i++) {

-    fprintf(f, "  { \n");

-    for (j = 0; j < COEF_BANDS; j++) {

-      fprintf(f, "    {\n");

-      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

-        fprintf(f, "      {");

-        for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++) {

-          fprintf(f, "%3ld, ",

-              get_binary_prob(tree_update_hist_8x8[i][j][k][l][0],

-                              tree_update_hist_8x8[i][j][k][l][1]));

-        }

-        fprintf(f, "},\n");

-      }

-      fprintf(f, "    },\n");

-    }

-    fprintf(f, "  },\n");

-  }

-  fprintf(f, "const vp9_prob\n"

-          "vp9_coef_update_probs_16x16[BLOCK_TYPES_16X16]\n"

-          "                           [COEF_BANDS]\n"

-          "                           [PREV_COEF_CONTEXTS]\n"

-          "                           [ENTROPY_NODES] = {\n");

-  for (i = 0; i < BLOCK_TYPES_16X16; i++) {

-    fprintf(f, "  { \n");

-    for (j = 0; j < COEF_BANDS; j++) {

-      fprintf(f, "    {\n");

-      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

-        fprintf(f, "      {");

-        for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++) {

-          fprintf(f, "%3ld, ",

-              get_binary_prob(tree_update_hist_16x16[i][j][k][l][0],

-                              tree_update_hist_16x16[i][j][k][l][1]));

-        }

-        fprintf(f, "},\n");

-      }

-      fprintf(f, "    },\n");

-    }

-    fprintf(f, "  },\n");

-  }

-  fclose(f);

-  f = fopen("treeupdate.bin", "wb");

-  fwrite(tree_update_hist, sizeof(tree_update_hist), 1, f);

-  fwrite(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);

-  fwrite(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);

-  fclose(f);

-}

-#endif

--- a/vp8/encoder/bitstream.h

+++ /dev/null

@@ -1,17 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_BITSTREAM_H

-#define __INC_BITSTREAM_H

-void vp9_update_skip_probs(VP9_COMP *cpi);

-#endif

--- a/vp8/encoder/block.h

+++ /dev/null

@@ -1,184 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_BLOCK_H

-#define __INC_BLOCK_H

-#include "vp8/common/onyx.h"

-#include "vp8/common/entropymv.h"

-#include "vp8/common/entropy.h"

-#include "vpx_ports/mem.h"

-#include "vp8/common/onyxc_int.h"

-// motion search site

-typedef struct {

-  MV mv;

-  int offset;

-} search_site;

-typedef struct block {

-  // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries

-  short *src_diff;

-  short *coeff;

-  // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries

-  short *quant;

-  short *quant_fast;      // fast quant deprecated for now

-  unsigned char *quant_shift;

-  short *zbin;

-  short *zbin_8x8;

-  short *zbin_16x16;

-  short *zrun_zbin_boost;

-  short *zrun_zbin_boost_8x8;

-  short *zrun_zbin_boost_16x16;

-  short *round;

-  // Zbin Over Quant value

-  short zbin_extra;

-  unsigned char **base_src;

-  unsigned char **base_second_src;

-  int src;

-  int src_stride;

-  int eob_max_offset;

-  int eob_max_offset_8x8;

-  int eob_max_offset_16x16;

-} BLOCK;

-typedef struct {

-  int count;

-  struct {

-    B_PREDICTION_MODE mode;

-    int_mv mv;

-    int_mv second_mv;

-  } bmi[16];

-} PARTITION_INFO;

-// Structure to hold snapshot of coding context during the mode picking process

-// TODO Do we need all of these?

-typedef struct {

-  MODE_INFO mic;

-  PARTITION_INFO partition_info;

-  int_mv best_ref_mv;

-  int_mv second_best_ref_mv;

-#if CONFIG_NEWBESTREFMV || CONFIG_NEW_MVREF

-  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REFS];

-#endif

-  int rate;

-  int distortion;

-  int64_t intra_error;

-  int best_mode_index;

-  int rddiv;

-  int rdmult;

-  int hybrid_pred_diff;

-  int comp_pred_diff;

-  int single_pred_diff;

-  int64_t txfm_rd_diff[NB_TXFM_MODES];

-} PICK_MODE_CONTEXT;

-typedef struct macroblock {

-  DECLARE_ALIGNED(16, short, src_diff[400]);  // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y

-  DECLARE_ALIGNED(16, short, coeff[400]);     // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y

-  DECLARE_ALIGNED(16, unsigned char, thismb[256]);    // 16x16 Y

-  unsigned char *thismb_ptr;

-  // 16 Y blocks, 4 U blocks, 4 V blocks,

-  // 1 DC 2nd order block each with 16 entries

-  BLOCK block[25];

-  YV12_BUFFER_CONFIG src;

-  MACROBLOCKD e_mbd;

-  PARTITION_INFO *partition_info; /* work pointer */

-  PARTITION_INFO *pi;   /* Corresponds to upper left visible macroblock */

-  PARTITION_INFO *pip;  /* Base of allocated array */

-  search_site *ss;

-  int ss_count;

-  int searches_per_step;

-  int errorperbit;

-  int sadperbit16;

-  int sadperbit4;

-  int rddiv;

-  int rdmult;

-  unsigned int *mb_activity_ptr;

-  int *mb_norm_activity_ptr;

-  signed int act_zbin_adj;

-  int nmvjointcost[MV_JOINTS];

-  int nmvcosts[2][MV_VALS];

-  int *nmvcost[2];

-  int nmvcosts_hp[2][MV_VALS];

-  int *nmvcost_hp[2];

-  int nmvjointsadcost[MV_JOINTS];

-  int nmvsadcosts[2][MV_VALS];

-  int *nmvsadcost[2];

-  int nmvsadcosts_hp[2][MV_VALS];

-  int *nmvsadcost_hp[2];

-  int mbmode_cost[2][MB_MODE_COUNT];

-  int intra_uv_mode_cost[2][MB_MODE_COUNT];

-  int bmode_costs[VP9_BINTRAMODES][VP9_BINTRAMODES][VP9_BINTRAMODES];

-  int i8x8_mode_costs[MB_MODE_COUNT];

-  int inter_bmode_costs[B_MODE_COUNT];

-  int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1]

-                             [VP9_SWITCHABLE_FILTERS];

-  // These define limits to motion vector components to prevent them

-  // from extending outside the UMV borders

-  int mv_col_min;

-  int mv_col_max;

-  int mv_row_min;

-  int mv_row_max;

-  int skip;

-  int encode_breakout;

-  // char * gf_active_ptr;

-  signed char *gf_active_ptr;

-  unsigned char *active_ptr;

-  unsigned int token_costs[TX_SIZE_MAX][BLOCK_TYPES][COEF_BANDS]

-    [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];

-  unsigned int hybrid_token_costs[TX_SIZE_MAX][BLOCK_TYPES][COEF_BANDS]

-    [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];

-  int optimize;

-  // Structure to hold context for each of the 4 MBs within a SB:

-  // when encoded as 4 independent MBs:

-  PICK_MODE_CONTEXT mb_context[4];

-#if CONFIG_SUPERBLOCKS

-  // when 4 MBs share coding parameters:

-  PICK_MODE_CONTEXT sb_context[4];

-#endif

-  void (*vp9_short_fdct4x4)(short *input, short *output, int pitch);

-  void (*vp9_short_fdct8x4)(short *input, short *output, int pitch);

-  void (*short_walsh4x4)(short *input, short *output, int pitch);

-  void (*quantize_b_4x4)(BLOCK *b, BLOCKD *d);

-  void (*quantize_b_4x4_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);

-  void (*vp9_short_fdct8x8)(short *input, short *output, int pitch);

-  void (*vp9_short_fdct16x16)(short *input, short *output, int pitch);

-  void (*short_fhaar2x2)(short *input, short *output, int pitch);

-  void (*quantize_b_16x16)(BLOCK *b, BLOCKD *d);

-  void (*quantize_b_8x8)(BLOCK *b, BLOCKD *d);

-  void (*quantize_b_2x2)(BLOCK *b, BLOCKD *d);

-} MACROBLOCK;

-#endif

--- a/vp8/encoder/boolhuff.c

+++ /dev/null

@@ -1,153 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "boolhuff.h"

-#if defined(SECTIONBITS_OUTPUT)

-unsigned __int64 Sectionbits[500];

-#endif

-#ifdef ENTROPY_STATS

-unsigned int active_section = 0;

-#endif

-const unsigned int vp9_prob_cost[256] = {

-  2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,

-  1023, 1000,  979,  959,  940,  922,  905,  889,  873,  858,  843,  829,  816,  803,  790,  778,

-  767,  755,  744,  733,  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,

-  617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,  534,  528,  522,  516,

-  511,  505,  499,  494,  488,  483,  477,  472,  467,  462,  457,  452,  447,  442,  437,  433,

-  428,  424,  419,  415,  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,

-  361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,  317,  314,  311,  307,

-  304,  301,  297,  294,  291,  288,  285,  281,  278,  275,  272,  269,  266,  263,  260,  257,

-  255,  252,  249,  246,  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,

-  211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,  181,  179,  177,  174,

-  172,  170,  168,  165,  163,  161,  159,  156,  154,  152,  150,  148,  145,  143,  141,  139,

-  137,  135,  133,  131,  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,

-  105,  103,  101,   99,   97,   95,   93,   92,   90,   88,   86,   84,   82,   81,   79,   77,

-  75,   73,   72,   70,   68,   66,   65,   63,   61,   60,   58,   56,   55,   53,   51,   50,

-  48,   46,   45,   43,   41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,

-  22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1

-};

-void vp9_start_encode(BOOL_CODER *br, unsigned char *source) {

-  br->lowvalue = 0;

-  br->range    = 255;

-  br->value    = 0;

-  br->count    = -24;

-  br->buffer   = source;

-  br->pos      = 0;

-}

-void vp9_stop_encode(BOOL_CODER *br) {

-  int i;

-  for (i = 0; i < 32; i++)

-    encode_bool(br, 0, 128);

-}

-void vp9_encode_value(BOOL_CODER *br, int data, int bits) {

-  int bit;

-  for (bit = bits - 1; bit >= 0; bit--)

-    encode_bool(br, (1 & (data >> bit)), 0x80);

-}

-int vp9_recenter_nonneg(int v, int m) {

-  if (v > (m << 1)) return v;

-  else if (v >= m) return ((v - m) << 1);

-  else return ((m - v) << 1) - 1;

-}

-static int get_unsigned_bits(unsigned num_values) {

-  int cat = 0;

-  if ((num_values--) <= 1) return 0;

-  while (num_values > 0) {

-    cat++;

-    num_values >>= 1;

-  }

-  return cat;

-}

-void vp9_encode_uniform(BOOL_CODER *br, int v, int n) {

-  int l = get_unsigned_bits(n);

-  int m;

-  if (l == 0) return;

-  m = (1 << l) - n;

-  if (v < m)

-    vp9_encode_value(br, v, l - 1);

-  else {

-    vp9_encode_value(br, m + ((v - m) >> 1), l - 1);

-    vp9_encode_value(br, (v - m) & 1, 1);

-  }

-}

-int vp9_count_uniform(int v, int n) {

-  int l = get_unsigned_bits(n);

-  int m;

-  if (l == 0) return 0;

-  m = (1 << l) - n;

-  if (v < m)

-    return l - 1;

-  else

-    return l;

-}

-void vp9_encode_term_subexp(BOOL_CODER *br, int word, int k, int num_syms) {

-  int i = 0;

-  int mk = 0;

-  while (1) {

-    int b = (i ? k + i - 1 : k);

-    int a = (1 << b);

-    if (num_syms <= mk + 3 * a) {

-      vp9_encode_uniform(br, word - mk, num_syms - mk);

-      break;

-    } else {

-      int t = (word >= mk + a);

-      vp9_encode_value(br, t, 1);

-      if (t) {

-        i = i + 1;

-        mk += a;

-      } else {

-        vp9_encode_value(br, word - mk, b);

-        break;

-      }

-    }

-  }

-}

-int vp9_count_term_subexp(int word, int k, int num_syms) {

-  int count = 0;

-  int i = 0;

-  int mk = 0;

-  while (1) {

-    int b = (i ? k + i - 1 : k);

-    int a = (1 << b);

-    if (num_syms <= mk + 3 * a) {

-      count += vp9_count_uniform(word - mk, num_syms - mk);

-      break;

-    } else {

-      int t = (word >= mk + a);

-      count++;

-      if (t) {

-        i = i + 1;

-        mk += a;

-      } else {

-        count += b;

-        break;

-      }

-    }

-  }

-  return count;

-}

--- a/vp8/encoder/boolhuff.h

+++ /dev/null

@@ -1,111 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-/****************************************************************************

-*

-*   Module Title :     boolhuff.h

-*

-*   Description  :     Bool Coder header file.

-*

-****************************************************************************/

-#ifndef __INC_BOOLHUFF_H

-#define __INC_BOOLHUFF_H

-#include "vpx_ports/mem.h"

-typedef struct {

-  unsigned int lowvalue;

-  unsigned int range;

-  unsigned int value;

-  int count;

-  unsigned int pos;

-  unsigned char *buffer;

-  // Variables used to track bit costs without outputing to the bitstream

-  unsigned int  measure_cost;

-  unsigned long bit_counter;

-} BOOL_CODER;

-extern void vp9_start_encode(BOOL_CODER *bc, unsigned char *buffer);

-extern void vp9_encode_value(BOOL_CODER *br, int data, int bits);

-extern void vp9_stop_encode(BOOL_CODER *bc);

-extern const unsigned int vp9_prob_cost[256];

-extern void vp9_encode_uniform(BOOL_CODER *bc, int v, int n);

-extern void vp9_encode_term_subexp(BOOL_CODER *bc, int v, int k, int n);

-extern int vp9_count_uniform(int v, int n);

-extern int vp9_count_term_subexp(int v, int k, int n);

-extern int vp9_recenter_nonneg(int v, int m);

-DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);

-static void encode_bool(BOOL_CODER *br, int bit, int probability) {

-  unsigned int split;

-  int count = br->count;

-  unsigned int range = br->range;

-  unsigned int lowvalue = br->lowvalue;

-  register unsigned int shift;

-#ifdef ENTROPY_STATS

-#if defined(SECTIONBITS_OUTPUT)

-  if (bit)

-    Sectionbits[active_section] += vp9_prob_cost[255 - probability];

-  else

-    Sectionbits[active_section] += vp9_prob_cost[probability];

-#endif

-#endif

-  split = 1 + (((range - 1) * probability) >> 8);

-  range = split;

-  if (bit) {

-    lowvalue += split;

-    range = br->range - split;

-  }

-  shift = vp9_norm[range];

-  range <<= shift;

-  count += shift;

-  if (count >= 0) {

-    int offset = shift - count;

-    if ((lowvalue << (offset - 1)) & 0x80000000) {

-      int x = br->pos - 1;

-      while (x >= 0 && br->buffer[x] == 0xff) {

-        br->buffer[x] = (unsigned char)0;

-        x--;

-      }

-      br->buffer[x] += 1;

-    }

-    br->buffer[br->pos++] = (lowvalue >> (24 - offset));

-    lowvalue <<= offset;

-    shift = count;

-    lowvalue &= 0xffffff;

-    count -= 8;

-  }

-  lowvalue <<= shift;

-  br->count = count;

-  br->lowvalue = lowvalue;

-  br->range = range;

-}

-#endif

--- a/vp8/encoder/dct.c

+++ /dev/null

@@ -1,1109 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <assert.h>

-#include <math.h>

-#include "vpx_ports/config.h"

-#include "vp8/common/idct.h"

-#include "vp8/common/systemdependent.h"

-#include "vp8/common/blockd.h"

-// TODO: these transforms can be converted into integer forms to reduce

-//       the complexity

-static const float dct_4[16] = {

-  0.500000000000000,  0.500000000000000,  0.500000000000000,  0.500000000000000,

-  0.653281482438188,  0.270598050073099, -0.270598050073099, -0.653281482438188,

-  0.500000000000000, -0.500000000000000, -0.500000000000000,  0.500000000000000,

-  0.270598050073099, -0.653281482438188,  0.653281482438188, -0.270598050073099

-};

-static const float adst_4[16] = {

-  0.228013428883779,  0.428525073124360,  0.577350269189626,  0.656538502008139,

-  0.577350269189626,  0.577350269189626,  0.000000000000000, -0.577350269189626,

-  0.656538502008139, -0.228013428883779, -0.577350269189626,  0.428525073124359,

-  0.428525073124360, -0.656538502008139,  0.577350269189626, -0.228013428883779

-};

-static const float dct_8[64] = {

-  0.353553390593274,   0.353553390593274,   0.353553390593274,   0.353553390593274,

-  0.353553390593274,   0.353553390593274,   0.353553390593274,   0.353553390593274,

-  0.490392640201615,   0.415734806151273,   0.277785116509801,   0.097545161008064,

- -0.097545161008064,  -0.277785116509801,  -0.415734806151273,  -0.490392640201615,

-  0.461939766255643,   0.191341716182545,  -0.191341716182545,  -0.461939766255643,

- -0.461939766255643,  -0.191341716182545,   0.191341716182545,   0.461939766255643,

-  0.415734806151273,  -0.097545161008064,  -0.490392640201615,  -0.277785116509801,

-  0.277785116509801,   0.490392640201615,   0.097545161008064,  -0.415734806151273,

-  0.353553390593274,  -0.353553390593274,  -0.353553390593274,   0.353553390593274,

-  0.353553390593274,  -0.353553390593274,  -0.353553390593274,   0.353553390593274,

-  0.277785116509801,  -0.490392640201615,   0.097545161008064,   0.415734806151273,

- -0.415734806151273,  -0.097545161008064,   0.490392640201615,  -0.277785116509801,

-  0.191341716182545,  -0.461939766255643,   0.461939766255643,  -0.191341716182545,

- -0.191341716182545,   0.461939766255643,  -0.461939766255643,   0.191341716182545,

-  0.097545161008064,  -0.277785116509801,   0.415734806151273,  -0.490392640201615,

-  0.490392640201615,  -0.415734806151273,   0.277785116509801,  -0.097545161008064

-};

-static const float adst_8[64] = {

-  0.089131608307533,   0.175227946595735,   0.255357107325376,   0.326790388032145,

-  0.387095214016349,   0.434217976756762,   0.466553967085785,   0.483002021635509,

-  0.255357107325376,   0.434217976756762,   0.483002021635509,   0.387095214016349,

-  0.175227946595735,  -0.089131608307533,  -0.326790388032145,  -0.466553967085785,

-  0.387095214016349,   0.466553967085785,   0.175227946595735,  -0.255357107325376,

- -0.483002021635509,  -0.326790388032145,   0.089131608307533,   0.434217976756762,

-  0.466553967085785,   0.255357107325376,  -0.326790388032145,  -0.434217976756762,

-  0.089131608307533,   0.483002021635509,   0.175227946595735,  -0.387095214016348,

-  0.483002021635509,  -0.089131608307533,  -0.466553967085785,   0.175227946595735,

-  0.434217976756762,  -0.255357107325376,  -0.387095214016348,   0.326790388032145,

-  0.434217976756762,  -0.387095214016348,  -0.089131608307533,   0.466553967085786,

- -0.326790388032145,  -0.175227946595735,   0.483002021635509,  -0.255357107325375,

-  0.326790388032145,  -0.483002021635509,   0.387095214016349,  -0.089131608307534,

- -0.255357107325377,   0.466553967085785,  -0.434217976756762,   0.175227946595736,

-  0.175227946595735,  -0.326790388032145,   0.434217976756762,  -0.483002021635509,

-  0.466553967085785,  -0.387095214016348,   0.255357107325376,  -0.089131608307532

-};

-/* Converted the transforms to integers. */

-static const int16_t dct_i4[16] = {

-  16384,  16384,  16384,  16384,

-  21407,   8867,  -8867, -21407,

-  16384, -16384, -16384,  16384,

-   8867, -21407,  21407,  -8867

-};

-static const int16_t adst_i4[16] = {

-   7472,  14042,  18919,  21513,

-  18919,  18919,      0, -18919,

-  21513,  -7472, -18919,  14042,

-  14042, -21513,  18919,  -7472

-};

-static const int16_t dct_i8[64] = {

-   11585,  11585,  11585,  11585,

-   11585,  11585,  11585,  11585,

-   16069,  13623,   9102,   3196,

-   -3196,  -9102, -13623, -16069,

-   15137,   6270,  -6270, -15137,

-  -15137,  -6270,   6270,  15137,

-   13623,  -3196, -16069,  -9102,

-    9102,  16069,   3196, -13623,

-   11585, -11585, -11585,  11585,

-   11585, -11585, -11585,  11585,

-    9102, -16069,   3196,  13623,

-  -13623,  -3196,  16069,  -9102,

-    6270, -15137,  15137,  -6270,

-   -6270,  15137, -15137,   6270,

-    3196,  -9102,  13623, -16069,

-   16069, -13623,   9102,  -3196

-};

-static const int16_t adst_i8[64] = {

-    2921,   5742,   8368,  10708,

-   12684,  14228,  15288,  15827,

-    8368,  14228,  15827,  12684,

-    5742,  -2921, -10708, -15288,

-   12684,  15288,   5742,  -8368,

-  -15827, -10708,   2921,  14228,

-   15288,   8368, -10708, -14228,

-    2921,  15827,   5742, -12684,

-   15827,  -2921, -15288,   5742,

-   14228,  -8368, -12684,  10708,

-   14228, -12684,  -2921,  15288,

-  -10708,  -5742,  15827,  -8368,

-   10708, -15827,  12684,  -2921,

-   -8368,  15288, -14228,   5742,

-    5742, -10708,  14228, -15827,

-   15288, -12684,   8368,  -2921

-};

-static const float dct_16[256] = {

-  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,

-  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,

-  0.351851,  0.338330,  0.311806,  0.273300,  0.224292,  0.166664,  0.102631,  0.034654,

- -0.034654, -0.102631, -0.166664, -0.224292, -0.273300, -0.311806, -0.338330, -0.351851,

-  0.346760,  0.293969,  0.196424,  0.068975, -0.068975, -0.196424, -0.293969, -0.346760,

- -0.346760, -0.293969, -0.196424, -0.068975,  0.068975,  0.196424,  0.293969,  0.346760,

-  0.338330,  0.224292,  0.034654, -0.166664, -0.311806, -0.351851, -0.273300, -0.102631,

-  0.102631,  0.273300,  0.351851,  0.311806,  0.166664, -0.034654, -0.224292, -0.338330,

-  0.326641,  0.135299, -0.135299, -0.326641, -0.326641, -0.135299,  0.135299,  0.326641,

-  0.326641,  0.135299, -0.135299, -0.326641, -0.326641, -0.135299,  0.135299,  0.326641,

-  0.311806,  0.034654, -0.273300, -0.338330, -0.102631,  0.224292,  0.351851,  0.166664,

- -0.166664, -0.351851, -0.224292,  0.102631,  0.338330,  0.273300, -0.034654, -0.311806,

-  0.293969, -0.068975, -0.346760, -0.196424,  0.196424,  0.346760,  0.068975, -0.293969,

- -0.293969,  0.068975,  0.346760,  0.196424, -0.196424, -0.346760, -0.068975,  0.293969,

-  0.273300, -0.166664, -0.338330,  0.034654,  0.351851,  0.102631, -0.311806, -0.224292,

-  0.224292,  0.311806, -0.102631, -0.351851, -0.034654,  0.338330,  0.166664, -0.273300,

-  0.250000, -0.250000, -0.250000,  0.250000,  0.250000, -0.250000, -0.250000,  0.250000,

-  0.250000, -0.250000, -0.250000,  0.250000,  0.250000, -0.250000, -0.250000,  0.250000,

-  0.224292, -0.311806, -0.102631,  0.351851, -0.034654, -0.338330,  0.166664,  0.273300,

- -0.273300, -0.166664,  0.338330,  0.034654, -0.351851,  0.102631,  0.311806, -0.224292,

-  0.196424, -0.346760,  0.068975,  0.293969, -0.293969, -0.068975,  0.346760, -0.196424,

- -0.196424,  0.346760, -0.068975, -0.293969,  0.293969,  0.068975, -0.346760,  0.196424,

-  0.166664, -0.351851,  0.224292,  0.102631, -0.338330,  0.273300,  0.034654, -0.311806,

-  0.311806, -0.034654, -0.273300,  0.338330, -0.102631, -0.224292,  0.351851, -0.166664,

-  0.135299, -0.326641,  0.326641, -0.135299, -0.135299,  0.326641, -0.326641,  0.135299,

-  0.135299, -0.326641,  0.326641, -0.135299, -0.135299,  0.326641, -0.326641,  0.135299,

-  0.102631, -0.273300,  0.351851, -0.311806,  0.166664,  0.034654, -0.224292,  0.338330,

- -0.338330,  0.224292, -0.034654, -0.166664,  0.311806, -0.351851,  0.273300, -0.102631,

-  0.068975, -0.196424,  0.293969, -0.346760,  0.346760, -0.293969,  0.196424, -0.068975,

- -0.068975,  0.196424, -0.293969,  0.346760, -0.346760,  0.293969, -0.196424,  0.068975,

-  0.034654, -0.102631,  0.166664, -0.224292,  0.273300, -0.311806,  0.338330, -0.351851,

-  0.351851, -0.338330,  0.311806, -0.273300,  0.224292, -0.166664,  0.102631, -0.034654

-};

-static const float adst_16[256] = {

-  0.033094,  0.065889,  0.098087,  0.129396,  0.159534,  0.188227,  0.215215,  0.240255,

-  0.263118,  0.283599,  0.301511,  0.316693,  0.329007,  0.338341,  0.344612,  0.347761,

-  0.098087,  0.188227,  0.263118,  0.316693,  0.344612,  0.344612,  0.316693,  0.263118,

-  0.188227,  0.098087,  0.000000, -0.098087, -0.188227, -0.263118, -0.316693, -0.344612,

-  0.159534,  0.283599,  0.344612,  0.329007,  0.240255,  0.098087, -0.065889, -0.215215,

- -0.316693, -0.347761, -0.301511, -0.188227, -0.033094,  0.129396,  0.263118,  0.338341,

-  0.215215,  0.338341,  0.316693,  0.159534, -0.065889, -0.263118, -0.347761, -0.283599,

- -0.098087,  0.129396,  0.301511,  0.344612,  0.240255,  0.033094, -0.188227, -0.329007,

-  0.263118,  0.344612,  0.188227, -0.098087, -0.316693, -0.316693, -0.098087,  0.188227,

-  0.344612,  0.263118,  0.000000, -0.263118, -0.344612, -0.188227,  0.098087,  0.316693,

-  0.301511,  0.301511,  0.000000, -0.301511, -0.301511, -0.000000,  0.301511,  0.301511,

-  0.000000, -0.301511, -0.301511, -0.000000,  0.301511,  0.301511,  0.000000, -0.301511,

-  0.329007,  0.215215, -0.188227, -0.338341, -0.033094,  0.316693,  0.240255, -0.159534,

- -0.344612, -0.065889,  0.301511,  0.263118, -0.129396, -0.347761, -0.098087,  0.283599,

-  0.344612,  0.098087, -0.316693, -0.188227,  0.263118,  0.263118, -0.188227, -0.316693,

-  0.098087,  0.344612,  0.000000, -0.344612, -0.098087,  0.316693,  0.188227, -0.263118,

-  0.347761, -0.033094, -0.344612,  0.065889,  0.338341, -0.098087, -0.329007,  0.129396,

-  0.316693, -0.159534, -0.301511,  0.188227,  0.283599, -0.215215, -0.263118,  0.240255,

-  0.338341, -0.159534, -0.263118,  0.283599,  0.129396, -0.344612,  0.033094,  0.329007,

- -0.188227, -0.240255,  0.301511,  0.098087, -0.347761,  0.065889,  0.316693, -0.215215,

-  0.316693, -0.263118, -0.098087,  0.344612, -0.188227, -0.188227,  0.344612, -0.098087,

- -0.263118,  0.316693,  0.000000, -0.316693,  0.263118,  0.098087, -0.344612,  0.188227,

-  0.283599, -0.329007,  0.098087,  0.215215, -0.347761,  0.188227,  0.129396, -0.338341,

-  0.263118,  0.033094, -0.301511,  0.316693, -0.065889, -0.240255,  0.344612, -0.159534,

-  0.240255, -0.347761,  0.263118, -0.033094, -0.215215,  0.344612, -0.283599,  0.065889,

-  0.188227, -0.338341,  0.301511, -0.098087, -0.159534,  0.329007, -0.316693,  0.129396,

-  0.188227, -0.316693,  0.344612, -0.263118,  0.098087,  0.098087, -0.263118,  0.344612,

- -0.316693,  0.188227,  0.000000, -0.188227,  0.316693, -0.344612,  0.263118, -0.098087,

-  0.129396, -0.240255,  0.316693, -0.347761,  0.329007, -0.263118,  0.159534, -0.033094,

- -0.098087,  0.215215, -0.301511,  0.344612, -0.338341,  0.283599, -0.188227,  0.065889,

-  0.065889, -0.129396,  0.188227, -0.240255,  0.283599, -0.316693,  0.338341, -0.347761,

-  0.344612, -0.329007,  0.301511, -0.263118,  0.215215, -0.159534,  0.098087, -0.033094

-};

-/* Converted the transforms to integers. */

-static const int16_t dct_i16[256] = {

-    8192,   8192,   8192,   8192,   8192,   8192,   8192,   8192,

-    8192,   8192,   8192,   8192,   8192,   8192,   8192,   8192,

-   11529,  11086,  10217,   8955,   7350,   5461,   3363,   1136,

-   -1136,  -3363,  -5461,  -7350,  -8955, -10217, -11086, -11529,

-   11363,   9633,   6436,   2260,  -2260,  -6436,  -9633, -11363,

-  -11363,  -9633,  -6436,  -2260,   2260,   6436,   9633,  11363,

-   11086,   7350,   1136,  -5461, -10217, -11529,  -8955,  -3363,

-    3363,   8955,  11529,  10217,   5461,  -1136,  -7350, -11086,

-   10703,   4433,  -4433, -10703, -10703,  -4433,   4433,  10703,

-   10703,   4433,  -4433, -10703, -10703,  -4433,   4433,  10703,

-   10217,   1136,  -8955, -11086,  -3363,   7350,  11529,   5461,

-   -5461, -11529,  -7350,   3363,  11086,   8955,  -1136, -10217,

-    9633,  -2260, -11363,  -6436,   6436,  11363,   2260,  -9633,

-   -9633,   2260,  11363,   6436,  -6436, -11363,  -2260,   9633,

-    8955,  -5461, -11086,   1136,  11529,   3363, -10217,  -7350,

-    7350,  10217,  -3363, -11529,  -1136,  11086,   5461,  -8955,

-    8192,  -8192,  -8192,   8192,   8192,  -8192,  -8192,   8192,

-    8192,  -8192,  -8192,   8192,   8192,  -8192,  -8192,   8192,

-    7350, -10217,  -3363,  11529,  -1136, -11086,   5461,   8955,

-   -8955,  -5461,  11086,   1136, -11529,   3363,  10217,  -7350,

-    6436, -11363,   2260,   9633,  -9633,  -2260,  11363,  -6436,

-   -6436,  11363,  -2260,  -9633,   9633,   2260, -11363,   6436,

-    5461, -11529,   7350,   3363, -11086,   8955,   1136, -10217,

-   10217,  -1136,  -8955,  11086,  -3363,  -7350,  11529,  -5461,

-    4433, -10703,  10703,  -4433,  -4433,  10703, -10703,   4433,

-    4433, -10703,  10703,  -4433,  -4433,  10703, -10703,   4433,

-    3363,  -8955,  11529, -10217,   5461,   1136,  -7350,  11086,

-  -11086,   7350,  -1136,  -5461,  10217, -11529,   8955,  -3363,

-    2260,  -6436,   9633, -11363,  11363,  -9633,   6436,  -2260,

-   -2260,   6436,  -9633,  11363, -11363,   9633,  -6436,   2260,

-    1136,  -3363,   5461,  -7350,   8955, -10217,  11086, -11529,

-   11529, -11086,  10217,  -8955,   7350,  -5461,   3363,  -1136

-};

-static const int16_t adst_i16[256] = {

-    1084,   2159,   3214,   4240,   5228,   6168,   7052,   7873,

-    8622,   9293,   9880,  10377,  10781,  11087,  11292,  11395,

-    3214,   6168,   8622,  10377,  11292,  11292,  10377,   8622,

-    6168,   3214,      0,  -3214,  -6168,  -8622, -10377, -11292,

-    5228,   9293,  11292,  10781,   7873,   3214,  -2159,  -7052,

-  -10377, -11395,  -9880,  -6168,  -1084,   4240,   8622,  11087,

-    7052,  11087,  10377,   5228,  -2159,  -8622, -11395,  -9293,

-   -3214,   4240,   9880,  11292,   7873,   1084,  -6168, -10781,

-    8622,  11292,   6168,  -3214, -10377, -10377,  -3214,   6168,

-   11292,   8622,      0,  -8622, -11292,  -6168,   3214,  10377,

-    9880,   9880,      0,  -9880,  -9880,      0,   9880,   9880,

-       0,  -9880,  -9880,      0,   9880,   9880,      0,  -9880,

-   10781,   7052,  -6168, -11087,  -1084,  10377,   7873,  -5228,

-  -11292,  -2159,   9880,   8622,  -4240, -11395,  -3214,   9293,

-   11292,   3214, -10377,  -6168,   8622,   8622,  -6168, -10377,

-    3214,  11292,      0, -11292,  -3214,  10377,   6168,  -8622,

-   11395,  -1084, -11292,   2159,  11087,  -3214, -10781,   4240,

-   10377,  -5228,  -9880,   6168,   9293,  -7052,  -8622,   7873,

-   11087,  -5228,  -8622,   9293,   4240, -11292,   1084,  10781,

-   -6168,  -7873,   9880,   3214, -11395,   2159,  10377,  -7052,

-   10377,  -8622,  -3214,  11292,  -6168,  -6168,  11292,  -3214,

-   -8622,  10377,      0, -10377,   8622,   3214, -11292,   6168,

-    9293, -10781,   3214,   7052, -11395,   6168,   4240, -11087,

-    8622,   1084,  -9880,  10377,  -2159,  -7873,  11292,  -5228,

-    7873, -11395,   8622,  -1084,  -7052,  11292,  -9293,   2159,

-    6168, -11087,   9880,  -3214,  -5228,  10781, -10377,   4240,

-    6168, -10377,  11292,  -8622,   3214,   3214,  -8622,  11292,

-  -10377,   6168,      0,  -6168,  10377, -11292,   8622,  -3214,

-    4240,  -7873,  10377, -11395,  10781,  -8622,   5228,  -1084,

-   -3214,   7052,  -9880,  11292, -11087,   9293,  -6168,   2159,

-    2159,  -4240,   6168,  -7873,   9293, -10377,  11087, -11395,

-   11292, -10781,   9880,  -8622,   7052,  -5228,   3214,  -1084

-};

-static const int xC1S7 = 16069;

-static const int xC2S6 = 15137;

-static const int xC3S5 = 13623;

-static const int xC4S4 = 11585;

-static const int xC5S3 =  9102;

-static const int xC6S2 =  6270;

-static const int xC7S1 =  3196;

-#define SHIFT_BITS 14

-#define DOROUND(X) X += (1<<(SHIFT_BITS-1));

-#define FINAL_SHIFT 3

-#define FINAL_ROUNDING (1<<(FINAL_SHIFT -1))

-#define IN_SHIFT (FINAL_SHIFT+1)

-void vp9_short_fdct8x8_c(short *InputData, short *OutputData, int pitch) {

-  int loop;

-  int short_pitch = pitch >> 1;

-  int is07, is12, is34, is56;

-  int is0734, is1256;

-  int id07, id12, id34, id56;

-  int irot_input_x, irot_input_y;

-  int icommon_product1;      // Re-used product  (c4s4 * (s12 - s56))

-  int icommon_product2;      // Re-used product  (c4s4 * (d12 + d56))

-  int temp1, temp2;          // intermediate variable for computation

-  int  InterData[64];

-  int  *ip = InterData;

-  short *op = OutputData;

-  for (loop = 0; loop < 8; loop++) {

-    // Pre calculate some common sums and differences.

-    is07 = (InputData[0] + InputData[7]) << IN_SHIFT;

-    is12 = (InputData[1] + InputData[2]) << IN_SHIFT;

-    is34 = (InputData[3] + InputData[4]) << IN_SHIFT;

-    is56 = (InputData[5] + InputData[6]) << IN_SHIFT;

-    id07 = (InputData[0] - InputData[7]) << IN_SHIFT;

-    id12 = (InputData[1] - InputData[2]) << IN_SHIFT;

-    id34 = (InputData[3] - InputData[4]) << IN_SHIFT;

-    id56 = (InputData[5] - InputData[6]) << IN_SHIFT;

-    is0734 = is07 + is34;

-    is1256 = is12 + is56;

-    // Pre-Calculate some common product terms.

-    icommon_product1 = xC4S4 * (is12 - is56);

-    DOROUND(icommon_product1)

-    icommon_product1 >>= SHIFT_BITS;

-    icommon_product2 = xC4S4 * (id12 + id56);

-    DOROUND(icommon_product2)

-    icommon_product2 >>= SHIFT_BITS;

-    ip[0] = (xC4S4 * (is0734 + is1256));

-    DOROUND(ip[0]);

-    ip[0] >>= SHIFT_BITS;

-    ip[4] = (xC4S4 * (is0734 - is1256));

-    DOROUND(ip[4]);

-    ip[4] >>= SHIFT_BITS;

-    // Define inputs to rotation for outputs 2 and 6

-    irot_input_x = id12 - id56;

-    irot_input_y = is07 - is34;

-    // Apply rotation for outputs 2 and 6.

-    temp1 = xC6S2 * irot_input_x;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC2S6 * irot_input_y;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    ip[2] = temp1 + temp2;

-    temp1 = xC6S2 * irot_input_y;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC2S6 * irot_input_x;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    ip[6] = temp1 - temp2;

-    // Define inputs to rotation for outputs 1 and 7

-    irot_input_x = icommon_product1 + id07;

-    irot_input_y = -(id34 + icommon_product2);

-    // Apply rotation for outputs 1 and 7.

-    temp1 = xC1S7 * irot_input_x;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC7S1 * irot_input_y;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    ip[1] = temp1 - temp2;

-    temp1 = xC7S1 * irot_input_x;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC1S7 * irot_input_y;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    ip[7] = temp1 + temp2;

-    // Define inputs to rotation for outputs 3 and 5

-    irot_input_x = id07 - icommon_product1;

-    irot_input_y = id34 - icommon_product2;

-    // Apply rotation for outputs 3 and 5.

-    temp1 = xC3S5 * irot_input_x;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC5S3 * irot_input_y;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    ip[3] = temp1 - temp2;

-    temp1 = xC5S3 * irot_input_x;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC3S5 * irot_input_y;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    ip[5] = temp1 + temp2;

-    // Increment data pointer for next row

-    InputData += short_pitch;

-    ip += 8;

-  }

-  // Performed DCT on rows, now transform the columns

-  ip = InterData;

-  for (loop = 0; loop < 8; loop++) {

-    // Pre calculate some common sums and differences.

-    is07 = ip[0 * 8] + ip[7 * 8];

-    is12 = ip[1 * 8] + ip[2 * 8];

-    is34 = ip[3 * 8] + ip[4 * 8];

-    is56 = ip[5 * 8] + ip[6 * 8];

-    id07 = ip[0 * 8] - ip[7 * 8];

-    id12 = ip[1 * 8] - ip[2 * 8];

-    id34 = ip[3 * 8] - ip[4 * 8];

-    id56 = ip[5 * 8] - ip[6 * 8];

-    is0734 = is07 + is34;

-    is1256 = is12 + is56;

-    // Pre-Calculate some common product terms

-    icommon_product1 = xC4S4 * (is12 - is56);

-    icommon_product2 = xC4S4 * (id12 + id56);

-    DOROUND(icommon_product1)

-    DOROUND(icommon_product2)

-    icommon_product1 >>= SHIFT_BITS;

-    icommon_product2 >>= SHIFT_BITS;

-    temp1 = xC4S4 * (is0734 + is1256);

-    temp2 = xC4S4 * (is0734 - is1256);

-    DOROUND(temp1);

-    DOROUND(temp2);

-    temp1 >>= SHIFT_BITS;

-    temp2 >>= SHIFT_BITS;

-    op[0 * 8] = (temp1 + FINAL_ROUNDING) >> FINAL_SHIFT;

-    op[4 * 8] = (temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

-    // Define inputs to rotation for outputs 2 and 6

-    irot_input_x = id12 - id56;

-    irot_input_y = is07 - is34;

-    // Apply rotation for outputs 2 and 6.

-    temp1 = xC6S2 * irot_input_x;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC2S6 * irot_input_y;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    op[2 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

-    temp1 = xC6S2 * irot_input_y;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC2S6 * irot_input_x;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    op[6 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

-    // Define inputs to rotation for outputs 1 and 7

-    irot_input_x = icommon_product1 + id07;

-    irot_input_y = -(id34 + icommon_product2);

-    // Apply rotation for outputs 1 and 7.

-    temp1 = xC1S7 * irot_input_x;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC7S1 * irot_input_y;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    op[1 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

-    temp1 = xC7S1 * irot_input_x;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC1S7 * irot_input_y;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    op[7 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

-    // Define inputs to rotation for outputs 3 and 5

-    irot_input_x = id07 - icommon_product1;

-    irot_input_y = id34 - icommon_product2;

-    // Apply rotation for outputs 3 and 5.

-    temp1 = xC3S5 * irot_input_x;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC5S3 * irot_input_y;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    op[3 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

-    temp1 = xC5S3 * irot_input_x;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC3S5 * irot_input_y;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    op[5 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

-    // Increment data pointer for next column.

-    ip++;

-    op++;

-  }

-}

-void vp9_short_fhaar2x2_c(short *input, short *output, int pitch) {

-  /* [1 1; 1 -1] orthogonal transform */

-  /* use position: 0,1, 4, 8 */

-  int i;

-  short *ip1 = input;

-  short *op1 = output;

-  for (i = 0; i < 16; i++) {

-    op1[i] = 0;

-  }

-  op1[0] = (ip1[0] + ip1[1] + ip1[4] + ip1[8] + 1) >> 1;

-  op1[1] = (ip1[0] - ip1[1] + ip1[4] - ip1[8]) >> 1;

-  op1[4] = (ip1[0] + ip1[1] - ip1[4] - ip1[8]) >> 1;

-  op1[8] = (ip1[0] - ip1[1] - ip1[4] + ip1[8]) >> 1;

-}

-/* For test */

-#define TEST_INT 1

-#if TEST_INT

-#define vp9_fht_int_c vp9_fht_c

-#else

-#define vp9_fht_float_c vp9_fht_c

-#endif

-void vp9_fht_float_c(const int16_t *input, int pitch, int16_t *output,

-               TX_TYPE tx_type, int tx_dim) {

-  vp9_clear_system_state();  // Make it simd safe : __asm emms;

-  {

-    int i, j, k;

-    float bufa[256], bufb[256];  // buffers are for floating-point test purpose

-                                 // the implementation could be simplified in

-                                 // conjunction with integer transform

-    const int16_t *ip = input;

-    int16_t *op = output;

-    float *pfa = &bufa[0];

-    float *pfb = &bufb[0];

-    // pointers to vertical and horizontal transforms

-    const float *ptv, *pth;

-    assert(tx_type != DCT_DCT);

-    // load and convert residual array into floating-point

-    for (j = 0; j < tx_dim; j++) {

-      for (i = 0; i < tx_dim; i++) {

-        pfa[i] = (float)ip[i];

-      }

-      pfa += tx_dim;

-      ip  += pitch / 2;

-    }

-    // vertical transformation

-    pfa = &bufa[0];

-    pfb = &bufb[0];

-    switch (tx_type) {

-      case ADST_ADST :

-      case ADST_DCT  :

-        ptv = (tx_dim == 4) ? &adst_4[0] :

-                              ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);

-        break;

-      default :

-        ptv = (tx_dim == 4) ? &dct_4[0] :

-                              ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);

-        break;

-    }

-    for (j = 0; j < tx_dim; j++) {

-      for (i = 0; i < tx_dim; i++) {

-        pfb[i] = 0;

-        for (k = 0; k < tx_dim; k++) {

-          pfb[i] += ptv[k] * pfa[(k * tx_dim)];

-        }

-        pfa += 1;

-      }

-      pfb += tx_dim;

-      ptv += tx_dim;

-      pfa = &bufa[0];

-    }

-    // horizontal transformation

-    pfa = &bufa[0];

-    pfb = &bufb[0];

-    switch (tx_type) {

-      case ADST_ADST :

-      case  DCT_ADST :

-        pth = (tx_dim == 4) ? &adst_4[0] :

-                              ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);

-        break;

-      default :

-        pth = (tx_dim == 4) ? &dct_4[0] :

-                              ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);

-        break;

-    }

-    for (j = 0; j < tx_dim; j++) {

-      for (i = 0; i < tx_dim; i++) {

-        pfa[i] = 0;

-        for (k = 0; k < tx_dim; k++) {

-          pfa[i] += pfb[k] * pth[k];

-        }

-        pth += tx_dim;

-      }

-      pfa += tx_dim;

-      pfb += tx_dim;

-      // pth -= tx_dim * tx_dim;

-      switch (tx_type) {

-        case ADST_ADST :

-        case  DCT_ADST :

-          pth = (tx_dim == 4) ? &adst_4[0] :

-                                ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);

-          break;

-        default :

-          pth = (tx_dim == 4) ? &dct_4[0] :

-                                ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);

-          break;

-      }

-    }

-    // convert to short integer format and load BLOCKD buffer

-    op = output;

-    pfa = &bufa[0];

-    for (j = 0; j < tx_dim; j++) {

-      for (i = 0; i < tx_dim; i++) {

-        op[i] = (pfa[i] > 0 ) ? (int16_t)( 8 * pfa[i] + 0.49) :

-                                     -(int16_t)(- 8 * pfa[i] + 0.49);

-      }

-      op  += tx_dim;

-      pfa += tx_dim;

-    }

-  }

-  vp9_clear_system_state();  // Make it simd safe : __asm emms;

-}

-/* Converted the transforms to integer form. */

-#define VERTICAL_SHIFT 11

-#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)

-#define HORIZONTAL_SHIFT 16

-#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)

-void vp9_fht_int_c(const int16_t *input, int pitch, int16_t *output,

-                   TX_TYPE tx_type, int tx_dim) {

-  int i, j, k;

-  int16_t imbuf[256];

-  const int16_t *ip = input;

-  int16_t *op = output;

-  int16_t *im = &imbuf[0];

-  /* pointers to vertical and horizontal transforms. */

-  const int16_t *ptv = NULL, *pth = NULL;

-  switch (tx_type) {

-    case ADST_ADST :

-      ptv = pth = (tx_dim == 4) ? &adst_i4[0]

-                                  : ((tx_dim == 8) ? &adst_i8[0]

-                                                     : &adst_i16[0]);

-      break;

-    case ADST_DCT  :

-      ptv = (tx_dim == 4) ? &adst_i4[0]

-                            : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);

-      pth = (tx_dim == 4) ? &dct_i4[0]

-                            : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);

-      break;

-    case  DCT_ADST :

-      ptv = (tx_dim == 4) ? &dct_i4[0]

-                            : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);

-      pth = (tx_dim == 4) ? &adst_i4[0]

-                            : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);

-      break;

-    case  DCT_DCT :

-      ptv = pth = (tx_dim == 4) ? &dct_i4[0]

-                                  : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);

-      break;

-    default:

-      assert(0);

-      break;

-  }

-  /* vertical transformation */

-  for (j = 0; j < tx_dim; j++) {

-    for (i = 0; i < tx_dim; i++) {

-      int temp = 0;

-      for (k = 0; k < tx_dim; k++) {

-        temp += ptv[k] * ip[(k * (pitch >> 1))];

-      }

-      im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);

-      ip++;

-    }

-    im += tx_dim;  // 16

-    ptv += tx_dim;

-    ip = input;

-  }

-  /* horizontal transformation */

-  im = &imbuf[0];

-  for (j = 0; j < tx_dim; j++) {

-    const int16_t *pthc = pth;

-    for (i = 0; i < tx_dim; i++) {

-      int temp = 0;

-      for (k = 0; k < tx_dim; k++) {

-        temp += im[k] * pthc[k];

-      }

-      op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);

-      pthc += tx_dim;

-    }

-    im += tx_dim;  // 16

-    op += tx_dim;

-  }

-}

-void vp9_short_fdct4x4_c(short *input, short *output, int pitch) {

-  int i;

-  int a1, b1, c1, d1;

-  short *ip = input;

-  short *op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ((ip[0] + ip[3]) << 5);

-    b1 = ((ip[1] + ip[2]) << 5);

-    c1 = ((ip[1] - ip[2]) << 5);

-    d1 = ((ip[0] - ip[3]) << 5);

-    op[0] = a1 + b1;

-    op[2] = a1 - b1;

-    op[1] = (c1 * 2217 + d1 * 5352 +  14500) >> 12;

-    op[3] = (d1 * 2217 - c1 * 5352 +   7500) >> 12;

-    ip += pitch / 2;

-    op += 4;

-  }

-  ip = output;

-  op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[0] + ip[12];

-    b1 = ip[4] + ip[8];

-    c1 = ip[4] - ip[8];

-    d1 = ip[0] - ip[12];

-    op[0]  = (a1 + b1 + 7) >> 4;

-    op[8]  = (a1 - b1 + 7) >> 4;

-    op[4]  = ((c1 * 2217 + d1 * 5352 +  12000) >> 16) + (d1 != 0);

-    op[12] = (d1 * 2217 - c1 * 5352 +  51000) >> 16;

-    ip++;

-    op++;

-  }

-}

-void vp9_short_fdct8x4_c(short *input, short *output, int pitch)

-{

-    vp9_short_fdct4x4_c(input,   output,    pitch);

-    vp9_short_fdct4x4_c(input + 4, output + 16, pitch);

-}

-void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {

-  int i;

-  int a1, b1, c1, d1;

-  short *ip = input;

-  short *op = output;

-  int pitch_short = pitch >> 1;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[0 * pitch_short] + ip[3 * pitch_short];

-    b1 = ip[1 * pitch_short] + ip[2 * pitch_short];

-    c1 = ip[1 * pitch_short] - ip[2 * pitch_short];

-    d1 = ip[0 * pitch_short] - ip[3 * pitch_short];

-    op[0] = (a1 + b1 + 1) >> 1;

-    op[4] = (c1 + d1) >> 1;

-    op[8] = (a1 - b1) >> 1;

-    op[12] = (d1 - c1) >> 1;

-    ip++;

-    op++;

-  }

-  ip = output;

-  op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[0] + ip[3];

-    b1 = ip[1] + ip[2];

-    c1 = ip[1] - ip[2];

-    d1 = ip[0] - ip[3];

-    op[0] = (a1 + b1 + 1) >> 1;

-    op[1] = (c1 + d1) >> 1;

-    op[2] = (a1 - b1) >> 1;

-    op[3] = (d1 - c1) >> 1;

-    ip += 4;

-    op += 4;

-  }

-}

-#if CONFIG_LOSSLESS

-void vp9_short_walsh4x4_lossless_c(short *input, short *output, int pitch) {

-  int i;

-  int a1, b1, c1, d1;

-  short *ip = input;

-  short *op = output;

-  int pitch_short = pitch >> 1;

-  for (i = 0; i < 4; i++) {

-    a1 = (ip[0 * pitch_short] + ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;

-    b1 = (ip[1 * pitch_short] + ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;

-    c1 = (ip[1 * pitch_short] - ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;

-    d1 = (ip[0 * pitch_short] - ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;

-    op[0] = (a1 + b1 + 1) >> 1;

-    op[4] = (c1 + d1) >> 1;

-    op[8] = (a1 - b1) >> 1;

-    op[12] = (d1 - c1) >> 1;

-    ip++;

-    op++;

-  }

-  ip = output;

-  op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[0] + ip[3];

-    b1 = ip[1] + ip[2];

-    c1 = ip[1] - ip[2];

-    d1 = ip[0] - ip[3];

-    op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

-    op[1] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

-    op[2] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

-    op[3] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

-    ip += 4;

-    op += 4;

-  }

-}

-void vp9_short_walsh4x4_x8_c(short *input, short *output, int pitch) {

-  int i;

-  int a1, b1, c1, d1;

-  short *ip = input;

-  short *op = output;

-  int pitch_short = pitch >> 1;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[0 * pitch_short] + ip[3 * pitch_short];

-    b1 = ip[1 * pitch_short] + ip[2 * pitch_short];

-    c1 = ip[1 * pitch_short] - ip[2 * pitch_short];

-    d1 = ip[0 * pitch_short] - ip[3 * pitch_short];

-    op[0] = (a1 + b1 + 1) >> 1;

-    op[4] = (c1 + d1) >> 1;

-    op[8] = (a1 - b1) >> 1;

-    op[12] = (d1 - c1) >> 1;

-    ip++;

-    op++;

-  }

-  ip = output;

-  op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[0] + ip[3];

-    b1 = ip[1] + ip[2];

-    c1 = ip[1] - ip[2];

-    d1 = ip[0] - ip[3];

-    op[0] = ((a1 + b1 + 1) >> 1) << WHT_UPSCALE_FACTOR;

-    op[1] = ((c1 + d1) >> 1) << WHT_UPSCALE_FACTOR;

-    op[2] = ((a1 - b1) >> 1) << WHT_UPSCALE_FACTOR;

-    op[3] = ((d1 - c1) >> 1) << WHT_UPSCALE_FACTOR;

-    ip += 4;

-    op += 4;

-  }

-}

-void vp9_short_walsh8x4_x8_c(short *input, short *output, int pitch) {

-  vp9_short_walsh4x4_x8_c(input,   output,    pitch);

-  vp9_short_walsh4x4_x8_c(input + 4, output + 16, pitch);

-}

-#endif

-static const double C1 = 0.995184726672197;

-static const double C2 = 0.98078528040323;

-static const double C3 = 0.956940335732209;

-static const double C4 = 0.923879532511287;

-static const double C5 = 0.881921264348355;

-static const double C6 = 0.831469612302545;

-static const double C7 = 0.773010453362737;

-static const double C8 = 0.707106781186548;

-static const double C9 = 0.634393284163646;

-static const double C10 = 0.555570233019602;

-static const double C11 = 0.471396736825998;

-static const double C12 = 0.38268343236509;

-static const double C13 = 0.290284677254462;

-static const double C14 = 0.195090322016128;

-static const double C15 = 0.098017140329561;

-static void dct16x16_1d(double input[16], double output[16]) {

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-  {

-    double step[16];

-    double intermediate[16];

-    double temp1, temp2;

-    // step 1

-    step[ 0] = input[0] + input[15];

-    step[ 1] = input[1] + input[14];

-    step[ 2] = input[2] + input[13];

-    step[ 3] = input[3] + input[12];

-    step[ 4] = input[4] + input[11];

-    step[ 5] = input[5] + input[10];

-    step[ 6] = input[6] + input[ 9];

-    step[ 7] = input[7] + input[ 8];

-    step[ 8] = input[7] - input[ 8];

-    step[ 9] = input[6] - input[ 9];

-    step[10] = input[5] - input[10];

-    step[11] = input[4] - input[11];

-    step[12] = input[3] - input[12];

-    step[13] = input[2] - input[13];

-    step[14] = input[1] - input[14];

-    step[15] = input[0] - input[15];

-    // step 2

-    output[0] = step[0] + step[7];

-    output[1] = step[1] + step[6];

-    output[2] = step[2] + step[5];

-    output[3] = step[3] + step[4];

-    output[4] = step[3] - step[4];

-    output[5] = step[2] - step[5];

-    output[6] = step[1] - step[6];

-    output[7] = step[0] - step[7];

-    temp1 = step[ 8]*C7;

-    temp2 = step[15]*C9;

-    output[ 8] = temp1 + temp2;

-    temp1 = step[ 9]*C11;

-    temp2 = step[14]*C5;

-    output[ 9] = temp1 - temp2;

-    temp1 = step[10]*C3;

-    temp2 = step[13]*C13;

-    output[10] = temp1 + temp2;

-    temp1 = step[11]*C15;

-    temp2 = step[12]*C1;

-    output[11] = temp1 - temp2;

-    temp1 = step[11]*C1;

-    temp2 = step[12]*C15;

-    output[12] = temp2 + temp1;

-    temp1 = step[10]*C13;

-    temp2 = step[13]*C3;

-    output[13] = temp2 - temp1;

-    temp1 = step[ 9]*C5;

-    temp2 = step[14]*C11;

-    output[14] = temp2 + temp1;

-    temp1 = step[ 8]*C9;

-    temp2 = step[15]*C7;

-    output[15] = temp2 - temp1;

-    // step 3

-    step[ 0] = output[0] + output[3];

-    step[ 1] = output[1] + output[2];

-    step[ 2] = output[1] - output[2];

-    step[ 3] = output[0] - output[3];

-    temp1 = output[4]*C14;

-    temp2 = output[7]*C2;

-    step[ 4] = temp1 + temp2;

-    temp1 = output[5]*C10;

-    temp2 = output[6]*C6;

-    step[ 5] = temp1 + temp2;

-    temp1 = output[5]*C6;

-    temp2 = output[6]*C10;

-    step[ 6] = temp2 - temp1;

-    temp1 = output[4]*C2;

-    temp2 = output[7]*C14;

-    step[ 7] = temp2 - temp1;

-    step[ 8] = output[ 8] + output[11];

-    step[ 9] = output[ 9] + output[10];

-    step[10] = output[ 9] - output[10];

-    step[11] = output[ 8] - output[11];

-    step[12] = output[12] + output[15];

-    step[13] = output[13] + output[14];

-    step[14] = output[13] - output[14];

-    step[15] = output[12] - output[15];

-    // step 4

-    output[ 0] = (step[ 0] + step[ 1]);

-    output[ 8] = (step[ 0] - step[ 1]);

-    temp1 = step[2]*C12;

-    temp2 = step[3]*C4;

-    temp1 = temp1 + temp2;

-    output[ 4] = 2*(temp1*C8);

-    temp1 = step[2]*C4;

-    temp2 = step[3]*C12;

-    temp1 = temp2 - temp1;

-    output[12] = 2*(temp1*C8);

-    output[ 2] = 2*((step[4] + step[ 5])*C8);

-    output[14] = 2*((step[7] - step[ 6])*C8);

-    temp1 = step[4] - step[5];

-    temp2 = step[6] + step[7];

-    output[ 6] = (temp1 + temp2);

-    output[10] = (temp1 - temp2);

-    intermediate[8] = step[8] + step[14];

-    intermediate[9] = step[9] + step[15];

-    temp1 = intermediate[8]*C12;

-    temp2 = intermediate[9]*C4;

-    temp1 = temp1 - temp2;

-    output[3] = 2*(temp1*C8);

-    temp1 = intermediate[8]*C4;

-    temp2 = intermediate[9]*C12;

-    temp1 = temp2 + temp1;

-    output[13] = 2*(temp1*C8);

-    output[ 9] = 2*((step[10] + step[11])*C8);

-    intermediate[11] = step[10] - step[11];

-    intermediate[12] = step[12] + step[13];

-    intermediate[13] = step[12] - step[13];

-    intermediate[14] = step[ 8] - step[14];

-    intermediate[15] = step[ 9] - step[15];

-    output[15] = (intermediate[11] + intermediate[12]);

-    output[ 1] = -(intermediate[11] - intermediate[12]);

-    output[ 7] = 2*(intermediate[13]*C8);

-    temp1 = intermediate[14]*C12;

-    temp2 = intermediate[15]*C4;

-    temp1 = temp1 - temp2;

-    output[11] = -2*(temp1*C8);

-    temp1 = intermediate[14]*C4;

-    temp2 = intermediate[15]*C12;

-    temp1 = temp2 + temp1;

-    output[ 5] = 2*(temp1*C8);

-  }

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-}

-void vp9_short_fdct16x16_c(short *input, short *out, int pitch) {

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-  {

-    int shortpitch = pitch >> 1;

-    int i, j;

-    double output[256];

-    // First transform columns

-    for (i = 0; i < 16; i++) {

-        double temp_in[16], temp_out[16];

-        for (j = 0; j < 16; j++)

-            temp_in[j] = input[j*shortpitch + i];

-        dct16x16_1d(temp_in, temp_out);

-        for (j = 0; j < 16; j++)

-            output[j*16 + i] = temp_out[j];

-    }

-    // Then transform rows

-    for (i = 0; i < 16; ++i) {

-        double temp_in[16], temp_out[16];

-        for (j = 0; j < 16; ++j)

-            temp_in[j] = output[j + i*16];

-        dct16x16_1d(temp_in, temp_out);

-        for (j = 0; j < 16; ++j)

-            output[j + i*16] = temp_out[j];

-    }

-    // Scale by some magic number

-    for (i = 0; i < 256; i++)

-        out[i] = (short)round(output[i]/2);

-  }

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-}

--- a/vp8/encoder/encodeframe.c

+++ /dev/null

@@ -1,2342 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "encodemb.h"

-#include "encodemv.h"

-#include "vp8/common/common.h"

-#include "onyx_int.h"

-#include "vp8/common/extend.h"

-#include "vp8/common/entropymode.h"

-#include "vp8/common/quant_common.h"

-#include "segmentation.h"

-#include "vp8/common/setupintrarecon.h"

-#include "vp8/common/reconintra4x4.h"

-#include "encodeintra.h"

-#include "vp8/common/reconinter.h"

-#include "vp8/common/invtrans.h"

-#include "rdopt.h"

-#include "vp8/common/findnearmv.h"

-#include "vp8/common/reconintra.h"

-#include "vp8/common/seg_common.h"

-#include "vpx_rtcd.h"

-#include <stdio.h>

-#include <math.h>

-#include <limits.h>

-#include "vp8/common/subpixel.h"

-#include "vpx_ports/vpx_timer.h"

-#include "vp8/common/pred_common.h"

-#define DBG_PRNT_SEGMAP 0

-#if CONFIG_NEWBESTREFMV

-#include "vp8/common/mvref_common.h"

-#endif

-#if CONFIG_RUNTIME_CPU_DETECT

-#define RTCD(x)     &cpi->common.rtcd.x

-#define IF_RTCD(x)  (x)

-#else

-#define RTCD(x)     NULL

-#define IF_RTCD(x)  NULL

-#endif

-#ifdef ENC_DEBUG

-int enc_debug = 0;

-int mb_row_debug, mb_col_debug;

-#endif

-extern void vp9_initialize_me_consts(VP9_COMP *cpi, int QIndex);

-extern void vp9_auto_select_speed(VP9_COMP *cpi);

-int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,

-                              int recon_yoffset, int recon_uvoffset,

-                              int *returnrate, int *returndistortion);

-extern void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,

-                                           int recon_yoffset,

-                                           int recon_uvoffset, int *r, int *d);

-void vp9_build_block_offsets(MACROBLOCK *x);

-void vp9_setup_block_ptrs(MACROBLOCK *x);

-void vp9_encode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,

-                                 int recon_yoffset, int recon_uvoffset,

-                                 int output_enabled);

-void vp9_encode_inter_superblock(VP9_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,

-                                 int recon_yoffset, int recon_uvoffset,

-                                 int mb_col, int mb_row);

-void vp9_encode_intra_macro_block(VP9_COMP *cpi, MACROBLOCK *x,

-                                  TOKENEXTRA **t, int output_enabled);

-void vp9_encode_intra_super_block(VP9_COMP *cpi, MACROBLOCK *x,

-                                  TOKENEXTRA **t, int mb_col);

-static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);

-#ifdef MODE_STATS

-unsigned int inter_y_modes[MB_MODE_COUNT];

-unsigned int inter_uv_modes[VP9_UV_MODES];

-unsigned int inter_b_modes[B_MODE_COUNT];

-unsigned int y_modes[VP9_YMODES];

-unsigned int i8x8_modes[VP9_I8X8_MODES];

-unsigned int uv_modes[VP9_UV_MODES];

-unsigned int uv_modes_y[VP9_YMODES][VP9_UV_MODES];

-unsigned int b_modes[B_MODE_COUNT];

-#endif

-/* activity_avg must be positive, or flat regions could get a zero weight

- *  (infinite lambda), which confounds analysis.

- * This also avoids the need for divide by zero checks in

- *  vp9_activity_masking().

- */

-#define VP9_ACTIVITY_AVG_MIN (64)

-/* This is used as a reference when computing the source variance for the

- *  purposes of activity masking.

- * Eventually this should be replaced by custom no-reference routines,

- *  which will be faster.

- */

-static const unsigned char VP9_VAR_OFFS[16] = {

-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128

-};

-// Original activity measure from Tim T's code.

-static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) {

-  unsigned int act;

-  unsigned int sse;

-  /* TODO: This could also be done over smaller areas (8x8), but that would

-   *  require extensive changes elsewhere, as lambda is assumed to be fixed

-   *  over an entire MB in most of the code.

-   * Another option is to compute four 8x8 variances, and pick a single

-   *  lambda using a non-linear combination (e.g., the smallest, or second

-   *  smallest, etc.).

-   */

-  act = vp9_variance16x16(x->src.y_buffer, x->src.y_stride, VP9_VAR_OFFS, 0,

-                          &sse);

-  act = act << 4;

-  /* If the region is flat, lower the activity some more. */

-  if (act < 8 << 12)

-    act = act < 5 << 12 ? act : 5 << 12;

-  return act;

-}

-// Stub for alternative experimental activity measures.

-static unsigned int alt_activity_measure(VP9_COMP *cpi,

-                                         MACROBLOCK *x, int use_dc_pred) {

-  return vp9_encode_intra(cpi, x, use_dc_pred);

-}

-// Measure the activity of the current macroblock

-// What we measure here is TBD so abstracted to this function

-#define ALT_ACT_MEASURE 1

-static unsigned int mb_activity_measure(VP9_COMP *cpi, MACROBLOCK *x,

-                                        int mb_row, int mb_col) {

-  unsigned int mb_activity;

-  if (ALT_ACT_MEASURE) {

-    int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);

-    // Or use and alternative.

-    mb_activity = alt_activity_measure(cpi, x, use_dc_pred);

-  } else {

-    // Original activity measure from Tim T's code.

-    mb_activity = tt_activity_measure(cpi, x);

-  }

-  if (mb_activity < VP9_ACTIVITY_AVG_MIN)

-    mb_activity = VP9_ACTIVITY_AVG_MIN;

-  return mb_activity;

-}

-// Calculate an "average" mb activity value for the frame

-#define ACT_MEDIAN 0

-static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {

-#if ACT_MEDIAN

-  // Find median: Simple n^2 algorithm for experimentation

-  {

-    unsigned int median;

-    unsigned int i, j;

-    unsigned int *sortlist;

-    unsigned int tmp;

-    // Create a list to sort to

-    CHECK_MEM_ERROR(sortlist,

-    vpx_calloc(sizeof(unsigned int),

-    cpi->common.MBs));

-    // Copy map to sort list

-    vpx_memcpy(sortlist, cpi->mb_activity_map,

-    sizeof(unsigned int) * cpi->common.MBs);

-    // Ripple each value down to its correct position

-    for (i = 1; i < cpi->common.MBs; i ++) {

-      for (j = i; j > 0; j --) {

-        if (sortlist[j] < sortlist[j - 1]) {

-          // Swap values

-          tmp = sortlist[j - 1];

-          sortlist[j - 1] = sortlist[j];

-          sortlist[j] = tmp;

-        } else

-          break;

-      }

-    }

-    // Even number MBs so estimate median as mean of two either side.

-    median = (1 + sortlist[cpi->common.MBs >> 1] +

-              sortlist[(cpi->common.MBs >> 1) + 1]) >> 1;

-    cpi->activity_avg = median;

-    vpx_free(sortlist);

-  }

-#else

-  // Simple mean for now

-  cpi->activity_avg = (unsigned int)(activity_sum / cpi->common.MBs);

-#endif

-  if (cpi->activity_avg < VP9_ACTIVITY_AVG_MIN)

-    cpi->activity_avg = VP9_ACTIVITY_AVG_MIN;

-  // Experimental code: return fixed value normalized for several clips

-  if (ALT_ACT_MEASURE)

-    cpi->activity_avg = 100000;

-}

-#define USE_ACT_INDEX   0

-#define OUTPUT_NORM_ACT_STATS   0

-#if USE_ACT_INDEX

-// Calculate and activity index for each mb

-static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) {

-  VP9_COMMON *const cm = &cpi->common;

-  int mb_row, mb_col;

-  int64_t act;

-  int64_t a;

-  int64_t b;

-#if OUTPUT_NORM_ACT_STATS

-  FILE *f = fopen("norm_act.stt", "a");

-  fprintf(f, "\n%12d\n", cpi->activity_avg);

-#endif

-  // Reset pointers to start of activity map

-  x->mb_activity_ptr = cpi->mb_activity_map;

-  // Calculate normalized mb activity number.

-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {

-    // for each macroblock col in image

-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

-      // Read activity from the map

-      act = *(x->mb_activity_ptr);

-      // Calculate a normalized activity number

-      a = act + 4 * cpi->activity_avg;

-      b = 4 * act + cpi->activity_avg;

-      if (b >= a)

-        *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1;

-      else

-        *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b);

-#if OUTPUT_NORM_ACT_STATS

-      fprintf(f, " %6d", *(x->mb_activity_ptr));

-#endif

-      // Increment activity map pointers

-      x->mb_activity_ptr++;

-    }

-#if OUTPUT_NORM_ACT_STATS

-    fprintf(f, "\n");

-#endif

-  }

-#if OUTPUT_NORM_ACT_STATS

-  fclose(f);

-#endif

-}

-#endif

-// Loop through all MBs. Note activity of each, average activity and

-// calculate a normalized activity for each

-static void build_activity_map(VP9_COMP *cpi) {

-  MACROBLOCK *const x = &cpi->mb;

-  MACROBLOCKD *xd = &x->e_mbd;

-  VP9_COMMON *const cm = &cpi->common;

-#if ALT_ACT_MEASURE

-  YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];

-  int recon_yoffset;

-  int recon_y_stride = new_yv12->y_stride;

-#endif

-  int mb_row, mb_col;

-  unsigned int mb_activity;

-  int64_t activity_sum = 0;

-  // for each macroblock row in image

-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {

-#if ALT_ACT_MEASURE

-    // reset above block coeffs

-    xd->up_available = (mb_row != 0);

-    recon_yoffset = (mb_row * recon_y_stride * 16);

-#endif

-    // for each macroblock col in image

-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

-#if ALT_ACT_MEASURE

-      xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;

-      xd->left_available = (mb_col != 0);

-      recon_yoffset += 16;

-#endif

-      // Copy current mb to a buffer

-      vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);

-      // measure activity

-      mb_activity = mb_activity_measure(cpi, x, mb_row, mb_col);

-      // Keep frame sum

-      activity_sum += mb_activity;

-      // Store MB level activity details.

-      *x->mb_activity_ptr = mb_activity;

-      // Increment activity map pointer

-      x->mb_activity_ptr++;

-      // adjust to the next column of source macroblocks

-      x->src.y_buffer += 16;

-    }

-    // adjust to the next row of mbs

-    x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;

-#if ALT_ACT_MEASURE

-    // extend the recon for intra prediction

-    vp9_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,

-                      xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);

-#endif

-  }

-  // Calculate an "average" MB activity

-  calc_av_activity(cpi, activity_sum);

-#if USE_ACT_INDEX

-  // Calculate an activity index number of each mb

-  calc_activity_index(cpi, x);

-#endif

-}

-// Macroblock activity masking

-void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {

-#if USE_ACT_INDEX

-  x->rdmult += *(x->mb_activity_ptr) * (x->rdmult >> 2);

-  x->errorperbit = x->rdmult * 100 / (110 * x->rddiv);

-  x->errorperbit += (x->errorperbit == 0);

-#else

-  int64_t a;

-  int64_t b;

-  int64_t act = *(x->mb_activity_ptr);

-  // Apply the masking to the RD multiplier.

-  a = act + (2 * cpi->activity_avg);

-  b = (2 * act) + cpi->activity_avg;

-  x->rdmult = (unsigned int)(((int64_t)x->rdmult * b + (a >> 1)) / a);

-  x->errorperbit = x->rdmult * 100 / (110 * x->rddiv);

-  x->errorperbit += (x->errorperbit == 0);

-#endif

-  // Activity based Zbin adjustment

-  adjust_act_zbin(cpi, x);

-}

-static void update_state(VP9_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {

-  int i;

-  MACROBLOCKD *xd = &x->e_mbd;

-  MODE_INFO *mi = &ctx->mic;

-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;

-  int mb_mode = mi->mbmi.mode;

-  int mb_mode_index = ctx->best_mode_index;

-#if CONFIG_DEBUG

-  assert(mb_mode < MB_MODE_COUNT);

-  assert(mb_mode_index < MAX_MODES);

-  assert(mi->mbmi.ref_frame < MAX_REF_FRAMES);

-#endif

-  // Restore the coding context of the MB to that that was in place

-  // when the mode was picked for it

-  vpx_memcpy(xd->mode_info_context, mi, sizeof(MODE_INFO));

-#if CONFIG_SUPERBLOCKS

-  if (mi->mbmi.encoded_as_sb) {

-    const int mis = cpi->common.mode_info_stride;

-    if (xd->mb_to_right_edge > 0)

-      vpx_memcpy(xd->mode_info_context + 1, mi, sizeof(MODE_INFO));

-    if (xd->mb_to_bottom_edge > 0) {

-      vpx_memcpy(xd->mode_info_context + mis, mi, sizeof(MODE_INFO));

-      if (xd->mb_to_right_edge > 0)

-        vpx_memcpy(xd->mode_info_context + mis + 1, mi, sizeof(MODE_INFO));

-    }

-  }

-#endif

-  if (mb_mode == B_PRED) {

-    for (i = 0; i < 16; i++) {

-      xd->block[i].bmi.as_mode = xd->mode_info_context->bmi[i].as_mode;

-      assert(xd->block[i].bmi.as_mode.first < MB_MODE_COUNT);

-    }

-  } else if (mb_mode == I8X8_PRED) {

-    for (i = 0; i < 16; i++) {

-      xd->block[i].bmi = xd->mode_info_context->bmi[i];

-    }

-  } else if (mb_mode == SPLITMV) {

-    vpx_memcpy(x->partition_info, &ctx->partition_info,

-               sizeof(PARTITION_INFO));

-    mbmi->mv[0].as_int = x->partition_info->bmi[15].mv.as_int;

-    mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;

-  }

-  {

-    int segment_id = mbmi->segment_id;

-    if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||

-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB)) {

-      for (i = 0; i < NB_TXFM_MODES; i++) {

-        cpi->rd_tx_select_diff[i] += ctx->txfm_rd_diff[i];

-      }

-    }

-  }

-  if (cpi->common.frame_type == KEY_FRAME) {

-    // Restore the coding modes to that held in the coding context

-    // if (mb_mode == B_PRED)

-    //    for (i = 0; i < 16; i++)

-    //    {

-    //        xd->block[i].bmi.as_mode =

-    //                          xd->mode_info_context->bmi[i].as_mode;

-    //        assert(xd->mode_info_context->bmi[i].as_mode < MB_MODE_COUNT);

-    //    }

-#if CONFIG_INTERNAL_STATS

-    static const int kf_mode_index[] = {

-      THR_DC /*DC_PRED*/,

-      THR_V_PRED /*V_PRED*/,

-      THR_H_PRED /*H_PRED*/,

-      THR_D45_PRED /*D45_PRED*/,

-      THR_D135_PRED /*D135_PRED*/,

-      THR_D117_PRED /*D117_PRED*/,

-      THR_D153_PRED /*D153_PRED*/,

-      THR_D27_PRED /*D27_PRED*/,

-      THR_D63_PRED /*D63_PRED*/,

-      THR_TM /*TM_PRED*/,

-      THR_I8X8_PRED /*I8X8_PRED*/,

-      THR_B_PRED /*B_PRED*/,

-    };

-    cpi->mode_chosen_counts[kf_mode_index[mb_mode]]++;

-#endif

-  } else {

-    /*

-            // Reduce the activation RD thresholds for the best choice mode

-            if ((cpi->rd_baseline_thresh[mb_mode_index] > 0) &&

-                (cpi->rd_baseline_thresh[mb_mode_index] < (INT_MAX >> 2)))

-            {

-                int best_adjustment = (cpi->rd_thresh_mult[mb_mode_index] >> 2);

-                cpi->rd_thresh_mult[mb_mode_index] =

-                        (cpi->rd_thresh_mult[mb_mode_index]

-                         >= (MIN_THRESHMULT + best_adjustment)) ?

-                                cpi->rd_thresh_mult[mb_mode_index] - best_adjustment :

-                                MIN_THRESHMULT;

-                cpi->rd_threshes[mb_mode_index] =

-                        (cpi->rd_baseline_thresh[mb_mode_index] >> 7)

-                        * cpi->rd_thresh_mult[mb_mode_index];

-            }

-    */

-    // Note how often each mode chosen as best

-    cpi->mode_chosen_counts[mb_mode_index]++;

-    cpi->prediction_error += ctx->distortion;

-    cpi->intra_error += ctx->intra_error;

-    cpi->rd_comp_pred_diff[0] += ctx->single_pred_diff;

-    cpi->rd_comp_pred_diff[1] += ctx->comp_pred_diff;

-    cpi->rd_comp_pred_diff[2] += ctx->hybrid_pred_diff;

-  }

-}

-static void pick_mb_modes(VP9_COMP *cpi,

-                          VP9_COMMON *cm,

-                          int mb_row,

-                          int mb_col,

-                          MACROBLOCK  *x,

-                          MACROBLOCKD *xd,

-                          TOKENEXTRA **tp,

-                          int *totalrate,

-                          int *totaldist) {

-  int i;

-  int map_index;

-  int recon_yoffset, recon_uvoffset;

-  int ref_fb_idx = cm->lst_fb_idx;

-  int dst_fb_idx = cm->new_fb_idx;

-  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;

-  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;

-  ENTROPY_CONTEXT_PLANES left_context[2];

-  ENTROPY_CONTEXT_PLANES above_context[2];

-  ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context

-                                                      + mb_col;

-  // Offsets to move pointers from MB to MB within a SB in raster order

-  int row_delta[4] = { 0, +1,  0, -1};

-  int col_delta[4] = { +1, -1, +1, +1};

-  /* Function should not modify L & A contexts; save and restore on exit */

-  vpx_memcpy(left_context,

-             cm->left_context,

-             sizeof(left_context));

-  vpx_memcpy(above_context,

-             initial_above_context_ptr,

-             sizeof(above_context));

-  /* Encode MBs in raster order within the SB */

-  for (i = 0; i < 4; i++) {

-    int dy = row_delta[i];

-    int dx = col_delta[i];

-    int offset_unextended = dy * cm->mb_cols + dx;

-    int offset_extended   = dy * xd->mode_info_stride + dx;

-    MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;

-    // TODO Many of the index items here can be computed more efficiently!

-    if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) {

-      // MB lies outside frame, move on

-      mb_row += dy;

-      mb_col += dx;

-      // Update pointers

-      x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);

-      x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);

-      x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);

-      x->gf_active_ptr += offset_unextended;

-      x->partition_info += offset_extended;

-      xd->mode_info_context += offset_extended;

-      xd->prev_mode_info_context += offset_extended;

-#if CONFIG_DEBUG

-      assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==

-             (xd->mode_info_context - cpi->common.mip));

-#endif

-      continue;

-    }

-    // Index of the MB in the SB 0..3

-    xd->mb_index = i;

-    map_index = (mb_row * cpi->common.mb_cols) + mb_col;

-    x->mb_activity_ptr = &cpi->mb_activity_map[map_index];

-    // set above context pointer

-    xd->above_context = cm->above_context + mb_col;

-    // Restore the appropriate left context depending on which

-    // row in the SB the MB is situated

-    xd->left_context = cm->left_context + (i >> 1);

-    // Set up distance of MB to edge of frame in 1/8th pel units

-    xd->mb_to_top_edge    = -((mb_row * 16) << 3);

-    xd->mb_to_left_edge   = -((mb_col * 16) << 3);

-    xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;

-    xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;

-    // Set up limit values for MV components to prevent them from

-    // extending beyond the UMV borders assuming 16x16 block size

-    x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);

-    x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);

-    x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +

-                     (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));

-    x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +

-                     (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));

-    xd->up_available   = (mb_row != 0);

-    xd->left_available = (mb_col != 0);

-    recon_yoffset  = (mb_row * recon_y_stride * 16) + (mb_col * 16);

-    recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col *  8);

-    xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;

-    xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;

-    xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;

-    // Copy current MB to a work buffer

-    vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);

-    x->rddiv = cpi->RDDIV;

-    x->rdmult = cpi->RDMULT;

-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM)

-      vp9_activity_masking(cpi, x);

-    // Is segmentation enabled

-    if (xd->segmentation_enabled) {

-      // Code to set segment id in xd->mbmi.segment_id

-      if (xd->update_mb_segmentation_map)

-        mbmi->segment_id = cpi->segmentation_map[map_index];

-      else

-        mbmi->segment_id = cm->last_frame_seg_map[map_index];

-      if (mbmi->segment_id > 3)

-        mbmi->segment_id = 0;

-      vp9_mb_init_quantizer(cpi, x);

-    } else

-      // Set to Segment 0 by default

-      mbmi->segment_id = 0;

-    x->active_ptr = cpi->active_map + map_index;

-#if CONFIG_SUPERBLOCKS

-    xd->mode_info_context->mbmi.encoded_as_sb = 0;

-#endif

-    cpi->update_context = 0;    // TODO Do we need this now??

-    vp9_intra_prediction_down_copy(xd);

-    // Find best coding mode & reconstruct the MB so it is available

-    // as a predictor for MBs that follow in the SB

-    if (cm->frame_type == KEY_FRAME) {

-      int r, d;

-      vp9_rd_pick_intra_mode(cpi, x, &r, &d);

-      *totalrate += r;

-      *totaldist += d;

-      // Dummy encode, do not do the tokenization

-      vp9_encode_intra_macro_block(cpi, x, tp, 0);

-      // Note the encoder may have changed the segment_id

-      // Save the coding context

-      vpx_memcpy(&x->mb_context[i].mic, xd->mode_info_context,

-                 sizeof(MODE_INFO));

-    } else {

-      int seg_id, r, d;

-      if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&

-          !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&

-          vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME) &&

-          vp9_check_segref(xd, 1, INTRA_FRAME)  +

-          vp9_check_segref(xd, 1, LAST_FRAME)   +

-          vp9_check_segref(xd, 1, GOLDEN_FRAME) +

-          vp9_check_segref(xd, 1, ALTREF_FRAME) == 1) {

-        cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;

-      } else {

-        cpi->seg0_progress = (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols + i) << 16) / cm->MBs;

-      }

-      vp9_pick_mode_inter_macroblock(cpi, x, recon_yoffset,

-                                     recon_uvoffset, &r, &d);

-      *totalrate += r;

-      *totaldist += d;

-      // Dummy encode, do not do the tokenization

-      vp9_encode_inter_macroblock(cpi, x, tp,

-                                  recon_yoffset, recon_uvoffset, 0);

-      seg_id = mbmi->segment_id;

-      if (cpi->mb.e_mbd.segmentation_enabled && seg_id == 0) {

-        cpi->seg0_idx++;

-      }

-      if (!xd->segmentation_enabled ||

-          !vp9_segfeature_active(xd, seg_id, SEG_LVL_REF_FRAME) ||

-          vp9_check_segref(xd, seg_id, INTRA_FRAME)  +

-          vp9_check_segref(xd, seg_id, LAST_FRAME)   +

-          vp9_check_segref(xd, seg_id, GOLDEN_FRAME) +

-          vp9_check_segref(xd, seg_id, ALTREF_FRAME) > 1) {

-        // Get the prediction context and status

-        int pred_flag = vp9_get_pred_flag(xd, PRED_REF);

-        int pred_context = vp9_get_pred_context(cm, xd, PRED_REF);

-        // Count prediction success

-        cpi->ref_pred_count[pred_context][pred_flag]++;

-      }

-    }

-    // Next MB

-    mb_row += dy;

-    mb_col += dx;

-    x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);

-    x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);

-    x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);

-    x->gf_active_ptr += offset_unextended;

-    x->partition_info += offset_extended;

-    xd->mode_info_context += offset_extended;

-    xd->prev_mode_info_context += offset_extended;

-#if CONFIG_DEBUG

-    assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==

-           (xd->mode_info_context - cpi->common.mip));

-#endif

-  }

-  /* Restore L & A coding context to those in place on entry */

-  vpx_memcpy(cm->left_context,

-             left_context,

-             sizeof(left_context));

-  vpx_memcpy(initial_above_context_ptr,

-             above_context,

-             sizeof(above_context));

-}

-#if CONFIG_SUPERBLOCKS

-static void pick_sb_modes (VP9_COMP *cpi,

-                           VP9_COMMON *cm,

-                           int mb_row,

-                           int mb_col,

-                           MACROBLOCK  *x,

-                           MACROBLOCKD *xd,

-                           TOKENEXTRA **tp,

-                           int *totalrate,

-                           int *totaldist)

-{

-  int map_index;

-  int recon_yoffset, recon_uvoffset;

-  int ref_fb_idx = cm->lst_fb_idx;

-  int dst_fb_idx = cm->new_fb_idx;

-  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;

-  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;

-  ENTROPY_CONTEXT_PLANES left_context[2];

-  ENTROPY_CONTEXT_PLANES above_context[2];

-  ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context

-    + mb_col;

-  /* Function should not modify L & A contexts; save and restore on exit */

-  vpx_memcpy (left_context,

-              cm->left_context,

-              sizeof(left_context));

-  vpx_memcpy (above_context,

-              initial_above_context_ptr,

-              sizeof(above_context));

-  map_index = (mb_row * cpi->common.mb_cols) + mb_col;

-  x->mb_activity_ptr = &cpi->mb_activity_map[map_index];

-  /* set above context pointer */

-  xd->above_context = cm->above_context + mb_col;

-  /* Restore the appropriate left context depending on which

-   * row in the SB the MB is situated */

-  xd->left_context = cm->left_context;

-  // Set up distance of MB to edge of frame in 1/8th pel units

-  xd->mb_to_top_edge    = -((mb_row * 16) << 3);

-  xd->mb_to_left_edge   = -((mb_col * 16) << 3);

-  xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;

-  xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;

-  /* Set up limit values for MV components to prevent them from

-   * extending beyond the UMV borders assuming 16x16 block size */

-  x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);

-  x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);

-  x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +

-                   (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));

-  x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +

-                   (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));

-  xd->up_available   = (mb_row != 0);

-  xd->left_available = (mb_col != 0);

-  recon_yoffset  = (mb_row * recon_y_stride * 16) + (mb_col * 16);

-  recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col *  8);

-  xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;

-  xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;

-  xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;

-#if 0 // FIXME

-  /* Copy current MB to a work buffer */

-  vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);

-#endif

-  x->rddiv = cpi->RDDIV;

-  x->rdmult = cpi->RDMULT;

-  if(cpi->oxcf.tuning == VP8_TUNE_SSIM)

-    vp9_activity_masking(cpi, x);

-  /* Is segmentation enabled */

-  if (xd->segmentation_enabled)

-  {

-    /* Code to set segment id in xd->mbmi.segment_id */

-    if (xd->update_mb_segmentation_map)

-      xd->mode_info_context->mbmi.segment_id =

-            cpi->segmentation_map[map_index] &&

-            cpi->segmentation_map[map_index + 1] &&

-            cpi->segmentation_map[map_index + cm->mb_cols] &&

-            cpi->segmentation_map[map_index + cm->mb_cols + 1];

-    else

-      xd->mode_info_context->mbmi.segment_id =

-            cm->last_frame_seg_map[map_index] &&

-            cm->last_frame_seg_map[map_index + 1] &&

-            cm->last_frame_seg_map[map_index + cm->mb_cols] &&

-            cm->last_frame_seg_map[map_index + cm->mb_cols + 1];

-    if (xd->mode_info_context->mbmi.segment_id > 3)

-      xd->mode_info_context->mbmi.segment_id = 0;

-    vp9_mb_init_quantizer(cpi, x);

-  }

-  else

-    /* Set to Segment 0 by default */

-    xd->mode_info_context->mbmi.segment_id = 0;

-  x->active_ptr = cpi->active_map + map_index;

-  cpi->update_context = 0;    // TODO Do we need this now??

-  /* Find best coding mode & reconstruct the MB so it is available

-   * as a predictor for MBs that follow in the SB */

-  if (cm->frame_type == KEY_FRAME)

-  {

-    vp9_rd_pick_intra_mode_sb(cpi, x,

-                              totalrate,

-                              totaldist);

-    /* Save the coding context */

-    vpx_memcpy(&x->sb_context[0].mic, xd->mode_info_context,

-               sizeof(MODE_INFO));

-  } else {

-    if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&

-        !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&

-        vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME) &&

-        vp9_check_segref(xd, 1, INTRA_FRAME)  +

-        vp9_check_segref(xd, 1, LAST_FRAME)   +

-        vp9_check_segref(xd, 1, GOLDEN_FRAME) +

-        vp9_check_segref(xd, 1, ALTREF_FRAME) == 1) {

-      cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;

-    } else {

-      cpi->seg0_progress =

-        (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols) << 16) / cm->MBs;

-    }

-    vp9_rd_pick_inter_mode_sb(cpi, x,

-                              recon_yoffset,

-                              recon_uvoffset,

-                              totalrate,

-                              totaldist);

-  }

-  /* Restore L & A coding context to those in place on entry */

-  vpx_memcpy (cm->left_context,

-              left_context,

-              sizeof(left_context));

-  vpx_memcpy (initial_above_context_ptr,

-              above_context,

-              sizeof(above_context));

-}

-#endif

-static void encode_sb(VP9_COMP *cpi,

-                      VP9_COMMON *cm,

-                      int mbrow,

-                      int mbcol,

-                      MACROBLOCK  *x,

-                      MACROBLOCKD *xd,

-                      TOKENEXTRA **tp) {

-  int i;

-  int map_index;

-  int mb_row, mb_col;

-  int recon_yoffset, recon_uvoffset;

-  int ref_fb_idx = cm->lst_fb_idx;

-  int dst_fb_idx = cm->new_fb_idx;

-  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;

-  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;

-  int row_delta[4] = { 0, +1,  0, -1};

-  int col_delta[4] = { +1, -1, +1, +1};

-  mb_row = mbrow;

-  mb_col = mbcol;

-  /* Encode MBs in raster order within the SB */

-  for (i = 0; i < 4; i++) {

-    int dy = row_delta[i];

-    int dx = col_delta[i];

-    int offset_extended   = dy * xd->mode_info_stride + dx;

-    int offset_unextended = dy * cm->mb_cols + dx;

-    MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;

-    if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) {

-      // MB lies outside frame, move on

-      mb_row += dy;

-      mb_col += dx;

-      x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);

-      x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);

-      x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);

-      x->gf_active_ptr      += offset_unextended;

-      x->partition_info     += offset_extended;

-      xd->mode_info_context += offset_extended;

-      xd->prev_mode_info_context += offset_extended;

-#if CONFIG_DEBUG

-      assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==

-             (xd->mode_info_context - cpi->common.mip));

-#endif

-      continue;

-    }

-    xd->mb_index = i;

-#ifdef ENC_DEBUG

-    enc_debug = (cpi->common.current_video_frame == 0 &&

-                 mb_row == 0 && mb_col == 0);

-    mb_col_debug = mb_col;

-    mb_row_debug = mb_row;

-#endif

-    // Restore MB state to that when it was picked

-#if CONFIG_SUPERBLOCKS

-    if (xd->mode_info_context->mbmi.encoded_as_sb) {

-      update_state(cpi, x, &x->sb_context[i]);

-      cpi->sb_count++;

-    } else

-#endif

-      update_state(cpi, x, &x->mb_context[i]);

-    map_index = (mb_row * cpi->common.mb_cols) + mb_col;

-    x->mb_activity_ptr = &cpi->mb_activity_map[map_index];

-    // reset above block coeffs

-    xd->above_context = cm->above_context + mb_col;

-    xd->left_context  = cm->left_context + (i >> 1);

-    // Set up distance of MB to edge of the frame in 1/8th pel units

-    xd->mb_to_top_edge    = -((mb_row * 16) << 3);

-    xd->mb_to_left_edge   = -((mb_col * 16) << 3);

-    xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;

-    xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;

-#if CONFIG_SUPERBLOCKS

-    if (xd->mode_info_context->mbmi.encoded_as_sb) {

-      // Set up limit values for MV components to prevent them from

-      // extending beyond the UMV borders assuming 32x32 block size

-      x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);

-      x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);

-      x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +

-                       (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));

-      x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +

-                       (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));

-    } else {

-#endif

-      // Set up limit values for MV components to prevent them from

-      // extending beyond the UMV borders assuming 16x16 block size

-      x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);

-      x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);

-      x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +

-                       (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));

-      x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +

-                       (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));

-#if CONFIG_SUPERBLOCKS

-    }

-#endif

-    xd->up_available = (mb_row != 0);

-    xd->left_available = (mb_col != 0);

-    recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);

-    recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);

-    xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;

-    xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;

-    xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;

-    // Copy current MB to a work buffer

-    vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);

-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM)

-      vp9_activity_masking(cpi, x);

-    // Is segmentation enabled

-    if (xd->segmentation_enabled) {

-      vp9_mb_init_quantizer(cpi, x);

-    }

-    x->active_ptr = cpi->active_map + map_index;

-    cpi->update_context = 0;

-#if CONFIG_SUPERBLOCKS

-    if (!xd->mode_info_context->mbmi.encoded_as_sb)

-#endif

-      vp9_intra_prediction_down_copy(xd);

-    if (cm->frame_type == KEY_FRAME) {

-#if CONFIG_SUPERBLOCKS

-      if (xd->mode_info_context->mbmi.encoded_as_sb)

-        vp9_encode_intra_super_block(cpi, x, tp, mb_col);

-      else

-#endif

-        vp9_encode_intra_macro_block(cpi, x, tp, 1);

-        // Note the encoder may have changed the segment_id

-#ifdef MODE_STATS

-      y_modes[mbmi->mode]++;

-#endif

-    } else {

-      unsigned char *segment_id;

-      int seg_ref_active;

-      if (xd->mode_info_context->mbmi.ref_frame) {

-        unsigned char pred_context;

-        pred_context = vp9_get_pred_context(cm, xd, PRED_COMP);

-        if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME)

-          cpi->single_pred_count[pred_context]++;

-        else

-          cpi->comp_pred_count[pred_context]++;

-      }

-#if CONFIG_SUPERBLOCKS

-      if (xd->mode_info_context->mbmi.encoded_as_sb)

-        vp9_encode_inter_superblock(cpi, x, tp, recon_yoffset, recon_uvoffset,

-                                    mb_col, mb_row);

-      else

-#endif

-        vp9_encode_inter_macroblock(cpi, x, tp,

-                                    recon_yoffset, recon_uvoffset, 1);

-        // Note the encoder may have changed the segment_id

-#ifdef MODE_STATS

-      inter_y_modes[mbmi->mode]++;

-      if (mbmi->mode == SPLITMV) {

-        int b;

-        for (b = 0; b < x->partition_info->count; b++) {

-          inter_b_modes[x->partition_info->bmi[b].mode]++;

-        }

-      }

-#endif

-      // If we have just a single reference frame coded for a segment then

-      // exclude from the reference frame counts used to work out

-      // probabilities. NOTE: At the moment we dont support custom trees

-      // for the reference frame coding for each segment but this is a

-      // possible future action.

-      segment_id = &mbmi->segment_id;

-      seg_ref_active = vp9_segfeature_active(xd, *segment_id,

-                                             SEG_LVL_REF_FRAME);

-      if (!seg_ref_active ||

-          ((vp9_check_segref(xd, *segment_id, INTRA_FRAME) +

-            vp9_check_segref(xd, *segment_id, LAST_FRAME) +

-            vp9_check_segref(xd, *segment_id, GOLDEN_FRAME) +

-            vp9_check_segref(xd, *segment_id, ALTREF_FRAME)) > 1)) {

-        {

-          cpi->count_mb_ref_frame_usage[mbmi->ref_frame]++;

-        }

-      }

-      // Count of last ref frame 0,0 usage

-      if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))

-        cpi->inter_zz_count++;

-    }

-#if CONFIG_SUPERBLOCKS

-    if (xd->mode_info_context->mbmi.encoded_as_sb) {

-      x->src.y_buffer += 32;

-      x->src.u_buffer += 16;

-      x->src.v_buffer += 16;

-      x->gf_active_ptr      += 2;

-      x->partition_info     += 2;

-      xd->mode_info_context += 2;

-      xd->prev_mode_info_context += 2;

-      (*tp)->Token = EOSB_TOKEN;

-      (*tp)++;

-      if (mb_row < cm->mb_rows) cpi->tplist[mb_row].stop = *tp;

-      break;

-    }

-#endif

-    // Next MB

-    mb_row += dy;

-    mb_col += dx;

-    x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);

-    x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);

-    x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);

-    x->gf_active_ptr      += offset_unextended;

-    x->partition_info     += offset_extended;

-    xd->mode_info_context += offset_extended;

-    xd->prev_mode_info_context += offset_extended;

-#if CONFIG_DEBUG

-    assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==

-           (xd->mode_info_context - cpi->common.mip));

-#endif

-    (*tp)->Token = EOSB_TOKEN;

-    (*tp)++;

-    if (mb_row < cm->mb_rows) cpi->tplist[mb_row].stop = *tp;

-  }

-  // debug output

-#if DBG_PRNT_SEGMAP

-  {

-    FILE *statsfile;

-    statsfile = fopen("segmap2.stt", "a");

-    fprintf(statsfile, "\n");

-    fclose(statsfile);

-  }

-#endif

-}

-static

-void encode_sb_row(VP9_COMP *cpi,

-                   VP9_COMMON *cm,

-                   int mb_row,

-                   MACROBLOCK  *x,

-                   MACROBLOCKD *xd,

-                   TOKENEXTRA **tp,

-                   int *totalrate) {

-  int mb_col;

-  int mb_cols = cm->mb_cols;

-  // Initialize the left context for the new SB row

-  vpx_memset(cm->left_context, 0, sizeof(cm->left_context));

-  // Code each SB in the row

-  for (mb_col = 0; mb_col < mb_cols; mb_col += 2) {

-    int mb_rate = 0, mb_dist = 0;

-#if CONFIG_SUPERBLOCKS

-    int sb_rate = INT_MAX, sb_dist;

-#endif

-#if CONFIG_DEBUG

-    MODE_INFO *mic = xd->mode_info_context;

-    PARTITION_INFO *pi = x->partition_info;

-    signed char  *gfa = x->gf_active_ptr;

-    unsigned char *yb = x->src.y_buffer;

-    unsigned char *ub = x->src.u_buffer;

-    unsigned char *vb = x->src.v_buffer;

-#endif

-#if CONFIG_SUPERBLOCKS

-    // Pick modes assuming the SB is coded as 4 independent MBs

-    xd->mode_info_context->mbmi.encoded_as_sb = 0;

-#endif

-    pick_mb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &mb_rate, &mb_dist);

-#if CONFIG_SUPERBLOCKS

-    mb_rate += vp9_cost_bit(cm->sb_coded, 0);

-#endif

-    x->src.y_buffer -= 32;

-    x->src.u_buffer -= 16;

-    x->src.v_buffer -= 16;

-    x->gf_active_ptr -= 2;

-    x->partition_info -= 2;

-    xd->mode_info_context -= 2;

-    xd->prev_mode_info_context -= 2;

-#if CONFIG_DEBUG

-    assert(x->gf_active_ptr == gfa);

-    assert(x->partition_info == pi);

-    assert(xd->mode_info_context == mic);

-    assert(x->src.y_buffer == yb);

-    assert(x->src.u_buffer == ub);

-    assert(x->src.v_buffer == vb);

-#endif

-#if CONFIG_SUPERBLOCKS

-    if (!(((    mb_cols & 1) && mb_col ==     mb_cols - 1) ||

-          ((cm->mb_rows & 1) && mb_row == cm->mb_rows - 1))) {

-      /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */

-      xd->mode_info_context->mbmi.encoded_as_sb = 1;

-      pick_sb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &sb_rate, &sb_dist);

-      sb_rate += vp9_cost_bit(cm->sb_coded, 1);

-    }

-    /* Decide whether to encode as a SB or 4xMBs */

-    if (sb_rate < INT_MAX &&

-        RDCOST(x->rdmult, x->rddiv, sb_rate, sb_dist) <

-          RDCOST(x->rdmult, x->rddiv, mb_rate, mb_dist)) {

-      xd->mode_info_context->mbmi.encoded_as_sb = 1;

-      xd->mode_info_context[1].mbmi.encoded_as_sb = 1;

-      xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 1;

-      xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 1;

-      *totalrate += sb_rate;

-    } else

-#endif

-    {

-#if CONFIG_SUPERBLOCKS

-      xd->mode_info_context->mbmi.encoded_as_sb = 0;

-      if (cm->mb_cols - 1 > mb_col)

-        xd->mode_info_context[1].mbmi.encoded_as_sb = 0;

-      if (cm->mb_rows - 1 > mb_row) {

-        xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 0;

-        if (cm->mb_cols - 1 > mb_col)

-          xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 0;

-      }

-#endif

-      *totalrate += mb_rate;

-    }

-    /* Encode SB using best computed mode(s) */

-    encode_sb(cpi, cm, mb_row, mb_col, x, xd, tp);

-#if CONFIG_DEBUG

-    assert(x->gf_active_ptr == gfa + 2);

-    assert(x->partition_info == pi + 2);

-    assert(xd->mode_info_context == mic + 2);

-    assert(x->src.y_buffer == yb + 32);

-    assert(x->src.u_buffer == ub + 16);

-    assert(x->src.v_buffer == vb + 16);

-#endif

-  }

-  // this is to account for the border

-  x->gf_active_ptr += mb_cols - (mb_cols & 0x1);

-  x->partition_info += xd->mode_info_stride + 1 - (mb_cols & 0x1);

-  xd->mode_info_context += xd->mode_info_stride + 1 - (mb_cols & 0x1);

-  xd->prev_mode_info_context += xd->mode_info_stride + 1 - (mb_cols & 0x1);

-#if CONFIG_DEBUG

-  assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==

-         (xd->mode_info_context - cpi->common.mip));

-#endif

-}

-static void init_encode_frame_mb_context(VP9_COMP *cpi) {

-  MACROBLOCK *const x = &cpi->mb;

-  VP9_COMMON *const cm = &cpi->common;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  // GF active flags data structure

-  x->gf_active_ptr = (signed char *)cpi->gf_active_flags;

-  // Activity map pointer

-  x->mb_activity_ptr = cpi->mb_activity_map;

-  x->act_zbin_adj = 0;

-  cpi->seg0_idx = 0;

-  vpx_memset(cpi->ref_pred_count, 0, sizeof(cpi->ref_pred_count));

-  x->partition_info = x->pi;

-  xd->mode_info_context = cm->mi;

-  xd->mode_info_stride = cm->mode_info_stride;

-  xd->prev_mode_info_context = cm->prev_mi;

-  xd->frame_type = cm->frame_type;

-  xd->frames_since_golden = cm->frames_since_golden;

-  xd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;

-  // reset intra mode contexts

-  if (cm->frame_type == KEY_FRAME)

-    vp9_init_mbmode_probs(cm);

-  // Copy data over into macro block data structures.

-  x->src = * cpi->Source;

-  xd->pre = cm->yv12_fb[cm->lst_fb_idx];

-  xd->dst = cm->yv12_fb[cm->new_fb_idx];

-  // set up frame for intra coded blocks

-  vp9_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]);

-  vp9_build_block_offsets(x);

-  vp9_setup_block_dptrs(&x->e_mbd);

-  vp9_setup_block_ptrs(x);

-  xd->mode_info_context->mbmi.mode = DC_PRED;

-  xd->mode_info_context->mbmi.uv_mode = DC_PRED;

-  vp9_zero(cpi->count_mb_ref_frame_usage)

-  vp9_zero(cpi->bmode_count)

-  vp9_zero(cpi->ymode_count)

-  vp9_zero(cpi->i8x8_mode_count)

-  vp9_zero(cpi->y_uv_mode_count)

-  vp9_zero(cpi->sub_mv_ref_count)

-  vp9_zero(cpi->mbsplit_count)

-  vp9_zero(cpi->common.fc.mv_ref_ct)

-  vp9_zero(cpi->common.fc.mv_ref_ct_a)

-#if CONFIG_SUPERBLOCKS

-  vp9_zero(cpi->sb_ymode_count)

-  cpi->sb_count = 0;

-#endif

-  vpx_memset(cm->above_context, 0,

-             sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);

-  xd->fullpixel_mask = 0xffffffff;

-  if (cm->full_pixel)

-    xd->fullpixel_mask = 0xfffffff8;

-}

-static void encode_frame_internal(VP9_COMP *cpi) {

-  int mb_row;

-  MACROBLOCK *const x = &cpi->mb;

-  VP9_COMMON *const cm = &cpi->common;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  TOKENEXTRA *tp = cpi->tok;

-  int totalrate;

-  //printf("encode_frame_internal\n");

-  // Compute a modified set of reference frame probabilities to use when

-  // prediction fails. These are based on the current general estimates for

-  // this frame which may be updated with each iteration of the recode loop.

-  vp9_compute_mod_refprobs(cm);

-#if CONFIG_NEW_MVREF

-  // temp stats reset

-  vp9_zero( cpi->best_ref_index_counts );

-#endif

-// debug output

-#if DBG_PRNT_SEGMAP

-  {

-    FILE *statsfile;

-    statsfile = fopen("segmap2.stt", "a");

-    fprintf(statsfile, "\n");

-    fclose(statsfile);

-  }

-#endif

-  totalrate = 0;

-  // Functions setup for all frame types so we can use MC in AltRef

-  vp9_setup_interp_filters(xd, cm->mcomp_filter_type, cm);

-  // Reset frame count of inter 0,0 motion vector usage.

-  cpi->inter_zz_count = 0;

-  cpi->prediction_error = 0;

-  cpi->intra_error = 0;

-  cpi->skip_true_count[0] = cpi->skip_true_count[1] = cpi->skip_true_count[2] = 0;

-  cpi->skip_false_count[0] = cpi->skip_false_count[1] = cpi->skip_false_count[2] = 0;

-#if CONFIG_PRED_FILTER

-  if (cm->current_video_frame == 0) {

-    // Initially assume that we'll signal the prediction filter

-    // state at the frame level and that it is off.

-    cpi->common.pred_filter_mode = 0;

-    cpi->common.prob_pred_filter_off = 128;

-  }

-  cpi->pred_filter_on_count = 0;

-  cpi->pred_filter_off_count = 0;

-#endif

-  vp9_zero(cpi->switchable_interp_count);

-  xd->mode_info_context = cm->mi;

-  xd->prev_mode_info_context = cm->prev_mi;

-  vp9_zero(cpi->NMVcount);

-  vp9_zero(cpi->coef_counts);

-  vp9_zero(cpi->hybrid_coef_counts);

-  vp9_zero(cpi->coef_counts_8x8);

-  vp9_zero(cpi->hybrid_coef_counts_8x8);

-  vp9_zero(cpi->coef_counts_16x16);

-  vp9_zero(cpi->hybrid_coef_counts_16x16);

-  vp9_frame_init_quantizer(cpi);

-  vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);

-  vp9_initialize_me_consts(cpi, cm->base_qindex);

-  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {

-    // Initialize encode frame context.

-    init_encode_frame_mb_context(cpi);

-    // Build a frame level activity map

-    build_activity_map(cpi);

-  }

-  // re-initencode frame context.

-  init_encode_frame_mb_context(cpi);

-  vpx_memset(cpi->rd_comp_pred_diff, 0, sizeof(cpi->rd_comp_pred_diff));

-  vpx_memset(cpi->single_pred_count, 0, sizeof(cpi->single_pred_count));

-  vpx_memset(cpi->comp_pred_count, 0, sizeof(cpi->comp_pred_count));

-  vpx_memset(cpi->txfm_count, 0, sizeof(cpi->txfm_count));

-  vpx_memset(cpi->txfm_count_8x8p, 0, sizeof(cpi->txfm_count_8x8p));

-  vpx_memset(cpi->rd_tx_select_diff, 0, sizeof(cpi->rd_tx_select_diff));

-  {

-    struct vpx_usec_timer  emr_timer;

-    vpx_usec_timer_start(&emr_timer);

-    {

-      // For each row of SBs in the frame

-      for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 2) {

-        int offset = (cm->mb_cols + 1) & ~0x1;

-        encode_sb_row(cpi, cm, mb_row, x, xd, &tp, &totalrate);

-        // adjust to the next row of SBs

-        x->src.y_buffer += 32 * x->src.y_stride - 16 * offset;

-        x->src.u_buffer += 16 * x->src.uv_stride - 8 * offset;

-        x->src.v_buffer += 16 * x->src.uv_stride - 8 * offset;

-      }

-      cpi->tok_count = tp - cpi->tok;

-    }

-    vpx_usec_timer_mark(&emr_timer);

-    cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);

-  }

-  // 256 rate units to the bit,

-  // projected_frame_size in units of BYTES

-  cpi->projected_frame_size = totalrate >> 8;

-#if 0

-  // Keep record of the total distortion this time around for future use

-  cpi->last_frame_distortion = cpi->frame_distortion;

-#endif

-}

-static int check_dual_ref_flags(VP9_COMP *cpi) {

-  MACROBLOCKD *xd = &cpi->mb.e_mbd;

-  int ref_flags = cpi->ref_frame_flags;

-  if (vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {

-    if ((ref_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) == (VP9_LAST_FLAG | VP9_GOLD_FLAG) &&

-        vp9_check_segref(xd, 1, LAST_FRAME))

-      return 1;

-    if ((ref_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) == (VP9_GOLD_FLAG | VP9_ALT_FLAG) &&

-        vp9_check_segref(xd, 1, GOLDEN_FRAME))

-      return 1;

-    if ((ref_flags & (VP9_ALT_FLAG  | VP9_LAST_FLAG)) == (VP9_ALT_FLAG  | VP9_LAST_FLAG) &&

-        vp9_check_segref(xd, 1, ALTREF_FRAME))

-      return 1;

-    return 0;

-  } else {

-    return (!!(ref_flags & VP9_GOLD_FLAG) +

-            !!(ref_flags & VP9_LAST_FLAG) +

-            !!(ref_flags & VP9_ALT_FLAG)) >= 2;

-  }

-}

-static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {

-  VP9_COMMON *cm = &cpi->common;

-  int mb_row, mb_col, mis = cm->mode_info_stride, segment_id;

-  MODE_INFO *mi, *mi_ptr = cm->mi;

-#if CONFIG_SUPERBLOCKS

-  MODE_INFO *sb_mi_ptr = cm->mi, *sb_mi;

-  MB_MODE_INFO *sb_mbmi;

-#endif

-  MB_MODE_INFO *mbmi;

-  MACROBLOCK *x = &cpi->mb;

-  MACROBLOCKD *xd = &x->e_mbd;

-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++, mi_ptr += mis) {

-    mi = mi_ptr;

-#if CONFIG_SUPERBLOCKS

-    sb_mi = sb_mi_ptr;

-#endif

-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++, mi++) {

-      mbmi = &mi->mbmi;

-#if CONFIG_SUPERBLOCKS

-      sb_mbmi = &sb_mi->mbmi;

-#endif

-      if (

-#if CONFIG_SUPERBLOCKS

-          !sb_mbmi->encoded_as_sb &&

-#endif

-          mbmi->txfm_size > txfm_max) {

-        segment_id = mbmi->segment_id;

-        xd->mode_info_context = mi;

-        assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&

-                vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||

-               (cm->mb_no_coeff_skip && mbmi->mb_skip_coeff));

-        mbmi->txfm_size = txfm_max;

-      }

-#if CONFIG_SUPERBLOCKS

-      if (mb_col & 1)

-        sb_mi += 2;

-#endif

-    }

-#if CONFIG_SUPERBLOCKS

-    if (mb_row & 1)

-      sb_mi_ptr += 2 * mis;

-#endif

-  }

-}

-void vp9_encode_frame(VP9_COMP *cpi) {

-  if (cpi->sf.RD) {

-    int i, frame_type, pred_type;

-    TXFM_MODE txfm_type;

-    /*

-     * This code does a single RD pass over the whole frame assuming

-     * either compound, single or hybrid prediction as per whatever has

-     * worked best for that type of frame in the past.

-     * It also predicts whether another coding mode would have worked

-     * better that this coding mode. If that is the case, it remembers

-     * that for subsequent frames.

-     * It does the same analysis for transform size selection also.

-     */

-    if (cpi->common.frame_type == KEY_FRAME)

-      frame_type = 0;

-    else if (cpi->is_src_frame_alt_ref && cpi->common.refresh_golden_frame)

-      frame_type = 3;

-    else if (cpi->common.refresh_golden_frame || cpi->common.refresh_alt_ref_frame)

-      frame_type = 1;

-    else

-      frame_type = 2;

-    /* prediction (compound, single or hybrid) mode selection */

-    if (frame_type == 3)

-      pred_type = SINGLE_PREDICTION_ONLY;

-    else if (cpi->rd_prediction_type_threshes[frame_type][1] >

-                 cpi->rd_prediction_type_threshes[frame_type][0] &&

-             cpi->rd_prediction_type_threshes[frame_type][1] >

-                 cpi->rd_prediction_type_threshes[frame_type][2] &&

-             check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)

-      pred_type = COMP_PREDICTION_ONLY;

-    else if (cpi->rd_prediction_type_threshes[frame_type][0] >

-                 cpi->rd_prediction_type_threshes[frame_type][2])

-      pred_type = SINGLE_PREDICTION_ONLY;

-    else

-      pred_type = HYBRID_PREDICTION;

-    /* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */

-#if CONFIG_LOSSLESS

-    if (cpi->oxcf.lossless) {

-      txfm_type = ONLY_4X4;

-    } else

-#endif

-    /* FIXME (rbultje)

-     * this is a hack (no really), basically to work around the complete

-     * nonsense coefficient cost prediction for keyframes. The probabilities

-     * are reset to defaults, and thus we basically have no idea how expensive

-     * a 4x4 vs. 8x8 will really be. The result is that any estimate at which

-     * of the two is better is utterly bogus.

-     * I'd like to eventually remove this hack, but in order to do that, we

-     * need to move the frame reset code from the frame encode init to the

-     * bitstream write code, or alternatively keep a backup of the previous

-     * keyframe's probabilities as an estimate of what the current keyframe's

-     * coefficient cost distributions may look like. */

-    if (frame_type == 0) {

-      txfm_type = ALLOW_16X16;

-    } else

-#if 0

-    /* FIXME (rbultje)

-     * this code is disabled for a similar reason as the code above; the

-     * problem is that each time we "revert" to 4x4 only (or even 8x8 only),

-     * the coefficient probabilities for 16x16 (and 8x8) start lagging behind,

-     * thus leading to them lagging further behind and not being chosen for

-     * subsequent frames either. This is essentially a local minimum problem

-     * that we can probably fix by estimating real costs more closely within

-     * a frame, perhaps by re-calculating costs on-the-fly as frame encoding

-     * progresses. */

-    if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >

-            cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] &&

-        cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >

-            cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] &&

-        cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >

-            cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {

-      txfm_type = TX_MODE_SELECT;

-    } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >

-                  cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]

-            && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >

-                  cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16]

-               ) {

-      txfm_type = ONLY_4X4;

-    } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=

-                  cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {

-      txfm_type = ALLOW_16X16;

-    } else

-      txfm_type = ALLOW_8X8;

-#else

-    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=

-                 cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?

-    ALLOW_16X16 : TX_MODE_SELECT;

-#endif

-    cpi->common.txfm_mode = txfm_type;

-    if (txfm_type != TX_MODE_SELECT) {

-      cpi->common.prob_tx[0] = 128;

-      cpi->common.prob_tx[1] = 128;

-    }

-    cpi->common.comp_pred_mode = pred_type;

-    encode_frame_internal(cpi);

-    for (i = 0; i < NB_PREDICTION_TYPES; ++i) {

-      const int diff = cpi->rd_comp_pred_diff[i] / cpi->common.MBs;

-      cpi->rd_prediction_type_threshes[frame_type][i] += diff;

-      cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;

-    }

-    for (i = 0; i < NB_TXFM_MODES; ++i) {

-      int64_t pd = cpi->rd_tx_select_diff[i];

-      int diff;

-      if (i == TX_MODE_SELECT)

-        pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, 2048 * (TX_SIZE_MAX - 1), 0);

-      diff = pd / cpi->common.MBs;

-      cpi->rd_tx_select_threshes[frame_type][i] += diff;

-      cpi->rd_tx_select_threshes[frame_type][i] /= 2;

-    }

-    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {

-      int single_count_zero = 0;

-      int comp_count_zero = 0;

-      for (i = 0; i < COMP_PRED_CONTEXTS; i++) {

-        single_count_zero += cpi->single_pred_count[i];

-        comp_count_zero += cpi->comp_pred_count[i];

-      }

-      if (comp_count_zero == 0) {

-        cpi->common.comp_pred_mode = SINGLE_PREDICTION_ONLY;

-      } else if (single_count_zero == 0) {

-        cpi->common.comp_pred_mode = COMP_PREDICTION_ONLY;

-      }

-    }

-    if (cpi->common.txfm_mode == TX_MODE_SELECT) {

-      const int count4x4 = cpi->txfm_count[TX_4X4] + cpi->txfm_count_8x8p[TX_4X4];

-      const int count8x8 = cpi->txfm_count[TX_8X8];

-      const int count8x8_8x8p = cpi->txfm_count_8x8p[TX_8X8];

-      const int count16x16 = cpi->txfm_count[TX_16X16];

-      if (count4x4 == 0 && count16x16 == 0) {

-        cpi->common.txfm_mode = ALLOW_8X8;

-        reset_skip_txfm_size(cpi, TX_8X8);

-      } else if (count8x8 == 0 && count16x16 == 0 && count8x8_8x8p == 0) {

-        cpi->common.txfm_mode = ONLY_4X4;

-        reset_skip_txfm_size(cpi, TX_4X4);

-      } else if (count8x8 == 0 && count4x4 == 0) {

-        cpi->common.txfm_mode = ALLOW_16X16;

-      }

-    }

-  } else {

-    encode_frame_internal(cpi);

-  }

-}

-void vp9_setup_block_ptrs(MACROBLOCK *x) {

-  int r, c;

-  int i;

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

-      x->block[r * 4 + c].src_diff = x->src_diff + r * 4 * 16 + c * 4;

-    }

-  }

-  for (r = 0; r < 2; r++) {

-    for (c = 0; c < 2; c++) {

-      x->block[16 + r * 2 + c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4;

-    }

-  }

-  for (r = 0; r < 2; r++) {

-    for (c = 0; c < 2; c++) {

-      x->block[20 + r * 2 + c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4;

-    }

-  }

-  x->block[24].src_diff = x->src_diff + 384;

-  for (i = 0; i < 25; i++) {

-    x->block[i].coeff = x->coeff + i * 16;

-  }

-}

-void vp9_build_block_offsets(MACROBLOCK *x) {

-  int block = 0;

-  int br, bc;

-  vp9_build_block_doffsets(&x->e_mbd);

-  // y blocks

-  x->thismb_ptr = &x->thismb[0];

-  for (br = 0; br < 4; br++) {

-    for (bc = 0; bc < 4; bc++) {

-      BLOCK *this_block = &x->block[block];

-      // this_block->base_src = &x->src.y_buffer;

-      // this_block->src_stride = x->src.y_stride;

-      // this_block->src = 4 * br * this_block->src_stride + 4 * bc;

-      this_block->base_src = &x->thismb_ptr;

-      this_block->src_stride = 16;

-      this_block->src = 4 * br * 16 + 4 * bc;

-      ++block;

-    }

-  }

-  // u blocks

-  for (br = 0; br < 2; br++) {

-    for (bc = 0; bc < 2; bc++) {

-      BLOCK *this_block = &x->block[block];

-      this_block->base_src = &x->src.u_buffer;

-      this_block->src_stride = x->src.uv_stride;

-      this_block->src = 4 * br * this_block->src_stride + 4 * bc;

-      ++block;

-    }

-  }

-  // v blocks

-  for (br = 0; br < 2; br++) {

-    for (bc = 0; bc < 2; bc++) {

-      BLOCK *this_block = &x->block[block];

-      this_block->base_src = &x->src.v_buffer;

-      this_block->src_stride = x->src.uv_stride;

-      this_block->src = 4 * br * this_block->src_stride + 4 * bc;

-      ++block;

-    }

-  }

-}

-static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {

-  const MACROBLOCKD *xd = &x->e_mbd;

-  const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;

-  const MB_PREDICTION_MODE uvm = xd->mode_info_context->mbmi.uv_mode;

-#ifdef MODE_STATS

-  const int is_key = cpi->common.frame_type == KEY_FRAME;

-  ++ (is_key ? uv_modes : inter_uv_modes)[uvm];

-  ++ uv_modes_y[m][uvm];

-  if (m == B_PRED) {

-    unsigned int *const bct = is_key ? b_modes : inter_b_modes;

-    int b = 0;

-    do {

-      ++ bct[xd->block[b].bmi.as_mode.first];

-    } while (++b < 16);

-  }

-  if (m == I8X8_PRED) {

-    i8x8_modes[xd->block[0].bmi.as_mode.first]++;

-    i8x8_modes[xd->block[2].bmi.as_mode.first]++;

-    i8x8_modes[xd->block[8].bmi.as_mode.first]++;

-    i8x8_modes[xd->block[10].bmi.as_mode.first]++;

-  }

-#endif

-#if CONFIG_SUPERBLOCKS

-  if (xd->mode_info_context->mbmi.encoded_as_sb) {

-    ++cpi->sb_ymode_count[m];

-  } else

-#endif

-    ++cpi->ymode_count[m];

-  if (m != I8X8_PRED)

-    ++cpi->y_uv_mode_count[m][uvm];

-  else {

-    cpi->i8x8_mode_count[xd->block[0].bmi.as_mode.first]++;

-    cpi->i8x8_mode_count[xd->block[2].bmi.as_mode.first]++;

-    cpi->i8x8_mode_count[xd->block[8].bmi.as_mode.first]++;

-    cpi->i8x8_mode_count[xd->block[10].bmi.as_mode.first]++;

-  }

-  if (m == B_PRED) {

-    int b = 0;

-    do {

-      ++ cpi->bmode_count[xd->block[b].bmi.as_mode.first];

-    } while (++b < 16);

-  }

-}

-// Experimental stub function to create a per MB zbin adjustment based on

-// some previously calculated measure of MB activity.

-static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) {

-#if USE_ACT_INDEX

-  x->act_zbin_adj = *(x->mb_activity_ptr);

-#else

-  int64_t a;

-  int64_t b;

-  int64_t act = *(x->mb_activity_ptr);

-  // Apply the masking to the RD multiplier.

-  a = act + 4 * cpi->activity_avg;

-  b = 4 * act + cpi->activity_avg;

-  if (act > cpi->activity_avg)

-    x->act_zbin_adj = (int)(((int64_t)b + (a >> 1)) / a) - 1;

-  else

-    x->act_zbin_adj = 1 - (int)(((int64_t)a + (b >> 1)) / b);

-#endif

-}

-#if CONFIG_SUPERBLOCKS

-static void update_sb_skip_coeff_state(VP9_COMP *cpi,

-                                       MACROBLOCK *x,

-                                       ENTROPY_CONTEXT_PLANES ta[4],

-                                       ENTROPY_CONTEXT_PLANES tl[4],

-                                       TOKENEXTRA *t[4],

-                                       TOKENEXTRA **tp,

-                                       int skip[4])

-{

-  TOKENEXTRA tokens[4][16 * 24];

-  int n_tokens[4], n;

-  // if there were no skips, we don't need to do anything

-  if (!skip[0] && !skip[1] && !skip[2] && !skip[3])

-    return;

-  // if we don't do coeff skipping for this frame, we don't

-  // need to do anything here

-  if (!cpi->common.mb_no_coeff_skip)

-    return;

-  // if all 4 MBs skipped coeff coding, nothing to be done

-  if (skip[0] && skip[1] && skip[2] && skip[3])

-    return;

-  // so the situation now is that we want to skip coeffs

-  // for some MBs, but not all, and we didn't code EOB

-  // coefficients for them. However, the skip flag for this

-  // SB will be 0 overall, so we need to insert EOBs in the

-  // middle of the token tree. Do so here.

-  n_tokens[0] = t[1] - t[0];

-  n_tokens[1] = t[2] - t[1];

-  n_tokens[2] = t[3] - t[2];

-  n_tokens[3] = *tp  - t[3];

-  if (n_tokens[0])

-    memcpy(tokens[0], t[0], n_tokens[0] * sizeof(*t[0]));

-  if (n_tokens[1])

-    memcpy(tokens[1], t[1], n_tokens[1] * sizeof(*t[0]));

-  if (n_tokens[2])

-    memcpy(tokens[2], t[2], n_tokens[2] * sizeof(*t[0]));

-  if (n_tokens[3])

-    memcpy(tokens[3], t[3], n_tokens[3] * sizeof(*t[0]));

-  // reset pointer, stuff EOBs where necessary

-  *tp = t[0];

-  for (n = 0; n < 4; n++) {

-    if (skip[n]) {

-      x->e_mbd.above_context = &ta[n];

-      x->e_mbd.left_context  = &tl[n];

-      vp9_stuff_mb(cpi, &x->e_mbd, tp, 0);

-    } else {

-      if (n_tokens[n]) {

-        memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);

-      }

-      (*tp) += n_tokens[n];

-    }

-  }

-}

-void vp9_encode_intra_super_block(VP9_COMP *cpi,

-                                  MACROBLOCK *x,

-                                  TOKENEXTRA **t,

-                                  int mb_col) {

-  const int output_enabled = 1;

-  int n;

-  MACROBLOCKD *xd = &x->e_mbd;

-  VP9_COMMON *cm = &cpi->common;

-  const uint8_t *src = x->src.y_buffer;

-  uint8_t *dst = xd->dst.y_buffer;

-  const uint8_t *usrc = x->src.u_buffer;

-  uint8_t *udst = xd->dst.u_buffer;

-  const uint8_t *vsrc = x->src.v_buffer;

-  uint8_t *vdst = xd->dst.v_buffer;

-  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;

-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;

-  const VP9_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);

-  TOKENEXTRA *tp[4];

-  int skip[4];

-  MODE_INFO *mi = x->e_mbd.mode_info_context;

-  ENTROPY_CONTEXT_PLANES ta[4], tl[4];

-  if ((cpi->oxcf.tuning == VP8_TUNE_SSIM) && output_enabled) {

-    adjust_act_zbin(cpi, x);

-    vp9_update_zbin_extra(cpi, x);

-  }

-  vp9_build_intra_predictors_sby_s(&x->e_mbd);

-  vp9_build_intra_predictors_sbuv_s(&x->e_mbd);

-  assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);

-  for (n = 0; n < 4; n++) {

-    int x_idx = n & 1, y_idx = n >> 1;

-    xd->above_context = cm->above_context + mb_col + (n & 1);

-    xd->left_context = cm->left_context + (n >> 1);

-    vp9_subtract_mby_s_c(x->src_diff,

-                         src + x_idx * 16 + y_idx * 16 * src_y_stride,

-                         src_y_stride,

-                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,

-                         dst_y_stride);

-    vp9_subtract_mbuv_s_c(x->src_diff,

-                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

-                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

-                          src_uv_stride,

-                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                          dst_uv_stride);

-    vp9_transform_mb_8x8(x);

-    vp9_quantize_mb_8x8(x);

-    if (x->optimize) {

-      vp9_optimize_mby_8x8(x, rtcd);

-      vp9_optimize_mbuv_8x8(x, rtcd);

-    }

-    vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);

-    vp9_recon_mby_s_c(&x->e_mbd, dst + x_idx * 16 + y_idx * 16 * dst_y_stride);

-    vp9_recon_mbuv_s_c(&x->e_mbd,

-                       udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                       vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);

-    if (output_enabled) {

-      memcpy(&ta[n], xd->above_context, sizeof(ta[n]));

-      memcpy(&tl[n], xd->left_context, sizeof(tl[n]));

-      tp[n] = *t;

-      xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;

-      vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);

-      skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;

-    }

-  }

-  if (output_enabled) {

-    // Tokenize

-    xd->mode_info_context = mi;

-    sum_intra_stats(cpi, x);

-    update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);

-  }

-}

-#endif /* CONFIG_SUPERBLOCKS */

-void vp9_encode_intra_macro_block(VP9_COMP *cpi,

-                                  MACROBLOCK *x,

-                                  TOKENEXTRA **t,

-                                  int output_enabled) {

-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

-  if ((cpi->oxcf.tuning == VP8_TUNE_SSIM) && output_enabled) {

-    adjust_act_zbin(cpi, x);

-    vp9_update_zbin_extra(cpi, x);

-  }

-  if (mbmi->mode == I8X8_PRED) {

-    vp9_encode_intra8x8mby(IF_RTCD(&cpi->rtcd), x);

-    vp9_encode_intra8x8mbuv(IF_RTCD(&cpi->rtcd), x);

-  } else if (mbmi->mode == B_PRED) {

-    vp9_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);

-  } else {

-    vp9_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);

-  }

-  if (mbmi->mode != I8X8_PRED) {

-    vp9_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);

-  }

-  if (output_enabled) {

-    int segment_id = mbmi->segment_id;

-    // Tokenize

-    sum_intra_stats(cpi, x);

-    vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);

-    if (cpi->common.txfm_mode == TX_MODE_SELECT &&

-        !((cpi->common.mb_no_coeff_skip && mbmi->mb_skip_coeff) ||

-          (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_EOB) &&

-           vp9_get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) {

-      if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED) {

-        cpi->txfm_count[mbmi->txfm_size]++;

-      } else if (mbmi->mode == I8X8_PRED) {

-        cpi->txfm_count_8x8p[mbmi->txfm_size]++;

-      }

-    } else if (cpi->common.txfm_mode >= ALLOW_16X16 && mbmi->mode <= TM_PRED) {

-      mbmi->txfm_size = TX_16X16;

-    } else

-    if (cpi->common.txfm_mode >= ALLOW_8X8 && mbmi->mode != B_PRED) {

-      mbmi->txfm_size = TX_8X8;

-    } else {

-      mbmi->txfm_size = TX_4X4;

-    }

-  }

-#if CONFIG_NEWBESTREFMV

-  else

-    vp9_tokenize_mb(cpi, &x->e_mbd, t, 1);

-#endif

-}

-extern void vp9_fix_contexts(MACROBLOCKD *xd);

-void vp9_encode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,

-                                 TOKENEXTRA **t, int recon_yoffset,

-                                 int recon_uvoffset, int output_enabled) {

-  VP9_COMMON *cm = &cpi->common;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;

-  unsigned char *segment_id = &mbmi->segment_id;

-  int seg_ref_active;

-  unsigned char ref_pred_flag;

-  x->skip = 0;

-#if CONFIG_SUPERBLOCKS

-  assert(!xd->mode_info_context->mbmi.encoded_as_sb);

-#endif

-  vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);

-  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {

-    // Adjust the zbin based on this MB rate.

-    adjust_act_zbin(cpi, x);

-  }

-  {

-    // Experimental code. Special case for gf and arf zeromv modes.

-    // Increase zbin size to suppress noise

-    cpi->zbin_mode_boost = 0;

-    if (cpi->zbin_mode_boost_enabled) {

-      if (mbmi->ref_frame != INTRA_FRAME) {

-        if (mbmi->mode == ZEROMV) {

-          if (mbmi->ref_frame != LAST_FRAME)

-            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;

-          else

-            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;

-        } else if (mbmi->mode == SPLITMV)

-          cpi->zbin_mode_boost = 0;

-        else

-          cpi->zbin_mode_boost = MV_ZBIN_BOOST;

-      }

-    }

-    vp9_update_zbin_extra(cpi, x);

-  }

-  seg_ref_active = vp9_segfeature_active(xd, *segment_id, SEG_LVL_REF_FRAME);

-  // SET VARIOUS PREDICTION FLAGS

-  // Did the chosen reference frame match its predicted value.

-  ref_pred_flag = ((mbmi->ref_frame == vp9_get_pred_ref(cm, xd)));

-  vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);

-  if (mbmi->ref_frame == INTRA_FRAME) {

-    if (mbmi->mode == B_PRED) {

-      vp9_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);

-      vp9_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);

-    } else if (mbmi->mode == I8X8_PRED) {

-      vp9_encode_intra8x8mby(IF_RTCD(&cpi->rtcd), x);

-      vp9_encode_intra8x8mbuv(IF_RTCD(&cpi->rtcd), x);

-    } else {

-      vp9_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);

-      vp9_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);

-    }

-    if (output_enabled)

-      sum_intra_stats(cpi, x);

-  } else {

-    int ref_fb_idx;

-    if (mbmi->ref_frame == LAST_FRAME)

-      ref_fb_idx = cpi->common.lst_fb_idx;

-    else if (mbmi->ref_frame == GOLDEN_FRAME)

-      ref_fb_idx = cpi->common.gld_fb_idx;

-    else

-      ref_fb_idx = cpi->common.alt_fb_idx;

-    xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;

-    xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;

-    xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;

-    if (mbmi->second_ref_frame) {

-      int second_ref_fb_idx;

-      if (mbmi->second_ref_frame == LAST_FRAME)

-        second_ref_fb_idx = cpi->common.lst_fb_idx;

-      else if (mbmi->second_ref_frame == GOLDEN_FRAME)

-        second_ref_fb_idx = cpi->common.gld_fb_idx;

-      else

-        second_ref_fb_idx = cpi->common.alt_fb_idx;

-      xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer +

-                                recon_yoffset;

-      xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer +

-                                recon_uvoffset;

-      xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer +

-                                recon_uvoffset;

-    }

-    if (!x->skip) {

-      vp9_encode_inter16x16(IF_RTCD(&cpi->rtcd), x);

-      // Clear mb_skip_coeff if mb_no_coeff_skip is not set

-      if (!cpi->common.mb_no_coeff_skip)

-        mbmi->mb_skip_coeff = 0;

-    } else {

-      vp9_build_1st_inter16x16_predictors_mb(xd, xd->dst.y_buffer,

-                                             xd->dst.u_buffer, xd->dst.v_buffer,

-                                             xd->dst.y_stride,

-                                             xd->dst.uv_stride);

-    }

-  }

-  if (!x->skip) {

-#ifdef ENC_DEBUG

-    if (enc_debug) {

-      int i;

-      printf("Segment=%d [%d, %d]: %d %d:\n", mbmi->segment_id, mb_col_debug,

-             mb_row_debug, xd->mb_to_left_edge, xd->mb_to_top_edge);

-      for (i = 0; i < 400; i++) {

-        printf("%3d ", xd->qcoeff[i]);

-        if (i % 16 == 15) printf("\n");

-      }

-      printf("\n");

-      printf("eobs = ");

-      for (i = 0; i < 25; i++)

-        printf("%d:%d ", i, xd->block[i].eob);

-      printf("\n");

-      fflush(stdout);

-    }

-#endif

-    vp9_tokenize_mb(cpi, xd, t, !output_enabled);

-#ifdef ENC_DEBUG

-    if (enc_debug) {

-      printf("Tokenized\n");

-      fflush(stdout);

-    }

-#endif

-  } else {

-    int mb_skip_context =

-      cpi->common.mb_no_coeff_skip ?

-      (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +

-      (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff :

-      0;

-    if (cpi->common.mb_no_coeff_skip) {

-      mbmi->mb_skip_coeff = 1;

-      if (output_enabled)

-        cpi->skip_true_count[mb_skip_context]++;

-      vp9_fix_contexts(xd);

-    } else {

-      vp9_stuff_mb(cpi, xd, t, !output_enabled);

-      mbmi->mb_skip_coeff = 0;

-      if (output_enabled)

-        cpi->skip_false_count[mb_skip_context]++;

-    }

-  }

-  if (output_enabled) {

-    int segment_id = mbmi->segment_id;

-    if (cpi->common.txfm_mode == TX_MODE_SELECT &&

-        !((cpi->common.mb_no_coeff_skip && mbmi->mb_skip_coeff) ||

-          (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_EOB) &&

-           vp9_get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) {

-      if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&

-          mbmi->mode != SPLITMV) {

-        cpi->txfm_count[mbmi->txfm_size]++;

-      } else if (mbmi->mode == I8X8_PRED ||

-                 (mbmi->mode == SPLITMV &&

-                  mbmi->partitioning != PARTITIONING_4X4)) {

-        cpi->txfm_count_8x8p[mbmi->txfm_size]++;

-      }

-    } else if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&

-        mbmi->mode != SPLITMV && cpi->common.txfm_mode >= ALLOW_16X16) {

-      mbmi->txfm_size = TX_16X16;

-    } else if (mbmi->mode != B_PRED &&

-               !(mbmi->mode == SPLITMV &&

-                 mbmi->partitioning == PARTITIONING_4X4) &&

-               cpi->common.txfm_mode >= ALLOW_8X8) {

-      mbmi->txfm_size = TX_8X8;

-    } else {

-      mbmi->txfm_size = TX_4X4;

-    }

-  }

-}

-#if CONFIG_SUPERBLOCKS

-void vp9_encode_inter_superblock(VP9_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,

-                                 int recon_yoffset, int recon_uvoffset,

-                                 int mb_col, int mb_row) {

-  const int output_enabled = 1;

-  VP9_COMMON *cm = &cpi->common;

-  MACROBLOCKD *xd = &x->e_mbd;

-  const uint8_t *src = x->src.y_buffer;

-  uint8_t *dst = xd->dst.y_buffer;

-  const uint8_t *usrc = x->src.u_buffer;

-  uint8_t *udst = xd->dst.u_buffer;

-  const uint8_t *vsrc = x->src.v_buffer;

-  uint8_t *vdst = xd->dst.v_buffer;

-  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;

-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;

-  const VP9_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);

-  unsigned int segment_id = xd->mode_info_context->mbmi.segment_id;

-  int seg_ref_active;

-  unsigned char ref_pred_flag;

-  int n;

-  TOKENEXTRA *tp[4];

-  int skip[4];

-  MODE_INFO *mi = x->e_mbd.mode_info_context;

-  ENTROPY_CONTEXT_PLANES ta[4], tl[4];

-  x->skip = 0;

-  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {

-    // Adjust the zbin based on this MB rate.

-    adjust_act_zbin(cpi, x);

-  }

-  {

-    // Experimental code. Special case for gf and arf zeromv modes.

-    // Increase zbin size to suppress noise

-    cpi->zbin_mode_boost = 0;

-    if (cpi->zbin_mode_boost_enabled) {

-      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {

-        if (xd->mode_info_context->mbmi.mode == ZEROMV) {

-          if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)

-            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;

-          else

-            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;

-        } else if (xd->mode_info_context->mbmi.mode == SPLITMV)

-          cpi->zbin_mode_boost = 0;

-        else

-          cpi->zbin_mode_boost = MV_ZBIN_BOOST;

-      }

-    }

-    vp9_update_zbin_extra(cpi, x);

-  }

-  seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);

-  // SET VARIOUS PREDICTION FLAGS

-  // Did the chosen reference frame match its predicted value.

-  ref_pred_flag = ((xd->mode_info_context->mbmi.ref_frame ==

-                    vp9_get_pred_ref(cm, xd)));

-  vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);

-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {

-    vp9_build_intra_predictors_sby_s(&x->e_mbd);

-    vp9_build_intra_predictors_sbuv_s(&x->e_mbd);

-  } else {

-    int ref_fb_idx;

-    if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)

-      ref_fb_idx = cpi->common.lst_fb_idx;

-    else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)

-      ref_fb_idx = cpi->common.gld_fb_idx;

-    else

-      ref_fb_idx = cpi->common.alt_fb_idx;

-    xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;

-    xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;

-    xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;

-    if (xd->mode_info_context->mbmi.second_ref_frame) {

-      int second_ref_fb_idx;

-      if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)

-        second_ref_fb_idx = cpi->common.lst_fb_idx;

-      else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)

-        second_ref_fb_idx = cpi->common.gld_fb_idx;

-      else

-        second_ref_fb_idx = cpi->common.alt_fb_idx;

-      xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer +

-                                    recon_yoffset;

-      xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer +

-                                    recon_uvoffset;

-      xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer +

-                                    recon_uvoffset;

-    }

-    vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,

-                                       xd->dst.u_buffer, xd->dst.v_buffer,

-                                       xd->dst.y_stride, xd->dst.uv_stride);

-  }

-  assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);

-  for (n = 0; n < 4; n++) {

-    int x_idx = n & 1, y_idx = n >> 1;

-    vp9_subtract_mby_s_c(x->src_diff,

-                         src + x_idx * 16 + y_idx * 16 * src_y_stride,

-                         src_y_stride,

-                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,

-                         dst_y_stride);

-    vp9_subtract_mbuv_s_c(x->src_diff,

-                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

-                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

-                          src_uv_stride,

-                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                          dst_uv_stride);

-    vp9_transform_mb_8x8(x);

-    vp9_quantize_mb_8x8(x);

-    if (x->optimize) {

-      vp9_optimize_mby_8x8(x, rtcd);

-      vp9_optimize_mbuv_8x8(x, rtcd);

-    }

-    vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);

-    vp9_recon_mby_s_c(&x->e_mbd,

-                      dst + x_idx * 16 + y_idx * 16 * dst_y_stride);

-    vp9_recon_mbuv_s_c(&x->e_mbd,

-                       udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                       vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);

-    if (!x->skip) {

-      if (output_enabled) {

-        xd->left_context = cm->left_context + (n >> 1);

-        xd->above_context = cm->above_context + mb_col + (n & 1);

-        memcpy(&ta[n], xd->above_context, sizeof(ta[n]));

-        memcpy(&tl[n], xd->left_context, sizeof(tl[n]));

-        tp[n] = *t;

-        xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;

-        vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);

-        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;

-      }

-    } else {

-      int mb_skip_context =

-        cpi->common.mb_no_coeff_skip ?

-          (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +

-            (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff :

-          0;

-      if (cpi->common.mb_no_coeff_skip) {

-        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff = 1;

-        xd->left_context = cm->left_context + (n >> 1);

-        xd->above_context = cm->above_context + mb_col + (n & 1);

-        memcpy(&ta[n], xd->above_context, sizeof(ta[n]));

-        memcpy(&tl[n], xd->left_context, sizeof(tl[n]));

-        tp[n] = *t;

-        cpi->skip_true_count[mb_skip_context]++;

-        vp9_fix_contexts(xd);

-      } else {

-        vp9_stuff_mb(cpi, xd, t, 0);

-        xd->mode_info_context->mbmi.mb_skip_coeff = 0;

-        cpi->skip_false_count[mb_skip_context]++;

-      }

-    }

-  }

-  xd->mode_info_context = mi;

-  update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);

-}

-#endif

--- a/vp8/encoder/encodeintra.c

+++ /dev/null

@@ -1,289 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "vpx_rtcd.h"

-#include "vp8/common/idct.h"

-#include "quantize.h"

-#include "vp8/common/reconintra.h"

-#include "vp8/common/reconintra4x4.h"

-#include "encodemb.h"

-#include "vp8/common/invtrans.h"

-#include "encodeintra.h"

-#if CONFIG_RUNTIME_CPU_DETECT

-#define IF_RTCD(x) (x)

-#else

-#define IF_RTCD(x) NULL

-#endif

-int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {

-  int i;

-  int intra_pred_var = 0;

-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

-  (void) cpi;

-  if (use_16x16_pred) {

-    mbmi->mode = DC_PRED;

-#if CONFIG_COMP_INTRA_PRED

-    mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

-#endif

-    mbmi->uv_mode = DC_PRED;

-    mbmi->ref_frame = INTRA_FRAME;

-    vp9_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);

-  } else {

-    for (i = 0; i < 16; i++) {

-      x->e_mbd.block[i].bmi.as_mode.first = B_DC_PRED;

-      vp9_encode_intra4x4block(IF_RTCD(&cpi->rtcd), x, i);

-    }

-  }

-  intra_pred_var = vp9_get_mb_ss(x->src_diff);

-  return intra_pred_var;

-}

-void vp9_encode_intra4x4block(const VP9_ENCODER_RTCD *rtcd,

-                              MACROBLOCK *x, int ib) {

-  BLOCKD *b = &x->e_mbd.block[ib];

-  BLOCK *be = &x->block[ib];

-  TX_TYPE tx_type;

-#if CONFIG_COMP_INTRA_PRED

-  if (b->bmi.as_mode.second == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {

-#endif

-    vp9_intra4x4_predict(b, b->bmi.as_mode.first, b->predictor);

-#if CONFIG_COMP_INTRA_PRED

-  } else {

-    vp9_comp_intra4x4_predict(b, b->bmi.as_mode.first, b->bmi.as_mode.second,

-                              b->predictor);

-  }

-#endif

-  vp9_subtract_b(be, b, 16);

-  tx_type = get_tx_type(&x->e_mbd, b);

-  if (tx_type != DCT_DCT) {

-    vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);

-    vp9_ht_quantize_b_4x4(be, b, tx_type);

-    vp9_ihtllm_c(b->dqcoeff, b->diff, 32, tx_type, 4);

-  } else {

-    x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);

-    x->quantize_b_4x4(be, b) ;

-    vp9_inverse_transform_b_4x4(IF_RTCD(&rtcd->common->idct), b, 32);

-  }

-  vp9_recon_b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);

-}

-void vp9_encode_intra4x4mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *mb) {

-  int i;

-  for (i = 0; i < 16; i++)

-    vp9_encode_intra4x4block(rtcd, mb, i);

-  return;

-}

-void vp9_encode_intra16x16mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {

-  MACROBLOCKD *xd = &x->e_mbd;

-  BLOCK *b = &x->block[0];

-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;

-  TX_TYPE tx_type;

-#if CONFIG_COMP_INTRA_PRED

-  if (xd->mode_info_context->mbmi.second_mode == (MB_PREDICTION_MODE)(DC_PRED - 1))

-#endif

-    vp9_build_intra_predictors_mby(xd);

-#if CONFIG_COMP_INTRA_PRED

-  else

-    vp9_build_comp_intra_predictors_mby(xd);

-#endif

-  vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);

-  if (tx_size == TX_16X16) {

-    BLOCKD  *bd = &xd->block[0];

-    tx_type = get_tx_type(xd, bd);

-    if (tx_type != DCT_DCT) {

-      vp9_fht(b->src_diff, 32, b->coeff, tx_type, 16);

-      vp9_quantize_mby_16x16(x);

-      if (x->optimize)

-        vp9_optimize_mby_16x16(x, rtcd);

-      vp9_ihtllm_c(bd->dqcoeff, bd->diff, 32, tx_type, 16);

-    } else {

-      vp9_transform_mby_16x16(x);

-      vp9_quantize_mby_16x16(x);

-      if (x->optimize)

-        vp9_optimize_mby_16x16(x, rtcd);

-      vp9_inverse_transform_mby_16x16(IF_RTCD(&rtcd->common->idct), xd);

-    }

-  } else if (tx_size == TX_8X8) {

-    vp9_transform_mby_8x8(x);

-    vp9_quantize_mby_8x8(x);

-    if (x->optimize)

-      vp9_optimize_mby_8x8(x, rtcd);

-    vp9_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), xd);

-  } else {

-    vp9_transform_mby_4x4(x);

-    vp9_quantize_mby_4x4(x);

-    if (x->optimize)

-      vp9_optimize_mby_4x4(x, rtcd);

-    vp9_inverse_transform_mby_4x4(IF_RTCD(&rtcd->common->idct), xd);

-  }

-  vp9_recon_mby(xd);

-}

-void vp9_encode_intra16x16mbuv(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {

-  MACROBLOCKD *xd = &x->e_mbd;

-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;

-#if CONFIG_COMP_INTRA_PRED

-  if (xd->mode_info_context->mbmi.second_uv_mode == (MB_PREDICTION_MODE)(DC_PRED - 1)) {

-#endif

-    vp9_build_intra_predictors_mbuv(xd);

-#if CONFIG_COMP_INTRA_PRED

-  } else {

-    vp9_build_comp_intra_predictors_mbuv(xd);

-  }

-#endif

-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

-                    xd->predictor, x->src.uv_stride);

-  if (tx_size == TX_4X4) {

-    vp9_transform_mbuv_4x4(x);

-    vp9_quantize_mbuv_4x4(x);

-    if (x->optimize)

-      vp9_optimize_mbuv_4x4(x, rtcd);

-    vp9_inverse_transform_mbuv_4x4(IF_RTCD(&rtcd->common->idct), xd);

-  } else /* 16x16 or 8x8 */ {

-    vp9_transform_mbuv_8x8(x);

-    vp9_quantize_mbuv_8x8(x);

-    if (x->optimize)

-      vp9_optimize_mbuv_8x8(x, rtcd);

-    vp9_inverse_transform_mbuv_8x8(IF_RTCD(&rtcd->common->idct), xd);

-  }

-  vp9_recon_intra_mbuv(xd);

-}

-void vp9_encode_intra8x8(const VP9_ENCODER_RTCD *rtcd,

-                         MACROBLOCK *x, int ib) {

-  MACROBLOCKD *xd = &x->e_mbd;

-  BLOCKD *b = &xd->block[ib];

-  BLOCK *be = &x->block[ib];

-  const int iblock[4] = {0, 1, 4, 5};

-  int i;

-  TX_TYPE tx_type;

-#if CONFIG_COMP_INTRA_PRED

-  if (b->bmi.as_mode.second == (MB_PREDICTION_MODE)(DC_PRED - 1)) {

-#endif

-    vp9_intra8x8_predict(b, b->bmi.as_mode.first, b->predictor);

-#if CONFIG_COMP_INTRA_PRED

-  } else {

-    vp9_comp_intra8x8_predict(b, b->bmi.as_mode.first, b->bmi.as_mode.second,

-                              b->predictor);

-  }

-#endif

-  if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {

-    int idx = (ib & 0x02) ? (ib + 2) : ib;

-    // generate residual blocks

-    vp9_subtract_4b_c(be, b, 16);

-    tx_type = get_tx_type(xd, xd->block + idx);

-    if (tx_type != DCT_DCT) {

-      vp9_fht(be->src_diff, 32, (x->block + idx)->coeff,

-                tx_type, 8);

-      x->quantize_b_8x8(x->block + idx, xd->block + idx);

-      vp9_ihtllm_c(xd->block[idx].dqcoeff, xd->block[ib].diff, 32,

-                   tx_type, 8);

-    } else {

-      x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);

-      x->quantize_b_8x8(x->block + idx, xd->block + idx);

-      vp9_idct_idct8(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);

-    }

-  } else {

-    for (i = 0; i < 4; i++) {

-      b = &xd->block[ib + iblock[i]];

-      be = &x->block[ib + iblock[i]];

-      vp9_subtract_b(be, b, 16);

-      x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);

-      x->quantize_b_4x4(be, b);

-      vp9_inverse_transform_b_4x4(IF_RTCD(&rtcd->common->idct), b, 32);

-    }

-  }

-  // reconstruct submacroblock

-  for (i = 0; i < 4; i++) {

-    b = &xd->block[ib + iblock[i]];

-    vp9_recon_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst,

-                  b->dst_stride);

-  }

-}

-void vp9_encode_intra8x8mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {

-  int i, ib;

-  for (i = 0; i < 4; i++) {

-    ib = vp9_i8x8_block[i];

-    vp9_encode_intra8x8(rtcd, x, ib);

-  }

-}

-void vp9_encode_intra_uv4x4(const VP9_ENCODER_RTCD *rtcd,

-                            MACROBLOCK *x, int ib,

-                            int mode, int second) {

-  BLOCKD *b = &x->e_mbd.block[ib];

-  BLOCK *be = &x->block[ib];

-#if CONFIG_COMP_INTRA_PRED

-  if (second == -1) {

-#endif

-    vp9_intra_uv4x4_predict(b, mode, b->predictor);

-#if CONFIG_COMP_INTRA_PRED

-  } else {

-    vp9_comp_intra_uv4x4_predict(b, mode, second, b->predictor);

-  }

-#endif

-  vp9_subtract_b(be, b, 8);

-  x->vp9_short_fdct4x4(be->src_diff, be->coeff, 16);

-  x->quantize_b_4x4(be, b);

-  vp9_inverse_transform_b_4x4(IF_RTCD(&rtcd->common->idct), b, 16);

-  vp9_recon_uv_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst,

-                   b->dst_stride);

-}

-void vp9_encode_intra8x8mbuv(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {

-  int i, ib, mode, second;

-  BLOCKD *b;

-  for (i = 0; i < 4; i++) {

-    ib = vp9_i8x8_block[i];

-    b = &x->e_mbd.block[ib];

-    mode = b->bmi.as_mode.first;

-#if CONFIG_COMP_INTRA_PRED

-    second = b->bmi.as_mode.second;

-#else

-    second = -1;

-#endif

-    /*u */

-    vp9_encode_intra_uv4x4(rtcd, x, i + 16, mode, second);

-    /*v */

-    vp9_encode_intra_uv4x4(rtcd, x, i + 20, mode, second);

-  }

-}

--- a/vp8/encoder/encodeintra.h

+++ /dev/null

@@ -1,27 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __ENCODEINTRA_H_

-#define __ENCODEINTRA_H_

-#include "onyx_int.h"

-int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);

-void vp9_encode_intra16x16mby(const VP9_ENCODER_RTCD *, MACROBLOCK *x);

-void vp9_encode_intra16x16mbuv(const VP9_ENCODER_RTCD *, MACROBLOCK *x);

-void vp9_encode_intra4x4mby(const VP9_ENCODER_RTCD *, MACROBLOCK *mb);

-void vp9_encode_intra4x4block(const VP9_ENCODER_RTCD *rtcd,

-                              MACROBLOCK *x, int ib);

-void vp9_encode_intra8x8mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);

-void vp9_encode_intra8x8mbuv(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);

-void vp9_encode_intra8x8(const VP9_ENCODER_RTCD *rtcd,

-                         MACROBLOCK *x, int ib);

-#endif  // __ENCODEINTRA_H_

--- a/vp8/encoder/encodemb.c

+++ /dev/null

@@ -1,950 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "encodemb.h"

-#include "vp8/common/reconinter.h"

-#include "quantize.h"

-#include "tokenize.h"

-#include "vp8/common/invtrans.h"

-#include "vp8/common/reconintra.h"

-#include "vpx_mem/vpx_mem.h"

-#include "rdopt.h"

-#include "vp8/common/systemdependent.h"

-#include "vpx_rtcd.h"

-#if CONFIG_RUNTIME_CPU_DETECT

-#define IF_RTCD(x) (x)

-#else

-#define IF_RTCD(x) NULL

-#endif

-void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) {

-  unsigned char *src_ptr = (*(be->base_src) + be->src);

-  short *diff_ptr = be->src_diff;

-  unsigned char *pred_ptr = bd->predictor;

-  int src_stride = be->src_stride;

-  int r, c;

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

-      diff_ptr[c] = src_ptr[c] - pred_ptr[c];

-    }

-    diff_ptr += pitch;

-    pred_ptr += pitch;

-    src_ptr  += src_stride;

-  }

-}

-void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch) {

-  unsigned char *src_ptr = (*(be->base_src) + be->src);

-  short *diff_ptr = be->src_diff;

-  unsigned char *pred_ptr = bd->predictor;

-  int src_stride = be->src_stride;

-  int r, c;

-  for (r = 0; r < 8; r++) {

-    for (c = 0; c < 8; c++) {

-      diff_ptr[c] = src_ptr[c] - pred_ptr[c];

-    }

-    diff_ptr += pitch;

-    pred_ptr += pitch;

-    src_ptr  += src_stride;

-  }

-}

-void vp9_subtract_mbuv_s_c(short *diff, const unsigned char *usrc,

-                           const unsigned char *vsrc, int src_stride,

-                           const unsigned char *upred,

-                           const unsigned char *vpred, int dst_stride) {

-  short *udiff = diff + 256;

-  short *vdiff = diff + 320;

-  int r, c;

-  for (r = 0; r < 8; r++) {

-    for (c = 0; c < 8; c++) {

-      udiff[c] = usrc[c] - upred[c];

-    }

-    udiff += 8;

-    upred += dst_stride;

-    usrc  += src_stride;

-  }

-  for (r = 0; r < 8; r++) {

-    for (c = 0; c < 8; c++) {

-      vdiff[c] = vsrc[c] - vpred[c];

-    }

-    vdiff += 8;

-    vpred += dst_stride;

-    vsrc  += src_stride;

-  }

-}

-void vp9_subtract_mbuv_c(short *diff, unsigned char *usrc,

-                         unsigned char *vsrc, unsigned char *pred, int stride) {

-  unsigned char *upred = pred + 256;

-  unsigned char *vpred = pred + 320;

-  vp9_subtract_mbuv_s_c(diff, usrc, vsrc, stride, upred, vpred, 8);

-}

-void vp9_subtract_mby_s_c(short *diff, const unsigned char *src, int src_stride,

-                          const unsigned char *pred, int dst_stride) {

-  int r, c;

-  for (r = 0; r < 16; r++) {

-    for (c = 0; c < 16; c++) {

-      diff[c] = src[c] - pred[c];

-    }

-    diff += 16;

-    pred += dst_stride;

-    src  += src_stride;

-  }

-}

-void vp9_subtract_mby_c(short *diff, unsigned char *src,

-                        unsigned char *pred, int stride) {

-  vp9_subtract_mby_s_c(diff, src, stride, pred, 16);

-}

-static void subtract_mb(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {

-  BLOCK *b = &x->block[0];

-  vp9_subtract_mby(x->src_diff, *(b->base_src), x->e_mbd.predictor,

-                   b->src_stride);

-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

-                    x->e_mbd.predictor, x->src.uv_stride);

-}

-static void build_dcblock_4x4(MACROBLOCK *x) {

-  short *src_diff_ptr = &x->src_diff[384];

-  int i;

-  for (i = 0; i < 16; i++) {

-    src_diff_ptr[i] = x->coeff[i * 16];

-  }

-}

-void vp9_transform_mby_4x4(MACROBLOCK *x) {

-  int i;

-  for (i = 0; i < 16; i += 2) {

-    x->vp9_short_fdct8x4(&x->block[i].src_diff[0],

-                         &x->block[i].coeff[0], 32);

-  }

-  if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {

-    // build dc block from 16 y dc values

-    build_dcblock_4x4(x);

-    // do 2nd order transform on the dc block

-    x->short_walsh4x4(&x->block[24].src_diff[0],

-                      &x->block[24].coeff[0], 8);

-  }

-}

-void vp9_transform_mbuv_4x4(MACROBLOCK *x) {

-  int i;

-  for (i = 16; i < 24; i += 2) {

-    x->vp9_short_fdct8x4(&x->block[i].src_diff[0],

-                         &x->block[i].coeff[0], 16);

-  }

-}

-static void transform_mb_4x4(MACROBLOCK *x) {

-  vp9_transform_mby_4x4(x);

-  vp9_transform_mbuv_4x4(x);

-}

-static void build_dcblock_8x8(MACROBLOCK *x) {

-  int16_t *src_diff_ptr = x->block[24].src_diff;

-  int i;

-  for (i = 0; i < 16; i++) {

-    src_diff_ptr[i] = 0;

-  }

-  src_diff_ptr[0] = x->coeff[0 * 16];

-  src_diff_ptr[1] = x->coeff[4 * 16];

-  src_diff_ptr[4] = x->coeff[8 * 16];

-  src_diff_ptr[8] = x->coeff[12 * 16];

-}

-void vp9_transform_mby_8x8(MACROBLOCK *x) {

-  int i;

-  for (i = 0; i < 9; i += 8) {

-    x->vp9_short_fdct8x8(&x->block[i].src_diff[0],

-                         &x->block[i].coeff[0], 32);

-  }

-  for (i = 2; i < 11; i += 8) {

-    x->vp9_short_fdct8x8(&x->block[i].src_diff[0],

-                         &x->block[i + 2].coeff[0], 32);

-  }

-  if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {

-    // build dc block from 2x2 y dc values

-    build_dcblock_8x8(x);

-    // do 2nd order transform on the dc block

-    x->short_fhaar2x2(&x->block[24].src_diff[0],

-                      &x->block[24].coeff[0], 8);

-  }

-}

-void vp9_transform_mbuv_8x8(MACROBLOCK *x) {

-  int i;

-  for (i = 16; i < 24; i += 4) {

-    x->vp9_short_fdct8x8(&x->block[i].src_diff[0],

-                         &x->block[i].coeff[0], 16);

-  }

-}

-void vp9_transform_mb_8x8(MACROBLOCK *x) {

-  vp9_transform_mby_8x8(x);

-  vp9_transform_mbuv_8x8(x);

-}

-void vp9_transform_mby_16x16(MACROBLOCK *x) {

-  vp9_clear_system_state();

-  x->vp9_short_fdct16x16(&x->block[0].src_diff[0],

-                         &x->block[0].coeff[0], 32);

-}

-void vp9_transform_mb_16x16(MACROBLOCK *x) {

-  vp9_transform_mby_16x16(x);

-  vp9_transform_mbuv_8x8(x);

-}

-#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )

-#define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )

-typedef struct vp9_token_state vp9_token_state;

-struct vp9_token_state {

-  int           rate;

-  int           error;

-  int           next;

-  signed char   token;

-  short         qc;

-};

-// TODO: experiments to find optimal multiple numbers

-#define Y1_RD_MULT 4

-#define UV_RD_MULT 2

-#define Y2_RD_MULT 4

-static const int plane_rd_mult[4] = {

-  Y1_RD_MULT,

-  Y2_RD_MULT,

-  UV_RD_MULT,

-  Y1_RD_MULT

-};

-#define UPDATE_RD_COST()\

-{\

-  rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\

-  rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\

-  if (rd_cost0 == rd_cost1) {\

-    rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\

-    rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\

-  }\

-}

-static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,

-                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

-                       const VP9_ENCODER_RTCD *rtcd, int tx_size) {

-  BLOCK *b;

-  BLOCKD *d;

-  vp9_token_state tokens[65][2];

-  uint64_t best_mask[2];

-  const short *dequant_ptr;

-  const short *coeff_ptr;

-  short *qcoeff_ptr;

-  short *dqcoeff_ptr;

-  int eob;

-  int i0;

-  int rc;

-  int x;

-  int sz = 0;

-  int next;

-  int rdmult;

-  int rddiv;

-  int final_eob;

-  int64_t rd_cost0, rd_cost1;

-  int rate0, rate1;

-  int error0, error1;

-  int t0, t1;

-  int best;

-  int band;

-  int pt;

-  int err_mult = plane_rd_mult[type];

-  int default_eob;

-  int const *scan, *bands;

-  b = &mb->block[i];

-  d = &mb->e_mbd.block[i];

-  switch (tx_size) {

-    default:

-    case TX_4X4:

-      scan = vp9_default_zig_zag1d;

-      bands = vp9_coef_bands;

-      default_eob = 16;

-      // TODO: this isn't called (for intra4x4 modes), but will be left in

-      // since it could be used later

-      {

-        TX_TYPE tx_type = get_tx_type(&mb->e_mbd, d);

-        if (tx_type != DCT_DCT) {

-          switch (tx_type) {

-            case ADST_DCT:

-              scan = vp9_row_scan;

-              break;

-            case DCT_ADST:

-              scan = vp9_col_scan;

-              break;

-            default:

-              scan = vp9_default_zig_zag1d;

-              break;

-          }

-        } else {

-          scan = vp9_default_zig_zag1d;

-        }

-      }

-      break;

-    case TX_8X8:

-      scan = vp9_default_zig_zag1d_8x8;

-      bands = vp9_coef_bands_8x8;

-      default_eob = 64;

-      break;

-  }

-  dequant_ptr = d->dequant;

-  coeff_ptr = b->coeff;

-  qcoeff_ptr = d->qcoeff;

-  dqcoeff_ptr = d->dqcoeff;

-  i0 = (type == PLANE_TYPE_Y_NO_DC);

-  eob = d->eob;

-  /* Now set up a Viterbi trellis to evaluate alternative roundings. */

-  rdmult = mb->rdmult * err_mult;

-  if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME)

-    rdmult = (rdmult * 9) >> 4;

-  rddiv = mb->rddiv;

-  best_mask[0] = best_mask[1] = 0;

-  /* Initialize the sentinel node of the trellis. */

-  tokens[eob][0].rate = 0;

-  tokens[eob][0].error = 0;

-  tokens[eob][0].next = default_eob;

-  tokens[eob][0].token = DCT_EOB_TOKEN;

-  tokens[eob][0].qc = 0;

-  *(tokens[eob] + 1) = *(tokens[eob] + 0);

-  next = eob;

-  for (i = eob; i-- > i0;) {

-    int base_bits;

-    int d2;

-    int dx;

-    rc = scan[i];

-    x = qcoeff_ptr[rc];

-    /* Only add a trellis state for non-zero coefficients. */

-    if (x) {

-      int shortcut = 0;

-      error0 = tokens[next][0].error;

-      error1 = tokens[next][1].error;

-      /* Evaluate the first possibility for this state. */

-      rate0 = tokens[next][0].rate;

-      rate1 = tokens[next][1].rate;

-      t0 = (vp9_dct_value_tokens_ptr + x)->Token;

-      /* Consider both possible successor states. */

-      if (next < default_eob) {

-        band = bands[i + 1];

-        pt = vp9_prev_token_class[t0];

-        rate0 +=

-          mb->token_costs[tx_size][type][band][pt][tokens[next][0].token];

-        rate1 +=

-          mb->token_costs[tx_size][type][band][pt][tokens[next][1].token];

-      }

-      UPDATE_RD_COST();

-      /* And pick the best. */

-      best = rd_cost1 < rd_cost0;

-      base_bits = *(vp9_dct_value_cost_ptr + x);

-      dx = dqcoeff_ptr[rc] - coeff_ptr[rc];

-      d2 = dx * dx;

-      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);

-      tokens[i][0].error = d2 + (best ? error1 : error0);

-      tokens[i][0].next = next;

-      tokens[i][0].token = t0;

-      tokens[i][0].qc = x;

-      best_mask[0] |= best << i;

-      /* Evaluate the second possibility for this state. */

-      rate0 = tokens[next][0].rate;

-      rate1 = tokens[next][1].rate;

-      if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc])) &&

-          (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) + dequant_ptr[rc != 0]))

-        shortcut = 1;

-      else

-        shortcut = 0;

-      if (shortcut) {

-        sz = -(x < 0);

-        x -= 2 * sz + 1;

-      }

-      /* Consider both possible successor states. */

-      if (!x) {

-        /* If we reduced this coefficient to zero, check to see if

-         *  we need to move the EOB back here.

-         */

-        t0 = tokens[next][0].token == DCT_EOB_TOKEN ?

-             DCT_EOB_TOKEN : ZERO_TOKEN;

-        t1 = tokens[next][1].token == DCT_EOB_TOKEN ?

-             DCT_EOB_TOKEN : ZERO_TOKEN;

-      } else {

-        t0 = t1 = (vp9_dct_value_tokens_ptr + x)->Token;

-      }

-      if (next < default_eob) {

-        band = bands[i + 1];

-        if (t0 != DCT_EOB_TOKEN) {

-          pt = vp9_prev_token_class[t0];

-          rate0 += mb->token_costs[tx_size][type][band][pt][

-              tokens[next][0].token];

-        }

-        if (t1 != DCT_EOB_TOKEN) {

-          pt = vp9_prev_token_class[t1];

-          rate1 += mb->token_costs[tx_size][type][band][pt][

-              tokens[next][1].token];

-        }

-      }

-      UPDATE_RD_COST();

-      /* And pick the best. */

-      best = rd_cost1 < rd_cost0;

-      base_bits = *(vp9_dct_value_cost_ptr + x);

-      if (shortcut) {

-        dx -= (dequant_ptr[rc != 0] + sz) ^ sz;

-        d2 = dx * dx;

-      }

-      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);

-      tokens[i][1].error = d2 + (best ? error1 : error0);

-      tokens[i][1].next = next;

-      tokens[i][1].token = best ? t1 : t0;

-      tokens[i][1].qc = x;

-      best_mask[1] |= best << i;

-      /* Finally, make this the new head of the trellis. */

-      next = i;

-    }

-    /* There's no choice to make for a zero coefficient, so we don't

-     *  add a new trellis node, but we do need to update the costs.

-     */

-    else {

-      band = bands[i + 1];

-      t0 = tokens[next][0].token;

-      t1 = tokens[next][1].token;

-      /* Update the cost of each path if we're past the EOB token. */

-      if (t0 != DCT_EOB_TOKEN) {

-        tokens[next][0].rate += mb->token_costs[tx_size][type][band][0][t0];

-        tokens[next][0].token = ZERO_TOKEN;

-      }

-      if (t1 != DCT_EOB_TOKEN) {

-        tokens[next][1].rate += mb->token_costs[tx_size][type][band][0][t1];

-        tokens[next][1].token = ZERO_TOKEN;

-      }

-      /* Don't update next, because we didn't add a new node. */

-    }

-  }

-  /* Now pick the best path through the whole trellis. */

-  band = bands[i + 1];

-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

-  rate0 = tokens[next][0].rate;

-  rate1 = tokens[next][1].rate;

-  error0 = tokens[next][0].error;

-  error1 = tokens[next][1].error;

-  t0 = tokens[next][0].token;

-  t1 = tokens[next][1].token;

-  rate0 += mb->token_costs[tx_size][type][band][pt][t0];

-  rate1 += mb->token_costs[tx_size][type][band][pt][t1];

-  UPDATE_RD_COST();

-  best = rd_cost1 < rd_cost0;

-  final_eob = i0 - 1;

-  for (i = next; i < eob; i = next) {

-    x = tokens[i][best].qc;

-    if (x)

-      final_eob = i;

-    rc = scan[i];

-    qcoeff_ptr[rc] = x;

-    dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]);

-    next = tokens[i][best].next;

-    best = (best_mask[best] >> i) & 1;

-  }

-  final_eob++;

-  d->eob = final_eob;

-  *a = *l = (d->eob != !type);

-}

-/**************************************************************************

-our inverse hadamard transform effectively is weighted sum of all 16 inputs

-with weight either 1 or -1. It has a last stage scaling of (sum+1)>>2. And

-dc only idct is (dc+16)>>5. So if all the sums are between -65 and 63 the

-output after inverse wht and idct will be all zero. A sum of absolute value

-smaller than 65 guarantees all 16 different (+1/-1) weighted sums in wht

-fall between -65 and +65.

-**************************************************************************/

-#define SUM_2ND_COEFF_THRESH 65

-static void check_reset_2nd_coeffs(MACROBLOCKD *xd,

-                                   ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {

-  int sum = 0;

-  int i;

-  BLOCKD *bd = &xd->block[24];

-  if (bd->dequant[0] >= SUM_2ND_COEFF_THRESH

-      && bd->dequant[1] >= SUM_2ND_COEFF_THRESH)

-    return;

-  for (i = 0; i < bd->eob; i++) {

-    int coef = bd->dqcoeff[vp9_default_zig_zag1d[i]];

-    sum += (coef >= 0) ? coef : -coef;

-    if (sum >= SUM_2ND_COEFF_THRESH)

-      return;

-  }

-  if (sum < SUM_2ND_COEFF_THRESH) {

-    for (i = 0; i < bd->eob; i++) {

-      int rc = vp9_default_zig_zag1d[i];

-      bd->qcoeff[rc] = 0;

-      bd->dqcoeff[rc] = 0;

-    }

-    bd->eob = 0;

-    *a = *l = (bd->eob != 0);

-  }

-}

-#define SUM_2ND_COEFF_THRESH_8X8 32

-static void check_reset_8x8_2nd_coeffs(MACROBLOCKD *xd,

-                                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {

-  int sum = 0;

-  BLOCKD *bd = &xd->block[24];

-  int coef;

-  coef = bd->dqcoeff[0];

-  sum += (coef >= 0) ? coef : -coef;

-  coef = bd->dqcoeff[1];

-  sum += (coef >= 0) ? coef : -coef;

-  coef = bd->dqcoeff[4];

-  sum += (coef >= 0) ? coef : -coef;

-  coef = bd->dqcoeff[8];

-  sum += (coef >= 0) ? coef : -coef;

-  if (sum < SUM_2ND_COEFF_THRESH_8X8) {

-    bd->qcoeff[0] = 0;

-    bd->dqcoeff[0] = 0;

-    bd->qcoeff[1] = 0;

-    bd->dqcoeff[1] = 0;

-    bd->qcoeff[4] = 0;

-    bd->dqcoeff[4] = 0;

-    bd->qcoeff[8] = 0;

-    bd->dqcoeff[8] = 0;

-    bd->eob = 0;

-    *a = *l = (bd->eob != 0);

-  }

-}

-void vp9_optimize_mby_4x4(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {

-  int b;

-  PLANE_TYPE type;

-  int has_2nd_order;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta;

-  ENTROPY_CONTEXT *tl;

-  MB_PREDICTION_MODE mode = x->e_mbd.mode_info_context->mbmi.mode;

-  if (!x->e_mbd.above_context || !x->e_mbd.left_context)

-    return;

-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  ta = (ENTROPY_CONTEXT *)&t_above;

-  tl = (ENTROPY_CONTEXT *)&t_left;

-  has_2nd_order = (mode != B_PRED && mode != I8X8_PRED && mode != SPLITMV);

-  type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;

-  for (b = 0; b < 16; b++) {

-    optimize_b(x, b, type,

-               ta + vp9_block2above[b], tl + vp9_block2left[b], rtcd, TX_4X4);

-  }

-  if (has_2nd_order) {

-    b = 24;

-    optimize_b(x, b, PLANE_TYPE_Y2,

-               ta + vp9_block2above[b], tl + vp9_block2left[b], rtcd, TX_4X4);

-    check_reset_2nd_coeffs(&x->e_mbd,

-                           ta + vp9_block2above[b], tl + vp9_block2left[b]);

-  }

-}

-void vp9_optimize_mbuv_4x4(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {

-  int b;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta;

-  ENTROPY_CONTEXT *tl;

-  if (!x->e_mbd.above_context || !x->e_mbd.left_context)

-    return;

-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  ta = (ENTROPY_CONTEXT *)&t_above;

-  tl = (ENTROPY_CONTEXT *)&t_left;

-  for (b = 16; b < 24; b++) {

-    optimize_b(x, b, PLANE_TYPE_UV,

-               ta + vp9_block2above[b], tl + vp9_block2left[b], rtcd, TX_4X4);

-  }

-}

-static void optimize_mb_4x4(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {

-  vp9_optimize_mby_4x4(x, rtcd);

-  vp9_optimize_mbuv_4x4(x, rtcd);

-}

-void vp9_optimize_mby_8x8(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {

-  int b;

-  PLANE_TYPE type;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta;

-  ENTROPY_CONTEXT *tl;

-  int has_2nd_order = x->e_mbd.mode_info_context->mbmi.mode != SPLITMV;

-  if (!x->e_mbd.above_context || !x->e_mbd.left_context)

-    return;

-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  ta = (ENTROPY_CONTEXT *)&t_above;

-  tl = (ENTROPY_CONTEXT *)&t_left;

-  type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;

-  for (b = 0; b < 16; b += 4) {

-    optimize_b(x, b, type,

-               ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],

-               rtcd, TX_8X8);

-    ta[vp9_block2above_8x8[b] + 1] = ta[vp9_block2above_8x8[b]];

-    tl[vp9_block2left_8x8[b] + 1]  = tl[vp9_block2left_8x8[b]];

-  }

-  // 8x8 always have 2nd roder haar block

-  if (has_2nd_order) {

-    check_reset_8x8_2nd_coeffs(&x->e_mbd,

-                               ta + vp9_block2above_8x8[24],

-                               tl + vp9_block2left_8x8[24]);

-  }

-}

-void vp9_optimize_mbuv_8x8(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {

-  int b;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta;

-  ENTROPY_CONTEXT *tl;

-  if (!x->e_mbd.above_context || !x->e_mbd.left_context)

-    return;

-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  ta = (ENTROPY_CONTEXT *)&t_above;

-  tl = (ENTROPY_CONTEXT *)&t_left;

-  for (b = 16; b < 24; b += 4) {

-    optimize_b(x, b, PLANE_TYPE_UV,

-               ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],

-               rtcd, TX_8X8);

-    ta[vp9_block2above_8x8[b] + 1] = ta[vp9_block2above_8x8[b]];

-    tl[vp9_block2left_8x8[b] + 1]  = tl[vp9_block2left_8x8[b]];

-  }

-}

-static void optimize_mb_8x8(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {

-  vp9_optimize_mby_8x8(x, rtcd);

-  vp9_optimize_mbuv_8x8(x, rtcd);

-}

-static void optimize_b_16x16(MACROBLOCK *mb, int i, PLANE_TYPE type,

-                             ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

-                             const VP9_ENCODER_RTCD *rtcd) {

-  BLOCK *b = &mb->block[i];

-  BLOCKD *d = &mb->e_mbd.block[i];

-  vp9_token_state tokens[257][2];

-  unsigned best_index[257][2];

-  const short *dequant_ptr = d->dequant, *coeff_ptr = b->coeff;

-  short *qcoeff_ptr = qcoeff_ptr = d->qcoeff;

-  short *dqcoeff_ptr = dqcoeff_ptr = d->dqcoeff;

-  int eob = d->eob, final_eob, sz = 0;

-  int rc, x, next;

-  int64_t rdmult, rddiv, rd_cost0, rd_cost1;

-  int rate0, rate1, error0, error1, t0, t1;

-  int best, band, pt;

-  int err_mult = plane_rd_mult[type];

-  /* Now set up a Viterbi trellis to evaluate alternative roundings. */

-  rdmult = mb->rdmult * err_mult;

-  if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME)

-      rdmult = (rdmult * 9)>>4;

-  rddiv = mb->rddiv;

-  memset(best_index, 0, sizeof(best_index));

-  /* Initialize the sentinel node of the trellis. */

-  tokens[eob][0].rate = 0;

-  tokens[eob][0].error = 0;

-  tokens[eob][0].next = 256;

-  tokens[eob][0].token = DCT_EOB_TOKEN;

-  tokens[eob][0].qc = 0;

-  *(tokens[eob] + 1) = *(tokens[eob] + 0);

-  next = eob;

-  for (i = eob; i-- > 0;) {

-    int base_bits, d2, dx;

-    rc = vp9_default_zig_zag1d_16x16[i];

-    x = qcoeff_ptr[rc];

-    /* Only add a trellis state for non-zero coefficients. */

-    if (x) {

-      int shortcut = 0;

-      error0 = tokens[next][0].error;

-      error1 = tokens[next][1].error;

-      /* Evaluate the first possibility for this state. */

-      rate0 = tokens[next][0].rate;

-      rate1 = tokens[next][1].rate;

-      t0 = (vp9_dct_value_tokens_ptr + x)->Token;

-      /* Consider both possible successor states. */

-      if (next < 256) {

-        band = vp9_coef_bands_16x16[i + 1];

-        pt = vp9_prev_token_class[t0];

-        rate0 += mb->token_costs[TX_16X16][type][band][pt][tokens[next][0].token];

-        rate1 += mb->token_costs[TX_16X16][type][band][pt][tokens[next][1].token];

-      }

-      UPDATE_RD_COST();

-      /* And pick the best. */

-      best = rd_cost1 < rd_cost0;

-      base_bits = *(vp9_dct_value_cost_ptr + x);

-      dx = dqcoeff_ptr[rc] - coeff_ptr[rc];

-      d2 = dx*dx;

-      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);

-      tokens[i][0].error = d2 + (best ? error1 : error0);

-      tokens[i][0].next = next;

-      tokens[i][0].token = t0;

-      tokens[i][0].qc = x;

-      best_index[i][0] = best;

-      /* Evaluate the second possibility for this state. */

-      rate0 = tokens[next][0].rate;

-      rate1 = tokens[next][1].rate;

-      if((abs(x)*dequant_ptr[rc!=0]>abs(coeff_ptr[rc])) &&

-         (abs(x)*dequant_ptr[rc!=0]<abs(coeff_ptr[rc])+dequant_ptr[rc!=0]))

-        shortcut = 1;

-      else

-        shortcut = 0;

-      if (shortcut) {

-        sz = -(x < 0);

-        x -= 2*sz + 1;

-      }

-      /* Consider both possible successor states. */

-      if (!x) {

-        /* If we reduced this coefficient to zero, check to see if

-         *  we need to move the EOB back here.

-         */

-        t0 = tokens[next][0].token == DCT_EOB_TOKEN ?

-             DCT_EOB_TOKEN : ZERO_TOKEN;

-        t1 = tokens[next][1].token == DCT_EOB_TOKEN ?

-             DCT_EOB_TOKEN : ZERO_TOKEN;

-      }

-      else

-        t0=t1 = (vp9_dct_value_tokens_ptr + x)->Token;

-      if (next < 256) {

-        band = vp9_coef_bands_16x16[i + 1];

-        if (t0 != DCT_EOB_TOKEN) {

-            pt = vp9_prev_token_class[t0];

-            rate0 += mb->token_costs[TX_16X16][type][band][pt]

-                [tokens[next][0].token];

-        }

-        if (t1!=DCT_EOB_TOKEN) {

-            pt = vp9_prev_token_class[t1];

-            rate1 += mb->token_costs[TX_16X16][type][band][pt]

-                [tokens[next][1].token];

-        }

-      }

-      UPDATE_RD_COST();

-      /* And pick the best. */

-      best = rd_cost1 < rd_cost0;

-      base_bits = *(vp9_dct_value_cost_ptr + x);

-      if(shortcut) {

-        dx -= (dequant_ptr[rc!=0] + sz) ^ sz;

-        d2 = dx*dx;

-      }

-      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);

-      tokens[i][1].error = d2 + (best ? error1 : error0);

-      tokens[i][1].next = next;

-      tokens[i][1].token = best ? t1 : t0;

-      tokens[i][1].qc = x;

-      best_index[i][1] = best;

-      /* Finally, make this the new head of the trellis. */

-      next = i;

-    }

-    /* There's no choice to make for a zero coefficient, so we don't

-     *  add a new trellis node, but we do need to update the costs.

-     */

-    else {

-      band = vp9_coef_bands_16x16[i + 1];

-      t0 = tokens[next][0].token;

-      t1 = tokens[next][1].token;

-      /* Update the cost of each path if we're past the EOB token. */

-      if (t0 != DCT_EOB_TOKEN) {

-        tokens[next][0].rate += mb->token_costs[TX_16X16][type][band][0][t0];

-        tokens[next][0].token = ZERO_TOKEN;

-      }

-      if (t1 != DCT_EOB_TOKEN) {

-        tokens[next][1].rate += mb->token_costs[TX_16X16][type][band][0][t1];

-        tokens[next][1].token = ZERO_TOKEN;

-      }

-      /* Don't update next, because we didn't add a new node. */

-    }

-  }

-  /* Now pick the best path through the whole trellis. */

-  band = vp9_coef_bands_16x16[i + 1];

-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

-  rate0 = tokens[next][0].rate;

-  rate1 = tokens[next][1].rate;

-  error0 = tokens[next][0].error;

-  error1 = tokens[next][1].error;

-  t0 = tokens[next][0].token;

-  t1 = tokens[next][1].token;

-  rate0 += mb->token_costs[TX_16X16][type][band][pt][t0];

-  rate1 += mb->token_costs[TX_16X16][type][band][pt][t1];

-  UPDATE_RD_COST();

-  best = rd_cost1 < rd_cost0;

-  final_eob = -1;

-  for (i = next; i < eob; i = next) {

-    x = tokens[i][best].qc;

-    if (x)

-      final_eob = i;

-    rc = vp9_default_zig_zag1d_16x16[i];

-    qcoeff_ptr[rc] = x;

-    dqcoeff_ptr[rc] = (x * dequant_ptr[rc!=0]);

-    next = tokens[i][best].next;

-    best = best_index[i][best];

-  }

-  final_eob++;

-  d->eob = final_eob;

-  *a = *l = (d->eob != !type);

-}

-void vp9_optimize_mby_16x16(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta, *tl;

-  if (!x->e_mbd.above_context || !x->e_mbd.left_context)

-    return;

-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  ta = (ENTROPY_CONTEXT *)&t_above;

-  tl = (ENTROPY_CONTEXT *)&t_left;

-  optimize_b_16x16(x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, rtcd);

-}

-static void optimize_mb_16x16(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {

-  vp9_optimize_mby_16x16(x, rtcd);

-  vp9_optimize_mbuv_8x8(x, rtcd);

-}

-void vp9_encode_inter16x16(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {

-  MACROBLOCKD *xd = &x->e_mbd;

-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;

-  vp9_build_inter_predictors_mb(xd);

-  subtract_mb(rtcd, x);

-  if (tx_size == TX_16X16) {

-    vp9_transform_mb_16x16(x);

-    vp9_quantize_mb_16x16(x);

-    if (x->optimize)

-      optimize_mb_16x16(x, rtcd);

-    vp9_inverse_transform_mb_16x16(IF_RTCD(&rtcd->common->idct), xd);

-  } else if (tx_size == TX_8X8) {

-    if (xd->mode_info_context->mbmi.mode == SPLITMV) {

-      assert(xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4);

-      vp9_transform_mby_8x8(x);

-      vp9_transform_mbuv_4x4(x);

-      vp9_quantize_mby_8x8(x);

-      vp9_quantize_mbuv_4x4(x);

-      if (x->optimize) {

-        vp9_optimize_mby_8x8(x, rtcd);

-        vp9_optimize_mbuv_4x4(x, rtcd);

-      }

-      vp9_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), xd);

-      vp9_inverse_transform_mbuv_4x4(IF_RTCD(&rtcd->common->idct), xd);

-    } else {

-      vp9_transform_mb_8x8(x);

-      vp9_quantize_mb_8x8(x);

-      if (x->optimize)

-        optimize_mb_8x8(x, rtcd);

-      vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), xd);

-    }

-  } else {

-    transform_mb_4x4(x);

-    vp9_quantize_mb_4x4(x);

-    if (x->optimize)

-      optimize_mb_4x4(x, rtcd);

-    vp9_inverse_transform_mb_4x4(IF_RTCD(&rtcd->common->idct), xd);

-  }

-  vp9_recon_mb(xd);

-}

-/* this function is used by first pass only */

-void vp9_encode_inter16x16y(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {

-  MACROBLOCKD *xd = &x->e_mbd;

-  BLOCK *b = &x->block[0];

-#if CONFIG_PRED_FILTER

-  // Disable the prediction filter for firstpass

-  xd->mode_info_context->mbmi.pred_filter_enabled = 0;

-#endif

-  vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);

-  vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);

-  vp9_transform_mby_4x4(x);

-  vp9_quantize_mby_4x4(x);

-  vp9_inverse_transform_mby_4x4(IF_RTCD(&rtcd->common->idct), xd);

-  vp9_recon_mby(xd);

-}

--- a/vp8/encoder/encodemb.h

+++ /dev/null

@@ -1,70 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_ENCODEMB_H

-#define __INC_ENCODEMB_H

-#include "vpx_ports/config.h"

-#include "block.h"

-typedef struct {

-  MB_PREDICTION_MODE mode;

-  MV_REFERENCE_FRAME ref_frame;

-  MV_REFERENCE_FRAME second_ref_frame;

-#if CONFIG_PRED_FILTER

-  int pred_filter_flag;

-#endif

-} MODE_DEFINITION;

-#if CONFIG_RUNTIME_CPU_DETECT

-#define ENCODEMB_INVOKE(ctx,fn) (ctx)->fn

-#else

-#define ENCODEMB_INVOKE(ctx,fn) vp9_encodemb_##fn

-#endif

-#include "onyx_int.h"

-struct VP9_ENCODER_RTCD;

-void vp9_encode_inter16x16(const struct VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);

-void vp9_transform_mbuv_4x4(MACROBLOCK *x);

-void vp9_transform_mby_4x4(MACROBLOCK *x);

-void vp9_optimize_mby_4x4(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);

-void vp9_optimize_mbuv_4x4(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);

-void vp9_encode_inter16x16y(const struct VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);

-void vp9_transform_mb_8x8(MACROBLOCK *mb);

-void vp9_transform_mby_8x8(MACROBLOCK *x);

-void vp9_transform_mbuv_8x8(MACROBLOCK *x);

-void vp9_build_dcblock_8x8(MACROBLOCK *b);

-void vp9_optimize_mby_8x8(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);

-void vp9_optimize_mbuv_8x8(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);

-void vp9_transform_mb_16x16(MACROBLOCK *mb);

-void vp9_transform_mby_16x16(MACROBLOCK *x);

-void vp9_optimize_mby_16x16(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);

-void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);

-#if CONFIG_SUPERBLOCKS

-void vp9_subtract_mbuv_s_c(short *diff, const unsigned char *usrc,

-                           const unsigned char *vsrc, int src_stride,

-                           const unsigned char *upred,

-                           const unsigned char *vpred, int dst_stride);

-void vp9_subtract_mby_s_c(short *diff, const unsigned char *src,

-                          int src_stride, const unsigned char *pred,

-                          int dst_stride);

-#endif

-#endif

--- a/vp8/encoder/encodemv.c

+++ /dev/null

@@ -1,547 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp8/common/common.h"

-#include "encodemv.h"

-#include "vp8/common/entropymode.h"

-#include "vp8/common/systemdependent.h"

-#include <math.h>

-#ifdef ENTROPY_STATS

-extern unsigned int active_section;

-#endif

-#ifdef NMV_STATS

-nmv_context_counts tnmvcounts;

-#endif

-static void encode_nmv_component(vp9_writer* const bc,

-                                 int v,

-                                 int r,

-                                 const nmv_component* const mvcomp) {

-  int s, z, c, o, d;

-  assert (v != 0);            /* should not be zero */

-  s = v < 0;

-  vp9_write(bc, s, mvcomp->sign);

-  z = (s ? -v : v) - 1;       /* magnitude - 1 */

-  c = vp9_get_mv_class(z, &o);

-  write_token(bc, vp9_mv_class_tree, mvcomp->classes,

-              vp9_mv_class_encodings + c);

-  d = (o >> 3);               /* int mv data */

-  if (c == MV_CLASS_0) {

-    write_token(bc, vp9_mv_class0_tree, mvcomp->class0,

-                vp9_mv_class0_encodings + d);

-  } else {

-    int i, b;

-    b = c + CLASS0_BITS - 1;  /* number of bits */

-    for (i = 0; i < b; ++i)

-      vp9_write(bc, ((d >> i) & 1), mvcomp->bits[i]);

-  }

-}

-static void encode_nmv_component_fp(vp9_writer *bc,

-                                    int v,

-                                    int r,

-                                    const nmv_component* const mvcomp,

-                                    int usehp) {

-  int s, z, c, o, d, f, e;

-  assert (v != 0);            /* should not be zero */

-  s = v < 0;

-  z = (s ? -v : v) - 1;       /* magnitude - 1 */

-  c = vp9_get_mv_class(z, &o);

-  d = (o >> 3);               /* int mv data */

-  f = (o >> 1) & 3;           /* fractional pel mv data */

-  e = (o & 1);                /* high precision mv data */

-  /* Code the fractional pel bits */

-  if (c == MV_CLASS_0) {

-    write_token(bc, vp9_mv_fp_tree, mvcomp->class0_fp[d],

-                vp9_mv_fp_encodings + f);

-  } else {

-    write_token(bc, vp9_mv_fp_tree, mvcomp->fp,

-                vp9_mv_fp_encodings + f);

-  }

-  /* Code the high precision bit */

-  if (usehp) {

-    if (c == MV_CLASS_0) {

-      vp9_write(bc, e, mvcomp->class0_hp);

-    } else {

-      vp9_write(bc, e, mvcomp->hp);

-    }

-  }

-}

-static void build_nmv_component_cost_table(int *mvcost,

-                                           const nmv_component* const mvcomp,

-                                           int usehp) {

-  int i, v;

-  int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];

-  int bits_cost[MV_OFFSET_BITS][2];

-  int class0_fp_cost[CLASS0_SIZE][4], fp_cost[4];

-  int class0_hp_cost[2], hp_cost[2];

-  sign_cost[0] = vp9_cost_zero(mvcomp->sign);

-  sign_cost[1] = vp9_cost_one(mvcomp->sign);

-  vp9_cost_tokens(class_cost, mvcomp->classes, vp9_mv_class_tree);

-  vp9_cost_tokens(class0_cost, mvcomp->class0, vp9_mv_class0_tree);

-  for (i = 0; i < MV_OFFSET_BITS; ++i) {

-    bits_cost[i][0] = vp9_cost_zero(mvcomp->bits[i]);

-    bits_cost[i][1] = vp9_cost_one(mvcomp->bits[i]);

-  }

-  for (i = 0; i < CLASS0_SIZE; ++i)

-    vp9_cost_tokens(class0_fp_cost[i], mvcomp->class0_fp[i], vp9_mv_fp_tree);

-  vp9_cost_tokens(fp_cost, mvcomp->fp, vp9_mv_fp_tree);

-  if (usehp) {

-    class0_hp_cost[0] = vp9_cost_zero(mvcomp->class0_hp);

-    class0_hp_cost[1] = vp9_cost_one(mvcomp->class0_hp);

-    hp_cost[0] = vp9_cost_zero(mvcomp->hp);

-    hp_cost[1] = vp9_cost_one(mvcomp->hp);

-  }

-  mvcost[0] = 0;

-  for (v = 1; v <= MV_MAX; ++v) {

-    int z, c, o, d, e, f, cost = 0;

-    z = v - 1;

-    c = vp9_get_mv_class(z, &o);

-    cost += class_cost[c];

-    d = (o >> 3);               /* int mv data */

-    f = (o >> 1) & 3;           /* fractional pel mv data */

-    e = (o & 1);                /* high precision mv data */

-    if (c == MV_CLASS_0) {

-      cost += class0_cost[d];

-    } else {

-      int i, b;

-      b = c + CLASS0_BITS - 1;  /* number of bits */

-      for (i = 0; i < b; ++i)

-        cost += bits_cost[i][((d >> i) & 1)];

-    }

-    if (c == MV_CLASS_0) {

-      cost += class0_fp_cost[d][f];

-    } else {

-      cost += fp_cost[f];

-    }

-    if (usehp) {

-      if (c == MV_CLASS_0) {

-        cost += class0_hp_cost[e];

-      } else {

-        cost += hp_cost[e];

-      }

-    }

-    mvcost[v] = cost + sign_cost[0];

-    mvcost[-v] = cost + sign_cost[1];

-  }

-}

-static int update_nmv_savings(const unsigned int ct[2],

-                              const vp9_prob cur_p,

-                              const vp9_prob new_p,

-                              const vp9_prob upd_p) {

-#ifdef LOW_PRECISION_MV_UPDATE

-  vp9_prob mod_p = new_p | 1;

-#else

-  vp9_prob mod_p = new_p;

-#endif

-  const int cur_b = cost_branch256(ct, cur_p);

-  const int mod_b = cost_branch256(ct, mod_p);

-  const int cost = 7 * 256 +

-#ifndef LOW_PRECISION_MV_UPDATE

-      256 +

-#endif

-      (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p));

-  if (cur_b - mod_b - cost > 0) {

-    return cur_b - mod_b - cost;

-  } else {

-    return -vp9_cost_zero(upd_p);

-  }

-}

-static int update_nmv(

-  vp9_writer *const bc,

-  const unsigned int ct[2],

-  vp9_prob *const cur_p,

-  const vp9_prob new_p,

-  const vp9_prob upd_p) {

-#ifdef LOW_PRECISION_MV_UPDATE

-  vp9_prob mod_p = new_p | 1;

-#else

-  vp9_prob mod_p = new_p;

-#endif

-  const int cur_b = cost_branch256(ct, *cur_p);

-  const int mod_b = cost_branch256(ct, mod_p);

-  const int cost = 7 * 256 +

-#ifndef LOW_PRECISION_MV_UPDATE

-      256 +

-#endif

-      (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p));

-  if (cur_b - mod_b > cost) {

-    *cur_p = mod_p;

-    vp9_write(bc, 1, upd_p);

-#ifdef LOW_PRECISION_MV_UPDATE

-    vp9_write_literal(bc, mod_p >> 1, 7);

-#else

-    vp9_write_literal(bc, mod_p, 8);

-#endif

-    return 1;

-  } else {

-    vp9_write(bc, 0, upd_p);

-    return 0;

-  }

-}

-#ifdef NMV_STATS

-void init_nmvstats() {

-  vp9_zero(tnmvcounts);

-}

-void print_nmvstats() {

-  nmv_context prob;

-  unsigned int branch_ct_joint[MV_JOINTS - 1][2];

-  unsigned int branch_ct_sign[2][2];

-  unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];

-  unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];

-  unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];

-  unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];

-  unsigned int branch_ct_fp[2][4 - 1][2];

-  unsigned int branch_ct_class0_hp[2][2];

-  unsigned int branch_ct_hp[2][2];

-  int i, j, k;

-  vp9_counts_to_nmv_context(&tnmvcounts, &prob, 1,

-                            branch_ct_joint, branch_ct_sign, branch_ct_classes,

-                            branch_ct_class0, branch_ct_bits,

-                            branch_ct_class0_fp, branch_ct_fp,

-                            branch_ct_class0_hp, branch_ct_hp);

-  printf("\nCounts =\n  { ");

-  for (j = 0; j < MV_JOINTS; ++j)

-    printf("%d, ", tnmvcounts.joints[j]);

-  printf("},\n");

-  for (i=0; i< 2; ++i) {

-    printf("  {\n");

-    printf("    %d/%d,\n", tnmvcounts.comps[i].sign[0],

-                           tnmvcounts.comps[i].sign[1]);

-    printf("    { ");

-    for (j = 0; j < MV_CLASSES; ++j)

-      printf("%d, ", tnmvcounts.comps[i].classes[j]);

-    printf("},\n");

-    printf("    { ");

-    for (j = 0; j < CLASS0_SIZE; ++j)

-      printf("%d, ", tnmvcounts.comps[i].class0[j]);

-    printf("},\n");

-    printf("    { ");

-    for (j = 0; j < MV_OFFSET_BITS; ++j)

-      printf("%d/%d, ", tnmvcounts.comps[i].bits[j][0],

-                        tnmvcounts.comps[i].bits[j][1]);

-    printf("},\n");

-    printf("    {");

-    for (j = 0; j < CLASS0_SIZE; ++j) {

-      printf("{");

-      for (k = 0; k < 4; ++k)

-        printf("%d, ", tnmvcounts.comps[i].class0_fp[j][k]);

-      printf("}, ");

-    }

-    printf("},\n");

-    printf("    { ");

-    for (j = 0; j < 4; ++j)

-      printf("%d, ", tnmvcounts.comps[i].fp[j]);

-    printf("},\n");

-    printf("    %d/%d,\n",

-           tnmvcounts.comps[i].class0_hp[0],

-           tnmvcounts.comps[i].class0_hp[1]);

-    printf("    %d/%d,\n",

-           tnmvcounts.comps[i].hp[0],

-           tnmvcounts.comps[i].hp[1]);

-    printf("  },\n");

-  }

-  printf("\nProbs =\n  { ");

-  for (j = 0; j < MV_JOINTS - 1; ++j)

-    printf("%d, ", prob.joints[j]);

-  printf("},\n");

-  for (i=0; i< 2; ++i) {

-    printf("  {\n");

-    printf("    %d,\n", prob.comps[i].sign);

-    printf("    { ");

-    for (j = 0; j < MV_CLASSES - 1; ++j)

-      printf("%d, ", prob.comps[i].classes[j]);

-    printf("},\n");

-    printf("    { ");

-    for (j = 0; j < CLASS0_SIZE - 1; ++j)

-      printf("%d, ", prob.comps[i].class0[j]);

-    printf("},\n");

-    printf("    { ");

-    for (j = 0; j < MV_OFFSET_BITS; ++j)

-      printf("%d, ", prob.comps[i].bits[j]);

-    printf("},\n");

-    printf("    { ");

-    for (j = 0; j < CLASS0_SIZE; ++j) {

-      printf("{");

-      for (k = 0; k < 3; ++k)

-        printf("%d, ", prob.comps[i].class0_fp[j][k]);

-      printf("}, ");

-    }

-    printf("},\n");

-    printf("    { ");

-    for (j = 0; j < 3; ++j)

-      printf("%d, ", prob.comps[i].fp[j]);

-    printf("},\n");

-    printf("    %d,\n", prob.comps[i].class0_hp);

-    printf("    %d,\n", prob.comps[i].hp);

-    printf("  },\n");

-  }

-}

-static void add_nmvcount(nmv_context_counts* const dst,

-                         const nmv_context_counts* const src) {

-  int i, j, k;

-  for (j = 0; j < MV_JOINTS; ++j) {

-    dst->joints[j] += src->joints[j];

-  }

-  for (i = 0; i < 2; ++i) {

-    for (j = 0; j < MV_VALS; ++j) {

-      dst->comps[i].mvcount[j] += src->comps[i].mvcount[j];

-    }

-    dst->comps[i].sign[0] += src->comps[i].sign[0];

-    dst->comps[i].sign[1] += src->comps[i].sign[1];

-    for (j = 0; j < MV_CLASSES; ++j) {

-      dst->comps[i].classes[j] += src->comps[i].classes[j];

-    }

-    for (j = 0; j < CLASS0_SIZE; ++j) {

-      dst->comps[i].class0[j] += src->comps[i].class0[j];

-    }

-    for (j = 0; j < MV_OFFSET_BITS; ++j) {

-      dst->comps[i].bits[j][0] += src->comps[i].bits[j][0];

-      dst->comps[i].bits[j][1] += src->comps[i].bits[j][1];

-    }

-  }

-  for (i = 0; i < 2; ++i) {

-    for (j = 0; j < CLASS0_SIZE; ++j) {

-      for (k = 0; k < 4; ++k) {

-        dst->comps[i].class0_fp[j][k] += src->comps[i].class0_fp[j][k];

-      }

-    }

-    for (j = 0; j < 4; ++j) {

-      dst->comps[i].fp[j] += src->comps[i].fp[j];

-    }

-    dst->comps[i].class0_hp[0] += src->comps[i].class0_hp[0];

-    dst->comps[i].class0_hp[1] += src->comps[i].class0_hp[1];

-    dst->comps[i].hp[0] += src->comps[i].hp[0];

-    dst->comps[i].hp[1] += src->comps[i].hp[1];

-  }

-}

-#endif

-void vp9_write_nmvprobs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {

-  int i, j;

-  nmv_context prob;

-  unsigned int branch_ct_joint[MV_JOINTS - 1][2];

-  unsigned int branch_ct_sign[2][2];

-  unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];

-  unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];

-  unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];

-  unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];

-  unsigned int branch_ct_fp[2][4 - 1][2];

-  unsigned int branch_ct_class0_hp[2][2];

-  unsigned int branch_ct_hp[2][2];

-  int savings = 0;

-#ifdef NMV_STATS

-  if (!cpi->dummy_packing)

-    add_nmvcount(&tnmvcounts, &cpi->NMVcount);

-#endif

-  vp9_counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,

-                            branch_ct_joint, branch_ct_sign, branch_ct_classes,

-                            branch_ct_class0, branch_ct_bits,

-                            branch_ct_class0_fp, branch_ct_fp,

-                            branch_ct_class0_hp, branch_ct_hp);

-  /* write updates if they help */

-#ifdef MV_GROUP_UPDATE

-  for (j = 0; j < MV_JOINTS - 1; ++j) {

-    savings += update_nmv_savings(branch_ct_joint[j],

-                                  cpi->common.fc.nmvc.joints[j],

-                                  prob.joints[j],

-                                  VP9_NMV_UPDATE_PROB);

-  }

-  for (i = 0; i < 2; ++i) {

-    savings += update_nmv_savings(branch_ct_sign[i],

-                                  cpi->common.fc.nmvc.comps[i].sign,

-                                  prob.comps[i].sign,

-                                  VP9_NMV_UPDATE_PROB);

-    for (j = 0; j < MV_CLASSES - 1; ++j) {

-      savings += update_nmv_savings(branch_ct_classes[i][j],

-                                    cpi->common.fc.nmvc.comps[i].classes[j],

-                                    prob.comps[i].classes[j],

-                                    VP9_NMV_UPDATE_PROB);

-    }

-    for (j = 0; j < CLASS0_SIZE - 1; ++j) {

-      savings += update_nmv_savings(branch_ct_class0[i][j],

-                                    cpi->common.fc.nmvc.comps[i].class0[j],

-                                    prob.comps[i].class0[j],

-                                    VP9_NMV_UPDATE_PROB);

-    }

-    for (j = 0; j < MV_OFFSET_BITS; ++j) {

-      savings += update_nmv_savings(branch_ct_bits[i][j],

-                                    cpi->common.fc.nmvc.comps[i].bits[j],

-                                    prob.comps[i].bits[j],

-                                    VP9_NMV_UPDATE_PROB);

-    }

-  }

-  for (i = 0; i < 2; ++i) {

-    for (j = 0; j < CLASS0_SIZE; ++j) {

-      int k;

-      for (k = 0; k < 3; ++k) {

-        savings += update_nmv_savings(branch_ct_class0_fp[i][j][k],

-                                      cpi->common.fc.nmvc.comps[i].class0_fp[j][k],

-                                      prob.comps[i].class0_fp[j][k],

-                                      VP9_NMV_UPDATE_PROB);

-      }

-    }

-    for (j = 0; j < 3; ++j) {

-      savings += update_nmv_savings(branch_ct_fp[i][j],

-                                    cpi->common.fc.nmvc.comps[i].fp[j],

-                                    prob.comps[i].fp[j],

-                                    VP9_NMV_UPDATE_PROB);

-    }

-  }

-  if (usehp) {

-    for (i = 0; i < 2; ++i) {

-      savings += update_nmv_savings(branch_ct_class0_hp[i],

-                                    cpi->common.fc.nmvc.comps[i].class0_hp,

-                                    prob.comps[i].class0_hp,

-                                    VP9_NMV_UPDATE_PROB);

-      savings += update_nmv_savings(branch_ct_hp[i],

-                                    cpi->common.fc.nmvc.comps[i].hp,

-                                    prob.comps[i].hp,

-                                    VP9_NMV_UPDATE_PROB);

-    }

-  }

-  if (savings <= 0) {

-    vp9_write_bit(bc, 0);

-    return;

-  }

-  vp9_write_bit(bc, 1);

-#endif

-  for (j = 0; j < MV_JOINTS - 1; ++j) {

-    update_nmv(bc, branch_ct_joint[j],

-               &cpi->common.fc.nmvc.joints[j],

-               prob.joints[j],

-               VP9_NMV_UPDATE_PROB);

-  }

-  for (i = 0; i < 2; ++i) {

-    update_nmv(bc, branch_ct_sign[i],

-               &cpi->common.fc.nmvc.comps[i].sign,

-               prob.comps[i].sign,

-               VP9_NMV_UPDATE_PROB);

-    for (j = 0; j < MV_CLASSES - 1; ++j) {

-      update_nmv(bc, branch_ct_classes[i][j],

-                 &cpi->common.fc.nmvc.comps[i].classes[j],

-                 prob.comps[i].classes[j],

-                 VP9_NMV_UPDATE_PROB);

-    }

-    for (j = 0; j < CLASS0_SIZE - 1; ++j) {

-      update_nmv(bc, branch_ct_class0[i][j],

-                 &cpi->common.fc.nmvc.comps[i].class0[j],

-                 prob.comps[i].class0[j],

-                 VP9_NMV_UPDATE_PROB);

-    }

-    for (j = 0; j < MV_OFFSET_BITS; ++j) {

-      update_nmv(bc, branch_ct_bits[i][j],

-                 &cpi->common.fc.nmvc.comps[i].bits[j],

-                 prob.comps[i].bits[j],

-                 VP9_NMV_UPDATE_PROB);

-    }

-  }

-  for (i = 0; i < 2; ++i) {

-    for (j = 0; j < CLASS0_SIZE; ++j) {

-      int k;

-      for (k = 0; k < 3; ++k) {

-        update_nmv(bc, branch_ct_class0_fp[i][j][k],

-                   &cpi->common.fc.nmvc.comps[i].class0_fp[j][k],

-                   prob.comps[i].class0_fp[j][k],

-                   VP9_NMV_UPDATE_PROB);

-      }

-    }

-    for (j = 0; j < 3; ++j) {

-      update_nmv(bc, branch_ct_fp[i][j],

-                 &cpi->common.fc.nmvc.comps[i].fp[j],

-                 prob.comps[i].fp[j],

-                 VP9_NMV_UPDATE_PROB);

-    }

-  }

-  if (usehp) {

-    for (i = 0; i < 2; ++i) {

-      update_nmv(bc, branch_ct_class0_hp[i],

-                 &cpi->common.fc.nmvc.comps[i].class0_hp,

-                 prob.comps[i].class0_hp,

-                 VP9_NMV_UPDATE_PROB);

-      update_nmv(bc, branch_ct_hp[i],

-                 &cpi->common.fc.nmvc.comps[i].hp,

-                 prob.comps[i].hp,

-                 VP9_NMV_UPDATE_PROB);

-    }

-  }

-}

-void vp9_encode_nmv(vp9_writer* const bc, const MV* const mv,

-                    const MV* const ref, const nmv_context* const mvctx) {

-  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);

-  write_token(bc, vp9_mv_joint_tree, mvctx->joints,

-              vp9_mv_joint_encodings + j);

-  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {

-    encode_nmv_component(bc, mv->row, ref->col, &mvctx->comps[0]);

-  }

-  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {

-    encode_nmv_component(bc, mv->col, ref->col, &mvctx->comps[1]);

-  }

-}

-void vp9_encode_nmv_fp(vp9_writer* const bc, const MV* const mv,

-                       const MV* const ref, const nmv_context* const mvctx,

-                       int usehp) {

-  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);

-  usehp = usehp && vp9_use_nmv_hp(ref);

-  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {

-    encode_nmv_component_fp(bc, mv->row, ref->row, &mvctx->comps[0], usehp);

-  }

-  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {

-    encode_nmv_component_fp(bc, mv->col, ref->col, &mvctx->comps[1], usehp);

-  }

-}

-void vp9_build_nmv_cost_table(int *mvjoint,

-                              int *mvcost[2],

-                              const nmv_context* const mvctx,

-                              int usehp,

-                              int mvc_flag_v,

-                              int mvc_flag_h) {

-  vp9_clear_system_state();

-  vp9_cost_tokens(mvjoint, mvctx->joints, vp9_mv_joint_tree);

-  if (mvc_flag_v)

-    build_nmv_component_cost_table(mvcost[0], &mvctx->comps[0], usehp);

-  if (mvc_flag_h)

-    build_nmv_component_cost_table(mvcost[1], &mvctx->comps[1], usehp);

-}

--- a/vp8/encoder/encodemv.h

+++ /dev/null

@@ -1,30 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_ENCODEMV_H

-#define __INC_ENCODEMV_H

-#include "onyx_int.h"

-void vp9_write_nmvprobs(VP9_COMP* const, int usehp, vp9_writer* const);

-void vp9_encode_nmv(vp9_writer* const w, const MV* const mv,

-                    const MV* const ref, const nmv_context* const mvctx);

-void vp9_encode_nmv_fp(vp9_writer* const w, const MV* const mv,

-                       const MV* const ref, const nmv_context *mvctx,

-                       int usehp);

-void vp9_build_nmv_cost_table(int *mvjoint,

-                              int *mvcost[2],

-                              const nmv_context *mvctx,

-                              int usehp,

-                              int mvc_flag_v,

-                              int mvc_flag_h);

-#endif

--- a/vp8/encoder/firstpass.c

+++ /dev/null

@@ -1,2533 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "math.h"

-#include "limits.h"

-#include "block.h"

-#include "onyx_int.h"

-#include "variance.h"

-#include "encodeintra.h"

-#include "vp8/common/setupintrarecon.h"

-#include "mcomp.h"

-#include "firstpass.h"

-#include "vpx_scale/vpxscale.h"

-#include "encodemb.h"

-#include "vp8/common/extend.h"

-#include "vp8/common/systemdependent.h"

-#include "vpx_scale/yv12extend.h"

-#include "vpx_mem/vpx_mem.h"

-#include "vp8/common/swapyv12buffer.h"

-#include <stdio.h>

-#include "rdopt.h"

-#include "ratectrl.h"

-#include "vp8/common/quant_common.h"

-#include "vp8/common/entropymv.h"

-#include "encodemv.h"

-#define OUTPUT_FPF 0

-#if CONFIG_RUNTIME_CPU_DETECT

-#define IF_RTCD(x) (x)

-#else

-#define IF_RTCD(x) NULL

-#endif

-extern void vp9_build_block_offsets(MACROBLOCK *x);

-extern void vp9_setup_block_ptrs(MACROBLOCK *x);

-extern void vp9_frame_init_quantizer(VP9_COMP *cpi);

-extern void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb,

-                                   int_mv *mv);

-extern void vp9_alloc_compressor_data(VP9_COMP *cpi);

-#define IIFACTOR   12.5

-#define IIKFACTOR1 12.5

-#define IIKFACTOR2 15.0

-#define RMAX       128.0

-#define GF_RMAX    96.0

-#define ERR_DIVISOR   150.0

-#define KF_MB_INTRA_MIN 300

-#define GF_MB_INTRA_MIN 200

-#define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)

-#define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0

-#define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0

-static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame);

-static int select_cq_level(int qindex) {

-  int ret_val = QINDEX_RANGE - 1;

-  int i;

-  double target_q = (vp9_convert_qindex_to_q(qindex) * 0.5847) + 1.0;

-  for (i = 0; i < QINDEX_RANGE; i++) {

-    if (target_q <= vp9_convert_qindex_to_q(i)) {

-      ret_val = i;

-      break;

-    }

-  }

-  return ret_val;

-}

-// Resets the first pass file to the given position using a relative seek from the current position

-static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *Position) {

-  cpi->twopass.stats_in = Position;

-}

-static int lookup_next_frame_stats(VP9_COMP *cpi, FIRSTPASS_STATS *next_frame) {

-  if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)

-    return EOF;

-  *next_frame = *cpi->twopass.stats_in;

-  return 1;

-}

-// Read frame stats at an offset from the current position

-static int read_frame_stats(VP9_COMP *cpi,

-                            FIRSTPASS_STATS *frame_stats,

-                            int offset) {

-  FIRSTPASS_STATS *fps_ptr = cpi->twopass.stats_in;

-  // Check legality of offset

-  if (offset >= 0) {

-    if (&fps_ptr[offset] >= cpi->twopass.stats_in_end)

-      return EOF;

-  } else if (offset < 0) {

-    if (&fps_ptr[offset] < cpi->twopass.stats_in_start)

-      return EOF;

-  }

-  *frame_stats = fps_ptr[offset];

-  return 1;

-}

-static int input_stats(VP9_COMP *cpi, FIRSTPASS_STATS *fps) {

-  if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)

-    return EOF;

-  *fps = *cpi->twopass.stats_in;

-  cpi->twopass.stats_in =

-    (void *)((char *)cpi->twopass.stats_in + sizeof(FIRSTPASS_STATS));

-  return 1;

-}

-static void output_stats(const VP9_COMP            *cpi,

-                         struct vpx_codec_pkt_list *pktlist,

-                         FIRSTPASS_STATS            *stats) {

-  struct vpx_codec_cx_pkt pkt;

-  pkt.kind = VPX_CODEC_STATS_PKT;

-  pkt.data.twopass_stats.buf = stats;

-  pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);

-  vpx_codec_pkt_list_add(pktlist, &pkt);

-// TEMP debug code

-#if OUTPUT_FPF

-  {

-    FILE *fpfile;

-    fpfile = fopen("firstpass.stt", "a");

-    fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"

-            "%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"

-            "%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n",

-            stats->frame,

-            stats->intra_error,

-            stats->coded_error,

-            stats->sr_coded_error,

-            stats->ssim_weighted_pred_err,

-            stats->pcnt_inter,

-            stats->pcnt_motion,

-            stats->pcnt_second_ref,

-            stats->pcnt_neutral,

-            stats->MVr,

-            stats->mvr_abs,

-            stats->MVc,

-            stats->mvc_abs,

-            stats->MVrv,

-            stats->MVcv,

-            stats->mv_in_out_count,

-            stats->new_mv_count,

-            stats->count,

-            stats->duration);

-    fclose(fpfile);

-  }

-#endif

-}

-static void zero_stats(FIRSTPASS_STATS *section) {

-  section->frame      = 0.0;

-  section->intra_error = 0.0;

-  section->coded_error = 0.0;

-  section->sr_coded_error = 0.0;

-  section->ssim_weighted_pred_err = 0.0;

-  section->pcnt_inter  = 0.0;

-  section->pcnt_motion  = 0.0;

-  section->pcnt_second_ref = 0.0;

-  section->pcnt_neutral = 0.0;

-  section->MVr        = 0.0;

-  section->mvr_abs     = 0.0;

-  section->MVc        = 0.0;

-  section->mvc_abs     = 0.0;

-  section->MVrv       = 0.0;

-  section->MVcv       = 0.0;

-  section->mv_in_out_count  = 0.0;

-  section->new_mv_count = 0.0;

-  section->count      = 0.0;

-  section->duration   = 1.0;

-}

-static void accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) {

-  section->frame += frame->frame;

-  section->intra_error += frame->intra_error;

-  section->coded_error += frame->coded_error;

-  section->sr_coded_error += frame->sr_coded_error;

-  section->ssim_weighted_pred_err += frame->ssim_weighted_pred_err;

-  section->pcnt_inter  += frame->pcnt_inter;

-  section->pcnt_motion += frame->pcnt_motion;

-  section->pcnt_second_ref += frame->pcnt_second_ref;

-  section->pcnt_neutral += frame->pcnt_neutral;

-  section->MVr        += frame->MVr;

-  section->mvr_abs     += frame->mvr_abs;

-  section->MVc        += frame->MVc;

-  section->mvc_abs     += frame->mvc_abs;

-  section->MVrv       += frame->MVrv;

-  section->MVcv       += frame->MVcv;

-  section->mv_in_out_count  += frame->mv_in_out_count;

-  section->new_mv_count += frame->new_mv_count;

-  section->count      += frame->count;

-  section->duration   += frame->duration;

-}

-static void subtract_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) {

-  section->frame -= frame->frame;

-  section->intra_error -= frame->intra_error;

-  section->coded_error -= frame->coded_error;

-  section->sr_coded_error -= frame->sr_coded_error;

-  section->ssim_weighted_pred_err -= frame->ssim_weighted_pred_err;

-  section->pcnt_inter  -= frame->pcnt_inter;

-  section->pcnt_motion -= frame->pcnt_motion;

-  section->pcnt_second_ref -= frame->pcnt_second_ref;

-  section->pcnt_neutral -= frame->pcnt_neutral;

-  section->MVr        -= frame->MVr;

-  section->mvr_abs     -= frame->mvr_abs;

-  section->MVc        -= frame->MVc;

-  section->mvc_abs     -= frame->mvc_abs;

-  section->MVrv       -= frame->MVrv;

-  section->MVcv       -= frame->MVcv;

-  section->mv_in_out_count  -= frame->mv_in_out_count;

-  section->new_mv_count -= frame->new_mv_count;

-  section->count      -= frame->count;

-  section->duration   -= frame->duration;

-}

-static void avg_stats(FIRSTPASS_STATS *section) {

-  if (section->count < 1.0)

-    return;

-  section->intra_error /= section->count;

-  section->coded_error /= section->count;

-  section->sr_coded_error /= section->count;

-  section->ssim_weighted_pred_err /= section->count;

-  section->pcnt_inter  /= section->count;

-  section->pcnt_second_ref /= section->count;

-  section->pcnt_neutral /= section->count;

-  section->pcnt_motion /= section->count;

-  section->MVr        /= section->count;

-  section->mvr_abs     /= section->count;

-  section->MVc        /= section->count;

-  section->mvc_abs     /= section->count;

-  section->MVrv       /= section->count;

-  section->MVcv       /= section->count;

-  section->mv_in_out_count   /= section->count;

-  section->duration   /= section->count;

-}

-// Calculate a modified Error used in distributing bits between easier and harder frames

-static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {

-  double av_err = (cpi->twopass.total_stats->ssim_weighted_pred_err /

-                   cpi->twopass.total_stats->count);

-  double this_err = this_frame->ssim_weighted_pred_err;

-  double modified_err;

-  if (this_err > av_err)

-    modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW1);

-  else

-    modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW2);

-  return modified_err;

-}

-static const double weight_table[256] = {

-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,

-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,

-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,

-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,

-  0.020000, 0.031250, 0.062500, 0.093750, 0.125000, 0.156250, 0.187500, 0.218750,

-  0.250000, 0.281250, 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750,

-  0.500000, 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,

-  0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, 0.968750,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000

-};

-static double simple_weight(YV12_BUFFER_CONFIG *source) {

-  int i, j;

-  unsigned char *src = source->y_buffer;

-  double sum_weights = 0.0;

-  // Loop throught the Y plane raw examining levels and creating a weight for the image

-  i = source->y_height;

-  do {

-    j = source->y_width;

-    do {

-      sum_weights += weight_table[ *src];

-      src++;

-    } while (--j);

-    src -= source->y_width;

-    src += source->y_stride;

-  } while (--i);

-  sum_weights /= (source->y_height * source->y_width);

-  return sum_weights;

-}

-// This function returns the current per frame maximum bitrate target

-static int frame_max_bits(VP9_COMP *cpi) {

-  // Max allocation for a single frame based on the max section guidelines passed in and how many bits are left

-  int max_bits;

-  // For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user

-  max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats->count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));

-  // Trap case where we are out of bits

-  if (max_bits < 0)

-    max_bits = 0;

-  return max_bits;

-}

-void vp9_init_first_pass(VP9_COMP *cpi) {

-  zero_stats(cpi->twopass.total_stats);

-}

-void vp9_end_first_pass(VP9_COMP *cpi) {

-  output_stats(cpi, cpi->output_pkt_list, cpi->twopass.total_stats);

-}

-static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  BLOCK *b = &x->block[0];

-  BLOCKD *d = &x->e_mbd.block[0];

-  unsigned char *src_ptr = (*(b->base_src) + b->src);

-  int src_stride = b->src_stride;

-  unsigned char *ref_ptr;

-  int ref_stride = d->pre_stride;

-  // Set up pointers for this macro block recon buffer

-  xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;

-  ref_ptr = (unsigned char *)(*(d->base_pre) + d->pre);

-  vp9_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride,

-               (unsigned int *)(best_motion_err));

-}

-static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,

-                                     int_mv *ref_mv, MV *best_mv,

-                                     YV12_BUFFER_CONFIG *recon_buffer,

-                                     int *best_motion_err, int recon_yoffset) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  BLOCK *b = &x->block[0];

-  BLOCKD *d = &x->e_mbd.block[0];

-  int num00;

-  int_mv tmp_mv;

-  int_mv ref_mv_full;

-  int tmp_err;

-  int step_param = 3;

-  int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;

-  int n;

-  vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];

-  int new_mv_mode_penalty = 256;

-  // override the default variance function to use MSE

-  v_fn_ptr.vf = vp9_mse16x16;

-  // Set up pointers for this macro block recon buffer

-  xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;

-  // Initial step/diamond search centred on best mv

-  tmp_mv.as_int = 0;

-  ref_mv_full.as_mv.col = ref_mv->as_mv.col >> 3;

-  ref_mv_full.as_mv.row = ref_mv->as_mv.row >> 3;

-  tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv, step_param,

-                                    x->sadperbit16, &num00, &v_fn_ptr,

-                                    XMVCOST, ref_mv);

-  if (tmp_err < INT_MAX - new_mv_mode_penalty)

-    tmp_err += new_mv_mode_penalty;

-  if (tmp_err < *best_motion_err) {

-    *best_motion_err = tmp_err;

-    best_mv->row = tmp_mv.as_mv.row;

-    best_mv->col = tmp_mv.as_mv.col;

-  }

-  // Further step/diamond searches as necessary

-  n = num00;

-  num00 = 0;

-  while (n < further_steps) {

-    n++;

-    if (num00)

-      num00--;

-    else {

-      tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv,

-                                        step_param + n, x->sadperbit16,

-                                        &num00, &v_fn_ptr,

-                                        XMVCOST, ref_mv);

-      if (tmp_err < INT_MAX - new_mv_mode_penalty)

-        tmp_err += new_mv_mode_penalty;

-      if (tmp_err < *best_motion_err) {

-        *best_motion_err = tmp_err;

-        best_mv->row = tmp_mv.as_mv.row;

-        best_mv->col = tmp_mv.as_mv.col;

-      }

-    }

-  }

-}

-void vp9_first_pass(VP9_COMP *cpi) {

-  int mb_row, mb_col;

-  MACROBLOCK *const x = &cpi->mb;

-  VP9_COMMON *const cm = &cpi->common;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  int recon_yoffset, recon_uvoffset;

-  YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];

-  YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];

-  YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];

-  int recon_y_stride = lst_yv12->y_stride;

-  int recon_uv_stride = lst_yv12->uv_stride;

-  int64_t intra_error = 0;

-  int64_t coded_error = 0;

-  int64_t sr_coded_error = 0;

-  int sum_mvr = 0, sum_mvc = 0;

-  int sum_mvr_abs = 0, sum_mvc_abs = 0;

-  int sum_mvrs = 0, sum_mvcs = 0;

-  int mvcount = 0;

-  int intercount = 0;

-  int second_ref_count = 0;

-  int intrapenalty = 256;

-  int neutral_count = 0;

-  int new_mv_count = 0;

-  int sum_in_vectors = 0;

-  uint32_t lastmv_as_int = 0;

-  int_mv zero_ref_mv;

-  zero_ref_mv.as_int = 0;

-  vp9_clear_system_state();  // __asm emms;

-  x->src = * cpi->Source;

-  xd->pre = *lst_yv12;

-  xd->dst = *new_yv12;

-  x->partition_info = x->pi;

-  xd->mode_info_context = cm->mi;

-  vp9_build_block_offsets(x);

-  vp9_setup_block_dptrs(&x->e_mbd);

-  vp9_setup_block_ptrs(x);

-  // set up frame new frame for intra coded blocks

-  vp9_setup_intra_recon(new_yv12);

-  vp9_frame_init_quantizer(cpi);

-  // Initialise the MV cost table to the defaults

-  // if( cm->current_video_frame == 0)

-  // if ( 0 )

-  {

-    int flag[2] = {1, 1};

-    vp9_init_mv_probs(cm);

-    vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);

-  }

-  // for each macroblock row in image

-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {

-    int_mv best_ref_mv;

-    best_ref_mv.as_int = 0;

-    // reset above block coeffs

-    xd->up_available = (mb_row != 0);

-    recon_yoffset = (mb_row * recon_y_stride * 16);

-    recon_uvoffset = (mb_row * recon_uv_stride * 8);

-    // Set up limit values for motion vectors to prevent them extending outside the UMV borders

-    x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));

-    x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);

-    // for each macroblock col in image

-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

-      int this_error;

-      int gf_motion_error = INT_MAX;

-      int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);

-      xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;

-      xd->dst.u_buffer = new_yv12->u_buffer + recon_uvoffset;

-      xd->dst.v_buffer = new_yv12->v_buffer + recon_uvoffset;

-      xd->left_available = (mb_col != 0);

-      // Copy current mb to a buffer

-      vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);

-      // do intra 16x16 prediction

-      this_error = vp9_encode_intra(cpi, x, use_dc_pred);

-      // "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame)

-      // We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv.

-      // When the error score is very low this causes us to pick all or lots of INTRA modes and throw lots of key frames.

-      // This penalty adds a cost matching that of a 0,0 mv to the intra case.

-      this_error += intrapenalty;

-      // Cumulative intra error total

-      intra_error += (int64_t)this_error;

-      // Set up limit values for motion vectors to prevent them extending outside the UMV borders

-      x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));

-      x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);

-      // Other than for the first frame do a motion search

-      if (cm->current_video_frame > 0) {

-        int tmp_err;

-        int motion_error = INT_MAX;

-        int_mv mv, tmp_mv;

-        // Simple 0,0 motion with no mv overhead

-        zz_motion_search(cpi, x, lst_yv12, &motion_error, recon_yoffset);

-        mv.as_int = tmp_mv.as_int = 0;

-        // Test last reference frame using the previous best mv as the

-        // starting point (best reference) for the search

-        first_pass_motion_search(cpi, x, &best_ref_mv,

-                                 &mv.as_mv, lst_yv12,

-                                 &motion_error, recon_yoffset);

-        // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well

-        if (best_ref_mv.as_int) {

-          tmp_err = INT_MAX;

-          first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv.as_mv,

-                                   lst_yv12, &tmp_err, recon_yoffset);

-          if (tmp_err < motion_error) {

-            motion_error = tmp_err;

-            mv.as_int = tmp_mv.as_int;

-          }

-        }

-        // Experimental search in an older reference frame

-        if (cm->current_video_frame > 1) {

-          // Simple 0,0 motion with no mv overhead

-          zz_motion_search(cpi, x, gld_yv12,

-                           &gf_motion_error, recon_yoffset);

-          first_pass_motion_search(cpi, x, &zero_ref_mv,

-                                   &tmp_mv.as_mv, gld_yv12,

-                                   &gf_motion_error, recon_yoffset);

-          if ((gf_motion_error < motion_error) &&

-              (gf_motion_error < this_error)) {

-            second_ref_count++;

-          }

-          // Reset to last frame as reference buffer

-          xd->pre.y_buffer = lst_yv12->y_buffer + recon_yoffset;

-          xd->pre.u_buffer = lst_yv12->u_buffer + recon_uvoffset;

-          xd->pre.v_buffer = lst_yv12->v_buffer + recon_uvoffset;

-          // In accumulating a score for the older reference frame

-          // take the best of the motion predicted score and

-          // the intra coded error (just as will be done for)

-          // accumulation of "coded_error" for the last frame.

-          if (gf_motion_error < this_error)

-            sr_coded_error += gf_motion_error;

-          else

-            sr_coded_error += this_error;

-        } else

-          sr_coded_error += motion_error;

-        /* Intra assumed best */

-        best_ref_mv.as_int = 0;

-        if (motion_error <= this_error) {

-          // Keep a count of cases where the inter and intra were

-          // very close and very low. This helps with scene cut

-          // detection for example in cropped clips with black bars

-          // at the sides or top and bottom.

-          if ((((this_error - intrapenalty) * 9) <=

-               (motion_error * 10)) &&

-              (this_error < (2 * intrapenalty))) {

-            neutral_count++;

-          }

-          mv.as_mv.row <<= 3;

-          mv.as_mv.col <<= 3;

-          this_error = motion_error;

-          vp9_set_mbmode_and_mvs(x, NEWMV, &mv);

-          xd->mode_info_context->mbmi.txfm_size = TX_4X4;

-          vp9_encode_inter16x16y(IF_RTCD(&cpi->rtcd), x);

-          sum_mvr += mv.as_mv.row;

-          sum_mvr_abs += abs(mv.as_mv.row);

-          sum_mvc += mv.as_mv.col;

-          sum_mvc_abs += abs(mv.as_mv.col);

-          sum_mvrs += mv.as_mv.row * mv.as_mv.row;

-          sum_mvcs += mv.as_mv.col * mv.as_mv.col;

-          intercount++;

-          best_ref_mv.as_int = mv.as_int;

-          // Was the vector non-zero

-          if (mv.as_int) {

-            mvcount++;

-            // Was it different from the last non zero vector

-            if (mv.as_int != lastmv_as_int)

-              new_mv_count++;

-            lastmv_as_int = mv.as_int;

-            // Does the Row vector point inwards or outwards

-            if (mb_row < cm->mb_rows / 2) {

-              if (mv.as_mv.row > 0)

-                sum_in_vectors--;

-              else if (mv.as_mv.row < 0)

-                sum_in_vectors++;

-            } else if (mb_row > cm->mb_rows / 2) {

-              if (mv.as_mv.row > 0)

-                sum_in_vectors++;

-              else if (mv.as_mv.row < 0)

-                sum_in_vectors--;

-            }

-            // Does the Row vector point inwards or outwards

-            if (mb_col < cm->mb_cols / 2) {

-              if (mv.as_mv.col > 0)

-                sum_in_vectors--;

-              else if (mv.as_mv.col < 0)

-                sum_in_vectors++;

-            } else if (mb_col > cm->mb_cols / 2) {

-              if (mv.as_mv.col > 0)

-                sum_in_vectors++;

-              else if (mv.as_mv.col < 0)

-                sum_in_vectors--;

-            }

-          }

-        }

-      } else

-        sr_coded_error += (int64_t)this_error;

-      coded_error += (int64_t)this_error;

-      // adjust to the next column of macroblocks

-      x->src.y_buffer += 16;

-      x->src.u_buffer += 8;

-      x->src.v_buffer += 8;

-      recon_yoffset += 16;

-      recon_uvoffset += 8;

-    }

-    // adjust to the next row of mbs

-    x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;

-    x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;

-    x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;

-    // extend the recon for intra prediction

-    vp9_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,

-                      xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);

-    vp9_clear_system_state();  // __asm emms;

-  }

-  vp9_clear_system_state();  // __asm emms;

-  {

-    double weight = 0.0;

-    FIRSTPASS_STATS fps;

-    fps.frame      = cm->current_video_frame;

-    fps.intra_error = intra_error >> 8;

-    fps.coded_error = coded_error >> 8;

-    fps.sr_coded_error = sr_coded_error >> 8;

-    weight = simple_weight(cpi->Source);

-    if (weight < 0.1)

-      weight = 0.1;

-    fps.ssim_weighted_pred_err = fps.coded_error * weight;

-    fps.pcnt_inter  = 0.0;

-    fps.pcnt_motion = 0.0;

-    fps.MVr        = 0.0;

-    fps.mvr_abs     = 0.0;

-    fps.MVc        = 0.0;

-    fps.mvc_abs     = 0.0;

-    fps.MVrv       = 0.0;

-    fps.MVcv       = 0.0;

-    fps.mv_in_out_count  = 0.0;

-    fps.new_mv_count = 0.0;

-    fps.count      = 1.0;

-    fps.pcnt_inter   = 1.0 * (double)intercount / cm->MBs;

-    fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs;

-    fps.pcnt_neutral = 1.0 * (double)neutral_count / cm->MBs;

-    if (mvcount > 0) {

-      fps.MVr = (double)sum_mvr / (double)mvcount;

-      fps.mvr_abs = (double)sum_mvr_abs / (double)mvcount;

-      fps.MVc = (double)sum_mvc / (double)mvcount;

-      fps.mvc_abs = (double)sum_mvc_abs / (double)mvcount;

-      fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) / (double)mvcount;

-      fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) / (double)mvcount;

-      fps.mv_in_out_count = (double)sum_in_vectors / (double)(mvcount * 2);

-      fps.new_mv_count = new_mv_count;

-      fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs;

-    }

-    // TODO:  handle the case when duration is set to 0, or something less

-    // than the full time between subsequent cpi->source_time_stamp s  .

-    fps.duration = cpi->source->ts_end

-                   - cpi->source->ts_start;

-    // don't want to do output stats with a stack variable!

-    memcpy(cpi->twopass.this_frame_stats,

-           &fps,

-           sizeof(FIRSTPASS_STATS));

-    output_stats(cpi, cpi->output_pkt_list, cpi->twopass.this_frame_stats);

-    accumulate_stats(cpi->twopass.total_stats, &fps);

-  }

-  // Copy the previous Last Frame back into gf and and arf buffers if

-  // the prediction is good enough... but also dont allow it to lag too far

-  if ((cpi->twopass.sr_update_lag > 3) ||

-      ((cm->current_video_frame > 0) &&

-       (cpi->twopass.this_frame_stats->pcnt_inter > 0.20) &&

-       ((cpi->twopass.this_frame_stats->intra_error /

-         cpi->twopass.this_frame_stats->coded_error) > 2.0))) {

-    vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);

-    cpi->twopass.sr_update_lag = 1;

-  } else

-    cpi->twopass.sr_update_lag++;

-  // swap frame pointers so last frame refers to the frame we just compressed

-  vp9_swap_yv12_buffer(lst_yv12, new_yv12);

-  vp8_yv12_extend_frame_borders(lst_yv12);

-  // Special case for the first frame. Copy into the GF buffer as a second reference.

-  if (cm->current_video_frame == 0) {

-    vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);

-  }

-  // use this to see what the first pass reconstruction looks like

-  if (0) {

-    char filename[512];

-    FILE *recon_file;

-    sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);

-    if (cm->current_video_frame == 0)

-      recon_file = fopen(filename, "wb");

-    else

-      recon_file = fopen(filename, "ab");

-    if (fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file));

-    fclose(recon_file);

-  }

-  cm->current_video_frame++;

-}

-// Estimate a cost per mb attributable to overheads such as the coding of

-// modes and motion vectors.

-// Currently simplistic in its assumptions for testing.

-//

-static double bitcost(double prob) {

-  return -(log(prob) / log(2.0));

-}

-static long long estimate_modemvcost(VP9_COMP *cpi,

-                                     FIRSTPASS_STATS *fpstats) {

-  int mv_cost;

-  int mode_cost;

-  double av_pct_inter = fpstats->pcnt_inter / fpstats->count;

-  double av_pct_motion = fpstats->pcnt_motion / fpstats->count;

-  double av_intra = (1.0 - av_pct_inter);

-  double zz_cost;

-  double motion_cost;

-  double intra_cost;

-  zz_cost = bitcost(av_pct_inter - av_pct_motion);

-  motion_cost = bitcost(av_pct_motion);

-  intra_cost = bitcost(av_intra);

-  // Estimate of extra bits per mv overhead for mbs

-  // << 9 is the normalization to the (bits * 512) used in vp9_bits_per_mb

-  mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9;

-  // Crude estimate of overhead cost from modes

-  // << 9 is the normalization to (bits * 512) used in vp9_bits_per_mb

-  mode_cost =

-    (int)((((av_pct_inter - av_pct_motion) * zz_cost) +

-           (av_pct_motion * motion_cost) +

-           (av_intra * intra_cost)) * cpi->common.MBs) << 9;

-  // return mv_cost + mode_cost;

-  // TODO PGW Fix overhead costs for extended Q range

-  return 0;

-}

-static double calc_correction_factor(double err_per_mb,

-                                     double err_divisor,

-                                     double pt_low,

-                                     double pt_high,

-                                     int Q) {

-  double power_term;

-  double error_term = err_per_mb / err_divisor;

-  double correction_factor;

-  // Adjustment based on actual quantizer to power term.

-  power_term = (vp9_convert_qindex_to_q(Q) * 0.01) + pt_low;

-  power_term = (power_term > pt_high) ? pt_high : power_term;

-  // Adjustments to error term

-  // TBD

-  // Calculate correction factor

-  correction_factor = pow(error_term, power_term);

-  // Clip range

-  correction_factor =

-    (correction_factor < 0.05)

-    ? 0.05 : (correction_factor > 2.0) ? 2.0 : correction_factor;

-  return correction_factor;

-}

-// Given a current maxQ value sets a range for future values.

-// PGW TODO..

-// This code removes direct dependency on QIndex to determin the range

-// (now uses the actual quantizer) but has not been tuned.

-static void adjust_maxq_qrange(VP9_COMP *cpi) {

-  int i;

-  double q;

-  // Set the max corresponding to cpi->avg_q * 2.0

-  q = cpi->avg_q * 2.0;

-  cpi->twopass.maxq_max_limit = cpi->worst_quality;

-  for (i = cpi->best_quality; i <= cpi->worst_quality; i++) {

-    cpi->twopass.maxq_max_limit = i;

-    if (vp9_convert_qindex_to_q(i) >= q)

-      break;

-  }

-  // Set the min corresponding to cpi->avg_q * 0.5

-  q = cpi->avg_q * 0.5;

-  cpi->twopass.maxq_min_limit = cpi->best_quality;

-  for (i = cpi->worst_quality; i >= cpi->best_quality; i--) {

-    cpi->twopass.maxq_min_limit = i;

-    if (vp9_convert_qindex_to_q(i) <= q)

-      break;

-  }

-}

-static int estimate_max_q(VP9_COMP *cpi,

-                          FIRSTPASS_STATS *fpstats,

-                          int section_target_bandwitdh,

-                          int overhead_bits) {

-  int Q;

-  int num_mbs = cpi->common.MBs;

-  int target_norm_bits_per_mb;

-  double section_err = (fpstats->coded_error / fpstats->count);

-  double sr_err_diff;

-  double sr_correction;

-  double err_per_mb = section_err / num_mbs;

-  double err_correction_factor;

-  double speed_correction = 1.0;

-  int overhead_bits_per_mb;

-  if (section_target_bandwitdh <= 0)

-    return cpi->twopass.maxq_max_limit;          // Highest value allowed

-  target_norm_bits_per_mb =

-    (section_target_bandwitdh < (1 << 20))

-    ? (512 * section_target_bandwitdh) / num_mbs

-    : 512 * (section_target_bandwitdh / num_mbs);

-  // Look at the drop in prediction quality between the last frame

-  // and the GF buffer (which contained an older frame).

-  sr_err_diff =

-    (fpstats->sr_coded_error - fpstats->coded_error) /

-    (fpstats->count * cpi->common.MBs);

-  sr_correction = (sr_err_diff / 32.0);

-  sr_correction = pow(sr_correction, 0.25);

-  if (sr_correction < 0.75)

-    sr_correction = 0.75;

-  else if (sr_correction > 1.25)

-    sr_correction = 1.25;

-  // Calculate a corrective factor based on a rolling ratio of bits spent

-  // vs target bits

-  if ((cpi->rolling_target_bits > 0) &&

-      (cpi->active_worst_quality < cpi->worst_quality)) {

-    double rolling_ratio;

-    rolling_ratio = (double)cpi->rolling_actual_bits /

-                    (double)cpi->rolling_target_bits;

-    if (rolling_ratio < 0.95)

-      cpi->twopass.est_max_qcorrection_factor -= 0.005;

-    else if (rolling_ratio > 1.05)

-      cpi->twopass.est_max_qcorrection_factor += 0.005;

-    cpi->twopass.est_max_qcorrection_factor =

-      (cpi->twopass.est_max_qcorrection_factor < 0.1)

-      ? 0.1

-      : (cpi->twopass.est_max_qcorrection_factor > 10.0)

-      ? 10.0 : cpi->twopass.est_max_qcorrection_factor;

-  }

-  // Corrections for higher compression speed settings

-  // (reduced compression expected)

-  if (cpi->compressor_speed == 1) {

-    if (cpi->oxcf.cpu_used <= 5)

-      speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);

-    else

-      speed_correction = 1.25;

-  }

-  // Estimate of overhead bits per mb

-  // Correction to overhead bits for min allowed Q.

-  // PGW TODO.. This code is broken for the extended Q range

-  //            for now overhead set to 0.

-  overhead_bits_per_mb = overhead_bits / num_mbs;

-  overhead_bits_per_mb *= pow(0.98, (double)cpi->twopass.maxq_min_limit);

-  // Try and pick a max Q that will be high enough to encode the

-  // content at the given rate.

-  for (Q = cpi->twopass.maxq_min_limit; Q < cpi->twopass.maxq_max_limit; Q++) {

-    int bits_per_mb_at_this_q;

-    err_correction_factor =

-      calc_correction_factor(err_per_mb, ERR_DIVISOR, 0.4, 0.90, Q) *

-      sr_correction * speed_correction *

-      cpi->twopass.est_max_qcorrection_factor;

-    if (err_correction_factor < 0.05)

-      err_correction_factor = 0.05;

-    else if (err_correction_factor > 5.0)

-      err_correction_factor = 5.0;

-    bits_per_mb_at_this_q =

-      vp9_bits_per_mb(INTER_FRAME, Q) + overhead_bits_per_mb;

-    bits_per_mb_at_this_q = (int)(.5 + err_correction_factor *

-                                  (double)bits_per_mb_at_this_q);

-    // Mode and motion overhead

-    // As Q rises in real encode loop rd code will force overhead down

-    // We make a crude adjustment for this here as *.98 per Q step.

-    // PGW TODO.. This code is broken for the extended Q range

-    //            for now overhead set to 0.

-    // overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);

-    if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)

-      break;

-  }

-  // Restriction on active max q for constrained quality mode.

-  if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&

-      (Q < cpi->cq_target_quality)) {

-    Q = cpi->cq_target_quality;

-  }

-  // Adjust maxq_min_limit and maxq_max_limit limits based on

-  // averaga q observed in clip for non kf/gf/arf frames

-  // Give average a chance to settle though.

-  // PGW TODO.. This code is broken for the extended Q range

-  if ((cpi->ni_frames >

-       ((unsigned int)cpi->twopass.total_stats->count >> 8)) &&

-      (cpi->ni_frames > 150)) {

-    adjust_maxq_qrange(cpi);

-  }

-  return Q;

-}

-// For cq mode estimate a cq level that matches the observed

-// complexity and data rate.

-static int estimate_cq(VP9_COMP *cpi,

-                       FIRSTPASS_STATS *fpstats,

-                       int section_target_bandwitdh,

-                       int overhead_bits) {

-  int Q;

-  int num_mbs = cpi->common.MBs;

-  int target_norm_bits_per_mb;

-  double section_err = (fpstats->coded_error / fpstats->count);

-  double err_per_mb = section_err / num_mbs;

-  double err_correction_factor;

-  double sr_err_diff;

-  double sr_correction;

-  double speed_correction = 1.0;

-  double clip_iiratio;

-  double clip_iifactor;

-  int overhead_bits_per_mb;

-  target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20))

-                            ? (512 * section_target_bandwitdh) / num_mbs

-                            : 512 * (section_target_bandwitdh / num_mbs);

-  // Estimate of overhead bits per mb

-  overhead_bits_per_mb = overhead_bits / num_mbs;

-  // Corrections for higher compression speed settings

-  // (reduced compression expected)

-  if (cpi->compressor_speed == 1) {

-    if (cpi->oxcf.cpu_used <= 5)

-      speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);

-    else

-      speed_correction = 1.25;

-  }

-  // Look at the drop in prediction quality between the last frame

-  // and the GF buffer (which contained an older frame).

-  sr_err_diff =

-    (fpstats->sr_coded_error - fpstats->coded_error) /

-    (fpstats->count * cpi->common.MBs);

-  sr_correction = (sr_err_diff / 32.0);

-  sr_correction = pow(sr_correction, 0.25);

-  if (sr_correction < 0.75)

-    sr_correction = 0.75;

-  else if (sr_correction > 1.25)

-    sr_correction = 1.25;

-  // II ratio correction factor for clip as a whole

-  clip_iiratio = cpi->twopass.total_stats->intra_error /

-                 DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats->coded_error);

-  clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);

-  if (clip_iifactor < 0.80)

-    clip_iifactor = 0.80;

-  // Try and pick a Q that can encode the content at the given rate.

-  for (Q = 0; Q < MAXQ; Q++) {

-    int bits_per_mb_at_this_q;

-    // Error per MB based correction factor

-    err_correction_factor =

-      calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, Q) *

-      sr_correction * speed_correction * clip_iifactor;

-    if (err_correction_factor < 0.05)

-      err_correction_factor = 0.05;

-    else if (err_correction_factor > 5.0)

-      err_correction_factor = 5.0;

-    bits_per_mb_at_this_q =

-      vp9_bits_per_mb(INTER_FRAME, Q) + overhead_bits_per_mb;

-    bits_per_mb_at_this_q = (int)(.5 + err_correction_factor *

-                                  (double)bits_per_mb_at_this_q);

-    // Mode and motion overhead

-    // As Q rises in real encode loop rd code will force overhead down

-    // We make a crude adjustment for this here as *.98 per Q step.

-    // PGW TODO.. This code is broken for the extended Q range

-    //            for now overhead set to 0.

-    overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);

-    if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)

-      break;

-  }

-  // Clip value to range "best allowed to (worst allowed - 1)"

-  Q = select_cq_level(Q);

-  if (Q >= cpi->worst_quality)

-    Q = cpi->worst_quality - 1;

-  if (Q < cpi->best_quality)

-    Q = cpi->best_quality;

-  return Q;

-}

-extern void vp9_new_frame_rate(VP9_COMP *cpi, double framerate);

-void vp9_init_second_pass(VP9_COMP *cpi) {

-  FIRSTPASS_STATS this_frame;

-  FIRSTPASS_STATS *start_pos;

-  double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate;

-  double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth

-                                      * cpi->oxcf.two_pass_vbrmin_section / 100);

-  if (two_pass_min_rate < lower_bounds_min_rate)

-    two_pass_min_rate = lower_bounds_min_rate;

-  zero_stats(cpi->twopass.total_stats);

-  zero_stats(cpi->twopass.total_left_stats);

-  if (!cpi->twopass.stats_in_end)

-    return;

-  *cpi->twopass.total_stats = *cpi->twopass.stats_in_end;

-  *cpi->twopass.total_left_stats = *cpi->twopass.total_stats;

-  // each frame can have a different duration, as the frame rate in the source

-  // isn't guaranteed to be constant.   The frame rate prior to the first frame

-  // encoded in the second pass is a guess.  However the sum duration is not.

-  // Its calculated based on the actual durations of all frames from the first

-  // pass.

-  vp9_new_frame_rate(cpi,

-                     10000000.0 * cpi->twopass.total_stats->count /

-                     cpi->twopass.total_stats->duration);

-  cpi->output_frame_rate = cpi->oxcf.frame_rate;

-  cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats->duration *

-                                     cpi->oxcf.target_bandwidth / 10000000.0);

-  cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats->duration *

-                                      two_pass_min_rate / 10000000.0);

-  // Calculate a minimum intra value to be used in determining the IIratio

-  // scores used in the second pass. We have this minimum to make sure

-  // that clips that are static but "low complexity" in the intra domain

-  // are still boosted appropriately for KF/GF/ARF

-  cpi->twopass.kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;

-  cpi->twopass.gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;

-  // This variable monitors how far behind the second ref update is lagging

-  cpi->twopass.sr_update_lag = 1;

-  // Scan the first pass file and calculate an average Intra / Inter error score ratio for the sequence

-  {

-    double sum_iiratio = 0.0;

-    double IIRatio;

-    start_pos = cpi->twopass.stats_in;               // Note starting "file" position

-    while (input_stats(cpi, &this_frame) != EOF) {

-      IIRatio = this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);

-      IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio;

-      sum_iiratio += IIRatio;

-    }

-    cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats->count);

-    // Reset file position

-    reset_fpf_position(cpi, start_pos);

-  }

-  // Scan the first pass file and calculate a modified total error based upon the bias/power function

-  // used to allocate bits

-  {

-    start_pos = cpi->twopass.stats_in;               // Note starting "file" position

-    cpi->twopass.modified_error_total = 0.0;

-    cpi->twopass.modified_error_used = 0.0;

-    while (input_stats(cpi, &this_frame) != EOF) {

-      cpi->twopass.modified_error_total += calculate_modified_err(cpi, &this_frame);

-    }

-    cpi->twopass.modified_error_left = cpi->twopass.modified_error_total;

-    reset_fpf_position(cpi, start_pos);            // Reset file position

-  }

-}

-void vp9_end_second_pass(VP9_COMP *cpi) {

-}

-// This function gives and estimate of how badly we believe

-// the prediction quality is decaying from frame to frame.

-static double get_prediction_decay_rate(VP9_COMP *cpi,

-                                        FIRSTPASS_STATS *next_frame) {

-  double prediction_decay_rate;

-  double second_ref_decay;

-  double mb_sr_err_diff;

-  // Initial basis is the % mbs inter coded

-  prediction_decay_rate = next_frame->pcnt_inter;

-  // Look at the observed drop in prediction quality between the last frame

-  // and the GF buffer (which contains an older frame).

-  mb_sr_err_diff =

-    (next_frame->sr_coded_error - next_frame->coded_error) /

-    (cpi->common.MBs);

-  second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0);

-  second_ref_decay = pow(second_ref_decay, 0.5);

-  if (second_ref_decay < 0.85)

-    second_ref_decay = 0.85;

-  else if (second_ref_decay > 1.0)

-    second_ref_decay = 1.0;

-  if (second_ref_decay < prediction_decay_rate)

-    prediction_decay_rate = second_ref_decay;

-  return prediction_decay_rate;

-}

-// Function to test for a condition where a complex transition is followed

-// by a static section. For example in slide shows where there is a fade

-// between slides. This is to help with more optimal kf and gf positioning.

-static int detect_transition_to_still(

-  VP9_COMP *cpi,

-  int frame_interval,

-  int still_interval,

-  double loop_decay_rate,

-  double last_decay_rate) {

-  BOOL trans_to_still = FALSE;

-  // Break clause to detect very still sections after motion

-  // For example a static image after a fade or other transition

-  // instead of a clean scene cut.

-  if ((frame_interval > MIN_GF_INTERVAL) &&

-      (loop_decay_rate >= 0.999) &&

-      (last_decay_rate < 0.9)) {

-    int j;

-    FIRSTPASS_STATS *position = cpi->twopass.stats_in;

-    FIRSTPASS_STATS tmp_next_frame;

-    double zz_inter;

-    // Look ahead a few frames to see if static condition

-    // persists...

-    for (j = 0; j < still_interval; j++) {

-      if (EOF == input_stats(cpi, &tmp_next_frame))

-        break;

-      zz_inter =

-        (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion);

-      if (zz_inter < 0.999)

-        break;

-    }

-    // Reset file position

-    reset_fpf_position(cpi, position);

-    // Only if it does do we signal a transition to still

-    if (j == still_interval)

-      trans_to_still = TRUE;

-  }

-  return trans_to_still;

-}

-// This function detects a flash through the high relative pcnt_second_ref

-// score in the frame following a flash frame. The offset passed in should

-// reflect this

-static BOOL detect_flash(VP9_COMP *cpi, int offset) {

-  FIRSTPASS_STATS next_frame;

-  BOOL flash_detected = FALSE;

-  // Read the frame data.

-  // The return is FALSE (no flash detected) if not a valid frame

-  if (read_frame_stats(cpi, &next_frame, offset) != EOF) {

-    // What we are looking for here is a situation where there is a

-    // brief break in prediction (such as a flash) but subsequent frames

-    // are reasonably well predicted by an earlier (pre flash) frame.

-    // The recovery after a flash is indicated by a high pcnt_second_ref

-    // comapred to pcnt_inter.

-    if ((next_frame.pcnt_second_ref > next_frame.pcnt_inter) &&

-        (next_frame.pcnt_second_ref >= 0.5)) {

-      flash_detected = TRUE;

-    }

-  }

-  return flash_detected;

-}

-// Update the motion related elements to the GF arf boost calculation

-static void accumulate_frame_motion_stats(

-  VP9_COMP *cpi,

-  FIRSTPASS_STATS *this_frame,

-  double *this_frame_mv_in_out,

-  double *mv_in_out_accumulator,

-  double *abs_mv_in_out_accumulator,

-  double *mv_ratio_accumulator) {

-  // double this_frame_mv_in_out;

-  double this_frame_mvr_ratio;

-  double this_frame_mvc_ratio;

-  double motion_pct;

-  // Accumulate motion stats.

-  motion_pct = this_frame->pcnt_motion;

-  // Accumulate Motion In/Out of frame stats

-  *this_frame_mv_in_out = this_frame->mv_in_out_count * motion_pct;

-  *mv_in_out_accumulator += this_frame->mv_in_out_count * motion_pct;

-  *abs_mv_in_out_accumulator +=

-    fabs(this_frame->mv_in_out_count * motion_pct);

-  // Accumulate a measure of how uniform (or conversely how random)

-  // the motion field is. (A ratio of absmv / mv)

-  if (motion_pct > 0.05) {

-    this_frame_mvr_ratio = fabs(this_frame->mvr_abs) /

-                           DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVr));

-    this_frame_mvc_ratio = fabs(this_frame->mvc_abs) /

-                           DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVc));

-    *mv_ratio_accumulator +=

-      (this_frame_mvr_ratio < this_frame->mvr_abs)

-      ? (this_frame_mvr_ratio * motion_pct)

-      : this_frame->mvr_abs * motion_pct;

-    *mv_ratio_accumulator +=

-      (this_frame_mvc_ratio < this_frame->mvc_abs)

-      ? (this_frame_mvc_ratio * motion_pct)

-      : this_frame->mvc_abs * motion_pct;

-  }

-}

-// Calculate a baseline boost number for the current frame.

-static double calc_frame_boost(

-  VP9_COMP *cpi,

-  FIRSTPASS_STATS *this_frame,

-  double this_frame_mv_in_out) {

-  double frame_boost;

-  // Underlying boost factor is based on inter intra error ratio

-  if (this_frame->intra_error > cpi->twopass.gf_intra_err_min)

-    frame_boost = (IIFACTOR * this_frame->intra_error /

-                   DOUBLE_DIVIDE_CHECK(this_frame->coded_error));

-  else

-    frame_boost = (IIFACTOR * cpi->twopass.gf_intra_err_min /

-                   DOUBLE_DIVIDE_CHECK(this_frame->coded_error));

-  // Increase boost for frames where new data coming into frame

-  // (eg zoom out). Slightly reduce boost if there is a net balance

-  // of motion out of the frame (zoom in).

-  // The range for this_frame_mv_in_out is -1.0 to +1.0

-  if (this_frame_mv_in_out > 0.0)

-    frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);

-  // In extreme case boost is halved

-  else

-    frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);

-  // Clip to maximum

-  if (frame_boost > GF_RMAX)

-    frame_boost = GF_RMAX;

-  return frame_boost;

-}

-static int calc_arf_boost(

-  VP9_COMP *cpi,

-  int offset,

-  int f_frames,

-  int b_frames,

-  int *f_boost,

-  int *b_boost) {

-  FIRSTPASS_STATS this_frame;

-  int i;

-  double boost_score = 0.0;

-  double mv_ratio_accumulator = 0.0;

-  double decay_accumulator = 1.0;

-  double this_frame_mv_in_out = 0.0;

-  double mv_in_out_accumulator = 0.0;

-  double abs_mv_in_out_accumulator = 0.0;

-  int arf_boost;

-  BOOL flash_detected = FALSE;

-  // Search forward from the proposed arf/next gf position

-  for (i = 0; i < f_frames; i++) {

-    if (read_frame_stats(cpi, &this_frame, (i + offset)) == EOF)

-      break;

-    // Update the motion related elements to the boost calculation

-    accumulate_frame_motion_stats(cpi, &this_frame,

-                                  &this_frame_mv_in_out, &mv_in_out_accumulator,

-                                  &abs_mv_in_out_accumulator, &mv_ratio_accumulator);

-    // We want to discount the the flash frame itself and the recovery

-    // frame that follows as both will have poor scores.

-    flash_detected = detect_flash(cpi, (i + offset)) ||

-                     detect_flash(cpi, (i + offset + 1));

-    // Cumulative effect of prediction quality decay

-    if (!flash_detected) {

-      decay_accumulator =

-        decay_accumulator *

-        get_prediction_decay_rate(cpi, &this_frame);

-      decay_accumulator =

-        decay_accumulator < 0.1 ? 0.1 : decay_accumulator;

-    }

-    boost_score += (decay_accumulator *

-                    calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out));

-  }

-  *f_boost = boost_score;

-  // Reset for backward looking loop

-  boost_score = 0.0;

-  mv_ratio_accumulator = 0.0;

-  decay_accumulator = 1.0;

-  this_frame_mv_in_out = 0.0;

-  mv_in_out_accumulator = 0.0;

-  abs_mv_in_out_accumulator = 0.0;

-  // Search backward towards last gf position

-  for (i = -1; i >= -b_frames; i--) {

-    if (read_frame_stats(cpi, &this_frame, (i + offset)) == EOF)

-      break;

-    // Update the motion related elements to the boost calculation

-    accumulate_frame_motion_stats(cpi, &this_frame,

-                                  &this_frame_mv_in_out, &mv_in_out_accumulator,

-                                  &abs_mv_in_out_accumulator, &mv_ratio_accumulator);

-    // We want to discount the the flash frame itself and the recovery

-    // frame that follows as both will have poor scores.

-    flash_detected = detect_flash(cpi, (i + offset)) ||

-                     detect_flash(cpi, (i + offset + 1));

-    // Cumulative effect of prediction quality decay

-    if (!flash_detected) {

-      decay_accumulator =

-        decay_accumulator *

-        get_prediction_decay_rate(cpi, &this_frame);

-      decay_accumulator =

-        decay_accumulator < 0.1 ? 0.1 : decay_accumulator;

-    }

-    boost_score += (decay_accumulator *

-                    calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out));

-  }

-  *b_boost = boost_score;

-  arf_boost = (*f_boost + *b_boost);

-  if (arf_boost < ((b_frames + f_frames) * 20))

-    arf_boost = ((b_frames + f_frames) * 20);

-  return arf_boost;

-}

-static void configure_arnr_filter(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {

-  int half_gf_int;

-  int frames_after_arf;

-  int frames_bwd = cpi->oxcf.arnr_max_frames - 1;

-  int frames_fwd = cpi->oxcf.arnr_max_frames - 1;

-  // Define the arnr filter width for this group of frames:

-  // We only filter frames that lie within a distance of half

-  // the GF interval from the ARF frame. We also have to trap

-  // cases where the filter extends beyond the end of clip.

-  // Note: this_frame->frame has been updated in the loop

-  // so it now points at the ARF frame.

-  half_gf_int = cpi->baseline_gf_interval >> 1;

-  frames_after_arf = cpi->twopass.total_stats->count -

-                     this_frame->frame - 1;

-  switch (cpi->oxcf.arnr_type) {

-    case 1: // Backward filter

-      frames_fwd = 0;

-      if (frames_bwd > half_gf_int)

-        frames_bwd = half_gf_int;

-      break;

-    case 2: // Forward filter

-      if (frames_fwd > half_gf_int)

-        frames_fwd = half_gf_int;

-      if (frames_fwd > frames_after_arf)

-        frames_fwd = frames_after_arf;

-      frames_bwd = 0;

-      break;

-    case 3: // Centered filter

-    default:

-      frames_fwd >>= 1;

-      if (frames_fwd > frames_after_arf)

-        frames_fwd = frames_after_arf;

-      if (frames_fwd > half_gf_int)

-        frames_fwd = half_gf_int;

-      frames_bwd = frames_fwd;

-      // For even length filter there is one more frame backward

-      // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.

-      if (frames_bwd < half_gf_int)

-        frames_bwd += (cpi->oxcf.arnr_max_frames + 1) & 0x1;

-      break;

-  }

-  cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;

-}

-// Analyse and define a gf/arf group .

-static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {

-  FIRSTPASS_STATS next_frame;

-  FIRSTPASS_STATS *start_pos;

-  int i;

-  double boost_score = 0.0;

-  double old_boost_score = 0.0;

-  double gf_group_err = 0.0;

-  double gf_first_frame_err = 0.0;

-  double mod_frame_err = 0.0;

-  double mv_ratio_accumulator = 0.0;

-  double decay_accumulator = 1.0;

-  double zero_motion_accumulator = 1.0;

-  double loop_decay_rate = 1.00;          // Starting decay rate

-  double last_loop_decay_rate = 1.00;

-  double this_frame_mv_in_out = 0.0;

-  double mv_in_out_accumulator = 0.0;

-  double abs_mv_in_out_accumulator = 0.0;

-  int max_bits = frame_max_bits(cpi);     // Max for a single frame

-  unsigned int allow_alt_ref =

-    cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;

-  int f_boost = 0;

-  int b_boost = 0;

-  BOOL flash_detected;

-  cpi->twopass.gf_group_bits = 0;

-  vp9_clear_system_state();  // __asm emms;

-  start_pos = cpi->twopass.stats_in;

-  vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean

-  // Load stats for the current frame.

-  mod_frame_err = calculate_modified_err(cpi, this_frame);

-  // Note the error of the frame at the start of the group (this will be

-  // the GF frame error if we code a normal gf

-  gf_first_frame_err = mod_frame_err;

-  // Special treatment if the current frame is a key frame (which is also

-  // a gf). If it is then its error score (and hence bit allocation) need

-  // to be subtracted out from the calculation for the GF group

-  if (cpi->common.frame_type == KEY_FRAME)

-    gf_group_err -= gf_first_frame_err;

-  // Scan forward to try and work out how many frames the next gf group

-  // should contain and what level of boost is appropriate for the GF

-  // or ARF that will be coded with the group

-  i = 0;

-  while (((i < cpi->twopass.static_scene_max_gf_interval) ||

-          ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL)) &&

-         (i < cpi->twopass.frames_to_key)) {

-    i++;    // Increment the loop counter

-    // Accumulate error score of frames in this gf group

-    mod_frame_err = calculate_modified_err(cpi, this_frame);

-    gf_group_err += mod_frame_err;

-    if (EOF == input_stats(cpi, &next_frame))

-      break;

-    // Test for the case where there is a brief flash but the prediction

-    // quality back to an earlier frame is then restored.

-    flash_detected = detect_flash(cpi, 0);

-    // Update the motion related elements to the boost calculation

-    accumulate_frame_motion_stats(cpi, &next_frame,

-                                  &this_frame_mv_in_out, &mv_in_out_accumulator,

-                                  &abs_mv_in_out_accumulator, &mv_ratio_accumulator);

-    // Cumulative effect of prediction quality decay

-    if (!flash_detected) {

-      last_loop_decay_rate = loop_decay_rate;

-      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);

-      decay_accumulator = decay_accumulator * loop_decay_rate;

-      // Monitor for static sections.

-      if ((next_frame.pcnt_inter - next_frame.pcnt_motion) <

-          zero_motion_accumulator) {

-        zero_motion_accumulator =

-          (next_frame.pcnt_inter - next_frame.pcnt_motion);

-      }

-      // Break clause to detect very still sections after motion

-      // (for example a staic image after a fade or other transition).

-      if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,

-                                     last_loop_decay_rate)) {

-        allow_alt_ref = FALSE;

-        break;

-      }

-    }

-    // Calculate a boost number for this frame

-    boost_score +=

-      (decay_accumulator *

-       calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out));

-    // Break out conditions.

-    if (

-      // Break at cpi->max_gf_interval unless almost totally static

-      (i >= cpi->max_gf_interval && (zero_motion_accumulator < 0.995)) ||

-      (

-        // Dont break out with a very short interval

-        (i > MIN_GF_INTERVAL) &&

-        // Dont break out very close to a key frame

-        ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&

-        ((boost_score > 125.0) || (next_frame.pcnt_inter < 0.75)) &&

-        (!flash_detected) &&

-        ((mv_ratio_accumulator > 100.0) ||

-         (abs_mv_in_out_accumulator > 3.0) ||

-         (mv_in_out_accumulator < -2.0) ||

-         ((boost_score - old_boost_score) < 12.5))

-      )) {

-      boost_score = old_boost_score;

-      break;

-    }

-    vpx_memcpy(this_frame, &next_frame, sizeof(*this_frame));

-    old_boost_score = boost_score;

-  }

-  // Dont allow a gf too near the next kf

-  if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL) {

-    while (i < cpi->twopass.frames_to_key) {

-      i++;

-      if (EOF == input_stats(cpi, this_frame))

-        break;

-      if (i < cpi->twopass.frames_to_key) {

-        mod_frame_err = calculate_modified_err(cpi, this_frame);

-        gf_group_err += mod_frame_err;

-      }

-    }

-  }

-  // Set the interval till the next gf or arf.

-  cpi->baseline_gf_interval = i;

-  // Should we use the alternate refernce frame

-  if (allow_alt_ref &&

-      (i < cpi->oxcf.lag_in_frames) &&

-      (i >= MIN_GF_INTERVAL) &&

-      // dont use ARF very near next kf

-      (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) &&

-      ((next_frame.pcnt_inter > 0.75) ||

-       (next_frame.pcnt_second_ref > 0.5)) &&

-      ((mv_in_out_accumulator / (double)i > -0.2) ||

-       (mv_in_out_accumulator > -2.0)) &&

-      (boost_score > 100)) {

-    // Alterrnative boost calculation for alt ref

-    cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);

-    cpi->source_alt_ref_pending = TRUE;

-    configure_arnr_filter(cpi, this_frame);

-  } else {

-    cpi->gfu_boost = (int)boost_score;

-    cpi->source_alt_ref_pending = FALSE;

-  }

-  // Now decide how many bits should be allocated to the GF group as  a

-  // proportion of those remaining in the kf group.

-  // The final key frame group in the clip is treated as a special case

-  // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.

-  // This is also important for short clips where there may only be one

-  // key frame.

-  if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats->count -

-                                          cpi->common.current_video_frame)) {

-    cpi->twopass.kf_group_bits =

-      (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0;

-  }

-  // Calculate the bits to be allocated to the group as a whole

-  if ((cpi->twopass.kf_group_bits > 0) &&

-      (cpi->twopass.kf_group_error_left > 0)) {

-    cpi->twopass.gf_group_bits =

-      (int)((double)cpi->twopass.kf_group_bits *

-            (gf_group_err / (double)cpi->twopass.kf_group_error_left));

-  } else

-    cpi->twopass.gf_group_bits = 0;

-  cpi->twopass.gf_group_bits =

-    (cpi->twopass.gf_group_bits < 0)

-    ? 0

-    : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)

-    ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits;

-  // Clip cpi->twopass.gf_group_bits based on user supplied data rate

-  // variability limit (cpi->oxcf.two_pass_vbrmax_section)

-  if (cpi->twopass.gf_group_bits > max_bits * cpi->baseline_gf_interval)

-    cpi->twopass.gf_group_bits = max_bits * cpi->baseline_gf_interval;

-  // Reset the file position

-  reset_fpf_position(cpi, start_pos);

-  // Update the record of error used so far (only done once per gf group)

-  cpi->twopass.modified_error_used += gf_group_err;

-  // Assign  bits to the arf or gf.

-  for (i = 0; i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME); i++) {

-    int boost;

-    int allocation_chunks;

-    int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;

-    int gf_bits;

-    boost = (cpi->gfu_boost * vp9_gfboost_qadjust(Q)) / 100;

-    // Set max and minimum boost and hence minimum allocation

-    if (boost > ((cpi->baseline_gf_interval + 1) * 200))

-      boost = ((cpi->baseline_gf_interval + 1) * 200);

-    else if (boost < 125)

-      boost = 125;

-    if (cpi->source_alt_ref_pending && i == 0)

-      allocation_chunks =

-        ((cpi->baseline_gf_interval + 1) * 100) + boost;

-    else

-      allocation_chunks =

-        (cpi->baseline_gf_interval * 100) + (boost - 100);

-    // Prevent overflow

-    if (boost > 1028) {

-      int divisor = boost >> 10;

-      boost /= divisor;

-      allocation_chunks /= divisor;

-    }

-    // Calculate the number of bits to be spent on the gf or arf based on

-    // the boost number

-    gf_bits = (int)((double)boost *

-                    (cpi->twopass.gf_group_bits /

-                     (double)allocation_chunks));

-    // If the frame that is to be boosted is simpler than the average for

-    // the gf/arf group then use an alternative calculation

-    // based on the error score of the frame itself

-    if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval) {

-      double  alt_gf_grp_bits;

-      int     alt_gf_bits;

-      alt_gf_grp_bits =

-        (double)cpi->twopass.kf_group_bits  *

-        (mod_frame_err * (double)cpi->baseline_gf_interval) /

-        DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left);

-      alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /

-                                           (double)allocation_chunks));

-      if (gf_bits > alt_gf_bits) {

-        gf_bits = alt_gf_bits;

-      }

-    }

-    // Else if it is harder than other frames in the group make sure it at

-    // least receives an allocation in keeping with its relative error

-    // score, otherwise it may be worse off than an "un-boosted" frame

-    else {

-      int alt_gf_bits =

-        (int)((double)cpi->twopass.kf_group_bits *

-              mod_frame_err /

-              DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left));

-      if (alt_gf_bits > gf_bits) {

-        gf_bits = alt_gf_bits;

-      }

-    }

-    // Dont allow a negative value for gf_bits

-    if (gf_bits < 0)

-      gf_bits = 0;

-    gf_bits += cpi->min_frame_bandwidth;                     // Add in minimum for a frame

-    if (i == 0) {

-      cpi->twopass.gf_bits = gf_bits;

-    }

-    if (i == 1 || (!cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME))) {

-      cpi->per_frame_bandwidth = gf_bits;                 // Per frame bit target for this frame

-    }

-  }

-  {

-    // Adjust KF group bits and error remainin

-    cpi->twopass.kf_group_error_left -= gf_group_err;

-    cpi->twopass.kf_group_bits -= cpi->twopass.gf_group_bits;

-    if (cpi->twopass.kf_group_bits < 0)

-      cpi->twopass.kf_group_bits = 0;

-    // Note the error score left in the remaining frames of the group.

-    // For normal GFs we want to remove the error score for the first frame

-    // of the group (except in Key frame case where this has already

-    // happened)

-    if (!cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME)

-      cpi->twopass.gf_group_error_left = gf_group_err - gf_first_frame_err;

-    else

-      cpi->twopass.gf_group_error_left = gf_group_err;

-    cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits - cpi->min_frame_bandwidth;

-    if (cpi->twopass.gf_group_bits < 0)

-      cpi->twopass.gf_group_bits = 0;

-    // This condition could fail if there are two kfs very close together

-    // despite (MIN_GF_INTERVAL) and would cause a devide by 0 in the

-    // calculation of cpi->twopass.alt_extra_bits.

-    if (cpi->baseline_gf_interval >= 3) {

-      int boost = (cpi->source_alt_ref_pending)

-                  ? b_boost : cpi->gfu_boost;

-      if (boost >= 150) {

-        int pct_extra;

-        pct_extra = (boost - 100) / 50;

-        pct_extra = (pct_extra > 20) ? 20 : pct_extra;

-        cpi->twopass.alt_extra_bits =

-          (cpi->twopass.gf_group_bits * pct_extra) / 100;

-        cpi->twopass.gf_group_bits -= cpi->twopass.alt_extra_bits;

-        cpi->twopass.alt_extra_bits /=

-          ((cpi->baseline_gf_interval - 1) >> 1);

-      } else

-        cpi->twopass.alt_extra_bits = 0;

-    } else

-      cpi->twopass.alt_extra_bits = 0;

-  }

-  if (cpi->common.frame_type != KEY_FRAME) {

-    FIRSTPASS_STATS sectionstats;

-    zero_stats(&sectionstats);

-    reset_fpf_position(cpi, start_pos);

-    for (i = 0; i < cpi->baseline_gf_interval; i++) {

-      input_stats(cpi, &next_frame);

-      accumulate_stats(&sectionstats, &next_frame);

-    }

-    avg_stats(&sectionstats);

-    cpi->twopass.section_intra_rating =

-      sectionstats.intra_error /

-      DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);

-    reset_fpf_position(cpi, start_pos);

-  }

-}

-// Allocate bits to a normal frame that is neither a gf an arf or a key frame.

-static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {

-  int    target_frame_size;                                                             // gf_group_error_left

-  double modified_err;

-  double err_fraction;                                                                 // What portion of the remaining GF group error is used by this frame

-  int max_bits = frame_max_bits(cpi);    // Max for a single frame

-  // Calculate modified prediction error used in bit allocation

-  modified_err = calculate_modified_err(cpi, this_frame);

-  if (cpi->twopass.gf_group_error_left > 0)

-    err_fraction = modified_err / cpi->twopass.gf_group_error_left;                              // What portion of the remaining GF group error is used by this frame

-  else

-    err_fraction = 0.0;

-  target_frame_size = (int)((double)cpi->twopass.gf_group_bits * err_fraction);                    // How many of those bits available for allocation should we give it?

-  // Clip to target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at the top end.

-  if (target_frame_size < 0)

-    target_frame_size = 0;

-  else {

-    if (target_frame_size > max_bits)

-      target_frame_size = max_bits;

-    if (target_frame_size > cpi->twopass.gf_group_bits)

-      target_frame_size = cpi->twopass.gf_group_bits;

-  }

-  cpi->twopass.gf_group_error_left -= modified_err;                                               // Adjust error remaining

-  cpi->twopass.gf_group_bits -= target_frame_size;                                                // Adjust bits remaining

-  if (cpi->twopass.gf_group_bits < 0)

-    cpi->twopass.gf_group_bits = 0;

-  target_frame_size += cpi->min_frame_bandwidth;                                          // Add in the minimum number of bits that is set aside for every frame.

-  cpi->per_frame_bandwidth = target_frame_size;                                           // Per frame bit target for this frame

-}

-// Make a damped adjustment to the active max q.

-static int adjust_active_maxq(int old_maxqi, int new_maxqi) {

-  int i;

-  int ret_val = new_maxqi;

-  double old_q;

-  double new_q;

-  double target_q;

-  old_q = vp9_convert_qindex_to_q(old_maxqi);

-  new_q = vp9_convert_qindex_to_q(new_maxqi);

-  target_q = ((old_q * 7.0) + new_q) / 8.0;

-  if (target_q > old_q) {

-    for (i = old_maxqi; i <= new_maxqi; i++) {

-      if (vp9_convert_qindex_to_q(i) >= target_q) {

-        ret_val = i;

-        break;

-      }

-    }

-  } else {

-    for (i = old_maxqi; i >= new_maxqi; i--) {

-      if (vp9_convert_qindex_to_q(i) <= target_q) {

-        ret_val = i;

-        break;

-      }

-    }

-  }

-  return ret_val;

-}

-void vp9_second_pass(VP9_COMP *cpi) {

-  int tmp_q;

-  int frames_left = (int)(cpi->twopass.total_stats->count - cpi->common.current_video_frame);

-  FIRSTPASS_STATS this_frame;

-  FIRSTPASS_STATS this_frame_copy;

-  double this_frame_error;

-  double this_frame_intra_error;

-  double this_frame_coded_error;

-  FIRSTPASS_STATS *start_pos;

-  int overhead_bits;

-  if (!cpi->twopass.stats_in) {

-    return;

-  }

-  vp9_clear_system_state();

-  vpx_memset(&this_frame, 0, sizeof(FIRSTPASS_STATS));

-  if (EOF == input_stats(cpi, &this_frame))

-    return;

-  this_frame_error = this_frame.ssim_weighted_pred_err;

-  this_frame_intra_error = this_frame.intra_error;

-  this_frame_coded_error = this_frame.coded_error;

-  start_pos = cpi->twopass.stats_in;

-  // keyframe and section processing !

-  if (cpi->twopass.frames_to_key == 0) {

-    // Define next KF group and assign bits to it

-    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));

-    find_next_key_frame(cpi, &this_frame_copy);

-  }

-  // Is this a GF / ARF (Note that a KF is always also a GF)

-  if (cpi->frames_till_gf_update_due == 0) {

-    // Define next gf group and assign bits to it

-    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));

-    define_gf_group(cpi, &this_frame_copy);

-    // If we are going to code an altref frame at the end of the group and the current frame is not a key frame....

-    // If the previous group used an arf this frame has already benefited from that arf boost and it should not be given extra bits

-    // If the previous group was NOT coded using arf we may want to apply some boost to this GF as well

-    if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)) {

-      // Assign a standard frames worth of bits from those allocated to the GF group

-      int bak = cpi->per_frame_bandwidth;

-      vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));

-      assign_std_frame_bits(cpi, &this_frame_copy);

-      cpi->per_frame_bandwidth = bak;

-    }

-  }

-  // Otherwise this is an ordinary frame

-  else {

-    // Assign bits from those allocated to the GF group

-    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));

-    assign_std_frame_bits(cpi, &this_frame_copy);

-  }

-  // Keep a globally available copy of this and the next frame's iiratio.

-  cpi->twopass.this_iiratio = this_frame_intra_error /

-                              DOUBLE_DIVIDE_CHECK(this_frame_coded_error);

-  {

-    FIRSTPASS_STATS next_frame;

-    if (lookup_next_frame_stats(cpi, &next_frame) != EOF) {

-      cpi->twopass.next_iiratio = next_frame.intra_error /

-                                  DOUBLE_DIVIDE_CHECK(next_frame.coded_error);

-    }

-  }

-  // Set nominal per second bandwidth for this frame

-  cpi->target_bandwidth = cpi->per_frame_bandwidth * cpi->output_frame_rate;

-  if (cpi->target_bandwidth < 0)

-    cpi->target_bandwidth = 0;

-  // Account for mv, mode and other overheads.

-  overhead_bits = estimate_modemvcost(

-                    cpi, cpi->twopass.total_left_stats);

-  // Special case code for first frame.

-  if (cpi->common.current_video_frame == 0) {

-    cpi->twopass.est_max_qcorrection_factor = 1.0;

-    // Set a cq_level in constrained quality mode.

-    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {

-      int est_cq;

-      est_cq =

-        estimate_cq(cpi,

-                    cpi->twopass.total_left_stats,

-                    (int)(cpi->twopass.bits_left / frames_left),

-                    overhead_bits);

-      cpi->cq_target_quality = cpi->oxcf.cq_level;

-      if (est_cq > cpi->cq_target_quality)

-        cpi->cq_target_quality = est_cq;

-    }

-    // guess at maxq needed in 2nd pass

-    cpi->twopass.maxq_max_limit = cpi->worst_quality;

-    cpi->twopass.maxq_min_limit = cpi->best_quality;

-    tmp_q = estimate_max_q(

-              cpi,

-              cpi->twopass.total_left_stats,

-              (int)(cpi->twopass.bits_left / frames_left),

-              overhead_bits);

-    cpi->active_worst_quality         = tmp_q;

-    cpi->ni_av_qi                     = tmp_q;

-    cpi->avg_q                        = vp9_convert_qindex_to_q(tmp_q);

-    // Limit the maxq value returned subsequently.

-    // This increases the risk of overspend or underspend if the initial

-    // estimate for the clip is bad, but helps prevent excessive

-    // variation in Q, especially near the end of a clip

-    // where for example a small overspend may cause Q to crash

-    adjust_maxq_qrange(cpi);

-  }

-  // The last few frames of a clip almost always have to few or too many

-  // bits and for the sake of over exact rate control we dont want to make

-  // radical adjustments to the allowed quantizer range just to use up a

-  // few surplus bits or get beneath the target rate.

-  else if ((cpi->common.current_video_frame <

-            (((unsigned int)cpi->twopass.total_stats->count * 255) >> 8)) &&

-           ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <

-            (unsigned int)cpi->twopass.total_stats->count)) {

-    if (frames_left < 1)

-      frames_left = 1;

-    tmp_q = estimate_max_q(

-              cpi,

-              cpi->twopass.total_left_stats,

-              (int)(cpi->twopass.bits_left / frames_left),

-              overhead_bits);

-    // Make a damped adjustment to active max Q

-    cpi->active_worst_quality =

-      adjust_active_maxq(cpi->active_worst_quality, tmp_q);

-  }

-  cpi->twopass.frames_to_key--;

-  // Update the total stats remaining sturcture

-  subtract_stats(cpi->twopass.total_left_stats, &this_frame);

-}

-static BOOL test_candidate_kf(VP9_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame) {

-  BOOL is_viable_kf = FALSE;

-  // Does the frame satisfy the primary criteria of a key frame

-  //      If so, then examine how well it predicts subsequent frames

-  if ((this_frame->pcnt_second_ref < 0.10) &&

-      (next_frame->pcnt_second_ref < 0.10) &&

-      ((this_frame->pcnt_inter < 0.05) ||

-       (

-         ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) &&

-         ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&

-         ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) ||

-          (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) ||

-          ((next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5)

-         )

-       )

-      )

-     ) {

-    int i;

-    FIRSTPASS_STATS *start_pos;

-    FIRSTPASS_STATS local_next_frame;

-    double boost_score = 0.0;

-    double old_boost_score = 0.0;

-    double decay_accumulator = 1.0;

-    double next_iiratio;

-    vpx_memcpy(&local_next_frame, next_frame, sizeof(*next_frame));

-    // Note the starting file position so we can reset to it

-    start_pos = cpi->twopass.stats_in;

-    // Examine how well the key frame predicts subsequent frames

-    for (i = 0; i < 16; i++) {

-      next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));

-      if (next_iiratio > RMAX)

-        next_iiratio = RMAX;

-      // Cumulative effect of decay in prediction quality

-      if (local_next_frame.pcnt_inter > 0.85)

-        decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;

-      else

-        decay_accumulator = decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);

-      // decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;

-      // Keep a running total

-      boost_score += (decay_accumulator * next_iiratio);

-      // Test various breakout clauses

-      if ((local_next_frame.pcnt_inter < 0.05) ||

-          (next_iiratio < 1.5) ||

-          (((local_next_frame.pcnt_inter -

-             local_next_frame.pcnt_neutral) < 0.20) &&

-           (next_iiratio < 3.0)) ||

-          ((boost_score - old_boost_score) < 3.0) ||

-          (local_next_frame.intra_error < 200)

-         ) {

-        break;

-      }

-      old_boost_score = boost_score;

-      // Get the next frame details

-      if (EOF == input_stats(cpi, &local_next_frame))

-        break;

-    }

-    // If there is tolerable prediction for at least the next 3 frames then break out else discard this pottential key frame and move on

-    if (boost_score > 30.0 && (i > 3))

-      is_viable_kf = TRUE;

-    else {

-      // Reset the file position

-      reset_fpf_position(cpi, start_pos);

-      is_viable_kf = FALSE;

-    }

-  }

-  return is_viable_kf;

-}

-static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {

-  int i, j;

-  FIRSTPASS_STATS last_frame;

-  FIRSTPASS_STATS first_frame;

-  FIRSTPASS_STATS next_frame;

-  FIRSTPASS_STATS *start_position;

-  double decay_accumulator = 1.0;

-  double zero_motion_accumulator = 1.0;

-  double boost_score = 0;

-  double old_boost_score = 0.0;

-  double loop_decay_rate;

-  double kf_mod_err = 0.0;

-  double kf_group_err = 0.0;

-  double kf_group_intra_err = 0.0;

-  double kf_group_coded_err = 0.0;

-  double recent_loop_decay[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};

-  vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean

-  vp9_clear_system_state();  // __asm emms;

-  start_position = cpi->twopass.stats_in;

-  cpi->common.frame_type = KEY_FRAME;

-  // is this a forced key frame by interval

-  cpi->this_key_frame_forced = cpi->next_key_frame_forced;

-  // Clear the alt ref active flag as this can never be active on a key frame

-  cpi->source_alt_ref_active = FALSE;

-  // Kf is always a gf so clear frames till next gf counter

-  cpi->frames_till_gf_update_due = 0;

-  cpi->twopass.frames_to_key = 1;

-  // Take a copy of the initial frame details

-  vpx_memcpy(&first_frame, this_frame, sizeof(*this_frame));

-  cpi->twopass.kf_group_bits = 0;        // Total bits avaialable to kf group

-  cpi->twopass.kf_group_error_left = 0;  // Group modified error score.

-  kf_mod_err = calculate_modified_err(cpi, this_frame);

-  // find the next keyframe

-  i = 0;

-  while (cpi->twopass.stats_in < cpi->twopass.stats_in_end) {

-    // Accumulate kf group error

-    kf_group_err += calculate_modified_err(cpi, this_frame);

-    // These figures keep intra and coded error counts for all frames including key frames in the group.

-    // The effect of the key frame itself can be subtracted out using the first_frame data collected above

-    kf_group_intra_err += this_frame->intra_error;

-    kf_group_coded_err += this_frame->coded_error;

-    // load a the next frame's stats

-    vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));

-    input_stats(cpi, this_frame);

-    // Provided that we are not at the end of the file...

-    if (cpi->oxcf.auto_key

-        && lookup_next_frame_stats(cpi, &next_frame) != EOF) {

-      // Normal scene cut check

-      if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame)) {

-        break;

-      }

-      // How fast is prediction quality decaying

-      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);

-      // We want to know something about the recent past... rather than

-      // as used elsewhere where we are concened with decay in prediction

-      // quality since the last GF or KF.

-      recent_loop_decay[i % 8] = loop_decay_rate;

-      decay_accumulator = 1.0;

-      for (j = 0; j < 8; j++) {

-        decay_accumulator = decay_accumulator * recent_loop_decay[j];

-      }

-      // Special check for transition or high motion followed by a

-      // to a static scene.

-      if (detect_transition_to_still(cpi, i,

-                                     (cpi->key_frame_frequency - i),

-                                     loop_decay_rate,

-                                     decay_accumulator)) {

-        break;

-      }

-      // Step on to the next frame

-      cpi->twopass.frames_to_key++;

-      // If we don't have a real key frame within the next two

-      // forcekeyframeevery intervals then break out of the loop.

-      if (cpi->twopass.frames_to_key >= 2 * (int)cpi->key_frame_frequency)

-        break;

-    } else

-      cpi->twopass.frames_to_key++;

-    i++;

-  }

-  // If there is a max kf interval set by the user we must obey it.

-  // We already breakout of the loop above at 2x max.

-  // This code centers the extra kf if the actual natural

-  // interval is between 1x and 2x

-  if (cpi->oxcf.auto_key

-      && cpi->twopass.frames_to_key > (int)cpi->key_frame_frequency) {

-    FIRSTPASS_STATS *current_pos = cpi->twopass.stats_in;

-    FIRSTPASS_STATS tmp_frame;

-    cpi->twopass.frames_to_key /= 2;

-    // Copy first frame details

-    vpx_memcpy(&tmp_frame, &first_frame, sizeof(first_frame));

-    // Reset to the start of the group

-    reset_fpf_position(cpi, start_position);

-    kf_group_err = 0;

-    kf_group_intra_err = 0;

-    kf_group_coded_err = 0;

-    // Rescan to get the correct error data for the forced kf group

-    for (i = 0; i < cpi->twopass.frames_to_key; i++) {

-      // Accumulate kf group errors

-      kf_group_err += calculate_modified_err(cpi, &tmp_frame);

-      kf_group_intra_err += tmp_frame.intra_error;

-      kf_group_coded_err += tmp_frame.coded_error;

-      // Load a the next frame's stats

-      input_stats(cpi, &tmp_frame);

-    }

-    // Reset to the start of the group

-    reset_fpf_position(cpi, current_pos);

-    cpi->next_key_frame_forced = TRUE;

-  } else

-    cpi->next_key_frame_forced = FALSE;

-  // Special case for the last frame of the file

-  if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) {

-    // Accumulate kf group error

-    kf_group_err += calculate_modified_err(cpi, this_frame);

-    // These figures keep intra and coded error counts for all frames including key frames in the group.

-    // The effect of the key frame itself can be subtracted out using the first_frame data collected above

-    kf_group_intra_err += this_frame->intra_error;

-    kf_group_coded_err += this_frame->coded_error;

-  }

-  // Calculate the number of bits that should be assigned to the kf group.

-  if ((cpi->twopass.bits_left > 0) && (cpi->twopass.modified_error_left > 0.0)) {

-    // Max for a single normal frame (not key frame)

-    int max_bits = frame_max_bits(cpi);

-    // Maximum bits for the kf group

-    int64_t max_grp_bits;

-    // Default allocation based on bits left and relative

-    // complexity of the section

-    cpi->twopass.kf_group_bits = (int64_t)(cpi->twopass.bits_left *

-                                           (kf_group_err /

-                                            cpi->twopass.modified_error_left));

-    // Clip based on maximum per frame rate defined by the user.

-    max_grp_bits = (int64_t)max_bits * (int64_t)cpi->twopass.frames_to_key;

-    if (cpi->twopass.kf_group_bits > max_grp_bits)

-      cpi->twopass.kf_group_bits = max_grp_bits;

-  } else

-    cpi->twopass.kf_group_bits = 0;

-  // Reset the first pass file position

-  reset_fpf_position(cpi, start_position);

-  // determine how big to make this keyframe based on how well the subsequent frames use inter blocks

-  decay_accumulator = 1.0;

-  boost_score = 0.0;

-  loop_decay_rate = 1.00;       // Starting decay rate

-  for (i = 0; i < cpi->twopass.frames_to_key; i++) {

-    double r;

-    if (EOF == input_stats(cpi, &next_frame))

-      break;

-    if (next_frame.intra_error > cpi->twopass.kf_intra_err_min)

-      r = (IIKFACTOR2 * next_frame.intra_error /

-           DOUBLE_DIVIDE_CHECK(next_frame.coded_error));

-    else

-      r = (IIKFACTOR2 * cpi->twopass.kf_intra_err_min /

-           DOUBLE_DIVIDE_CHECK(next_frame.coded_error));

-    if (r > RMAX)

-      r = RMAX;

-    // Monitor for static sections.

-    if ((next_frame.pcnt_inter - next_frame.pcnt_motion) <

-        zero_motion_accumulator) {

-      zero_motion_accumulator =

-        (next_frame.pcnt_inter - next_frame.pcnt_motion);

-    }

-    // How fast is prediction quality decaying

-    if (!detect_flash(cpi, 0)) {

-      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);

-      decay_accumulator = decay_accumulator * loop_decay_rate;

-      decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;

-    }

-    boost_score += (decay_accumulator * r);

-    if ((i > MIN_GF_INTERVAL) &&

-        ((boost_score - old_boost_score) < 6.25)) {

-      break;

-    }

-    old_boost_score = boost_score;

-  }

-  {

-    FIRSTPASS_STATS sectionstats;

-    zero_stats(&sectionstats);

-    reset_fpf_position(cpi, start_position);

-    for (i = 0; i < cpi->twopass.frames_to_key; i++) {

-      input_stats(cpi, &next_frame);

-      accumulate_stats(&sectionstats, &next_frame);

-    }

-    avg_stats(&sectionstats);

-    cpi->twopass.section_intra_rating =

-      sectionstats.intra_error

-      / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);

-  }

-  // Reset the first pass file position

-  reset_fpf_position(cpi, start_position);

-  // Work out how many bits to allocate for the key frame itself

-  if (1) {

-    int kf_boost = boost_score;

-    int allocation_chunks;

-    int alt_kf_bits;

-    if (kf_boost < 300) {

-      kf_boost += (cpi->twopass.frames_to_key * 3);

-      if (kf_boost > 300)

-        kf_boost = 300;

-    }

-    if (kf_boost < 250)                                                      // Min KF boost

-      kf_boost = 250;

-    // Make a note of baseline boost and the zero motion

-    // accumulator value for use elsewhere.

-    cpi->kf_boost = kf_boost;

-    cpi->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);

-    // We do three calculations for kf size.

-    // The first is based on the error score for the whole kf group.

-    // The second (optionaly) on the key frames own error if this is

-    // smaller than the average for the group.

-    // The final one insures that the frame receives at least the

-    // allocation it would have received based on its own error score vs

-    // the error score remaining

-    // Special case if the sequence appears almost totaly static

-    // In this case we want to spend almost all of the bits on the

-    // key frame.

-    // cpi->twopass.frames_to_key-1 because key frame itself is taken

-    // care of by kf_boost.

-    if (zero_motion_accumulator >= 0.99) {

-      allocation_chunks =

-        ((cpi->twopass.frames_to_key - 1) * 10) + kf_boost;

-    } else {

-      allocation_chunks =

-        ((cpi->twopass.frames_to_key - 1) * 100) + kf_boost;

-    }

-    // Prevent overflow

-    if (kf_boost > 1028) {

-      int divisor = kf_boost >> 10;

-      kf_boost /= divisor;

-      allocation_chunks /= divisor;

-    }

-    cpi->twopass.kf_group_bits = (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;

-    // Calculate the number of bits to be spent on the key frame

-    cpi->twopass.kf_bits  = (int)((double)kf_boost * ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));

-    // If the key frame is actually easier than the average for the

-    // kf group (which does sometimes happen... eg a blank intro frame)

-    // Then use an alternate calculation based on the kf error score

-    // which should give a smaller key frame.

-    if (kf_mod_err < kf_group_err / cpi->twopass.frames_to_key) {

-      double  alt_kf_grp_bits =

-        ((double)cpi->twopass.bits_left *

-         (kf_mod_err * (double)cpi->twopass.frames_to_key) /

-         DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left));

-      alt_kf_bits = (int)((double)kf_boost *

-                          (alt_kf_grp_bits / (double)allocation_chunks));

-      if (cpi->twopass.kf_bits > alt_kf_bits) {

-        cpi->twopass.kf_bits = alt_kf_bits;

-      }

-    }

-    // Else if it is much harder than other frames in the group make sure

-    // it at least receives an allocation in keeping with its relative

-    // error score

-    else {

-      alt_kf_bits =

-        (int)((double)cpi->twopass.bits_left *

-              (kf_mod_err /

-               DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left)));

-      if (alt_kf_bits > cpi->twopass.kf_bits) {

-        cpi->twopass.kf_bits = alt_kf_bits;

-      }

-    }

-    cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits;

-    cpi->twopass.kf_bits += cpi->min_frame_bandwidth;                                          // Add in the minimum frame allowance

-    cpi->per_frame_bandwidth = cpi->twopass.kf_bits;                                           // Peer frame bit target for this frame

-    cpi->target_bandwidth = cpi->twopass.kf_bits * cpi->output_frame_rate;                      // Convert to a per second bitrate

-  }

-  // Note the total error score of the kf group minus the key frame itself

-  cpi->twopass.kf_group_error_left = (int)(kf_group_err - kf_mod_err);

-  // Adjust the count of total modified error left.

-  // The count of bits left is adjusted elsewhere based on real coded frame sizes

-  cpi->twopass.modified_error_left -= kf_group_err;

-}

--- a/vp8/encoder/firstpass.h

+++ /dev/null

@@ -1,23 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#if !defined __INC_FIRSTPASS_H

-#define      __INC_FIRSTPASS_H

-extern void vp9_init_first_pass(VP9_COMP *cpi);

-extern void vp9_first_pass(VP9_COMP *cpi);

-extern void vp9_end_first_pass(VP9_COMP *cpi);

-extern void vp9_init_second_pass(VP9_COMP *cpi);

-extern void vp9_second_pass(VP9_COMP *cpi);

-extern void vp9_end_second_pass(VP9_COMP *cpi);

-#endif

--- a/vp8/encoder/generic/csystemdependent.c

+++ /dev/null

@@ -1,48 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "vp8/encoder/variance.h"

-#include "vp8/encoder/onyx_int.h"

-void vp9_arch_x86_encoder_init(VP9_COMP *cpi);

-void vp9_arch_arm_encoder_init(VP9_COMP *cpi);

-void (*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc,

-                                        YV12_BUFFER_CONFIG *dst_ybc,

-                                        int fraction);

-extern void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,

-                                        YV12_BUFFER_CONFIG *dst_ybc,

-                                        int fraction);

-void vp9_cmachine_specific_config(VP9_COMP *cpi) {

-#if CONFIG_RUNTIME_CPU_DETECT

-  cpi->rtcd.common                    = &cpi->common.rtcd;

-  cpi->rtcd.search.full_search             = vp9_full_search_sad;

-  cpi->rtcd.search.refining_search         = vp9_refining_search_sad;

-  cpi->rtcd.search.diamond_search          = vp9_diamond_search_sad;

-  cpi->rtcd.temporal.apply                 = vp9_temporal_filter_apply_c;

-#endif

-  vp9_yv12_copy_partial_frame_ptr = vp9_yv12_copy_partial_frame;

-#if ARCH_X86 || ARCH_X86_64

-  vp9_arch_x86_encoder_init(cpi);

-#endif

-#if ARCH_ARM

-  vp9_arch_arm_encoder_init(cpi);

-#endif

-}

--- a/vp8/encoder/lookahead.c

+++ /dev/null

@@ -1,191 +1,0 @@

-/*

- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <assert.h>

-#include <stdlib.h>

-#include "vpx_config.h"

-#include "lookahead.h"

-#include "vp8/common/extend.h"

-#define MAX_LAG_BUFFERS 25

-struct lookahead_ctx {

-  unsigned int max_sz;         /* Absolute size of the queue */

-  unsigned int sz;             /* Number of buffers currently in the queue */

-  unsigned int read_idx;       /* Read index */

-  unsigned int write_idx;      /* Write index */

-  struct lookahead_entry *buf; /* Buffer list */

-};

-/* Return the buffer at the given absolute index and increment the index */

-static struct lookahead_entry *

-pop(struct lookahead_ctx *ctx,

-    unsigned int         *idx) {

-  unsigned int            index = *idx;

-  struct lookahead_entry *buf = ctx->buf + index;

-  assert(index < ctx->max_sz);

-  if (++index >= ctx->max_sz)

-    index -= ctx->max_sz;

-  *idx = index;

-  return buf;

-}

-void

-vp9_lookahead_destroy(struct lookahead_ctx *ctx) {

-  if (ctx) {

-    if (ctx->buf) {

-      int i;

-      for (i = 0; i < ctx->max_sz; i++)

-        vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img);

-      free(ctx->buf);

-    }

-    free(ctx);

-  }

-}

-struct lookahead_ctx *

-vp9_lookahead_init(unsigned int width,

-                   unsigned int height,

-                   unsigned int depth) {

-  struct lookahead_ctx *ctx = NULL;

-  int i;

-  /* Clamp the lookahead queue depth */

-  if (depth < 1)

-    depth = 1;

-  else if (depth > MAX_LAG_BUFFERS)

-    depth = MAX_LAG_BUFFERS;

-  /* Align the buffer dimensions */

-  width = (width + 15) &~15;

-  height = (height + 15) &~15;

-  /* Allocate the lookahead structures */

-  ctx = calloc(1, sizeof(*ctx));

-  if (ctx) {

-    ctx->max_sz = depth;

-    ctx->buf = calloc(depth, sizeof(*ctx->buf));

-    if (!ctx->buf)

-      goto bail;

-    for (i = 0; i < depth; i++)

-      if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img,

-                                      width, height, VP8BORDERINPIXELS))

-        goto bail;

-  }

-  return ctx;

-bail:

-  vp9_lookahead_destroy(ctx);

-  return NULL;

-}

-int

-vp9_lookahead_push(struct lookahead_ctx *ctx,

-                   YV12_BUFFER_CONFIG   *src,

-                   int64_t               ts_start,

-                   int64_t               ts_end,

-                   unsigned int          flags,

-                   unsigned char        *active_map) {

-  struct lookahead_entry *buf;

-  int row, col, active_end;

-  int mb_rows = (src->y_height + 15) >> 4;

-  int mb_cols = (src->y_width + 15) >> 4;

-  if (ctx->sz + 1 > ctx->max_sz)

-    return 1;

-  ctx->sz++;

-  buf = pop(ctx, &ctx->write_idx);

-  // Only do this partial copy if the following conditions are all met:

-  // 1. Lookahead queue has has size of 1.

-  // 2. Active map is provided.

-  // 3. This is not a key frame, golden nor altref frame.

-  if (ctx->max_sz == 1 && active_map && !flags) {

-    for (row = 0; row < mb_rows; ++row) {

-      col = 0;

-      while (1) {

-        // Find the first active macroblock in this row.

-        for (; col < mb_cols; ++col) {

-          if (active_map[col])

-            break;

-        }

-        // No more active macroblock in this row.

-        if (col == mb_cols)

-          break;

-        // Find the end of active region in this row.

-        active_end = col;

-        for (; active_end < mb_cols; ++active_end) {

-          if (!active_map[active_end])

-            break;

-        }

-        // Only copy this active region.

-        vp9_copy_and_extend_frame_with_rect(src, &buf->img,

-                                            row << 4,

-                                            col << 4, 16,

-                                            (active_end - col) << 4);

-        // Start again from the end of this active region.

-        col = active_end;

-      }

-      active_map += mb_cols;

-    }

-  } else {

-    vp9_copy_and_extend_frame(src, &buf->img);

-  }

-  buf->ts_start = ts_start;

-  buf->ts_end = ts_end;

-  buf->flags = flags;

-  return 0;

-}

-struct lookahead_entry *

-vp9_lookahead_pop(struct lookahead_ctx *ctx,

-                  int                   drain) {

-  struct lookahead_entry *buf = NULL;

-  if (ctx->sz && (drain || ctx->sz == ctx->max_sz)) {

-    buf = pop(ctx, &ctx->read_idx);

-    ctx->sz--;

-  }

-  return buf;

-}

-struct lookahead_entry *

-vp9_lookahead_peek(struct lookahead_ctx *ctx,

-                   int                   index) {

-  struct lookahead_entry *buf = NULL;

-  assert(index < ctx->max_sz);

-  if (index < ctx->sz) {

-    index += ctx->read_idx;

-    if (index >= ctx->max_sz)

-      index -= ctx->max_sz;

-    buf = ctx->buf + index;

-  }

-  return buf;

-}

-unsigned int

-vp9_lookahead_depth(struct lookahead_ctx *ctx) {

-  return ctx->sz;

-}

--- a/vp8/encoder/lookahead.h

+++ /dev/null

@@ -1,105 +1,0 @@

-/*

- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef LOOKAHEAD_H

-#define LOOKAHEAD_H

-#include "vpx_scale/yv12config.h"

-#include "vpx/vpx_integer.h"

-struct lookahead_entry {

-  YV12_BUFFER_CONFIG  img;

-  int64_t             ts_start;

-  int64_t             ts_end;

-  unsigned int        flags;

-};

-struct lookahead_ctx;

-/**\brief Initializes the lookahead stage

- *

- * The lookahead stage is a queue of frame buffers on which some analysis

- * may be done when buffers are enqueued.

- *

- *

- */

-struct lookahead_ctx *vp9_lookahead_init(unsigned int width,

-                                         unsigned int height,

-                                         unsigned int depth

-                                        );

-/**\brief Destroys the lookahead stage

- *

- */

-void vp9_lookahead_destroy(struct lookahead_ctx *ctx);

-/**\brief Enqueue a source buffer

- *

- * This function will copy the source image into a new framebuffer with

- * the expected stride/border.

- *

- * If active_map is non-NULL and there is only one frame in the queue, then copy

- * only active macroblocks.

- *

- * \param[in] ctx         Pointer to the lookahead context

- * \param[in] src         Pointer to the image to enqueue

- * \param[in] ts_start    Timestamp for the start of this frame

- * \param[in] ts_end      Timestamp for the end of this frame

- * \param[in] flags       Flags set on this frame

- * \param[in] active_map  Map that specifies which macroblock is active

- */

-int

-vp9_lookahead_push(struct lookahead_ctx *ctx,

-                   YV12_BUFFER_CONFIG   *src,

-                   int64_t               ts_start,

-                   int64_t               ts_end,

-                   unsigned int          flags,

-                   unsigned char        *active_map);

-/**\brief Get the next source buffer to encode

- *

- *

- * \param[in] ctx       Pointer to the lookahead context

- * \param[in] drain     Flag indicating the buffer should be drained

- *                      (return a buffer regardless of the current queue depth)

- *

- * \retval NULL, if drain set and queue is empty

- * \retval NULL, if drain not set and queue not of the configured depth

- *

- */

-struct lookahead_entry *

-vp9_lookahead_pop(struct lookahead_ctx *ctx,

-                  int                   drain);

-/**\brief Get a future source buffer to encode

- *

- * \param[in] ctx       Pointer to the lookahead context

- * \param[in] index     Index of the frame to be returned, 0 == next frame

- *

- * \retval NULL, if no buffer exists at the specified index

- *

- */

-struct lookahead_entry *

-vp9_lookahead_peek(struct lookahead_ctx *ctx,

-                   int                   index);

-/**\brief Get the number of frames currently in the lookahead queue

- *

- * \param[in] ctx       Pointer to the lookahead context

- */

-unsigned int

-vp9_lookahead_depth(struct lookahead_ctx *ctx);

-#endif

--- a/vp8/encoder/mbgraph.c

+++ /dev/null

@@ -1,480 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <limits.h>

-#include <vp8/encoder/encodeintra.h>

-#include <vp8/encoder/rdopt.h>

-#include <vp8/common/setupintrarecon.h>

-#include <vp8/common/blockd.h>

-#include <vp8/common/reconinter.h>

-#include <vp8/common/systemdependent.h>

-#include <vpx_mem/vpx_mem.h>

-#include <vp8/encoder/segmentation.h>

-static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,

-                                              int_mv *ref_mv,

-                                              int_mv *dst_mv) {

-  MACROBLOCK   *const x  = &cpi->mb;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  BLOCK *b  = &x->block[0];

-  BLOCKD *d = &xd->block[0];

-  vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];

-  unsigned int best_err;

-  int step_param, further_steps;

-  int tmp_col_min = x->mv_col_min;

-  int tmp_col_max = x->mv_col_max;

-  int tmp_row_min = x->mv_row_min;

-  int tmp_row_max = x->mv_row_max;

-  int_mv ref_full;

-  // Further step/diamond searches as necessary

-  if (cpi->Speed < 8) {

-    step_param = cpi->sf.first_step + ((cpi->Speed > 5) ? 1 : 0);

-    further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;

-  } else {

-    step_param = cpi->sf.first_step + 2;

-    further_steps = 0;

-  }

-  vp9_clamp_mv_min_max(x, ref_mv);

-  ref_full.as_mv.col = ref_mv->as_mv.col >> 3;

-  ref_full.as_mv.row = ref_mv->as_mv.row >> 3;

-  /*cpi->sf.search_method == HEX*/

-  best_err = vp9_hex_search(

-      x, b, d,

-      &ref_full, dst_mv,

-      step_param,

-      x->errorperbit,

-      &v_fn_ptr,

-      NULLMVCOST,

-      NULLMVCOST,

-      ref_mv);

-  // Try sub-pixel MC

-  // if (bestsme > error_thresh && bestsme < INT_MAX)

-  {

-    int distortion;

-    unsigned int sse;

-    best_err = cpi->find_fractional_mv_step(

-        x, b, d,

-        dst_mv, ref_mv,

-        x->errorperbit, &v_fn_ptr,

-        NULLMVCOST,

-        & distortion, &sse);

-  }

-#if CONFIG_PRED_FILTER

-  // Disable the prediction filter

-  xd->mode_info_context->mbmi.pred_filter_enabled = 0;

-#endif

-  vp9_set_mbmode_and_mvs(x, NEWMV, dst_mv);

-  vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);

-  best_err = vp9_sad16x16(xd->dst.y_buffer, xd->dst.y_stride,

-                          xd->predictor, 16, INT_MAX);

-  /* restore UMV window */

-  x->mv_col_min = tmp_col_min;

-  x->mv_col_max = tmp_col_max;

-  x->mv_row_min = tmp_row_min;

-  x->mv_row_max = tmp_row_max;

-  return best_err;

-}

-static int do_16x16_motion_search

-(

-  VP9_COMP *cpi,

-  int_mv *ref_mv,

-  int_mv *dst_mv,

-  YV12_BUFFER_CONFIG *buf,

-  int buf_mb_y_offset,

-  YV12_BUFFER_CONFIG *ref,

-  int mb_y_offset

-) {

-  MACROBLOCK   *const x  = &cpi->mb;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  unsigned int err, tmp_err;

-  int_mv tmp_mv;

-  int n;

-  for (n = 0; n < 16; n++) {

-    BLOCKD *d = &xd->block[n];

-    BLOCK *b  = &x->block[n];

-    b->base_src   = &buf->y_buffer;

-    b->src_stride = buf->y_stride;

-    b->src        = buf->y_stride * (n & 12) + (n & 3) * 4 + buf_mb_y_offset;

-    d->base_pre   = &ref->y_buffer;

-    d->pre_stride = ref->y_stride;

-    d->pre        = ref->y_stride * (n & 12) + (n & 3) * 4 + mb_y_offset;

-  }

-  // Try zero MV first

-  // FIXME should really use something like near/nearest MV and/or MV prediction

-  xd->pre.y_buffer = ref->y_buffer + mb_y_offset;

-  xd->pre.y_stride = ref->y_stride;

-  err = vp9_sad16x16(ref->y_buffer + mb_y_offset, ref->y_stride,

-                     xd->dst.y_buffer, xd->dst.y_stride, INT_MAX);

-  dst_mv->as_int = 0;

-  // Test last reference frame using the previous best mv as the

-  // starting point (best reference) for the search

-  tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv);

-  if (tmp_err < err) {

-    err            = tmp_err;

-    dst_mv->as_int = tmp_mv.as_int;

-  }

-  // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well

-  if (ref_mv->as_int) {

-    int tmp_err;

-    int_mv zero_ref_mv, tmp_mv;

-    zero_ref_mv.as_int = 0;

-    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv);

-    if (tmp_err < err) {

-      dst_mv->as_int = tmp_mv.as_int;

-      err = tmp_err;

-    }

-  }

-  return err;

-}

-static int do_16x16_zerozero_search

-(

-  VP9_COMP *cpi,

-  int_mv *dst_mv,

-  YV12_BUFFER_CONFIG *buf,

-  int buf_mb_y_offset,

-  YV12_BUFFER_CONFIG *ref,

-  int mb_y_offset

-) {

-  MACROBLOCK   *const x  = &cpi->mb;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  unsigned int err;

-  int n;

-  for (n = 0; n < 16; n++) {

-    BLOCKD *d = &xd->block[n];

-    BLOCK *b  = &x->block[n];

-    b->base_src   = &buf->y_buffer;

-    b->src_stride = buf->y_stride;

-    b->src        = buf->y_stride * (n & 12) + (n & 3) * 4 + buf_mb_y_offset;

-    d->base_pre   = &ref->y_buffer;

-    d->pre_stride = ref->y_stride;

-    d->pre        = ref->y_stride * (n & 12) + (n & 3) * 4 + mb_y_offset;

-  }

-  // Try zero MV first

-  // FIXME should really use something like near/nearest MV and/or MV prediction

-  xd->pre.y_buffer = ref->y_buffer + mb_y_offset;

-  xd->pre.y_stride = ref->y_stride;

-  // VARIANCE_INVOKE(&cpi->rtcd.variance, satd16x16)

-  err = vp9_sad16x16(ref->y_buffer + mb_y_offset, ref->y_stride,

-                     xd->dst.y_buffer, xd->dst.y_stride, INT_MAX);

-  dst_mv->as_int = 0;

-  return err;

-}

-static int find_best_16x16_intra

-(

-  VP9_COMP *cpi,

-  YV12_BUFFER_CONFIG *buf,

-  int mb_y_offset,

-  MB_PREDICTION_MODE *pbest_mode

-) {

-  MACROBLOCK   *const x  = &cpi->mb;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  MB_PREDICTION_MODE best_mode = -1, mode;

-  int best_err = INT_MAX;

-  // calculate SATD for each intra prediction mode;

-  // we're intentionally not doing 4x4, we just want a rough estimate

-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {

-    unsigned int err;

-    xd->mode_info_context->mbmi.mode = mode;

-    vp9_build_intra_predictors_mby(xd);

-    err = vp9_sad16x16(xd->predictor, 16, buf->y_buffer + mb_y_offset,

-                       buf->y_stride, best_err);

-    // find best

-    if (err < best_err) {

-      best_err  = err;

-      best_mode = mode;

-    }

-  }

-  if (pbest_mode)

-    *pbest_mode = best_mode;

-  return best_err;

-}

-static void update_mbgraph_mb_stats

-(

-  VP9_COMP *cpi,

-  MBGRAPH_MB_STATS *stats,

-  YV12_BUFFER_CONFIG *buf,

-  int mb_y_offset,

-  YV12_BUFFER_CONFIG *golden_ref,

-  int_mv *prev_golden_ref_mv,

-  int gld_y_offset,

-  YV12_BUFFER_CONFIG *alt_ref,

-  int_mv *prev_alt_ref_mv,

-  int arf_y_offset

-) {

-  MACROBLOCK   *const x  = &cpi->mb;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  int intra_error;

-  // FIXME in practice we're completely ignoring chroma here

-  xd->dst.y_buffer = buf->y_buffer + mb_y_offset;

-  // do intra 16x16 prediction

-  intra_error = find_best_16x16_intra(cpi, buf, mb_y_offset, &stats->ref[INTRA_FRAME].m.mode);

-  if (intra_error <= 0)

-    intra_error = 1;

-  stats->ref[INTRA_FRAME].err = intra_error;

-  // Golden frame MV search, if it exists and is different than last frame

-  if (golden_ref) {

-    int g_motion_error = do_16x16_motion_search(cpi, prev_golden_ref_mv,

-                                                &stats->ref[GOLDEN_FRAME].m.mv,

-                                                buf, mb_y_offset,

-                                                golden_ref, gld_y_offset);

-    stats->ref[GOLDEN_FRAME].err = g_motion_error;

-  } else {

-    stats->ref[GOLDEN_FRAME].err = INT_MAX;

-    stats->ref[GOLDEN_FRAME].m.mv.as_int = 0;

-  }

-  // Alt-ref frame MV search, if it exists and is different than last/golden frame

-  if (alt_ref) {

-    // int a_motion_error = do_16x16_motion_search(cpi, prev_alt_ref_mv,

-    //                                            &stats->ref[ALTREF_FRAME].m.mv,

-    //                                            buf, mb_y_offset,

-    //                                            alt_ref, arf_y_offset);

-    int a_motion_error =

-      do_16x16_zerozero_search(cpi,

-                               &stats->ref[ALTREF_FRAME].m.mv,

-                               buf, mb_y_offset,

-                               alt_ref, arf_y_offset);

-    stats->ref[ALTREF_FRAME].err = a_motion_error;

-  } else {

-    stats->ref[ALTREF_FRAME].err = INT_MAX;

-    stats->ref[ALTREF_FRAME].m.mv.as_int = 0;

-  }

-}

-static void update_mbgraph_frame_stats

-(

-  VP9_COMP *cpi,

-  MBGRAPH_FRAME_STATS *stats,

-  YV12_BUFFER_CONFIG *buf,

-  YV12_BUFFER_CONFIG *golden_ref,

-  YV12_BUFFER_CONFIG *alt_ref

-) {

-  MACROBLOCK   *const x  = &cpi->mb;

-  VP9_COMMON   *const cm = &cpi->common;

-  MACROBLOCKD *const xd = &x->e_mbd;

-  int mb_col, mb_row, offset = 0;

-  int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;

-  int_mv arf_top_mv, gld_top_mv;

-  MODE_INFO mi_local;

-  // Set up limit values for motion vectors to prevent them extending outside the UMV borders

-  arf_top_mv.as_int = 0;

-  gld_top_mv.as_int = 0;

-  x->mv_row_min     = -(VP8BORDERINPIXELS - 16 - INTERP_EXTEND);

-  x->mv_row_max     = (cm->mb_rows - 1) * 16 + VP8BORDERINPIXELS - 16 - INTERP_EXTEND;

-  xd->up_available  = 0;

-  xd->dst.y_stride  = buf->y_stride;

-  xd->pre.y_stride  = buf->y_stride;

-  xd->dst.uv_stride = buf->uv_stride;

-  xd->mode_info_context = &mi_local;

-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {

-    int_mv arf_left_mv, gld_left_mv;

-    int mb_y_in_offset  = mb_y_offset;

-    int arf_y_in_offset = arf_y_offset;

-    int gld_y_in_offset = gld_y_offset;

-    // Set up limit values for motion vectors to prevent them extending outside the UMV borders

-    arf_left_mv.as_int = arf_top_mv.as_int;

-    gld_left_mv.as_int = gld_top_mv.as_int;

-    x->mv_col_min      = -(VP8BORDERINPIXELS - 16 - INTERP_EXTEND);

-    x->mv_col_max      = (cm->mb_cols - 1) * 16 + VP8BORDERINPIXELS - 16 - INTERP_EXTEND;

-    xd->left_available = 0;

-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

-      MBGRAPH_MB_STATS *mb_stats = &stats->mb_stats[offset + mb_col];

-      update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset,

-                              golden_ref, &gld_left_mv, gld_y_in_offset,

-                              alt_ref,    &arf_left_mv, arf_y_in_offset);

-      arf_left_mv.as_int = mb_stats->ref[ALTREF_FRAME].m.mv.as_int;

-      gld_left_mv.as_int = mb_stats->ref[GOLDEN_FRAME].m.mv.as_int;

-      if (mb_col == 0) {

-        arf_top_mv.as_int = arf_left_mv.as_int;

-        gld_top_mv.as_int = gld_left_mv.as_int;

-      }

-      xd->left_available = 1;

-      mb_y_in_offset    += 16;

-      gld_y_in_offset   += 16;

-      arf_y_in_offset   += 16;

-      x->mv_col_min     -= 16;

-      x->mv_col_max     -= 16;

-    }

-    xd->up_available = 1;

-    mb_y_offset     += buf->y_stride * 16;

-    gld_y_offset    += golden_ref->y_stride * 16;

-    if (alt_ref)

-      arf_y_offset    += alt_ref->y_stride * 16;

-    x->mv_row_min   -= 16;

-    x->mv_row_max   -= 16;

-    offset          += cm->mb_cols;

-  }

-}

-// void separate_arf_mbs_byzz

-static void separate_arf_mbs(VP9_COMP *cpi) {

-  VP9_COMMON *const cm = &cpi->common;

-  int mb_col, mb_row, offset, i;

-  int ncnt[4];

-  int n_frames = cpi->mbgraph_n_frames;

-  int *arf_not_zz;

-  CHECK_MEM_ERROR(arf_not_zz,

-                  vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));

-  vpx_memset(arf_not_zz, 0, sizeof(arf_not_zz));

-  // We are not interested in results beyond the alt ref itself.

-  if (n_frames > cpi->frames_till_gf_update_due)

-    n_frames = cpi->frames_till_gf_update_due;

-  // defer cost to reference frames

-  for (i = n_frames - 1; i >= 0; i--) {

-    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];

-    for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;

-         offset += cm->mb_cols, mb_row++) {

-      for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

-        MBGRAPH_MB_STATS *mb_stats =

-          &frame_stats->mb_stats[offset + mb_col];

-        int altref_err = mb_stats->ref[ALTREF_FRAME].err;

-        int intra_err  = mb_stats->ref[INTRA_FRAME ].err;

-        int golden_err = mb_stats->ref[GOLDEN_FRAME].err;

-        // Test for altref vs intra and gf and that its mv was 0,0.

-        if ((altref_err > 1000) ||

-            (altref_err > intra_err) ||

-            (altref_err > golden_err)) {

-          arf_not_zz[offset + mb_col]++;

-        }

-      }

-    }

-  }

-  vpx_memset(ncnt, 0, sizeof(ncnt));

-  for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;

-       offset += cm->mb_cols, mb_row++) {

-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

-      // If any of the blocks in the sequence failed then the MB

-      // goes in segment 0

-      if (arf_not_zz[offset + mb_col]) {

-        ncnt[0]++;

-        cpi->segmentation_map[offset + mb_col] = 0;

-      } else {

-        ncnt[1]++;

-        cpi->segmentation_map[offset + mb_col] = 1;

-      }

-    }

-  }

-  // Only bother with segmentation if over 10% of the MBs in static segment

-  // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) )

-  if (1) {

-    // Note % of blocks that are marked as static

-    if (cm->MBs)

-      cpi->static_mb_pct = (ncnt[1] * 100) / cm->MBs;

-    // This error case should not be reachable as this function should

-    // never be called with the common data structure unititialized.

-    else

-      cpi->static_mb_pct = 0;

-    cpi->seg0_cnt = ncnt[0];

-    vp9_enable_segmentation((VP9_PTR) cpi);

-  } else {

-    cpi->static_mb_pct = 0;

-    vp9_disable_segmentation((VP9_PTR) cpi);

-  }

-  // Free localy allocated storage

-  vpx_free(arf_not_zz);

-}

-void vp9_update_mbgraph_stats

-(

-  VP9_COMP *cpi

-) {

-  VP9_COMMON *const cm = &cpi->common;

-  int i, n_frames = vp9_lookahead_depth(cpi->lookahead);

-  YV12_BUFFER_CONFIG *golden_ref = &cm->yv12_fb[cm->gld_fb_idx];

-  // we need to look ahead beyond where the ARF transitions into

-  // being a GF - so exit if we don't look ahead beyond that

-  if (n_frames <= cpi->frames_till_gf_update_due)

-    return;

-  if (n_frames > cpi->common.frames_till_alt_ref_frame)

-    n_frames = cpi->common.frames_till_alt_ref_frame;

-  if (n_frames > MAX_LAG_BUFFERS)

-    n_frames = MAX_LAG_BUFFERS;

-  cpi->mbgraph_n_frames = n_frames;

-  for (i = 0; i < n_frames; i++) {

-    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];

-    vpx_memset(frame_stats->mb_stats, 0,

-               cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats));

-  }

-  // do motion search to find contribution of each reference to data

-  // later on in this GF group

-  // FIXME really, the GF/last MC search should be done forward, and

-  // the ARF MC search backwards, to get optimal results for MV caching

-  for (i = 0; i < n_frames; i++) {

-    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];

-    struct lookahead_entry *q_cur =

-      vp9_lookahead_peek(cpi->lookahead, i);

-    assert(q_cur != NULL);

-    update_mbgraph_frame_stats(cpi, frame_stats, &q_cur->img,

-                               golden_ref, cpi->Source);

-  }

-  vp9_clear_system_state();  // __asm emms;

-  separate_arf_mbs(cpi);

-}

--- a/vp8/encoder/mbgraph.h

+++ /dev/null

@@ -1,16 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_MBGRAPH_H__

-#define __INC_MBGRAPH_H__ 1

-extern void vp9_update_mbgraph_stats(VP9_COMP *cpi);

-#endif /* __INC_MBGRAPH_H__ */

--- a/vp8/encoder/mcomp.c

+++ /dev/null

@@ -1,2203 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp8/encoder/onyx_int.h"

-#include "mcomp.h"

-#include "vpx_mem/vpx_mem.h"

-#include "vpx_ports/config.h"

-#include <stdio.h>

-#include <limits.h>

-#include <math.h>

-#include "vp8/common/findnearmv.h"

-#ifdef ENTROPY_STATS

-static int mv_ref_ct [31] [4] [2];

-static int mv_mode_cts [4] [2];

-#endif

-void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) {

-  int col_min = (ref_mv->as_mv.col >> 3) - MAX_FULL_PEL_VAL +

-      ((ref_mv->as_mv.col & 7) ? 1 : 0);

-  int row_min = (ref_mv->as_mv.row >> 3) - MAX_FULL_PEL_VAL +

-      ((ref_mv->as_mv.row & 7) ? 1 : 0);

-  int col_max = (ref_mv->as_mv.col >> 3) + MAX_FULL_PEL_VAL;

-  int row_max = (ref_mv->as_mv.row >> 3) + MAX_FULL_PEL_VAL;

-  /* Get intersection of UMV window and valid MV window to reduce # of checks in diamond search. */

-  if (x->mv_col_min < col_min)

-    x->mv_col_min = col_min;

-  if (x->mv_col_max > col_max)

-    x->mv_col_max = col_max;

-  if (x->mv_row_min < row_min)

-    x->mv_row_min = row_min;

-  if (x->mv_row_max > row_max)

-    x->mv_row_max = row_max;

-}

-int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, DEC_MVCOSTS,

-                    int Weight, int ishp) {

-  MV v;

-  v.row = (mv->as_mv.row - ref->as_mv.row);

-  v.col = (mv->as_mv.col - ref->as_mv.col);

-  return ((mvjcost[vp9_get_mv_joint(v)] +

-           mvcost[0][v.row] + mvcost[1][v.col]) *

-          Weight) >> 7;

-}

-static int mv_err_cost(int_mv *mv, int_mv *ref, DEC_MVCOSTS,

-                       int error_per_bit, int ishp) {

-  if (mvcost) {

-    MV v;

-    v.row = (mv->as_mv.row - ref->as_mv.row);

-    v.col = (mv->as_mv.col - ref->as_mv.col);

-    return ((mvjcost[vp9_get_mv_joint(v)] +

-             mvcost[0][v.row] + mvcost[1][v.col]) *

-            error_per_bit + 128) >> 8;

-  }

-  return 0;

-}

-static int mvsad_err_cost(int_mv *mv, int_mv *ref, DEC_MVSADCOSTS,

-                          int error_per_bit) {

-  if (mvsadcost) {

-    MV v;

-    v.row = (mv->as_mv.row - ref->as_mv.row);

-    v.col = (mv->as_mv.col - ref->as_mv.col);

-    return ((mvjsadcost[vp9_get_mv_joint(v)] +

-             mvsadcost[0][v.row] + mvsadcost[1][v.col]) *

-            error_per_bit + 128) >> 8;

-  }

-  return 0;

-}

-void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride) {

-  int Len;

-  int search_site_count = 0;

-  // Generate offsets for 4 search sites per step.

-  Len = MAX_FIRST_STEP;

-  x->ss[search_site_count].mv.col = 0;

-  x->ss[search_site_count].mv.row = 0;

-  x->ss[search_site_count].offset = 0;

-  search_site_count++;

-  while (Len > 0) {

-    // Compute offsets for search sites.

-    x->ss[search_site_count].mv.col = 0;

-    x->ss[search_site_count].mv.row = -Len;

-    x->ss[search_site_count].offset = -Len * stride;

-    search_site_count++;

-    // Compute offsets for search sites.

-    x->ss[search_site_count].mv.col = 0;

-    x->ss[search_site_count].mv.row = Len;

-    x->ss[search_site_count].offset = Len * stride;

-    search_site_count++;

-    // Compute offsets for search sites.

-    x->ss[search_site_count].mv.col = -Len;

-    x->ss[search_site_count].mv.row = 0;

-    x->ss[search_site_count].offset = -Len;

-    search_site_count++;

-    // Compute offsets for search sites.

-    x->ss[search_site_count].mv.col = Len;

-    x->ss[search_site_count].mv.row = 0;

-    x->ss[search_site_count].offset = Len;

-    search_site_count++;

-    // Contract.

-    Len /= 2;

-  }

-  x->ss_count = search_site_count;

-  x->searches_per_step = 4;

-}

-void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {

-  int Len;

-  int search_site_count = 0;

-  // Generate offsets for 8 search sites per step.

-  Len = MAX_FIRST_STEP;

-  x->ss[search_site_count].mv.col = 0;

-  x->ss[search_site_count].mv.row = 0;

-  x->ss[search_site_count].offset = 0;

-  search_site_count++;

-  while (Len > 0) {

-    // Compute offsets for search sites.

-    x->ss[search_site_count].mv.col = 0;

-    x->ss[search_site_count].mv.row = -Len;

-    x->ss[search_site_count].offset = -Len * stride;

-    search_site_count++;

-    // Compute offsets for search sites.

-    x->ss[search_site_count].mv.col = 0;

-    x->ss[search_site_count].mv.row = Len;

-    x->ss[search_site_count].offset = Len * stride;

-    search_site_count++;

-    // Compute offsets for search sites.

-    x->ss[search_site_count].mv.col = -Len;

-    x->ss[search_site_count].mv.row = 0;

-    x->ss[search_site_count].offset = -Len;

-    search_site_count++;

-    // Compute offsets for search sites.

-    x->ss[search_site_count].mv.col = Len;

-    x->ss[search_site_count].mv.row = 0;

-    x->ss[search_site_count].offset = Len;

-    search_site_count++;

-    // Compute offsets for search sites.

-    x->ss[search_site_count].mv.col = -Len;

-    x->ss[search_site_count].mv.row = -Len;

-    x->ss[search_site_count].offset = -Len * stride - Len;

-    search_site_count++;

-    // Compute offsets for search sites.

-    x->ss[search_site_count].mv.col = Len;

-    x->ss[search_site_count].mv.row = -Len;

-    x->ss[search_site_count].offset = -Len * stride + Len;

-    search_site_count++;

-    // Compute offsets for search sites.

-    x->ss[search_site_count].mv.col = -Len;

-    x->ss[search_site_count].mv.row = Len;

-    x->ss[search_site_count].offset = Len * stride - Len;

-    search_site_count++;

-    // Compute offsets for search sites.

-    x->ss[search_site_count].mv.col = Len;

-    x->ss[search_site_count].mv.row = Len;

-    x->ss[search_site_count].offset = Len * stride + Len;

-    search_site_count++;

-    // Contract.

-    Len /= 2;

-  }

-  x->ss_count = search_site_count;

-  x->searches_per_step = 8;

-}

-/*

- * To avoid the penalty for crossing cache-line read, preload the reference

- * area in a small buffer, which is aligned to make sure there won't be crossing

- * cache-line read while reading from this buffer. This reduced the cpu

- * cycles spent on reading ref data in sub-pixel filter functions.

- * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x

- * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we

- * could reduce the area.

- */

-/* estimated cost of a motion vector (r,c) */

-#define MVC(r, c)                                       \

-    (mvcost ?                                           \

-     ((mvjcost[((r) != rr) * 2 + ((c) != rc)] +         \

-       mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) * \

-      error_per_bit + 128) >> 8 : 0)

-#define SP(x) (((x) & 7) << 1)  // convert motion vector component to offset

-                                // for svf calc

-#define IFMVCV(r, c, s, e)                                \

-    if (c >= minc && c <= maxc && r >= minr && r <= maxr) \

-      s                                                   \

-    else                                                  \

-      e;

-/* pointer to predictor base of a motionvector */

-#define PRE(r, c) (y + (((r) >> 3) * y_stride + ((c) >> 3) -(offset)))

-/* returns subpixel variance error function */

-#define DIST(r, c) \

-    vfp->svf(PRE(r, c), y_stride, SP(c), SP(r), z, b->src_stride, &sse)

-/* checks if (r, c) has better score than previous best */

-#define CHECK_BETTER(v, r, c) \

-    IFMVCV(r, c, {                                                       \

-      thismse = (DIST(r, c));                                            \

-      if ((v = MVC(r, c) + thismse) < besterr) {                         \

-        besterr = v;                                                     \

-        br = r;                                                          \

-        bc = c;                                                          \

-        *distortion = thismse;                                           \

-        *sse1 = sse;                                                     \

-      }                                                                  \

-    },                                                                   \

-    v = INT_MAX;)

-#define MIN(x,y) (((x)<(y))?(x):(y))

-#define MAX(x,y) (((x)>(y))?(x):(y))

-int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

-                                             int_mv *bestmv, int_mv *ref_mv,

-                                             int error_per_bit,

-                                             const vp9_variance_fn_ptr_t *vfp,

-                                             DEC_MVCOSTS,

-                                             int *distortion,

-                                             unsigned int *sse1) {

-  unsigned char *z = (*(b->base_src) + b->src);

-  MACROBLOCKD *xd = &x->e_mbd;

-  int rr, rc, br, bc, hstep;

-  int tr, tc;

-  unsigned int besterr = INT_MAX;

-  unsigned int left, right, up, down, diag;

-  unsigned int sse;

-  unsigned int whichdir;

-  unsigned int halfiters = 4;

-  unsigned int quarteriters = 4;

-  unsigned int eighthiters = 4;

-  int thismse;

-  int maxc, minc, maxr, minr;

-  int y_stride;

-  int offset;

-  int usehp = xd->allow_high_precision_mv;

-#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)

-  unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;

-  unsigned char *y;

-  int buf_r1, buf_r2, buf_c1, buf_c2;

-  // Clamping to avoid out-of-range data access

-  buf_r1 = ((bestmv->as_mv.row - INTERP_EXTEND) < x->mv_row_min) ?

-      (bestmv->as_mv.row - x->mv_row_min) : INTERP_EXTEND - 1;

-  buf_r2 = ((bestmv->as_mv.row + INTERP_EXTEND) > x->mv_row_max) ?

-      (x->mv_row_max - bestmv->as_mv.row) : INTERP_EXTEND - 1;

-  buf_c1 = ((bestmv->as_mv.col - INTERP_EXTEND) < x->mv_col_min) ?

-      (bestmv->as_mv.col - x->mv_col_min) : INTERP_EXTEND - 1;

-  buf_c2 = ((bestmv->as_mv.col + INTERP_EXTEND) > x->mv_col_max) ?

-      (x->mv_col_max - bestmv->as_mv.col) : INTERP_EXTEND - 1;

-  y_stride = 32;

-  /* Copy to intermediate buffer before searching. */

-  vfp->copymem(y0 - buf_c1 - d->pre_stride * buf_r1, d->pre_stride, xd->y_buf, y_stride, 16 + buf_r1 + buf_r2);

-  y = xd->y_buf + y_stride * buf_r1 + buf_c1;

-#else

-  unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;

-  y_stride = d->pre_stride;

-#endif

-  rr = ref_mv->as_mv.row;

-  rc = ref_mv->as_mv.col;

-  br = bestmv->as_mv.row << 3;

-  bc = bestmv->as_mv.col << 3;

-  hstep = 4;

-  minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) - ((1 << MV_MAX_BITS) - 1));

-  maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) + ((1 << MV_MAX_BITS) - 1));

-  minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) - ((1 << MV_MAX_BITS) - 1));

-  maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) + ((1 << MV_MAX_BITS) - 1));

-  tr = br;

-  tc = bc;

-  offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;

-  // central mv

-  bestmv->as_mv.row <<= 3;

-  bestmv->as_mv.col <<= 3;

-  // calculate central point error

-  besterr = vfp->vf(y, y_stride, z, b->src_stride, sse1);

-  *distortion = besterr;

-  besterr += mv_err_cost(bestmv, ref_mv, MVCOSTS,

-                         error_per_bit, xd->allow_high_precision_mv);

-  // TODO: Each subsequent iteration checks at least one point in

-  // common with the last iteration could be 2 ( if diag selected)

-  while (--halfiters) {

-    // 1/2 pel

-    CHECK_BETTER(left, tr, tc - hstep);

-    CHECK_BETTER(right, tr, tc + hstep);

-    CHECK_BETTER(up, tr - hstep, tc);

-    CHECK_BETTER(down, tr + hstep, tc);

-    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);

-    switch (whichdir) {

-      case 0:

-        CHECK_BETTER(diag, tr - hstep, tc - hstep);

-        break;

-      case 1:

-        CHECK_BETTER(diag, tr - hstep, tc + hstep);

-        break;

-      case 2:

-        CHECK_BETTER(diag, tr + hstep, tc - hstep);

-        break;

-      case 3:

-        CHECK_BETTER(diag, tr + hstep, tc + hstep);

-        break;

-    }

-    // no reason to check the same one again.

-    if (tr == br && tc == bc)

-      break;

-    tr = br;

-    tc = bc;

-  }

-  // TODO: Each subsequent iteration checks at least one point in common with

-  // the last iteration could be 2 ( if diag selected) 1/4 pel

-  hstep >>= 1;

-  while (--quarteriters) {

-    CHECK_BETTER(left, tr, tc - hstep);

-    CHECK_BETTER(right, tr, tc + hstep);

-    CHECK_BETTER(up, tr - hstep, tc);

-    CHECK_BETTER(down, tr + hstep, tc);

-    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);

-    switch (whichdir) {

-      case 0:

-        CHECK_BETTER(diag, tr - hstep, tc - hstep);

-        break;

-      case 1:

-        CHECK_BETTER(diag, tr - hstep, tc + hstep);

-        break;

-      case 2:

-        CHECK_BETTER(diag, tr + hstep, tc - hstep);

-        break;

-      case 3:

-        CHECK_BETTER(diag, tr + hstep, tc + hstep);

-        break;

-    }

-    // no reason to check the same one again.

-    if (tr == br && tc == bc)

-      break;

-    tr = br;

-    tc = bc;

-  }

-  if (xd->allow_high_precision_mv) {

-    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);

-  } else {

-    usehp = 0;

-  }

-  if (usehp) {

-    hstep >>= 1;

-    while (--eighthiters) {

-      CHECK_BETTER(left, tr, tc - hstep);

-      CHECK_BETTER(right, tr, tc + hstep);

-      CHECK_BETTER(up, tr - hstep, tc);

-      CHECK_BETTER(down, tr + hstep, tc);

-      whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);

-      switch (whichdir) {

-        case 0:

-          CHECK_BETTER(diag, tr - hstep, tc - hstep);

-          break;

-        case 1:

-          CHECK_BETTER(diag, tr - hstep, tc + hstep);

-          break;

-        case 2:

-          CHECK_BETTER(diag, tr + hstep, tc - hstep);

-          break;

-        case 3:

-          CHECK_BETTER(diag, tr + hstep, tc + hstep);

-          break;

-      }

-      // no reason to check the same one again.

-      if (tr == br && tc == bc)

-        break;

-      tr = br;

-      tc = bc;

-    }

-  }

-  bestmv->as_mv.row = br;

-  bestmv->as_mv.col = bc;

-  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||

-      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))

-    return INT_MAX;

-  return besterr;

-}

-#undef MVC

-#undef PRE

-#undef DIST

-#undef IFMVCV

-#undef CHECK_BETTER

-#undef MIN

-#undef MAX

-int vp9_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

-                                 int_mv *bestmv, int_mv *ref_mv,

-                                 int error_per_bit,

-                                 const vp9_variance_fn_ptr_t *vfp,

-                                 DEC_MVCOSTS, int *distortion,

-                                 unsigned int *sse1) {

-  int bestmse = INT_MAX;

-  int_mv startmv;

-  int_mv this_mv;

-  int_mv orig_mv;

-  int yrow_movedback = 0, ycol_movedback = 0;

-  unsigned char *z = (*(b->base_src) + b->src);

-  int left, right, up, down, diag;

-  unsigned int sse;

-  int whichdir;

-  int thismse;

-  int y_stride;

-  MACROBLOCKD *xd = &x->e_mbd;

-  int usehp = xd->allow_high_precision_mv;

-#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)

-  unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;

-  unsigned char *y;

-  y_stride = 32;

-  /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */

-  vfp->copymem(y0 - 1 - d->pre_stride, d->pre_stride, xd->y_buf, y_stride, 18);

-  y = xd->y_buf + y_stride + 1;

-#else

-  unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;

-  y_stride = d->pre_stride;

-#endif

-  // central mv

-  bestmv->as_mv.row <<= 3;

-  bestmv->as_mv.col <<= 3;

-  startmv = *bestmv;

-  orig_mv = *bestmv;

-  // calculate central point error

-  bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);

-  *distortion = bestmse;

-  bestmse += mv_err_cost(bestmv, ref_mv, MVCOSTS, error_per_bit,

-                         xd->allow_high_precision_mv);

-  // go left then right and check error

-  this_mv.as_mv.row = startmv.as_mv.row;

-  this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);

-  thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);

-  left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

-                               xd->allow_high_precision_mv);

-  if (left < bestmse) {

-    *bestmv = this_mv;

-    bestmse = left;

-    *distortion = thismse;

-    *sse1 = sse;

-  }

-  this_mv.as_mv.col += 8;

-  thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);

-  right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

-                                xd->allow_high_precision_mv);

-  if (right < bestmse) {

-    *bestmv = this_mv;

-    bestmse = right;

-    *distortion = thismse;

-    *sse1 = sse;

-  }

-  // go up then down and check error

-  this_mv.as_mv.col = startmv.as_mv.col;

-  this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);

-  thismse =  vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);

-  up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

-                             xd->allow_high_precision_mv);

-  if (up < bestmse) {

-    *bestmv = this_mv;

-    bestmse = up;

-    *distortion = thismse;

-    *sse1 = sse;

-  }

-  this_mv.as_mv.row += 8;

-  thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);

-  down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

-                               xd->allow_high_precision_mv);

-  if (down < bestmse) {

-    *bestmv = this_mv;

-    bestmse = down;

-    *distortion = thismse;

-    *sse1 = sse;

-  }

-  // now check 1 more diagonal

-  whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);

-  // for(whichdir =0;whichdir<4;whichdir++)

-  // {

-  this_mv = startmv;

-  switch (whichdir) {

-    case 0:

-      this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;

-      this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;

-      thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);

-      break;

-    case 1:

-      this_mv.as_mv.col += 4;

-      this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;

-      thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);

-      break;

-    case 2:

-      this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;

-      this_mv.as_mv.row += 4;

-      thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);

-      break;

-    case 3:

-    default:

-      this_mv.as_mv.col += 4;

-      this_mv.as_mv.row += 4;

-      thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);

-      break;

-  }

-  diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

-                               xd->allow_high_precision_mv);

-  if (diag < bestmse) {

-    *bestmv = this_mv;

-    bestmse = diag;

-    *distortion = thismse;

-    *sse1 = sse;

-  }

-//  }

-  // time to check quarter pels.

-  if (bestmv->as_mv.row < startmv.as_mv.row) {

-    y -= y_stride;

-    yrow_movedback = 1;

-  }

-  if (bestmv->as_mv.col < startmv.as_mv.col) {

-    y--;

-    ycol_movedback = 1;

-  }

-  startmv = *bestmv;

-  // go left then right and check error

-  this_mv.as_mv.row = startmv.as_mv.row;

-  if (startmv.as_mv.col & 7) {

-    this_mv.as_mv.col = startmv.as_mv.col - 2;

-    thismse = vfp->svf(y, y_stride,

-                       SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

-                       z, b->src_stride, &sse);

-  } else {

-    this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;

-    thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z,

-                       b->src_stride, &sse);

-  }

-  left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

-                               xd->allow_high_precision_mv);

-  if (left < bestmse) {

-    *bestmv = this_mv;

-    bestmse = left;

-    *distortion = thismse;

-    *sse1 = sse;

-  }

-  this_mv.as_mv.col += 4;

-  thismse = vfp->svf(y, y_stride,

-                     SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

-                     z, b->src_stride, &sse);

-  right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

-                                xd->allow_high_precision_mv);

-  if (right < bestmse) {

-    *bestmv = this_mv;

-    bestmse = right;

-    *distortion = thismse;

-    *sse1 = sse;

-  }

-  // go up then down and check error

-  this_mv.as_mv.col = startmv.as_mv.col;

-  if (startmv.as_mv.row & 7) {

-    this_mv.as_mv.row = startmv.as_mv.row - 2;

-    thismse = vfp->svf(y, y_stride,

-                       SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

-                       z, b->src_stride, &sse);

-  } else {

-    this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;

-    thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6),

-                       z, b->src_stride, &sse);

-  }

-  up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

-                             xd->allow_high_precision_mv);

-  if (up < bestmse) {

-    *bestmv = this_mv;

-    bestmse = up;

-    *distortion = thismse;

-    *sse1 = sse;

-  }

-  this_mv.as_mv.row += 4;

-  thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

-                     z, b->src_stride, &sse);

-  down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

-                               xd->allow_high_precision_mv);

-  if (down < bestmse) {

-    *bestmv = this_mv;

-    bestmse = down;

-    *distortion = thismse;

-    *sse1 = sse;

-  }

-  // now check 1 more diagonal

-  whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);

-//  for(whichdir=0;whichdir<4;whichdir++)

-//  {

-  this_mv = startmv;

-  switch (whichdir) {

-    case 0:

-      if (startmv.as_mv.row & 7) {

-        this_mv.as_mv.row -= 2;

-        if (startmv.as_mv.col & 7) {

-          this_mv.as_mv.col -= 2;

-          thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

-        } else {

-          this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;

-          thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z, b->src_stride, &sse);;

-        }

-      } else {

-        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;

-        if (startmv.as_mv.col & 7) {

-          this_mv.as_mv.col -= 2;

-          thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6), z, b->src_stride, &sse);

-        } else {

-          this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;

-          thismse = vfp->svf(y - y_stride - 1, y_stride, SP(6), SP(6), z, b->src_stride, &sse);

-        }

-      }

-      break;

-    case 1:

-      this_mv.as_mv.col += 2;

-      if (startmv.as_mv.row & 7) {

-        this_mv.as_mv.row -= 2;

-        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

-      } else {

-        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;

-        thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6), z, b->src_stride, &sse);

-      }

-      break;

-    case 2:

-      this_mv.as_mv.row += 2;

-      if (startmv.as_mv.col & 7) {

-        this_mv.as_mv.col -= 2;

-        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

-                           z, b->src_stride, &sse);

-      } else {

-        this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;

-        thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z,

-                           b->src_stride, &sse);

-      }

-      break;

-    case 3:

-      this_mv.as_mv.col += 2;

-      this_mv.as_mv.row += 2;

-      thismse = vfp->svf(y, y_stride,

-                         SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

-                         z, b->src_stride, &sse);

-      break;

-  }

-  diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

-                               xd->allow_high_precision_mv);

-  if (diag < bestmse) {

-    *bestmv = this_mv;

-    bestmse = diag;

-    *distortion = thismse;

-    *sse1 = sse;

-  }

-  if (x->e_mbd.allow_high_precision_mv) {

-    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);

-  } else {

-    usehp = 0;

-  }

-  if (!usehp)

-    return bestmse;

-  /* Now do 1/8th pixel */

-  if (bestmv->as_mv.row < orig_mv.as_mv.row && !yrow_movedback) {

-    y -= y_stride;

-    yrow_movedback = 1;

-  }

-  if (bestmv->as_mv.col < orig_mv.as_mv.col && !ycol_movedback) {

-    y--;

-    ycol_movedback = 1;

-  }

-  startmv = *bestmv;

-  // go left then right and check error

-  this_mv.as_mv.row = startmv.as_mv.row;

-  if (startmv.as_mv.col & 7) {

-    this_mv.as_mv.col = startmv.as_mv.col - 1;

-    thismse = vfp->svf(y, y_stride,

-                       SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

-                       z, b->src_stride, &sse);

-  } else {

-    this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;

-    thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row),

-                       z, b->src_stride, &sse);

-  }

-  left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

-                               xd->allow_high_precision_mv);

-  if (left < bestmse) {

-    *bestmv = this_mv;

-    bestmse = left;

-    *distortion = thismse;

-    *sse1 = sse;

-  }

-  this_mv.as_mv.col += 2;

-  thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

-                     z, b->src_stride, &sse);

-  right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

-                                xd->allow_high_precision_mv);

-  if (right < bestmse) {

-    *bestmv = this_mv;

-    bestmse = right;

-    *distortion = thismse;

-    *sse1 = sse;

-  }

-  // go up then down and check error

-  this_mv.as_mv.col = startmv.as_mv.col;

-  if (startmv.as_mv.row & 7) {

-    this_mv.as_mv.row = startmv.as_mv.row - 1;

-    thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

-  } else {

-    this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;

-    thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);

-  }

-  up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

-                             xd->allow_high_precision_mv);

-  if (up < bestmse) {

-    *bestmv = this_mv;

-    bestmse = up;

-    *distortion = thismse;

-    *sse1 = sse;

-  }

-  this_mv.as_mv.row += 2;

-  thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

-  down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

-                               xd->allow_high_precision_mv);

-  if (down < bestmse) {

-    *bestmv = this_mv;

-    bestmse = down;

-    *distortion = thismse;

-    *sse1 = sse;

-  }

-  // now check 1 more diagonal

-  whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);

-//  for(whichdir=0;whichdir<4;whichdir++)

-//  {

-  this_mv = startmv;

-  switch (whichdir) {

-    case 0:

-      if (startmv.as_mv.row & 7) {

-        this_mv.as_mv.row -= 1;

-        if (startmv.as_mv.col & 7) {

-          this_mv.as_mv.col -= 1;

-          thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

-        } else {

-          this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;

-          thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row), z, b->src_stride, &sse);;

-        }

-      } else {

-        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;

-        if (startmv.as_mv.col & 7) {

-          this_mv.as_mv.col -= 1;

-          thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);

-        } else {

-          this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;

-          thismse = vfp->svf(y - y_stride - 1, y_stride, SP(7), SP(7), z, b->src_stride, &sse);

-        }

-      }

-      break;

-    case 1:

-      this_mv.as_mv.col += 1;

-      if (startmv.as_mv.row & 7) {

-        this_mv.as_mv.row -= 1;

-        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

-      } else {

-        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;

-        thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);

-      }

-      break;

-    case 2:

-      this_mv.as_mv.row += 1;

-      if (startmv.as_mv.col & 7) {

-        this_mv.as_mv.col -= 1;

-        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

-      } else {

-        this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;

-        thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

-      }

-      break;

-    case 3:

-      this_mv.as_mv.col += 1;

-      this_mv.as_mv.row += 1;

-      thismse = vfp->svf(y, y_stride,  SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

-      break;

-  }

-  diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

-                               xd->allow_high_precision_mv);

-  if (diag < bestmse) {

-    *bestmv = this_mv;

-    bestmse = diag;

-    *distortion = thismse;

-    *sse1 = sse;

-  }

-  return bestmse;

-}

-#undef SP

-int vp9_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

-                                  int_mv *bestmv, int_mv *ref_mv,

-                                  int error_per_bit,

-                                  const vp9_variance_fn_ptr_t *vfp,

-                                  DEC_MVCOSTS,

-                                  int *distortion,

-                                  unsigned int *sse1) {

-  int bestmse = INT_MAX;

-  int_mv startmv;

-  int_mv this_mv;

-  unsigned char *z = (*(b->base_src) + b->src);

-  int left, right, up, down, diag;

-  unsigned int sse;

-  int whichdir;

-  int thismse;

-  int y_stride;

-  MACROBLOCKD *xd = &x->e_mbd;

-#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)

-  unsigned char *y0 = *(d->base_pre) + d->pre +

-      (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;

-  unsigned char *y;

-  y_stride = 32;

-  /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */

-  vfp->copymem(y0 - 1 - d->pre_stride, d->pre_stride, xd->y_buf, y_stride, 18);

-  y = xd->y_buf + y_stride + 1;

-#else

-  unsigned char *y = *(d->base_pre) + d->pre +

-      (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;

-  y_stride = d->pre_stride;

-#endif

-  // central mv

-  bestmv->as_mv.row <<= 3;

-  bestmv->as_mv.col <<= 3;

-  startmv = *bestmv;

-  // calculate central point error

-  bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);

-  *distortion = bestmse;

-  bestmse += mv_err_cost(bestmv, ref_mv, MVCOSTS, error_per_bit,

-                         xd->allow_high_precision_mv);

-  // go left then right and check error

-  this_mv.as_mv.row = startmv.as_mv.row;

-  this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);

-  thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);

-  left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

-                               xd->allow_high_precision_mv);

-  if (left < bestmse) {

-    *bestmv = this_mv;

-    bestmse = left;

-    *distortion = thismse;

-    *sse1 = sse;

-  }

-  this_mv.as_mv.col += 8;

-  thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);

-  right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

-                                xd->allow_high_precision_mv);

-  if (right < bestmse) {

-    *bestmv = this_mv;

-    bestmse = right;

-    *distortion = thismse;

-    *sse1 = sse;

-  }

-  // go up then down and check error

-  this_mv.as_mv.col = startmv.as_mv.col;

-  this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);

-  thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);

-  up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

-                             xd->allow_high_precision_mv);

-  if (up < bestmse) {

-    *bestmv = this_mv;

-    bestmse = up;

-    *distortion = thismse;

-    *sse1 = sse;

-  }

-  this_mv.as_mv.row += 8;

-  thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);

-  down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

-                               xd->allow_high_precision_mv);

-  if (down < bestmse) {

-    *bestmv = this_mv;

-    bestmse = down;

-    *distortion = thismse;

-    *sse1 = sse;

-  }

-  // now check 1 more diagonal -

-  whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);

-  this_mv = startmv;

-  switch (whichdir) {

-    case 0:

-      this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;

-      this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;

-      thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);

-      break;

-    case 1:

-      this_mv.as_mv.col += 4;

-      this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;

-      thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);

-      break;

-    case 2:

-      this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;

-      this_mv.as_mv.row += 4;

-      thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);

-      break;

-    case 3:

-    default:

-      this_mv.as_mv.col += 4;

-      this_mv.as_mv.row += 4;

-      thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);

-      break;

-  }

-  diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

-                               xd->allow_high_precision_mv);

-  if (diag < bestmse) {

-    *bestmv = this_mv;

-    bestmse = diag;

-    *distortion = thismse;

-    *sse1 = sse;

-  }

-  return bestmse;

-}

-#define CHECK_BOUNDS(range) \

-  {\

-    all_in = 1;\

-    all_in &= ((br-range) >= x->mv_row_min);\

-    all_in &= ((br+range) <= x->mv_row_max);\

-    all_in &= ((bc-range) >= x->mv_col_min);\

-    all_in &= ((bc+range) <= x->mv_col_max);\

-  }

-#define CHECK_POINT \

-  {\

-    if (this_mv.as_mv.col < x->mv_col_min) continue;\

-    if (this_mv.as_mv.col > x->mv_col_max) continue;\

-    if (this_mv.as_mv.row < x->mv_row_min) continue;\

-    if (this_mv.as_mv.row > x->mv_row_max) continue;\

-  }

-#define CHECK_BETTER \

-  {\

-    if (thissad < bestsad)\

-    {\

-      thissad += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);\

-      if (thissad < bestsad)\

-      {\

-        bestsad = thissad;\

-        best_site = i;\

-      }\

-    }\

-  }

-static const MV next_chkpts[6][3] = {

-  {{ -2, 0}, { -1, -2}, {1, -2}},

-  {{ -1, -2}, {1, -2}, {2, 0}},

-  {{1, -2}, {2, 0}, {1, 2}},

-  {{2, 0}, {1, 2}, { -1, 2}},

-  {{1, 2}, { -1, 2}, { -2, 0}},

-  {{ -1, 2}, { -2, 0}, { -1, -2}}

-};

-int vp9_hex_search

-(

-  MACROBLOCK *x,

-  BLOCK *b,

-  BLOCKD *d,

-  int_mv *ref_mv,

-  int_mv *best_mv,

-  int search_param,

-  int sad_per_bit,

-  const vp9_variance_fn_ptr_t *vfp,

-  DEC_MVSADCOSTS,

-  DEC_MVCOSTS,

-  int_mv *center_mv

-) {

-  MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} };

-  MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}};

-  int i, j;

-  unsigned char *what = (*(b->base_src) + b->src);

-  int what_stride = b->src_stride;

-  int in_what_stride = d->pre_stride;

-  int br, bc;

-  int_mv this_mv;

-  unsigned int bestsad = 0x7fffffff;

-  unsigned int thissad;

-  unsigned char *base_offset;

-  unsigned char *this_offset;

-  int k = -1;

-  int all_in;

-  int best_site = -1;

-  int_mv fcenter_mv;

-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;

-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

-  // adjust ref_mv to make sure it is within MV range

-  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);

-  br = ref_mv->as_mv.row;

-  bc = ref_mv->as_mv.col;

-  // Work out the start point for the search

-  base_offset = (unsigned char *)(*(d->base_pre) + d->pre);

-  this_offset = base_offset + (br * (d->pre_stride)) + bc;

-  this_mv.as_mv.row = br;

-  this_mv.as_mv.col = bc;

-  bestsad = vfp->sdf(what, what_stride, this_offset,

-                     in_what_stride, 0x7fffffff)

-            + mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);

-  // hex search

-  // j=0

-  CHECK_BOUNDS(2)

-  if (all_in) {

-    for (i = 0; i < 6; i++) {

-      this_mv.as_mv.row = br + hex[i].row;

-      this_mv.as_mv.col = bc + hex[i].col;

-      this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;

-      thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);

-      CHECK_BETTER

-    }

-  } else {

-    for (i = 0; i < 6; i++) {

-      this_mv.as_mv.row = br + hex[i].row;

-      this_mv.as_mv.col = bc + hex[i].col;

-      CHECK_POINT

-      this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;

-      thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);

-      CHECK_BETTER

-    }

-  }

-  if (best_site == -1)

-    goto cal_neighbors;

-  else {

-    br += hex[best_site].row;

-    bc += hex[best_site].col;

-    k = best_site;

-  }

-  for (j = 1; j < 127; j++) {

-    best_site = -1;

-    CHECK_BOUNDS(2)

-    if (all_in) {

-      for (i = 0; i < 3; i++) {

-        this_mv.as_mv.row = br + next_chkpts[k][i].row;

-        this_mv.as_mv.col = bc + next_chkpts[k][i].col;

-        this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;

-        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);

-        CHECK_BETTER

-      }

-    } else {

-      for (i = 0; i < 3; i++) {

-        this_mv.as_mv.row = br + next_chkpts[k][i].row;

-        this_mv.as_mv.col = bc + next_chkpts[k][i].col;

-        CHECK_POINT

-        this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;

-        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);

-        CHECK_BETTER

-      }

-    }

-    if (best_site == -1)

-      break;

-    else {

-      br += next_chkpts[k][best_site].row;

-      bc += next_chkpts[k][best_site].col;

-      k += 5 + best_site;

-      if (k >= 12) k -= 12;

-      else if (k >= 6) k -= 6;

-    }

-  }

-  // check 4 1-away neighbors

-cal_neighbors:

-  for (j = 0; j < 32; j++) {

-    best_site = -1;

-    CHECK_BOUNDS(1)

-    if (all_in) {

-      for (i = 0; i < 4; i++) {

-        this_mv.as_mv.row = br + neighbors[i].row;

-        this_mv.as_mv.col = bc + neighbors[i].col;

-        this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;

-        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);

-        CHECK_BETTER

-      }

-    } else {

-      for (i = 0; i < 4; i++) {

-        this_mv.as_mv.row = br + neighbors[i].row;

-        this_mv.as_mv.col = bc + neighbors[i].col;

-        CHECK_POINT

-        this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;

-        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);

-        CHECK_BETTER

-      }

-    }

-    if (best_site == -1)

-      break;

-    else {

-      br += neighbors[best_site].row;

-      bc += neighbors[best_site].col;

-    }

-  }

-  best_mv->as_mv.row = br;

-  best_mv->as_mv.col = bc;

-  return bestsad;

-}

-#undef CHECK_BOUNDS

-#undef CHECK_POINT

-#undef CHECK_BETTER

-int vp9_diamond_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

-                           int_mv *ref_mv, int_mv *best_mv,

-                           int search_param, int sad_per_bit, int *num00,

-                           vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,

-                           int_mv *center_mv) {

-  int i, j, step;

-  unsigned char *what = (*(b->base_src) + b->src);

-  int what_stride = b->src_stride;

-  unsigned char *in_what;

-  int in_what_stride = d->pre_stride;

-  unsigned char *best_address;

-  int tot_steps;

-  int_mv this_mv;

-  int bestsad = INT_MAX;

-  int best_site = 0;

-  int last_site = 0;

-  int ref_row, ref_col;

-  int this_row_offset, this_col_offset;

-  search_site *ss;

-  unsigned char *check_here;

-  int thissad;

-  MACROBLOCKD *xd = &x->e_mbd;

-  int_mv fcenter_mv;

-  int *mvjsadcost = x->nmvjointsadcost;

-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};

-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;

-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

-  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);

-  ref_row = ref_mv->as_mv.row;

-  ref_col = ref_mv->as_mv.col;

-  *num00 = 0;

-  best_mv->as_mv.row = ref_row;

-  best_mv->as_mv.col = ref_col;

-  // Work out the start point for the search

-  in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);

-  best_address = in_what;

-  // Check the starting position

-  bestsad = fn_ptr->sdf(what, what_stride, in_what,

-                        in_what_stride, 0x7fffffff)

-            + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);

-  // search_param determines the length of the initial step and hence the number of iterations

-  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.

-  ss = &x->ss[search_param * x->searches_per_step];

-  tot_steps = (x->ss_count / x->searches_per_step) - search_param;

-  i = 1;

-  for (step = 0; step < tot_steps; step++) {

-    for (j = 0; j < x->searches_per_step; j++) {

-      // Trap illegal vectors

-      this_row_offset = best_mv->as_mv.row + ss[i].mv.row;

-      this_col_offset = best_mv->as_mv.col + ss[i].mv.col;

-      if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&

-          (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))

-      {

-        check_here = ss[i].offset + best_address;

-        thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);

-        if (thissad < bestsad) {

-          this_mv.as_mv.row = this_row_offset;

-          this_mv.as_mv.col = this_col_offset;

-          thissad += mvsad_err_cost(&this_mv, &fcenter_mv,

-                                    MVSADCOSTS, sad_per_bit);

-          if (thissad < bestsad) {

-            bestsad = thissad;

-            best_site = i;

-          }

-        }

-      }

-      i++;

-    }

-    if (best_site != last_site) {

-      best_mv->as_mv.row += ss[best_site].mv.row;

-      best_mv->as_mv.col += ss[best_site].mv.col;

-      best_address += ss[best_site].offset;

-      last_site = best_site;

-    } else if (best_address == in_what)

-      (*num00)++;

-  }

-  this_mv.as_mv.row = best_mv->as_mv.row << 3;

-  this_mv.as_mv.col = best_mv->as_mv.col << 3;

-  if (bestsad == INT_MAX)

-    return INT_MAX;

-  return

-      fn_ptr->vf(what, what_stride, best_address, in_what_stride,

-                 (unsigned int *)(&thissad)) +

-      mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,

-                  xd->allow_high_precision_mv);

-}

-int vp9_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

-                             int_mv *ref_mv, int_mv *best_mv, int search_param,

-                             int sad_per_bit, int *num00,

-                             vp9_variance_fn_ptr_t *fn_ptr,

-                             DEC_MVCOSTS, int_mv *center_mv) {

-  int i, j, step;

-  unsigned char *what = (*(b->base_src) + b->src);

-  int what_stride = b->src_stride;

-  unsigned char *in_what;

-  int in_what_stride = d->pre_stride;

-  unsigned char *best_address;

-  int tot_steps;

-  int_mv this_mv;

-  int bestsad = INT_MAX;

-  int best_site = 0;

-  int last_site = 0;

-  int ref_row;

-  int ref_col;

-  int this_row_offset;

-  int this_col_offset;

-  search_site *ss;

-  unsigned char *check_here;

-  unsigned int thissad;

-  MACROBLOCKD *xd = &x->e_mbd;

-  int_mv fcenter_mv;

-  int *mvjsadcost = x->nmvjointsadcost;

-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};

-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;

-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

-  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);

-  ref_row = ref_mv->as_mv.row;

-  ref_col = ref_mv->as_mv.col;

-  *num00 = 0;

-  best_mv->as_mv.row = ref_row;

-  best_mv->as_mv.col = ref_col;

-  // Work out the start point for the search

-  in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);

-  best_address = in_what;

-  // Check the starting position

-  bestsad = fn_ptr->sdf(what, what_stride,

-                        in_what, in_what_stride, 0x7fffffff)

-            + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);

-  // search_param determines the length of the initial step and hence the number of iterations

-  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.

-  ss = &x->ss[search_param * x->searches_per_step];

-  tot_steps = (x->ss_count / x->searches_per_step) - search_param;

-  i = 1;

-  for (step = 0; step < tot_steps; step++) {

-    int all_in = 1, t;

-    // To know if all neighbor points are within the bounds, 4 bounds checking are enough instead of

-    // checking 4 bounds for each points.

-    all_in &= ((best_mv->as_mv.row + ss[i].mv.row) > x->mv_row_min);

-    all_in &= ((best_mv->as_mv.row + ss[i + 1].mv.row) < x->mv_row_max);

-    all_in &= ((best_mv->as_mv.col + ss[i + 2].mv.col) > x->mv_col_min);

-    all_in &= ((best_mv->as_mv.col + ss[i + 3].mv.col) < x->mv_col_max);

-    if (all_in) {

-      unsigned int sad_array[4];

-      for (j = 0; j < x->searches_per_step; j += 4) {

-        unsigned char *block_offset[4];

-        for (t = 0; t < 4; t++)

-          block_offset[t] = ss[i + t].offset + best_address;

-        fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,

-                       sad_array);

-        for (t = 0; t < 4; t++, i++) {

-          if (sad_array[t] < bestsad) {

-            this_mv.as_mv.row = best_mv->as_mv.row + ss[i].mv.row;

-            this_mv.as_mv.col = best_mv->as_mv.col + ss[i].mv.col;

-            sad_array[t] += mvsad_err_cost(&this_mv, &fcenter_mv,

-                                           MVSADCOSTS, sad_per_bit);

-            if (sad_array[t] < bestsad) {

-              bestsad = sad_array[t];

-              best_site = i;

-            }

-          }

-        }

-      }

-    } else {

-      for (j = 0; j < x->searches_per_step; j++) {

-        // Trap illegal vectors

-        this_row_offset = best_mv->as_mv.row + ss[i].mv.row;

-        this_col_offset = best_mv->as_mv.col + ss[i].mv.col;

-        if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&

-            (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {

-          check_here = ss[i].offset + best_address;

-          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);

-          if (thissad < bestsad) {

-            this_mv.as_mv.row = this_row_offset;

-            this_mv.as_mv.col = this_col_offset;

-            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,

-                                      MVSADCOSTS, sad_per_bit);

-            if (thissad < bestsad) {

-              bestsad = thissad;

-              best_site = i;

-            }

-          }

-        }

-        i++;

-      }

-    }

-    if (best_site != last_site) {

-      best_mv->as_mv.row += ss[best_site].mv.row;

-      best_mv->as_mv.col += ss[best_site].mv.col;

-      best_address += ss[best_site].offset;

-      last_site = best_site;

-    } else if (best_address == in_what)

-      (*num00)++;

-  }

-  this_mv.as_mv.row = best_mv->as_mv.row << 3;

-  this_mv.as_mv.col = best_mv->as_mv.col << 3;

-  if (bestsad == INT_MAX)

-    return INT_MAX;

-  return

-      fn_ptr->vf(what, what_stride, best_address, in_what_stride,

-                 (unsigned int *)(&thissad)) +

-      mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,

-                  xd->allow_high_precision_mv);

-}

-/* do_refine: If last step (1-away) of n-step search doesn't pick the center

-              point as the best match, we will do a final 1-away diamond

-              refining search  */

-int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b,

-                           BLOCKD *d, int_mv *mvp_full, int step_param,

-                           int sadpb, int further_steps,

-                           int do_refine, vp9_variance_fn_ptr_t *fn_ptr,

-                           int_mv *ref_mv, int_mv *dst_mv) {

-  int_mv temp_mv;

-  int thissme, n, num00;

-  int bestsme = cpi->diamond_search_sad(x, b, d, mvp_full, &temp_mv,

-                                        step_param, sadpb, &num00,

-                                        fn_ptr, XMVCOST, ref_mv);

-  dst_mv->as_int = temp_mv.as_int;

-  n = num00;

-  num00 = 0;

-  /* If there won't be more n-step search, check to see if refining search is needed. */

-  if (n > further_steps)

-    do_refine = 0;

-  while (n < further_steps) {

-    n++;

-    if (num00)

-      num00--;

-    else {

-      thissme = cpi->diamond_search_sad(x, b, d, mvp_full, &temp_mv,

-                                        step_param + n, sadpb, &num00,

-                                        fn_ptr, XMVCOST, ref_mv);

-      /* check to see if refining search is needed. */

-      if (num00 > (further_steps - n))

-        do_refine = 0;

-      if (thissme < bestsme) {

-        bestsme = thissme;

-        dst_mv->as_int = temp_mv.as_int;

-      }

-    }

-  }

-  /* final 1-away diamond refining search */

-  if (do_refine == 1) {

-    int search_range = 8;

-    int_mv best_mv;

-    best_mv.as_int = dst_mv->as_int;

-    thissme = cpi->refining_search_sad(x, b, d, &best_mv, sadpb, search_range,

-                                       fn_ptr, XMVCOST, ref_mv);

-    if (thissme < bestsme) {

-      bestsme = thissme;

-      dst_mv->as_int = best_mv.as_int;

-    }

-  }

-  return bestsme;

-}

-int vp9_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,

-                        int sad_per_bit, int distance,

-                        vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,

-                        int_mv *center_mv) {

-  unsigned char *what = (*(b->base_src) + b->src);

-  int what_stride = b->src_stride;

-  unsigned char *in_what;

-  int in_what_stride = d->pre_stride;

-  int mv_stride = d->pre_stride;

-  unsigned char *bestaddress;

-  int_mv *best_mv = &d->bmi.as_mv.first;

-  int_mv this_mv;

-  int bestsad = INT_MAX;

-  int r, c;

-  unsigned char *check_here;

-  int thissad;

-  MACROBLOCKD *xd = &x->e_mbd;

-  int ref_row = ref_mv->as_mv.row;

-  int ref_col = ref_mv->as_mv.col;

-  int row_min = ref_row - distance;

-  int row_max = ref_row + distance;

-  int col_min = ref_col - distance;

-  int col_max = ref_col + distance;

-  int_mv fcenter_mv;

-  int *mvjsadcost = x->nmvjointsadcost;

-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};

-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;

-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

-  // Work out the mid point for the search

-  in_what = *(d->base_pre) + d->pre;

-  bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;

-  best_mv->as_mv.row = ref_row;

-  best_mv->as_mv.col = ref_col;

-  // Baseline value at the centre

-  bestsad = fn_ptr->sdf(what, what_stride, bestaddress,

-                        in_what_stride, 0x7fffffff)

-            + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);

-  // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border

-  if (col_min < x->mv_col_min)

-    col_min = x->mv_col_min;

-  if (col_max > x->mv_col_max)

-    col_max = x->mv_col_max;

-  if (row_min < x->mv_row_min)

-    row_min = x->mv_row_min;

-  if (row_max > x->mv_row_max)

-    row_max = x->mv_row_max;

-  for (r = row_min; r < row_max; r++) {

-    this_mv.as_mv.row = r;

-    check_here = r * mv_stride + in_what + col_min;

-    for (c = col_min; c < col_max; c++) {

-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);

-      this_mv.as_mv.col = c;

-      thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,

-                                 MVSADCOSTS, sad_per_bit);

-      if (thissad < bestsad) {

-        bestsad = thissad;

-        best_mv->as_mv.row = r;

-        best_mv->as_mv.col = c;

-        bestaddress = check_here;

-      }

-      check_here++;

-    }

-  }

-  this_mv.as_mv.row = best_mv->as_mv.row << 3;

-  this_mv.as_mv.col = best_mv->as_mv.col << 3;

-  if (bestsad < INT_MAX)

-    return

-        fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,

-                   (unsigned int *)(&thissad)) +

-        mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,

-                    xd->allow_high_precision_mv);

-  else

-    return INT_MAX;

-}

-int vp9_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,

-                          int sad_per_bit, int distance,

-                          vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,

-                          int_mv *center_mv) {

-  unsigned char *what = (*(b->base_src) + b->src);

-  int what_stride = b->src_stride;

-  unsigned char *in_what;

-  int in_what_stride = d->pre_stride;

-  int mv_stride = d->pre_stride;

-  unsigned char *bestaddress;

-  int_mv *best_mv = &d->bmi.as_mv.first;

-  int_mv this_mv;

-  int bestsad = INT_MAX;

-  int r, c;

-  unsigned char *check_here;

-  unsigned int thissad;

-  MACROBLOCKD *xd = &x->e_mbd;

-  int ref_row = ref_mv->as_mv.row;

-  int ref_col = ref_mv->as_mv.col;

-  int row_min = ref_row - distance;

-  int row_max = ref_row + distance;

-  int col_min = ref_col - distance;

-  int col_max = ref_col + distance;

-  unsigned int sad_array[3];

-  int_mv fcenter_mv;

-  int *mvjsadcost = x->nmvjointsadcost;

-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};

-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;

-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

-  // Work out the mid point for the search

-  in_what = *(d->base_pre) + d->pre;

-  bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;

-  best_mv->as_mv.row = ref_row;

-  best_mv->as_mv.col = ref_col;

-  // Baseline value at the centre

-  bestsad = fn_ptr->sdf(what, what_stride,

-                        bestaddress, in_what_stride, 0x7fffffff)

-            + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);

-  // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border

-  if (col_min < x->mv_col_min)

-    col_min = x->mv_col_min;

-  if (col_max > x->mv_col_max)

-    col_max = x->mv_col_max;

-  if (row_min < x->mv_row_min)

-    row_min = x->mv_row_min;

-  if (row_max > x->mv_row_max)

-    row_max = x->mv_row_max;

-  for (r = row_min; r < row_max; r++) {

-    this_mv.as_mv.row = r;

-    check_here = r * mv_stride + in_what + col_min;

-    c = col_min;

-    while ((c + 2) < col_max) {

-      int i;

-      fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);

-      for (i = 0; i < 3; i++) {

-        thissad = sad_array[i];

-        if (thissad < bestsad) {

-          this_mv.as_mv.col = c;

-          thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,

-                                     MVSADCOSTS, sad_per_bit);

-          if (thissad < bestsad) {

-            bestsad = thissad;

-            best_mv->as_mv.row = r;

-            best_mv->as_mv.col = c;

-            bestaddress = check_here;

-          }

-        }

-        check_here++;

-        c++;

-      }

-    }

-    while (c < col_max) {

-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);

-      if (thissad < bestsad) {

-        this_mv.as_mv.col = c;

-        thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,

-                                   MVSADCOSTS, sad_per_bit);

-        if (thissad < bestsad) {

-          bestsad = thissad;

-          best_mv->as_mv.row = r;

-          best_mv->as_mv.col = c;

-          bestaddress = check_here;

-        }

-      }

-      check_here++;

-      c++;

-    }

-  }

-  this_mv.as_mv.row = best_mv->as_mv.row << 3;

-  this_mv.as_mv.col = best_mv->as_mv.col << 3;

-  if (bestsad < INT_MAX)

-    return

-        fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,

-                   (unsigned int *)(&thissad)) +

-        mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,

-                    xd->allow_high_precision_mv);

-  else

-    return INT_MAX;

-}

-int vp9_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,

-                          int sad_per_bit, int distance,

-                          vp9_variance_fn_ptr_t *fn_ptr,

-                          DEC_MVCOSTS,

-                          int_mv *center_mv) {

-  unsigned char *what = (*(b->base_src) + b->src);

-  int what_stride = b->src_stride;

-  unsigned char *in_what;

-  int in_what_stride = d->pre_stride;

-  int mv_stride = d->pre_stride;

-  unsigned char *bestaddress;

-  int_mv *best_mv = &d->bmi.as_mv.first;

-  int_mv this_mv;

-  int bestsad = INT_MAX;

-  int r, c;

-  unsigned char *check_here;

-  unsigned int thissad;

-  MACROBLOCKD *xd = &x->e_mbd;

-  int ref_row = ref_mv->as_mv.row;

-  int ref_col = ref_mv->as_mv.col;

-  int row_min = ref_row - distance;

-  int row_max = ref_row + distance;

-  int col_min = ref_col - distance;

-  int col_max = ref_col + distance;

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8);

-  unsigned int sad_array[3];

-  int_mv fcenter_mv;

-  int *mvjsadcost = x->nmvjointsadcost;

-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};

-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;

-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

-  // Work out the mid point for the search

-  in_what = *(d->base_pre) + d->pre;

-  bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;

-  best_mv->as_mv.row = ref_row;

-  best_mv->as_mv.col = ref_col;

-  // Baseline value at the centre

-  bestsad = fn_ptr->sdf(what, what_stride,

-                        bestaddress, in_what_stride, 0x7fffffff)

-            + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);

-  // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border

-  if (col_min < x->mv_col_min)

-    col_min = x->mv_col_min;

-  if (col_max > x->mv_col_max)

-    col_max = x->mv_col_max;

-  if (row_min < x->mv_row_min)

-    row_min = x->mv_row_min;

-  if (row_max > x->mv_row_max)

-    row_max = x->mv_row_max;

-  for (r = row_min; r < row_max; r++) {

-    this_mv.as_mv.row = r;

-    check_here = r * mv_stride + in_what + col_min;

-    c = col_min;

-    while ((c + 7) < col_max) {

-      int i;

-      fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8);

-      for (i = 0; i < 8; i++) {

-        thissad = (unsigned int)sad_array8[i];

-        if (thissad < bestsad) {

-          this_mv.as_mv.col = c;

-          thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,

-                                     MVSADCOSTS, sad_per_bit);

-          if (thissad < bestsad) {

-            bestsad = thissad;

-            best_mv->as_mv.row = r;

-            best_mv->as_mv.col = c;

-            bestaddress = check_here;

-          }

-        }

-        check_here++;

-        c++;

-      }

-    }

-    while ((c + 2) < col_max) {

-      int i;

-      fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);

-      for (i = 0; i < 3; i++) {

-        thissad = sad_array[i];

-        if (thissad < bestsad) {

-          this_mv.as_mv.col = c;

-          thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,

-                                     MVSADCOSTS, sad_per_bit);

-          if (thissad < bestsad) {

-            bestsad = thissad;

-            best_mv->as_mv.row = r;

-            best_mv->as_mv.col = c;

-            bestaddress = check_here;

-          }

-        }

-        check_here++;

-        c++;

-      }

-    }

-    while (c < col_max) {

-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);

-      if (thissad < bestsad) {

-        this_mv.as_mv.col = c;

-        thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,

-                                   MVSADCOSTS, sad_per_bit);

-        if (thissad < bestsad) {

-          bestsad = thissad;

-          best_mv->as_mv.row = r;

-          best_mv->as_mv.col = c;

-          bestaddress = check_here;

-        }

-      }

-      check_here++;

-      c++;

-    }

-  }

-  this_mv.as_mv.row = best_mv->as_mv.row << 3;

-  this_mv.as_mv.col = best_mv->as_mv.col << 3;

-  if (bestsad < INT_MAX)

-    return

-        fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,

-                   (unsigned int *)(&thissad)) +

-        mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,

-                    xd->allow_high_precision_mv);

-  else

-    return INT_MAX;

-}

-int vp9_refining_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,

-                            int error_per_bit, int search_range,

-                            vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,

-                            int_mv *center_mv) {

-  MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};

-  int i, j;

-  short this_row_offset, this_col_offset;

-  int what_stride = b->src_stride;

-  int in_what_stride = d->pre_stride;

-  unsigned char *what = (*(b->base_src) + b->src);

-  unsigned char *best_address = (unsigned char *)(*(d->base_pre) + d->pre +

-                                                  (ref_mv->as_mv.row * (d->pre_stride)) + ref_mv->as_mv.col);

-  unsigned char *check_here;

-  unsigned int thissad;

-  int_mv this_mv;

-  unsigned int bestsad = INT_MAX;

-  MACROBLOCKD *xd = &x->e_mbd;

-  int_mv fcenter_mv;

-  int *mvjsadcost = x->nmvjointsadcost;

-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};

-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;

-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

-  bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) +

-      mvsad_err_cost(ref_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);

-  for (i = 0; i < search_range; i++) {

-    int best_site = -1;

-    for (j = 0; j < 4; j++) {

-      this_row_offset = ref_mv->as_mv.row + neighbors[j].row;

-      this_col_offset = ref_mv->as_mv.col + neighbors[j].col;

-      if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&

-          (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {

-        check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address;

-        thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);

-        if (thissad < bestsad) {

-          this_mv.as_mv.row = this_row_offset;

-          this_mv.as_mv.col = this_col_offset;

-          thissad += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);

-          if (thissad < bestsad) {

-            bestsad = thissad;

-            best_site = j;

-          }

-        }

-      }

-    }

-    if (best_site == -1)

-      break;

-    else {

-      ref_mv->as_mv.row += neighbors[best_site].row;

-      ref_mv->as_mv.col += neighbors[best_site].col;

-      best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col;

-    }

-  }

-  this_mv.as_mv.row = ref_mv->as_mv.row << 3;

-  this_mv.as_mv.col = ref_mv->as_mv.col << 3;

-  if (bestsad < INT_MAX)

-    return

-        fn_ptr->vf(what, what_stride, best_address, in_what_stride,

-                   (unsigned int *)(&thissad)) +

-        mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,

-                    xd->allow_high_precision_mv);

-  else

-    return INT_MAX;

-}

-int vp9_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

-                              int_mv *ref_mv, int error_per_bit,

-                              int search_range, vp9_variance_fn_ptr_t *fn_ptr,

-                              DEC_MVCOSTS, int_mv *center_mv) {

-  MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};

-  int i, j;

-  short this_row_offset, this_col_offset;

-  int what_stride = b->src_stride;

-  int in_what_stride = d->pre_stride;

-  unsigned char *what = (*(b->base_src) + b->src);

-  unsigned char *best_address = (unsigned char *)(*(d->base_pre) + d->pre +

-                                                  (ref_mv->as_mv.row * (d->pre_stride)) + ref_mv->as_mv.col);

-  unsigned char *check_here;

-  unsigned int thissad;

-  int_mv this_mv;

-  unsigned int bestsad = INT_MAX;

-  MACROBLOCKD *xd = &x->e_mbd;

-  int_mv fcenter_mv;

-  int *mvjsadcost = x->nmvjointsadcost;

-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};

-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;

-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

-  bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) +

-      mvsad_err_cost(ref_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);

-  for (i = 0; i < search_range; i++) {

-    int best_site = -1;

-    int all_in = 1;

-    all_in &= ((ref_mv->as_mv.row - 1) > x->mv_row_min);

-    all_in &= ((ref_mv->as_mv.row + 1) < x->mv_row_max);

-    all_in &= ((ref_mv->as_mv.col - 1) > x->mv_col_min);

-    all_in &= ((ref_mv->as_mv.col + 1) < x->mv_col_max);

-    if (all_in) {

-      unsigned int sad_array[4];

-      unsigned char *block_offset[4];

-      block_offset[0] = best_address - in_what_stride;

-      block_offset[1] = best_address - 1;

-      block_offset[2] = best_address + 1;

-      block_offset[3] = best_address + in_what_stride;

-      fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);

-      for (j = 0; j < 4; j++) {

-        if (sad_array[j] < bestsad) {

-          this_mv.as_mv.row = ref_mv->as_mv.row + neighbors[j].row;

-          this_mv.as_mv.col = ref_mv->as_mv.col + neighbors[j].col;

-          sad_array[j] += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);

-          if (sad_array[j] < bestsad) {

-            bestsad = sad_array[j];

-            best_site = j;

-          }

-        }

-      }

-    } else {

-      for (j = 0; j < 4; j++) {

-        this_row_offset = ref_mv->as_mv.row + neighbors[j].row;

-        this_col_offset = ref_mv->as_mv.col + neighbors[j].col;

-        if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&

-            (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {

-          check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address;

-          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);

-          if (thissad < bestsad) {

-            this_mv.as_mv.row = this_row_offset;

-            this_mv.as_mv.col = this_col_offset;

-            thissad += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);

-            if (thissad < bestsad) {

-              bestsad = thissad;

-              best_site = j;

-            }

-          }

-        }

-      }

-    }

-    if (best_site == -1)

-      break;

-    else {

-      ref_mv->as_mv.row += neighbors[best_site].row;

-      ref_mv->as_mv.col += neighbors[best_site].col;

-      best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col;

-    }

-  }

-  this_mv.as_mv.row = ref_mv->as_mv.row << 3;

-  this_mv.as_mv.col = ref_mv->as_mv.col << 3;

-  if (bestsad < INT_MAX)

-    return

-        fn_ptr->vf(what, what_stride, best_address, in_what_stride,

-                   (unsigned int *)(&thissad)) +

-        mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,

-                    xd->allow_high_precision_mv);

-  else

-    return INT_MAX;

-}

-#ifdef ENTROPY_STATS

-void print_mode_context(void) {

-  FILE *f = fopen("modecont.c", "a");

-  int i, j;

-  fprintf(f, "#include \"entropy.h\"\n");

-  fprintf(f, "const int vp9_mode_contexts[6][4] =");

-  fprintf(f, "{\n");

-  for (j = 0; j < 6; j++) {

-    fprintf(f, "  {/* %d */ ", j);

-    fprintf(f, "    ");

-    for (i = 0; i < 4; i++) {

-      int this_prob;

-      int count;

-      // context probs

-      count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];

-      if (count)

-        this_prob = 256 * mv_ref_ct[j][i][0] / count;

-      else

-        this_prob = 128;

-      if (this_prob == 0)

-        this_prob = 1;

-      fprintf(f, "%5d, ", this_prob);

-    }

-    fprintf(f, "  },\n");

-  }

-  fprintf(f, "};\n");

-  fclose(f);

-}

-/* MV ref count ENTROPY_STATS stats code */

-void init_mv_ref_counts() {

-  vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct));

-  vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts));

-}

-void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4]) {

-  if (m == ZEROMV) {

-    ++mv_ref_ct [ct[0]] [0] [0];

-    ++mv_mode_cts[0][0];

-  } else {

-    ++mv_ref_ct [ct[0]] [0] [1];

-    ++mv_mode_cts[0][1];

-    if (m == NEARESTMV) {

-      ++mv_ref_ct [ct[1]] [1] [0];

-      ++mv_mode_cts[1][0];

-    } else {

-      ++mv_ref_ct [ct[1]] [1] [1];

-      ++mv_mode_cts[1][1];

-      if (m == NEARMV) {

-        ++mv_ref_ct [ct[2]] [2] [0];

-        ++mv_mode_cts[2][0];

-      } else {

-        ++mv_ref_ct [ct[2]] [2] [1];

-        ++mv_mode_cts[2][1];

-        if (m == NEWMV) {

-          ++mv_ref_ct [ct[3]] [3] [0];

-          ++mv_mode_cts[3][0];

-        } else {

-          ++mv_ref_ct [ct[3]] [3] [1];

-          ++mv_mode_cts[3][1];

-        }

-      }

-    }

-  }

-}

-#endif/* END MV ref count ENTROPY_STATS stats code */

--- a/vp8/encoder/mcomp.h

+++ /dev/null

@@ -1,159 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_MCOMP_H

-#define __INC_MCOMP_H

-#include "block.h"

-#include "variance.h"

-#define MVCOSTS mvjcost, mvcost

-#define MVSADCOSTS mvjsadcost, mvsadcost

-#define DEC_MVCOSTS int *mvjcost, int *mvcost[2]

-#define DEC_MVSADCOSTS int *mvjsadcost, int *mvsadcost[2]

-#define NULLMVCOST NULL, NULL

-#define XMVCOST x->nmvjointcost, (x->e_mbd.allow_high_precision_mv?x->nmvcost_hp:x->nmvcost)

-#ifdef ENTROPY_STATS

-extern void init_mv_ref_counts();

-extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);

-#endif

-#define MAX_MVSEARCH_STEPS 8                                    // The maximum number of steps in a step search given the largest allowed initial step

-#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)      // Max full pel mv specified in 1 pel units

-#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))            // Maximum size of the first step in full pel units

-extern void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv);

-extern int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, DEC_MVCOSTS,

-                           int Weight, int ishp);

-extern void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);

-extern void vp9_init3smotion_compensation(MACROBLOCK *x,  int stride);

-// Runs sequence of diamond searches in smaller steps for RD

-struct VP9_COMP;

-int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b,

-                           BLOCKD *d, int_mv *mvp_full, int step_param,

-                           int sadpb, int further_steps, int do_refine,

-                           vp9_variance_fn_ptr_t *fn_ptr,

-                           int_mv *ref_mv, int_mv *dst_mv);

-extern int vp9_hex_search

-(

-  MACROBLOCK *x,

-  BLOCK *b,

-  BLOCKD *d,

-  int_mv *ref_mv,

-  int_mv *best_mv,

-  int search_param,

-  int error_per_bit,

-  const vp9_variance_fn_ptr_t *vf,

-  DEC_MVSADCOSTS,

-  DEC_MVCOSTS,

-  int_mv *center_mv

-);

-typedef int (fractional_mv_step_fp)

-(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv,

- int error_per_bit, const vp9_variance_fn_ptr_t *vfp, DEC_MVCOSTS,

- int *distortion, unsigned int *sse);

-extern fractional_mv_step_fp vp9_find_best_sub_pixel_step_iteratively;

-extern fractional_mv_step_fp vp9_find_best_sub_pixel_step;

-extern fractional_mv_step_fp vp9_find_best_half_pixel_step;

-#define prototype_full_search_sad(sym)\

-  int (sym)\

-  (\

-   MACROBLOCK *x, \

-   BLOCK *b, \

-   BLOCKD *d, \

-   int_mv *ref_mv, \

-   int sad_per_bit, \

-   int distance, \

-   vp9_variance_fn_ptr_t *fn_ptr, \

-   DEC_MVSADCOSTS, \

-   int_mv *center_mv \

-  )

-#define prototype_refining_search_sad(sym)\

-  int (sym)\

-  (\

-   MACROBLOCK *x, \

-   BLOCK *b, \

-   BLOCKD *d, \

-   int_mv *ref_mv, \

-   int sad_per_bit, \

-   int distance, \

-   vp9_variance_fn_ptr_t *fn_ptr, \

-   DEC_MVSADCOSTS, \

-   int_mv *center_mv \

-  )

-#define prototype_diamond_search_sad(sym)\

-  int (sym)\

-  (\

-   MACROBLOCK *x, \

-   BLOCK *b, \

-   BLOCKD *d, \

-   int_mv *ref_mv, \

-   int_mv *best_mv, \

-   int search_param, \

-   int sad_per_bit, \

-   int *num00, \

-   vp9_variance_fn_ptr_t *fn_ptr, \

-   DEC_MVSADCOSTS, \

-   int_mv *center_mv \

-  )

-#if ARCH_X86 || ARCH_X86_64

-#include "x86/mcomp_x86.h"

-#endif

-typedef prototype_full_search_sad(*vp9_full_search_fn_t);

-extern prototype_full_search_sad(vp9_full_search_sad);

-extern prototype_full_search_sad(vp9_full_search_sadx3);

-extern prototype_full_search_sad(vp9_full_search_sadx8);

-typedef prototype_refining_search_sad(*vp9_refining_search_fn_t);

-extern prototype_refining_search_sad(vp9_refining_search_sad);

-extern prototype_refining_search_sad(vp9_refining_search_sadx4);

-typedef prototype_diamond_search_sad(*vp9_diamond_search_fn_t);

-extern prototype_diamond_search_sad(vp9_diamond_search_sad);

-extern prototype_diamond_search_sad(vp9_diamond_search_sadx4);

-#ifndef vp9_search_full_search

-#define vp9_search_full_search vp9_full_search_sad

-#endif

-extern prototype_full_search_sad(vp9_search_full_search);

-#ifndef vp9_search_refining_search

-#define vp9_search_refining_search vp9_refining_search_sad

-#endif

-extern prototype_refining_search_sad(vp9_search_refining_search);

-#ifndef vp9_search_diamond_search

-#define vp9_search_diamond_search vp9_diamond_search_sad

-#endif

-extern prototype_diamond_search_sad(vp9_search_diamond_search);

-typedef struct {

-  prototype_full_search_sad(*full_search);

-  prototype_refining_search_sad(*refining_search);

-  prototype_diamond_search_sad(*diamond_search);

-} vp9_search_rtcd_vtable_t;

-#if CONFIG_RUNTIME_CPU_DETECT

-#define SEARCH_INVOKE(ctx,fn) (ctx)->fn

-#else

-#define SEARCH_INVOKE(ctx,fn) vp9_search_##fn

-#endif

-#endif

--- a/vp8/encoder/modecosts.c

+++ /dev/null

@@ -1,49 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp8/common/blockd.h"

-#include "onyx_int.h"

-#include "treewriter.h"

-#include "vp8/common/entropymode.h"

-void vp9_init_mode_costs(VP9_COMP *c) {

-  VP9_COMMON *x = &c->common;

-  const vp9_tree_p T = vp9_bmode_tree;

-  int i, j;

-  for (i = 0; i < VP9_BINTRAMODES; i++) {

-    for (j = 0; j < VP9_BINTRAMODES; j++) {

-      vp9_cost_tokens((int *)c->mb.bmode_costs[i][j],

-                      x->kf_bmode_prob[i][j], T);

-    }

-  }

-  vp9_cost_tokens((int *)c->mb.inter_bmode_costs, x->fc.bmode_prob, T);

-  vp9_cost_tokens((int *)c->mb.inter_bmode_costs,

-                  x->fc.sub_mv_ref_prob[0], vp9_sub_mv_ref_tree);

-  vp9_cost_tokens(c->mb.mbmode_cost[1], x->fc.ymode_prob, vp9_ymode_tree);

-  vp9_cost_tokens(c->mb.mbmode_cost[0],

-                  x->kf_ymode_prob[c->common.kf_ymode_probs_index],

-                  vp9_kf_ymode_tree);

-  vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],

-                  x->fc.uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);

-  vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],

-                  x->kf_uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);

-  vp9_cost_tokens(c->mb.i8x8_mode_costs,

-                  x->fc.i8x8_mode_prob, vp9_i8x8_mode_tree);

-  for (i = 0; i <= VP9_SWITCHABLE_FILTERS; ++i)

-    vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],

-                    x->fc.switchable_interp_prob[i],

-                    vp9_switchable_interp_tree);

-}

--- a/vp8/encoder/modecosts.h

+++ /dev/null

@@ -1,17 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_MODECOSTS_H

-#define __INC_MODECOSTS_H

-void vp9_init_mode_costs(VP9_COMP *x);

-#endif

--- a/vp8/encoder/onyx_if.c

+++ /dev/null

@@ -1,4486 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_config.h"

-#include "vp8/common/onyxc_int.h"

-#include "onyx_int.h"

-#include "vp8/common/systemdependent.h"

-#include "quantize.h"

-#include "vp8/common/alloccommon.h"

-#include "mcomp.h"

-#include "firstpass.h"

-#include "psnr.h"

-#include "vpx_scale/vpxscale.h"

-#include "vp8/common/extend.h"

-#include "ratectrl.h"

-#include "vp8/common/quant_common.h"

-#include "segmentation.h"

-#include "vpx_scale/yv12extend.h"

-#if CONFIG_POSTPROC

-#include "vp8/common/postproc.h"

-#endif

-#include "vpx_mem/vpx_mem.h"

-#include "vp8/common/swapyv12buffer.h"

-#include "vpx_ports/vpx_timer.h"

-#include "temporal_filter.h"

-#include "vp8/common/seg_common.h"

-#include "mbgraph.h"

-#include "vp8/common/pred_common.h"

-#include "vp8/encoder/rdopt.h"

-#include "bitstream.h"

-#include "ratectrl.h"

-#if CONFIG_NEWBESTREFMV

-#include "vp8/common/mvref_common.h"

-#endif

-#if ARCH_ARM

-#include "vpx_ports/arm.h"

-#endif

-#include <math.h>

-#include <stdio.h>

-#include <limits.h>

-#if CONFIG_RUNTIME_CPU_DETECT

-#define IF_RTCD(x) (x)

-#define RTCD(x) &cpi->common.rtcd.x

-#else

-#define IF_RTCD(x) NULL

-#define RTCD(x) NULL

-#endif

-extern void vp9_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi);

-extern void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val);

-extern void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi);

-extern void vp9_cmachine_specific_config(VP9_COMP *cpi);

-extern void vp9_deblock_frame(YV12_BUFFER_CONFIG *source,

-                              YV12_BUFFER_CONFIG *post,

-                              int filt_lvl, int low_var_thresh, int flag);

-extern void print_tree_update_probs();

-#if HAVE_ARMV7

-extern void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc,

-                                          YV12_BUFFER_CONFIG *dst_ybc);

-extern void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc,

-                                              YV12_BUFFER_CONFIG *dst_ybc);

-#endif

-int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);

-extern void vp9_temporal_filter_prepare_c(VP9_COMP *cpi, int distance);

-static void set_default_lf_deltas(VP9_COMP *cpi);

-#define DEFAULT_INTERP_FILTER EIGHTTAP  /* SWITCHABLE for better performance */

-#define SEARCH_BEST_FILTER 0            /* to search exhaustively for

-                                           best filter */

-#define RESET_FOREACH_FILTER 0          /* whether to reset the encoder state

-                                           before trying each new filter */

-#define SHARP_FILTER_QTHRESH 0          /* Q threshold for 8-tap sharp filter */

-#define ALTREF_HIGH_PRECISION_MV 1      /* whether to use high precision mv

-                                           for altref computation */

-#define HIGH_PRECISION_MV_QTHRESH 200   /* Q threshold for use of high precision

-                                           mv. Choose a very high value for

-                                           now so that HIGH_PRECISION is always

-                                           chosen */

-#if CONFIG_INTERNAL_STATS

-#include "math.h"

-extern double vp9_calc_ssim(YV12_BUFFER_CONFIG *source,

-                            YV12_BUFFER_CONFIG *dest, int lumamask,

-                            double *weight);

-extern double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source,

-                             YV12_BUFFER_CONFIG *dest, double *ssim_y,

-                             double *ssim_u, double *ssim_v);

-#endif

-// #define OUTPUT_YUV_REC

-#ifdef OUTPUT_YUV_SRC

-FILE *yuv_file;

-#endif

-#ifdef OUTPUT_YUV_REC

-FILE *yuv_rec_file;

-#endif

-#if 0

-FILE *framepsnr;

-FILE *kf_list;

-FILE *keyfile;

-#endif

-#if 0

-extern int skip_true_count;

-extern int skip_false_count;

-#endif

-#ifdef ENTROPY_STATS

-extern int intra_mode_stats[VP9_BINTRAMODES][VP9_BINTRAMODES][VP9_BINTRAMODES];

-#endif

-#ifdef NMV_STATS

-extern void init_nmvstats();

-extern void print_nmvstats();

-#endif

-#ifdef SPEEDSTATS

-unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

-#endif

-#if defined(SECTIONBITS_OUTPUT)

-extern unsigned __int64 Sectionbits[500];

-#endif

-#ifdef MODE_STATS

-extern INT64 Sectionbits[500];

-extern unsigned int y_modes[VP9_YMODES];

-extern unsigned int i8x8_modes[VP9_I8X8_MODES];

-extern unsigned int uv_modes[VP9_UV_MODES];

-extern unsigned int uv_modes_y[VP9_YMODES][VP9_UV_MODES];

-extern unsigned int b_modes[B_MODE_COUNT];

-extern unsigned int inter_y_modes[MB_MODE_COUNT];

-extern unsigned int inter_uv_modes[VP9_UV_MODES];

-extern unsigned int inter_b_modes[B_MODE_COUNT];

-#endif

-extern void vp9_init_quantizer(VP9_COMP *cpi);

-static int base_skip_false_prob[QINDEX_RANGE][3];

-// Tables relating active max Q to active min Q

-static int kf_low_motion_minq[QINDEX_RANGE];

-static int kf_high_motion_minq[QINDEX_RANGE];

-static int gf_low_motion_minq[QINDEX_RANGE];

-static int gf_high_motion_minq[QINDEX_RANGE];

-static int inter_minq[QINDEX_RANGE];

-// Functions to compute the active minq lookup table entries based on a

-// formulaic approach to facilitate easier adjustment of the Q tables.

-// The formulae were derived from computing a 3rd order polynomial best

-// fit to the original data (after plotting real maxq vs minq (not q index))

-static int calculate_minq_index(double maxq,

-                                double x3, double x2, double x, double c) {

-  int i;

-  double minqtarget;

-  double thisq;

-  minqtarget = ((x3 * maxq * maxq * maxq) +

-                (x2 * maxq * maxq) +

-                (x * maxq) +

-                c);

-  if (minqtarget > maxq)

-    minqtarget = maxq;

-  for (i = 0; i < QINDEX_RANGE; i++) {

-    thisq = vp9_convert_qindex_to_q(i);

-    if (minqtarget <= vp9_convert_qindex_to_q(i))

-      return i;

-  }

-  return QINDEX_RANGE - 1;

-}

-static void init_minq_luts(void) {

-  int i;

-  double maxq;

-  for (i = 0; i < QINDEX_RANGE; i++) {

-    maxq = vp9_convert_qindex_to_q(i);

-    kf_low_motion_minq[i] = calculate_minq_index(maxq,

-                                                 0.0000003,

-                                                 -0.000015,

-                                                 0.074,

-                                                 0.0);

-    kf_high_motion_minq[i] = calculate_minq_index(maxq,

-                                                  0.0000004,

-                                                  -0.000125,

-                                                  0.14,

-                                                  0.0);

-    gf_low_motion_minq[i] = calculate_minq_index(maxq,

-                                                 0.0000015,

-                                                 -0.0009,

-                                                 0.33,

-                                                 0.0);

-    gf_high_motion_minq[i] = calculate_minq_index(maxq,

-                                                  0.0000021,

-                                                  -0.00125,

-                                                  0.45,

-                                                  0.0);

-    inter_minq[i] = calculate_minq_index(maxq,

-                                         0.00000271,

-                                         -0.00113,

-                                         0.697,

-                                         0.0);

-  }

-}

-static void init_base_skip_probs(void) {

-  int i;

-  double q;

-  int skip_prob, t;

-  for (i = 0; i < QINDEX_RANGE; i++) {

-    q = vp9_convert_qindex_to_q(i);

-    // Exponential decay caluclation of baseline skip prob with clamping

-    // Based on crude best fit of old table.

-    t = (int)(564.25 * pow(2.71828, (-0.012 * q)));

-    skip_prob = t;

-    if (skip_prob < 1)

-      skip_prob = 1;

-    else if (skip_prob > 255)

-      skip_prob = 255;

-    base_skip_false_prob[i][1] = skip_prob;

-    skip_prob = t * 0.75;

-    if (skip_prob < 1)

-      skip_prob = 1;

-    else if (skip_prob > 255)

-      skip_prob = 255;

-    base_skip_false_prob[i][2] = skip_prob;

-    skip_prob = t * 1.25;

-    if (skip_prob < 1)

-      skip_prob = 1;

-    else if (skip_prob > 255)

-      skip_prob = 255;

-    base_skip_false_prob[i][0] = skip_prob;

-  }

-}

-static void update_base_skip_probs(VP9_COMP *cpi) {

-  VP9_COMMON *cm = &cpi->common;

-  if (cm->frame_type != KEY_FRAME) {

-    vp9_update_skip_probs(cpi);

-    if (cm->refresh_alt_ref_frame) {

-      int k;

-      for (k = 0; k < MBSKIP_CONTEXTS; ++k)

-        cpi->last_skip_false_probs[2][k] = cm->mbskip_pred_probs[k];

-      cpi->last_skip_probs_q[2] = cm->base_qindex;

-    } else if (cpi->common.refresh_golden_frame) {

-      int k;

-      for (k = 0; k < MBSKIP_CONTEXTS; ++k)

-        cpi->last_skip_false_probs[1][k] = cm->mbskip_pred_probs[k];

-      cpi->last_skip_probs_q[1] = cm->base_qindex;

-    } else {

-      int k;

-      for (k = 0; k < MBSKIP_CONTEXTS; ++k)

-        cpi->last_skip_false_probs[0][k] = cm->mbskip_pred_probs[k];

-      cpi->last_skip_probs_q[0] = cm->base_qindex;

-      // update the baseline table for the current q

-      for (k = 0; k < MBSKIP_CONTEXTS; ++k)

-        cpi->base_skip_false_prob[cm->base_qindex][k] =

-          cm->mbskip_pred_probs[k];

-    }

-  }

-}

-void vp9_initialize_enc() {

-  static int init_done = 0;

-  if (!init_done) {

-    vp8_scale_machine_specific_config();

-    vp9_initialize_common();

-    vp9_tokenize_initialize();

-    vp9_init_quant_tables();

-    vp9_init_me_luts();

-    init_minq_luts();

-    init_base_skip_probs();

-    init_done = 1;

-  }

-}

-#ifdef PACKET_TESTING

-extern FILE *vpxlogc;

-#endif

-static void setup_features(VP9_COMP *cpi) {

-  MACROBLOCKD *xd = &cpi->mb.e_mbd;

-  // Set up default state for MB feature flags

-  xd->segmentation_enabled = 0;   // Default segmentation disabled

-  xd->update_mb_segmentation_map = 0;

-  xd->update_mb_segmentation_data = 0;

-  vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));

-  vp9_clearall_segfeatures(xd);

-  xd->mode_ref_lf_delta_enabled = 0;

-  xd->mode_ref_lf_delta_update = 0;

-  vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));

-  vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));

-  vpx_memset(xd->last_ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));

-  vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));

-  set_default_lf_deltas(cpi);

-}

-static void dealloc_compressor_data(VP9_COMP *cpi) {

-  vpx_free(cpi->tplist);

-  cpi->tplist = NULL;

-  // Delete last frame MV storage buffers

-  vpx_free(cpi->lfmv);

-  cpi->lfmv = 0;

-  vpx_free(cpi->lf_ref_frame_sign_bias);

-  cpi->lf_ref_frame_sign_bias = 0;

-  vpx_free(cpi->lf_ref_frame);

-  cpi->lf_ref_frame = 0;

-  // Delete sementation map

-  vpx_free(cpi->segmentation_map);

-  cpi->segmentation_map = 0;

-  vpx_free(cpi->common.last_frame_seg_map);

-  cpi->common.last_frame_seg_map = 0;

-  vpx_free(cpi->coding_context.last_frame_seg_map_copy);

-  cpi->coding_context.last_frame_seg_map_copy = 0;

-  vpx_free(cpi->active_map);

-  cpi->active_map = 0;

-  vp9_de_alloc_frame_buffers(&cpi->common);

-  vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf);

-  vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source);

-#if VP9_TEMPORAL_ALT_REF

-  vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer);

-#endif

-  vp9_lookahead_destroy(cpi->lookahead);

-  vpx_free(cpi->tok);

-  cpi->tok = 0;

-  // Structure used to monitor GF usage

-  vpx_free(cpi->gf_active_flags);

-  cpi->gf_active_flags = 0;

-  // Activity mask based per mb zbin adjustments

-  vpx_free(cpi->mb_activity_map);

-  cpi->mb_activity_map = 0;

-  vpx_free(cpi->mb_norm_activity_map);

-  cpi->mb_norm_activity_map = 0;

-  vpx_free(cpi->mb.pip);

-  cpi->mb.pip = 0;

-  vpx_free(cpi->twopass.total_stats);

-  cpi->twopass.total_stats = 0;

-  vpx_free(cpi->twopass.total_left_stats);

-  cpi->twopass.total_left_stats = 0;

-  vpx_free(cpi->twopass.this_frame_stats);

-  cpi->twopass.this_frame_stats = 0;

-}

-// Computes a q delta (in "q index" terms) to get from a starting q value

-// to a target value

-// target q value

-static int compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) {

-  int i;

-  int start_index = cpi->worst_quality;

-  int target_index = cpi->worst_quality;

-  // Convert the average q value to an index.

-  for (i = cpi->best_quality; i < cpi->worst_quality; i++) {

-    start_index = i;

-    if (vp9_convert_qindex_to_q(i) >= qstart)

-      break;

-  }

-  // Convert the q target to an index

-  for (i = cpi->best_quality; i < cpi->worst_quality; i++) {

-    target_index = i;

-    if (vp9_convert_qindex_to_q(i) >= qtarget)

-      break;

-  }

-  return target_index - start_index;

-}

-static void init_seg_features(VP9_COMP *cpi) {

-  VP9_COMMON *cm = &cpi->common;

-  MACROBLOCKD *xd = &cpi->mb.e_mbd;

-  int high_q = (int)(cpi->avg_q > 48.0);

-  int qi_delta;

-  // Disable and clear down for KF

-  if (cm->frame_type == KEY_FRAME) {

-    // Clear down the global segmentation map

-    vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));

-    xd->update_mb_segmentation_map = 0;

-    xd->update_mb_segmentation_data = 0;

-    cpi->static_mb_pct = 0;

-    // Disable segmentation

-    vp9_disable_segmentation((VP9_PTR)cpi);

-    // Clear down the segment features.

-    vp9_clearall_segfeatures(xd);

-  }

-  // If this is an alt ref frame

-  else if (cm->refresh_alt_ref_frame) {

-    // Clear down the global segmentation map

-    vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));

-    xd->update_mb_segmentation_map = 0;

-    xd->update_mb_segmentation_data = 0;

-    cpi->static_mb_pct = 0;

-    // Disable segmentation and individual segment features by default

-    vp9_disable_segmentation((VP9_PTR)cpi);

-    vp9_clearall_segfeatures(xd);

-    // Scan frames from current to arf frame.

-    // This function re-enables segmentation if appropriate.

-    vp9_update_mbgraph_stats(cpi);

-    // If segmentation was enabled set those features needed for the

-    // arf itself.

-    if (xd->segmentation_enabled) {

-      xd->update_mb_segmentation_map = 1;

-      xd->update_mb_segmentation_data = 1;

-      qi_delta = compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 0.875));

-      vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, (qi_delta - 2));

-      vp9_set_segdata(xd, 1, SEG_LVL_ALT_LF, -2);

-      vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_Q);

-      vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_LF);

-      // Where relevant assume segment data is delta data

-      xd->mb_segment_abs_delta = SEGMENT_DELTADATA;

-    }

-  }

-  // All other frames if segmentation has been enabled

-  else if (xd->segmentation_enabled) {

-    // First normal frame in a valid gf or alt ref group

-    if (cpi->common.frames_since_golden == 0) {

-      // Set up segment features for normal frames in an af group

-      if (cpi->source_alt_ref_active) {

-        xd->update_mb_segmentation_map = 0;

-        xd->update_mb_segmentation_data = 1;

-        xd->mb_segment_abs_delta = SEGMENT_DELTADATA;

-        qi_delta = compute_qdelta(cpi, cpi->avg_q,

-                                  (cpi->avg_q * 1.125));

-        vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, (qi_delta + 2));

-        vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, 0);

-        vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_Q);

-        vp9_set_segdata(xd, 1, SEG_LVL_ALT_LF, -2);

-        vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_LF);

-        // Segment coding disabled for compred testing

-        if (high_q || (cpi->static_mb_pct == 100)) {

-          // set_segref(xd, 1, LAST_FRAME);

-          vp9_set_segref(xd, 1, ALTREF_FRAME);

-          vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);

-          vp9_set_segdata(xd, 1, SEG_LVL_MODE, ZEROMV);

-          vp9_enable_segfeature(xd, 1, SEG_LVL_MODE);

-          // EOB segment coding not fixed for 8x8 yet

-          vp9_set_segdata(xd, 1, SEG_LVL_EOB, 0);

-          vp9_enable_segfeature(xd, 1, SEG_LVL_EOB);

-        }

-      }

-      // Disable segmentation and clear down features if alt ref

-      // is not active for this group

-      else {

-        vp9_disable_segmentation((VP9_PTR)cpi);

-        vpx_memset(cpi->segmentation_map, 0,

-                   (cm->mb_rows * cm->mb_cols));

-        xd->update_mb_segmentation_map = 0;

-        xd->update_mb_segmentation_data = 0;

-        vp9_clearall_segfeatures(xd);

-      }

-    }

-    // Special case where we are coding over the top of a previous

-    // alt ref frame

-    // Segment coding disabled for compred testing

-    else if (cpi->is_src_frame_alt_ref) {

-      // Enable mode and ref frame features for segment 0 as well

-      vp9_enable_segfeature(xd, 0, SEG_LVL_REF_FRAME);

-      vp9_enable_segfeature(xd, 0, SEG_LVL_MODE);

-      vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);

-      vp9_enable_segfeature(xd, 1, SEG_LVL_MODE);

-      // All mbs should use ALTREF_FRAME, ZEROMV exclusively

-      vp9_clear_segref(xd, 0);

-      vp9_set_segref(xd, 0, ALTREF_FRAME);

-      vp9_clear_segref(xd, 1);

-      vp9_set_segref(xd, 1, ALTREF_FRAME);

-      vp9_set_segdata(xd, 0, SEG_LVL_MODE, ZEROMV);

-      vp9_set_segdata(xd, 1, SEG_LVL_MODE, ZEROMV);

-      // Skip all MBs if high Q

-      if (high_q) {

-        vp9_enable_segfeature(xd, 0, SEG_LVL_EOB);

-        vp9_set_segdata(xd, 0, SEG_LVL_EOB, 0);

-        vp9_enable_segfeature(xd, 1, SEG_LVL_EOB);

-        vp9_set_segdata(xd, 1, SEG_LVL_EOB, 0);

-      }

-      // Enable data udpate

-      xd->update_mb_segmentation_data = 1;

-    }

-    // All other frames.

-    else {

-      // No updates.. leave things as they are.

-      xd->update_mb_segmentation_map = 0;

-      xd->update_mb_segmentation_data = 0;

-    }

-  }

-}

-// DEBUG: Print out the segment id of each MB in the current frame.

-static void print_seg_map(VP9_COMP *cpi) {

-  VP9_COMMON *cm = &cpi->common;

-  int row, col;

-  int map_index = 0;

-  FILE *statsfile;

-  statsfile = fopen("segmap.stt", "a");

-  fprintf(statsfile, "%10d\n",

-          cm->current_video_frame);

-  for (row = 0; row < cpi->common.mb_rows; row++) {

-    for (col = 0; col < cpi->common.mb_cols; col++) {

-      fprintf(statsfile, "%10d",

-              cpi->segmentation_map[map_index]);

-      map_index++;

-    }

-    fprintf(statsfile, "\n");

-  }

-  fprintf(statsfile, "\n");

-  fclose(statsfile);

-}

-static void update_reference_segmentation_map(VP9_COMP *cpi) {

-  VP9_COMMON *cm = &cpi->common;

-  int row, col, sb_rows = (cm->mb_rows + 1) >> 1, sb_cols = (cm->mb_cols + 1) >> 1;

-  MODE_INFO *mi = cm->mi;

-  uint8_t *segmap = cpi->segmentation_map;

-  uint8_t *segcache = cm->last_frame_seg_map;

-  for (row = 0; row < sb_rows; row++) {

-    for (col = 0; col < sb_cols; col++) {

-      MODE_INFO *miptr = mi + col * 2;

-      uint8_t *cache = segcache + col * 2;

-#if CONFIG_SUPERBLOCKS

-      if (miptr->mbmi.encoded_as_sb) {

-        cache[0] = miptr->mbmi.segment_id;

-        if (!(cm->mb_cols & 1) || col < sb_cols - 1)

-          cache[1] = miptr->mbmi.segment_id;

-        if (!(cm->mb_rows & 1) || row < sb_rows - 1) {

-          cache[cm->mb_cols] = miptr->mbmi.segment_id;

-          if (!(cm->mb_cols & 1) || col < sb_cols - 1)

-            cache[cm->mb_cols + 1] = miptr->mbmi.segment_id;

-        }

-      } else

-#endif

-      {

-        cache[0] = miptr[0].mbmi.segment_id;

-        if (!(cm->mb_cols & 1) || col < sb_cols - 1)

-          cache[1] = miptr[1].mbmi.segment_id;

-        if (!(cm->mb_rows & 1) || row < sb_rows - 1) {

-          cache[cm->mb_cols] = miptr[cm->mode_info_stride].mbmi.segment_id;

-          if (!(cm->mb_cols & 1) || col < sb_cols - 1)

-            cache[1] = miptr[1].mbmi.segment_id;

-          cache[cm->mb_cols + 1] = miptr[cm->mode_info_stride + 1].mbmi.segment_id;

-        }

-      }

-    }

-    segmap += 2 * cm->mb_cols;

-    segcache += 2 * cm->mb_cols;

-    mi += 2 * cm->mode_info_stride;

-  }

-}

-static void set_default_lf_deltas(VP9_COMP *cpi) {

-  cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;

-  cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;

-  vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));

-  vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));

-  // Test of ref frame deltas

-  cpi->mb.e_mbd.ref_lf_deltas[INTRA_FRAME] = 2;

-  cpi->mb.e_mbd.ref_lf_deltas[LAST_FRAME] = 0;

-  cpi->mb.e_mbd.ref_lf_deltas[GOLDEN_FRAME] = -2;

-  cpi->mb.e_mbd.ref_lf_deltas[ALTREF_FRAME] = -2;

-  cpi->mb.e_mbd.mode_lf_deltas[0] = 4;               // BPRED

-  cpi->mb.e_mbd.mode_lf_deltas[1] = -2;              // Zero

-  cpi->mb.e_mbd.mode_lf_deltas[2] = 2;               // New mv

-  cpi->mb.e_mbd.mode_lf_deltas[3] = 4;               // Split mv

-}

-void vp9_set_speed_features(VP9_COMP *cpi) {

-  SPEED_FEATURES *sf = &cpi->sf;

-  int Mode = cpi->compressor_speed;

-  int Speed = cpi->Speed;

-  int i;

-  VP9_COMMON *cm = &cpi->common;

-  // Only modes 0 and 1 supported for now in experimental code basae

-  if (Mode > 1)

-    Mode = 1;

-  // Initialise default mode frequency sampling variables

-  for (i = 0; i < MAX_MODES; i ++) {

-    cpi->mode_check_freq[i] = 0;

-    cpi->mode_test_hit_counts[i] = 0;

-    cpi->mode_chosen_counts[i] = 0;

-  }

-  // best quality defaults

-  sf->RD = 1;

-  sf->search_method = NSTEP;

-  sf->improved_dct = 1;

-  sf->auto_filter = 1;

-  sf->recode_loop = 1;

-  sf->quarter_pixel_search = 1;

-  sf->half_pixel_search = 1;

-  sf->iterative_sub_pixel = 1;

-#if CONFIG_LOSSLESS

-  sf->optimize_coefficients = 0;

-#else

-  sf->optimize_coefficients = 1;

-#endif

-  sf->no_skip_block4x4_search = 1;

-  sf->first_step = 0;

-  sf->max_step_search_steps = MAX_MVSEARCH_STEPS;

-  sf->improved_mv_pred = 1;

-  // default thresholds to 0

-  for (i = 0; i < MAX_MODES; i++)

-    sf->thresh_mult[i] = 0;

-  switch (Mode) {

-    case 0: // best quality mode

-#if CONFIG_PRED_FILTER

-      sf->thresh_mult[THR_ZEROMV        ] = 0;

-      sf->thresh_mult[THR_ZEROMV_FILT   ] = 0;

-      sf->thresh_mult[THR_ZEROG         ] = 0;

-      sf->thresh_mult[THR_ZEROG_FILT    ] = 0;

-      sf->thresh_mult[THR_ZEROA         ] = 0;

-      sf->thresh_mult[THR_ZEROA_FILT    ] = 0;

-      sf->thresh_mult[THR_NEARESTMV     ] = 0;

-      sf->thresh_mult[THR_NEARESTMV_FILT] = 0;

-      sf->thresh_mult[THR_NEARESTG      ] = 0;

-      sf->thresh_mult[THR_NEARESTG_FILT ] = 0;

-      sf->thresh_mult[THR_NEARESTA      ] = 0;

-      sf->thresh_mult[THR_NEARESTA_FILT ] = 0;

-      sf->thresh_mult[THR_NEARMV        ] = 0;

-      sf->thresh_mult[THR_NEARMV_FILT   ] = 0;

-      sf->thresh_mult[THR_NEARG         ] = 0;

-      sf->thresh_mult[THR_NEARG_FILT    ] = 0;

-      sf->thresh_mult[THR_NEARA         ] = 0;

-      sf->thresh_mult[THR_NEARA_FILT    ] = 0;

-      sf->thresh_mult[THR_DC       ] = 0;

-      sf->thresh_mult[THR_V_PRED   ] = 1000;

-      sf->thresh_mult[THR_H_PRED   ] = 1000;

-      sf->thresh_mult[THR_D45_PRED ] = 1000;

-      sf->thresh_mult[THR_D135_PRED] = 1000;

-      sf->thresh_mult[THR_D117_PRED] = 1000;

-      sf->thresh_mult[THR_D153_PRED] = 1000;

-      sf->thresh_mult[THR_D27_PRED ] = 1000;

-      sf->thresh_mult[THR_D63_PRED ] = 1000;

-      sf->thresh_mult[THR_B_PRED   ] = 2000;

-      sf->thresh_mult[THR_I8X8_PRED] = 2000;

-      sf->thresh_mult[THR_TM       ] = 1000;

-      sf->thresh_mult[THR_NEWMV    ] = 1000;

-      sf->thresh_mult[THR_NEWG     ] = 1000;

-      sf->thresh_mult[THR_NEWA     ] = 1000;

-      sf->thresh_mult[THR_NEWMV_FILT    ] = 1000;

-      sf->thresh_mult[THR_NEWG_FILT     ] = 1000;

-      sf->thresh_mult[THR_NEWA_FILT     ] = 1000;

-#else

-      sf->thresh_mult[THR_ZEROMV   ] = 0;

-      sf->thresh_mult[THR_ZEROG    ] = 0;

-      sf->thresh_mult[THR_ZEROA    ] = 0;

-      sf->thresh_mult[THR_NEARESTMV] = 0;

-      sf->thresh_mult[THR_NEARESTG ] = 0;

-      sf->thresh_mult[THR_NEARESTA ] = 0;

-      sf->thresh_mult[THR_NEARMV   ] = 0;

-      sf->thresh_mult[THR_NEARG    ] = 0;

-      sf->thresh_mult[THR_NEARA    ] = 0;

-      sf->thresh_mult[THR_DC       ] = 0;

-      sf->thresh_mult[THR_V_PRED   ] = 1000;

-      sf->thresh_mult[THR_H_PRED   ] = 1000;

-      sf->thresh_mult[THR_D45_PRED ] = 1000;

-      sf->thresh_mult[THR_D135_PRED] = 1000;

-      sf->thresh_mult[THR_D117_PRED] = 1000;

-      sf->thresh_mult[THR_D153_PRED] = 1000;

-      sf->thresh_mult[THR_D27_PRED ] = 1000;

-      sf->thresh_mult[THR_D63_PRED ] = 1000;

-      sf->thresh_mult[THR_B_PRED   ] = 2000;

-      sf->thresh_mult[THR_I8X8_PRED] = 2000;

-      sf->thresh_mult[THR_TM       ] = 1000;

-      sf->thresh_mult[THR_NEWMV    ] = 1000;

-      sf->thresh_mult[THR_NEWG     ] = 1000;

-      sf->thresh_mult[THR_NEWA     ] = 1000;

-#endif

-      sf->thresh_mult[THR_SPLITMV  ] = 2500;

-      sf->thresh_mult[THR_SPLITG   ] = 5000;

-      sf->thresh_mult[THR_SPLITA   ] = 5000;

-      sf->thresh_mult[THR_COMP_ZEROLG   ] = 0;

-      sf->thresh_mult[THR_COMP_NEARESTLG] = 0;

-      sf->thresh_mult[THR_COMP_NEARLG   ] = 0;

-      sf->thresh_mult[THR_COMP_ZEROLA   ] = 0;

-      sf->thresh_mult[THR_COMP_NEARESTLA] = 0;

-      sf->thresh_mult[THR_COMP_NEARLA   ] = 0;

-      sf->thresh_mult[THR_COMP_ZEROGA   ] = 0;

-      sf->thresh_mult[THR_COMP_NEARESTGA] = 0;

-      sf->thresh_mult[THR_COMP_NEARGA   ] = 0;

-      sf->thresh_mult[THR_COMP_NEWLG    ] = 1000;

-      sf->thresh_mult[THR_COMP_NEWLA    ] = 1000;

-      sf->thresh_mult[THR_COMP_NEWGA    ] = 1000;

-      sf->thresh_mult[THR_COMP_SPLITLA  ] = 2500;

-      sf->thresh_mult[THR_COMP_SPLITGA  ] = 5000;

-      sf->thresh_mult[THR_COMP_SPLITLG  ] = 5000;

-      sf->first_step = 0;

-      sf->max_step_search_steps = MAX_MVSEARCH_STEPS;

-      sf->search_best_filter = SEARCH_BEST_FILTER;

-      break;

-    case 1:

-#if CONFIG_PRED_FILTER

-      sf->thresh_mult[THR_NEARESTMV] = 0;

-      sf->thresh_mult[THR_NEARESTMV_FILT] = 0;

-      sf->thresh_mult[THR_ZEROMV   ] = 0;

-      sf->thresh_mult[THR_ZEROMV_FILT   ] = 0;

-      sf->thresh_mult[THR_DC       ] = 0;

-      sf->thresh_mult[THR_NEARMV   ] = 0;

-      sf->thresh_mult[THR_NEARMV_FILT   ] = 0;

-      sf->thresh_mult[THR_V_PRED   ] = 1000;

-      sf->thresh_mult[THR_H_PRED   ] = 1000;

-      sf->thresh_mult[THR_D45_PRED ] = 1000;

-      sf->thresh_mult[THR_D135_PRED] = 1000;

-      sf->thresh_mult[THR_D117_PRED] = 1000;

-      sf->thresh_mult[THR_D153_PRED] = 1000;

-      sf->thresh_mult[THR_D27_PRED ] = 1000;

-      sf->thresh_mult[THR_D63_PRED ] = 1000;

-      sf->thresh_mult[THR_B_PRED   ] = 2500;

-      sf->thresh_mult[THR_I8X8_PRED] = 2500;

-      sf->thresh_mult[THR_TM       ] = 1000;

-      sf->thresh_mult[THR_NEARESTG ] = 1000;

-      sf->thresh_mult[THR_NEARESTG_FILT ] = 1000;

-      sf->thresh_mult[THR_NEARESTA ] = 1000;

-      sf->thresh_mult[THR_NEARESTA_FILT ] = 1000;

-      sf->thresh_mult[THR_ZEROG    ] = 1000;

-      sf->thresh_mult[THR_ZEROA    ] = 1000;

-      sf->thresh_mult[THR_NEARG    ] = 1000;

-      sf->thresh_mult[THR_NEARA    ] = 1000;

-      sf->thresh_mult[THR_ZEROG_FILT    ] = 1000;

-      sf->thresh_mult[THR_ZEROA_FILT    ] = 1000;

-      sf->thresh_mult[THR_NEARG_FILT    ] = 1000;

-      sf->thresh_mult[THR_NEARA_FILT    ] = 1000;

-      sf->thresh_mult[THR_ZEROMV   ] = 0;

-      sf->thresh_mult[THR_ZEROG    ] = 0;

-      sf->thresh_mult[THR_ZEROA    ] = 0;

-      sf->thresh_mult[THR_NEARESTMV] = 0;

-      sf->thresh_mult[THR_NEARESTG ] = 0;

-      sf->thresh_mult[THR_NEARESTA ] = 0;

-      sf->thresh_mult[THR_NEARMV   ] = 0;

-      sf->thresh_mult[THR_NEARG    ] = 0;

-      sf->thresh_mult[THR_NEARA    ] = 0;

-      sf->thresh_mult[THR_ZEROMV_FILT   ] = 0;

-      sf->thresh_mult[THR_ZEROG_FILT    ] = 0;

-      sf->thresh_mult[THR_ZEROA_FILT    ] = 0;

-      sf->thresh_mult[THR_NEARESTMV_FILT] = 0;

-      sf->thresh_mult[THR_NEARESTG_FILT ] = 0;

-      sf->thresh_mult[THR_NEARESTA_FILT ] = 0;

-      sf->thresh_mult[THR_NEARMV_FILT   ] = 0;

-      sf->thresh_mult[THR_NEARG_FILT    ] = 0;

-      sf->thresh_mult[THR_NEARA_FILT    ] = 0;

-      sf->thresh_mult[THR_NEWMV    ] = 1000;

-      sf->thresh_mult[THR_NEWG     ] = 1000;

-      sf->thresh_mult[THR_NEWA     ] = 1000;

-      sf->thresh_mult[THR_NEWMV_FILT    ] = 1000;

-      sf->thresh_mult[THR_NEWG_FILT     ] = 1000;

-      sf->thresh_mult[THR_NEWA_FILT     ] = 1000;

-#else

-      sf->thresh_mult[THR_NEARESTMV] = 0;

-      sf->thresh_mult[THR_ZEROMV   ] = 0;

-      sf->thresh_mult[THR_DC       ] = 0;

-      sf->thresh_mult[THR_NEARMV   ] = 0;

-      sf->thresh_mult[THR_V_PRED   ] = 1000;

-      sf->thresh_mult[THR_H_PRED   ] = 1000;

-      sf->thresh_mult[THR_D45_PRED ] = 1000;

-      sf->thresh_mult[THR_D135_PRED] = 1000;

-      sf->thresh_mult[THR_D117_PRED] = 1000;

-      sf->thresh_mult[THR_D153_PRED] = 1000;

-      sf->thresh_mult[THR_D27_PRED ] = 1000;

-      sf->thresh_mult[THR_D63_PRED ] = 1000;

-      sf->thresh_mult[THR_B_PRED   ] = 2500;

-      sf->thresh_mult[THR_I8X8_PRED] = 2500;

-      sf->thresh_mult[THR_TM       ] = 1000;

-      sf->thresh_mult[THR_NEARESTG ] = 1000;

-      sf->thresh_mult[THR_NEARESTA ] = 1000;

-      sf->thresh_mult[THR_ZEROG    ] = 1000;

-      sf->thresh_mult[THR_ZEROA    ] = 1000;

-      sf->thresh_mult[THR_NEARG    ] = 1000;

-      sf->thresh_mult[THR_NEARA    ] = 1000;

-      sf->thresh_mult[THR_ZEROMV   ] = 0;

-      sf->thresh_mult[THR_ZEROG    ] = 0;

-      sf->thresh_mult[THR_ZEROA    ] = 0;

-      sf->thresh_mult[THR_NEARESTMV] = 0;

-      sf->thresh_mult[THR_NEARESTG ] = 0;

-      sf->thresh_mult[THR_NEARESTA ] = 0;

-      sf->thresh_mult[THR_NEARMV   ] = 0;

-      sf->thresh_mult[THR_NEARG    ] = 0;

-      sf->thresh_mult[THR_NEARA    ] = 0;

-      sf->thresh_mult[THR_NEWMV    ] = 1000;

-      sf->thresh_mult[THR_NEWG     ] = 1000;

-      sf->thresh_mult[THR_NEWA     ] = 1000;

-#endif

-      sf->thresh_mult[THR_SPLITMV  ] = 1700;

-      sf->thresh_mult[THR_SPLITG   ] = 4500;

-      sf->thresh_mult[THR_SPLITA   ] = 4500;

-      sf->thresh_mult[THR_COMP_ZEROLG   ] = 0;

-      sf->thresh_mult[THR_COMP_NEARESTLG] = 0;

-      sf->thresh_mult[THR_COMP_NEARLG   ] = 0;

-      sf->thresh_mult[THR_COMP_ZEROLA   ] = 0;

-      sf->thresh_mult[THR_COMP_NEARESTLA] = 0;

-      sf->thresh_mult[THR_COMP_NEARLA   ] = 0;

-      sf->thresh_mult[THR_COMP_ZEROGA   ] = 0;

-      sf->thresh_mult[THR_COMP_NEARESTGA] = 0;

-      sf->thresh_mult[THR_COMP_NEARGA   ] = 0;

-      sf->thresh_mult[THR_COMP_NEWLG    ] = 1000;

-      sf->thresh_mult[THR_COMP_NEWLA    ] = 1000;

-      sf->thresh_mult[THR_COMP_NEWGA    ] = 1000;

-      sf->thresh_mult[THR_COMP_SPLITLA  ] = 1700;

-      sf->thresh_mult[THR_COMP_SPLITGA  ] = 4500;

-      sf->thresh_mult[THR_COMP_SPLITLG  ] = 4500;

-      if (Speed > 0) {

-        /* Disable coefficient optimization above speed 0 */

-        sf->optimize_coefficients = 0;

-        sf->no_skip_block4x4_search = 0;

-        sf->first_step = 1;

-        cpi->mode_check_freq[THR_SPLITG] = 2;

-        cpi->mode_check_freq[THR_SPLITA] = 2;

-        cpi->mode_check_freq[THR_SPLITMV] = 0;

-        cpi->mode_check_freq[THR_COMP_SPLITGA] = 2;

-        cpi->mode_check_freq[THR_COMP_SPLITLG] = 2;

-        cpi->mode_check_freq[THR_COMP_SPLITLA] = 0;

-      }

-      if (Speed > 1) {

-        cpi->mode_check_freq[THR_SPLITG] = 4;

-        cpi->mode_check_freq[THR_SPLITA] = 4;

-        cpi->mode_check_freq[THR_SPLITMV] = 2;

-        cpi->mode_check_freq[THR_COMP_SPLITGA] = 4;

-        cpi->mode_check_freq[THR_COMP_SPLITLG] = 4;

-        cpi->mode_check_freq[THR_COMP_SPLITLA] = 2;

-        sf->thresh_mult[THR_TM       ] = 1500;

-        sf->thresh_mult[THR_V_PRED   ] = 1500;

-        sf->thresh_mult[THR_H_PRED   ] = 1500;

-        sf->thresh_mult[THR_D45_PRED ] = 1500;

-        sf->thresh_mult[THR_D135_PRED] = 1500;

-        sf->thresh_mult[THR_D117_PRED] = 1500;

-        sf->thresh_mult[THR_D153_PRED] = 1500;

-        sf->thresh_mult[THR_D27_PRED ] = 1500;

-        sf->thresh_mult[THR_D63_PRED ] = 1500;

-        sf->thresh_mult[THR_B_PRED   ] = 5000;

-        sf->thresh_mult[THR_I8X8_PRED] = 5000;

-        if (cpi->ref_frame_flags & VP9_LAST_FLAG) {

-          sf->thresh_mult[THR_NEWMV    ] = 2000;

-#if CONFIG_PRED_FILTER

-          sf->thresh_mult[THR_NEWMV_FILT    ] = 2000;

-#endif

-          sf->thresh_mult[THR_SPLITMV  ] = 10000;

-          sf->thresh_mult[THR_COMP_SPLITLG  ] = 20000;

-        }

-        if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {

-          sf->thresh_mult[THR_NEARESTG ] = 1500;

-          sf->thresh_mult[THR_ZEROG    ] = 1500;

-          sf->thresh_mult[THR_NEARG    ] = 1500;

-          sf->thresh_mult[THR_NEWG     ] = 2000;

-#if CONFIG_PRED_FILTER

-          sf->thresh_mult[THR_NEARESTG_FILT ] = 1500;

-          sf->thresh_mult[THR_ZEROG_FILT    ] = 1500;

-          sf->thresh_mult[THR_NEARG_FILT    ] = 1500;

-          sf->thresh_mult[THR_NEWG_FILT     ] = 2000;

-#endif

-          sf->thresh_mult[THR_SPLITG   ] = 20000;

-          sf->thresh_mult[THR_COMP_SPLITGA  ] = 20000;

-        }

-        if (cpi->ref_frame_flags & VP9_ALT_FLAG) {

-          sf->thresh_mult[THR_NEARESTA ] = 1500;

-          sf->thresh_mult[THR_ZEROA    ] = 1500;

-          sf->thresh_mult[THR_NEARA    ] = 1500;

-          sf->thresh_mult[THR_NEWA     ] = 2000;

-#if CONFIG_PRED_FILTER

-          sf->thresh_mult[THR_NEARESTA_FILT ] = 1500;

-          sf->thresh_mult[THR_ZEROA_FILT    ] = 1500;

-          sf->thresh_mult[THR_NEARA_FILT    ] = 1500;

-          sf->thresh_mult[THR_NEWA_FILT     ] = 2000;

-#endif

-          sf->thresh_mult[THR_SPLITA   ] = 20000;

-          sf->thresh_mult[THR_COMP_SPLITLA  ] = 10000;

-        }

-        sf->thresh_mult[THR_COMP_ZEROLG   ] = 1500;

-        sf->thresh_mult[THR_COMP_NEARESTLG] = 1500;

-        sf->thresh_mult[THR_COMP_NEARLG   ] = 1500;

-        sf->thresh_mult[THR_COMP_ZEROLA   ] = 1500;

-        sf->thresh_mult[THR_COMP_NEARESTLA] = 1500;

-        sf->thresh_mult[THR_COMP_NEARLA   ] = 1500;

-        sf->thresh_mult[THR_COMP_ZEROGA   ] = 1500;

-        sf->thresh_mult[THR_COMP_NEARESTGA] = 1500;

-        sf->thresh_mult[THR_COMP_NEARGA   ] = 1500;

-        sf->thresh_mult[THR_COMP_NEWLG    ] = 2000;

-        sf->thresh_mult[THR_COMP_NEWLA    ] = 2000;

-        sf->thresh_mult[THR_COMP_NEWGA    ] = 2000;

-      }

-      if (Speed > 2) {

-        cpi->mode_check_freq[THR_SPLITG] = 15;

-        cpi->mode_check_freq[THR_SPLITA] = 15;

-        cpi->mode_check_freq[THR_SPLITMV] = 7;

-        cpi->mode_check_freq[THR_COMP_SPLITGA] = 15;

-        cpi->mode_check_freq[THR_COMP_SPLITLG] = 15;

-        cpi->mode_check_freq[THR_COMP_SPLITLA] = 7;

-        sf->thresh_mult[THR_TM       ] = 2000;

-        sf->thresh_mult[THR_V_PRED   ] = 2000;

-        sf->thresh_mult[THR_H_PRED   ] = 2000;

-        sf->thresh_mult[THR_D45_PRED ] = 2000;

-        sf->thresh_mult[THR_D135_PRED] = 2000;

-        sf->thresh_mult[THR_D117_PRED] = 2000;

-        sf->thresh_mult[THR_D153_PRED] = 2000;

-        sf->thresh_mult[THR_D27_PRED ] = 2000;

-        sf->thresh_mult[THR_D63_PRED ] = 2000;

-        sf->thresh_mult[THR_B_PRED   ] = 7500;

-        sf->thresh_mult[THR_I8X8_PRED] = 7500;

-        if (cpi->ref_frame_flags & VP9_LAST_FLAG) {

-          sf->thresh_mult[THR_NEWMV    ] = 2000;

-#if CONFIG_PRED_FILTER

-          sf->thresh_mult[THR_NEWMV_FILT    ] = 2000;

-#endif

-          sf->thresh_mult[THR_SPLITMV  ] = 25000;

-          sf->thresh_mult[THR_COMP_SPLITLG  ] = 50000;

-        }

-        if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {

-          sf->thresh_mult[THR_NEARESTG ] = 2000;

-          sf->thresh_mult[THR_ZEROG    ] = 2000;

-          sf->thresh_mult[THR_NEARG    ] = 2000;

-          sf->thresh_mult[THR_NEWG     ] = 2500;

-#if CONFIG_PRED_FILTER

-          sf->thresh_mult[THR_NEARESTG_FILT ] = 2000;

-          sf->thresh_mult[THR_ZEROG_FILT    ] = 2000;

-          sf->thresh_mult[THR_NEARG_FILT    ] = 2000;

-          sf->thresh_mult[THR_NEWG_FILT     ] = 2500;

-#endif

-          sf->thresh_mult[THR_SPLITG   ] = 50000;

-          sf->thresh_mult[THR_COMP_SPLITGA  ] = 50000;

-        }

-        if (cpi->ref_frame_flags & VP9_ALT_FLAG) {

-          sf->thresh_mult[THR_NEARESTA ] = 2000;

-          sf->thresh_mult[THR_ZEROA    ] = 2000;

-          sf->thresh_mult[THR_NEARA    ] = 2000;

-          sf->thresh_mult[THR_NEWA     ] = 2500;

-#if CONFIG_PRED_FILTER

-          sf->thresh_mult[THR_NEARESTA_FILT ] = 2000;

-          sf->thresh_mult[THR_ZEROA_FILT    ] = 2000;

-          sf->thresh_mult[THR_NEARA_FILT    ] = 2000;

-          sf->thresh_mult[THR_NEWA_FILT     ] = 2500;

-#endif

-          sf->thresh_mult[THR_SPLITA   ] = 50000;

-          sf->thresh_mult[THR_COMP_SPLITLA  ] = 25000;

-        }

-        sf->thresh_mult[THR_COMP_ZEROLG   ] = 2000;

-        sf->thresh_mult[THR_COMP_NEARESTLG] = 2000;

-        sf->thresh_mult[THR_COMP_NEARLG   ] = 2000;

-        sf->thresh_mult[THR_COMP_ZEROLA   ] = 2000;

-        sf->thresh_mult[THR_COMP_NEARESTLA] = 2000;

-        sf->thresh_mult[THR_COMP_NEARLA   ] = 2000;

-        sf->thresh_mult[THR_COMP_ZEROGA   ] = 2000;

-        sf->thresh_mult[THR_COMP_NEARESTGA] = 2000;

-        sf->thresh_mult[THR_COMP_NEARGA   ] = 2000;

-        sf->thresh_mult[THR_COMP_NEWLG    ] = 2500;

-        sf->thresh_mult[THR_COMP_NEWLA    ] = 2500;

-        sf->thresh_mult[THR_COMP_NEWGA    ] = 2500;

-        sf->improved_dct = 0;

-        // Only do recode loop on key frames, golden frames and

-        // alt ref frames

-        sf->recode_loop = 2;

-      }

-      break;

-  }; /* switch */

-  /* disable frame modes if flags not set */

-  if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {

-    sf->thresh_mult[THR_NEWMV    ] = INT_MAX;

-    sf->thresh_mult[THR_NEARESTMV] = INT_MAX;

-    sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;

-    sf->thresh_mult[THR_NEARMV   ] = INT_MAX;

-#if CONFIG_PRED_FILTER

-    sf->thresh_mult[THR_NEWMV_FILT    ] = INT_MAX;

-    sf->thresh_mult[THR_NEARESTMV_FILT] = INT_MAX;

-    sf->thresh_mult[THR_ZEROMV_FILT   ] = INT_MAX;

-    sf->thresh_mult[THR_NEARMV_FILT   ] = INT_MAX;

-#endif

-    sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;

-  }

-  if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) {

-    sf->thresh_mult[THR_NEARESTG ] = INT_MAX;

-    sf->thresh_mult[THR_ZEROG    ] = INT_MAX;

-    sf->thresh_mult[THR_NEARG    ] = INT_MAX;

-    sf->thresh_mult[THR_NEWG     ] = INT_MAX;

-#if CONFIG_PRED_FILTER

-    sf->thresh_mult[THR_NEARESTG_FILT ] = INT_MAX;

-    sf->thresh_mult[THR_ZEROG_FILT    ] = INT_MAX;

-    sf->thresh_mult[THR_NEARG_FILT    ] = INT_MAX;

-    sf->thresh_mult[THR_NEWG_FILT     ] = INT_MAX;

-#endif

-    sf->thresh_mult[THR_SPLITG   ] = INT_MAX;

-  }

-  if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) {

-    sf->thresh_mult[THR_NEARESTA ] = INT_MAX;

-    sf->thresh_mult[THR_ZEROA    ] = INT_MAX;

-    sf->thresh_mult[THR_NEARA    ] = INT_MAX;

-    sf->thresh_mult[THR_NEWA     ] = INT_MAX;

-#if CONFIG_PRED_FILTER

-    sf->thresh_mult[THR_NEARESTA_FILT ] = INT_MAX;

-    sf->thresh_mult[THR_ZEROA_FILT    ] = INT_MAX;

-    sf->thresh_mult[THR_NEARA_FILT    ] = INT_MAX;

-    sf->thresh_mult[THR_NEWA_FILT     ] = INT_MAX;

-#endif

-    sf->thresh_mult[THR_SPLITA   ] = INT_MAX;

-  }

-  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) != (VP9_LAST_FLAG | VP9_GOLD_FLAG)) {

-    sf->thresh_mult[THR_COMP_ZEROLG   ] = INT_MAX;

-    sf->thresh_mult[THR_COMP_NEARESTLG] = INT_MAX;

-    sf->thresh_mult[THR_COMP_NEARLG   ] = INT_MAX;

-    sf->thresh_mult[THR_COMP_NEWLG    ] = INT_MAX;

-    sf->thresh_mult[THR_COMP_SPLITLG  ] = INT_MAX;

-  }

-  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != (VP9_LAST_FLAG | VP9_ALT_FLAG)) {

-    sf->thresh_mult[THR_COMP_ZEROLA   ] = INT_MAX;

-    sf->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX;

-    sf->thresh_mult[THR_COMP_NEARLA   ] = INT_MAX;

-    sf->thresh_mult[THR_COMP_NEWLA    ] = INT_MAX;

-    sf->thresh_mult[THR_COMP_SPLITLA  ] = INT_MAX;

-  }

-  if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != (VP9_GOLD_FLAG | VP9_ALT_FLAG)) {

-    sf->thresh_mult[THR_COMP_ZEROGA   ] = INT_MAX;

-    sf->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX;

-    sf->thresh_mult[THR_COMP_NEARGA   ] = INT_MAX;

-    sf->thresh_mult[THR_COMP_NEWGA    ] = INT_MAX;

-    sf->thresh_mult[THR_COMP_SPLITGA  ] = INT_MAX;

-  }

-  // Slow quant, dct and trellis not worthwhile for first pass

-  // so make sure they are always turned off.

-  if (cpi->pass == 1) {

-    sf->optimize_coefficients = 0;

-    sf->improved_dct = 0;

-  }

-  if (cpi->sf.search_method == NSTEP) {

-    vp9_init3smotion_compensation(&cpi->mb,

-                                  cm->yv12_fb[cm->lst_fb_idx].y_stride);

-  } else if (cpi->sf.search_method == DIAMOND) {

-    vp9_init_dsmotion_compensation(&cpi->mb,

-                                   cm->yv12_fb[cm->lst_fb_idx].y_stride);

-  }

-  cpi->mb.vp9_short_fdct16x16 = vp9_short_fdct16x16;

-  cpi->mb.vp9_short_fdct8x8 = vp9_short_fdct8x8;

-  cpi->mb.vp9_short_fdct8x4 = vp9_short_fdct8x4;

-  cpi->mb.vp9_short_fdct4x4 = vp9_short_fdct4x4;

-  cpi->mb.short_walsh4x4 = vp9_short_walsh4x4;

-  cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2;

-#if CONFIG_LOSSLESS

-  if (cpi->oxcf.lossless) {

-    cpi->mb.vp9_short_fdct8x4 = vp9_short_walsh8x4_x8;

-    cpi->mb.vp9_short_fdct4x4 = vp9_short_walsh4x4_x8;

-    cpi->mb.short_walsh4x4 = vp9_short_walsh4x4;

-    cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2;

-    cpi->mb.short_walsh4x4 = vp9_short_walsh4x4_lossless;

-  }

-#endif

-  cpi->mb.quantize_b_4x4      = vp9_regular_quantize_b_4x4;

-  cpi->mb.quantize_b_4x4_pair = vp9_regular_quantize_b_4x4_pair;

-  cpi->mb.quantize_b_8x8      = vp9_regular_quantize_b_8x8;

-  cpi->mb.quantize_b_16x16    = vp9_regular_quantize_b_16x16;

-  cpi->mb.quantize_b_2x2      = vp9_regular_quantize_b_2x2;

-  vp9_init_quantizer(cpi);

-#if CONFIG_RUNTIME_CPU_DETECT

-  cpi->mb.e_mbd.rtcd = &cpi->common.rtcd;

-#endif

-  if (cpi->sf.iterative_sub_pixel == 1) {

-    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step_iteratively;

-  } else if (cpi->sf.quarter_pixel_search) {

-    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step;

-  } else if (cpi->sf.half_pixel_search) {

-    cpi->find_fractional_mv_step = vp9_find_best_half_pixel_step;

-  }

-  if (cpi->sf.optimize_coefficients == 1 && cpi->pass != 1)

-    cpi->mb.optimize = 1;

-  else

-    cpi->mb.optimize = 0;

-#ifdef SPEEDSTATS

-  frames_at_speed[cpi->Speed]++;

-#endif

-}

-static void alloc_raw_frame_buffers(VP9_COMP *cpi) {

-  int width = (cpi->oxcf.Width + 15) & ~15;

-  int height = (cpi->oxcf.Height + 15) & ~15;

-  cpi->lookahead = vp9_lookahead_init(cpi->oxcf.Width, cpi->oxcf.Height,

-                                      cpi->oxcf.lag_in_frames);

-  if (!cpi->lookahead)

-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

-                       "Failed to allocate lag buffers");

-#if VP9_TEMPORAL_ALT_REF

-  if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer,

-                                  width, height, VP8BORDERINPIXELS))

-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

-                       "Failed to allocate altref buffer");

-#endif

-}

-static int alloc_partition_data(VP9_COMP *cpi) {

-  vpx_free(cpi->mb.pip);

-  cpi->mb.pip = vpx_calloc((cpi->common.mb_cols + 1) *

-                           (cpi->common.mb_rows + 1),

-                           sizeof(PARTITION_INFO));

-  if (!cpi->mb.pip)

-    return 1;

-  cpi->mb.pi = cpi->mb.pip + cpi->common.mode_info_stride + 1;

-  return 0;

-}

-void vp9_alloc_compressor_data(VP9_COMP *cpi) {

-  VP9_COMMON *cm = &cpi->common;

-  int width = cm->Width;

-  int height = cm->Height;

-  if (vp9_alloc_frame_buffers(cm, width, height))

-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

-                       "Failed to allocate frame buffers");

-  if (alloc_partition_data(cpi))

-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

-                       "Failed to allocate partition data");

-  if ((width & 0xf) != 0)

-    width += 16 - (width & 0xf);

-  if ((height & 0xf) != 0)

-    height += 16 - (height & 0xf);

-  if (vp8_yv12_alloc_frame_buffer(&cpi->last_frame_uf,

-                                  width, height, VP8BORDERINPIXELS))

-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

-                       "Failed to allocate last frame buffer");

-  if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source,

-                                  width, height, VP8BORDERINPIXELS))

-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

-                       "Failed to allocate scaled source buffer");

-  vpx_free(cpi->tok);

-  {

-    unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16;

-    CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));

-  }

-  // Data used for real time vc mode to see if gf needs refreshing

-  cpi->inter_zz_count = 0;

-  cpi->gf_bad_count = 0;

-  cpi->gf_update_recommended = 0;

-  // Structures used to minitor GF usage

-  vpx_free(cpi->gf_active_flags);

-  CHECK_MEM_ERROR(cpi->gf_active_flags,

-                  vpx_calloc(1, cm->mb_rows * cm->mb_cols));

-  cpi->gf_active_count = cm->mb_rows * cm->mb_cols;

-  vpx_free(cpi->mb_activity_map);

-  CHECK_MEM_ERROR(cpi->mb_activity_map,

-                  vpx_calloc(sizeof(unsigned int),

-                             cm->mb_rows * cm->mb_cols));

-  vpx_free(cpi->mb_norm_activity_map);

-  CHECK_MEM_ERROR(cpi->mb_norm_activity_map,

-                  vpx_calloc(sizeof(unsigned int),

-                             cm->mb_rows * cm->mb_cols));

-  vpx_free(cpi->twopass.total_stats);

-  cpi->twopass.total_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));

-  vpx_free(cpi->twopass.total_left_stats);

-  cpi->twopass.total_left_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));

-  vpx_free(cpi->twopass.this_frame_stats);

-  cpi->twopass.this_frame_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));

-  if (!cpi->twopass.total_stats ||

-      !cpi->twopass.total_left_stats ||

-      !cpi->twopass.this_frame_stats)

-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

-                       "Failed to allocate firstpass stats");

-  vpx_free(cpi->tplist);

-  CHECK_MEM_ERROR(cpi->tplist,

-                  vpx_malloc(sizeof(TOKENLIST) * (cpi->common.mb_rows)));

-}

-// TODO perhaps change number of steps expose to outside world when setting

-// max and min limits. Also this will likely want refining for the extended Q

-// range.

-//

-// Table that converts 0-63 Q range values passed in outside to the Qindex

-// range used internally.

-static const int q_trans[] = {

-  0,    4,   8,  12,  16,  20,  24,  28,

-  32,   36,  40,  44,  48,  52,  56,  60,

-  64,   68,  72,  76,  80,  84,  88,  92,

-  96,  100, 104, 108, 112, 116, 120, 124,

-  128, 132, 136, 140, 144, 148, 152, 156,

-  160, 164, 168, 172, 176, 180, 184, 188,

-  192, 196, 200, 204, 208, 212, 216, 220,

-  224, 228, 232, 236, 240, 244, 249, 255,

-};

-int vp9_reverse_trans(int x) {

-  int i;

-  for (i = 0; i < 64; i++)

-    if (q_trans[i] >= x)

-      return i;

-  return 63;

-};

-void vp9_new_frame_rate(VP9_COMP *cpi, double framerate) {

-  if (framerate < .1)

-    framerate = 30;

-  cpi->oxcf.frame_rate             = framerate;

-  cpi->output_frame_rate            = cpi->oxcf.frame_rate;

-  cpi->per_frame_bandwidth          = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);

-  cpi->av_per_frame_bandwidth        = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);

-  cpi->min_frame_bandwidth          = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);

-  if (cpi->min_frame_bandwidth < FRAME_OVERHEAD_BITS)

-    cpi->min_frame_bandwidth = FRAME_OVERHEAD_BITS;

-  // Set Maximum gf/arf interval

-  cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2);

-  if (cpi->max_gf_interval < 12)

-    cpi->max_gf_interval = 12;

-  // Extended interval for genuinely static scenes

-  cpi->twopass.static_scene_max_gf_interval = cpi->key_frame_frequency >> 1;

-  // Special conditions when altr ref frame enabled in lagged compress mode

-  if (cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames) {

-    if (cpi->max_gf_interval > cpi->oxcf.lag_in_frames - 1)

-      cpi->max_gf_interval = cpi->oxcf.lag_in_frames - 1;

-    if (cpi->twopass.static_scene_max_gf_interval > cpi->oxcf.lag_in_frames - 1)

-      cpi->twopass.static_scene_max_gf_interval = cpi->oxcf.lag_in_frames - 1;

-  }

-  if (cpi->max_gf_interval > cpi->twopass.static_scene_max_gf_interval)

-    cpi->max_gf_interval = cpi->twopass.static_scene_max_gf_interval;

-}

-static int

-rescale(int val, int num, int denom) {

-  int64_t llnum = num;

-  int64_t llden = denom;

-  int64_t llval = val;

-  return llval * llnum / llden;

-}

-static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {

-  VP9_COMP *cpi = (VP9_COMP *)(ptr);

-  VP9_COMMON *cm = &cpi->common;

-  cpi->oxcf = *oxcf;

-  cpi->goldfreq = 7;

-  cm->version = oxcf->Version;

-  vp9_setup_version(cm);

-  // change includes all joint functionality

-  vp9_change_config(ptr, oxcf);

-  // Initialize active best and worst q and average q values.

-  cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;

-  cpi->active_best_quality          = cpi->oxcf.best_allowed_q;

-  cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;

-  // Initialise the starting buffer levels

-  cpi->buffer_level                 = cpi->oxcf.starting_buffer_level;

-  cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;

-  cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;

-  cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;

-  cpi->long_rolling_target_bits     = cpi->av_per_frame_bandwidth;

-  cpi->long_rolling_actual_bits     = cpi->av_per_frame_bandwidth;

-  cpi->total_actual_bits            = 0;

-  cpi->total_target_vs_actual       = 0;

-  cpi->static_mb_pct = 0;

-#if VP9_TEMPORAL_ALT_REF

-  {

-    int i;

-    cpi->fixed_divide[0] = 0;

-    for (i = 1; i < 512; i++)

-      cpi->fixed_divide[i] = 0x80000 / i;

-  }

-#endif

-}

-void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {

-  VP9_COMP *cpi = (VP9_COMP *)(ptr);

-  VP9_COMMON *cm = &cpi->common;

-  if (!cpi)

-    return;

-  if (!oxcf)

-    return;

-  if (cm->version != oxcf->Version) {

-    cm->version = oxcf->Version;

-    vp9_setup_version(cm);

-  }

-  cpi->oxcf = *oxcf;

-  switch (cpi->oxcf.Mode) {

-      // Real time and one pass deprecated in test code base

-    case MODE_FIRSTPASS:

-      cpi->pass = 1;

-      cpi->compressor_speed = 1;

-      break;

-    case MODE_SECONDPASS:

-      cpi->pass = 2;

-      cpi->compressor_speed = 1;

-      if (cpi->oxcf.cpu_used < -5) {

-        cpi->oxcf.cpu_used = -5;

-      }

-      if (cpi->oxcf.cpu_used > 5)

-        cpi->oxcf.cpu_used = 5;

-      break;

-    case MODE_SECONDPASS_BEST:

-      cpi->pass = 2;

-      cpi->compressor_speed = 0;

-      break;

-  }

-  cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];

-  cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q];

-  cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];

-#if CONFIG_LOSSLESS

-  cpi->oxcf.lossless = oxcf->lossless;

-  if (cpi->oxcf.lossless) {

-    cpi->common.rtcd.idct.idct1        = vp9_short_inv_walsh4x4_1_x8_c;

-    cpi->common.rtcd.idct.idct16       = vp9_short_inv_walsh4x4_x8_c;

-    cpi->common.rtcd.idct.idct1_scalar_add  = vp9_dc_only_inv_walsh_add_c;

-    cpi->common.rtcd.idct.iwalsh1      = vp9_short_inv_walsh4x4_1_c;

-    cpi->common.rtcd.idct.iwalsh16     = vp9_short_inv_walsh4x4_lossless_c;

-  }

-#endif

-  cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;

-  cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;

-  // cpi->use_golden_frame_only = 0;

-  // cpi->use_last_frame_only = 0;

-  cm->refresh_golden_frame = 0;

-  cm->refresh_last_frame = 1;

-  cm->refresh_entropy_probs = 1;

-  setup_features(cpi);

-  cpi->mb.e_mbd.allow_high_precision_mv = 0;   // Default mv precision adaptation

-  {

-    int i;

-    for (i = 0; i < MAX_MB_SEGMENTS; i++)

-      cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;

-  }

-  // At the moment the first order values may not be > MAXQ

-  if (cpi->oxcf.fixed_q > MAXQ)

-    cpi->oxcf.fixed_q = MAXQ;

-  // local file playback mode == really big buffer

-  if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK) {

-    cpi->oxcf.starting_buffer_level   = 60000;

-    cpi->oxcf.optimal_buffer_level    = 60000;

-    cpi->oxcf.maximum_buffer_size     = 240000;

-  }

-  // Convert target bandwidth from Kbit/s to Bit/s

-  cpi->oxcf.target_bandwidth       *= 1000;

-  cpi->oxcf.starting_buffer_level =

-    rescale(cpi->oxcf.starting_buffer_level,

-            cpi->oxcf.target_bandwidth, 1000);

-  // Set or reset optimal and maximum buffer levels.

-  if (cpi->oxcf.optimal_buffer_level == 0)

-    cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;

-  else

-    cpi->oxcf.optimal_buffer_level =

-      rescale(cpi->oxcf.optimal_buffer_level,

-              cpi->oxcf.target_bandwidth, 1000);

-  if (cpi->oxcf.maximum_buffer_size == 0)

-    cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;

-  else

-    cpi->oxcf.maximum_buffer_size =

-      rescale(cpi->oxcf.maximum_buffer_size,

-              cpi->oxcf.target_bandwidth, 1000);

-  // Set up frame rate and related parameters rate control values.

-  vp9_new_frame_rate(cpi, cpi->oxcf.frame_rate);

-  // Set absolute upper and lower quality limits

-  cpi->worst_quality               = cpi->oxcf.worst_allowed_q;

-  cpi->best_quality                = cpi->oxcf.best_allowed_q;

-  // active values should only be modified if out of new range

-  if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q) {

-    cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;

-  }

-  // less likely

-  else if (cpi->active_worst_quality < cpi->oxcf.best_allowed_q) {

-    cpi->active_worst_quality = cpi->oxcf.best_allowed_q;

-  }

-  if (cpi->active_best_quality < cpi->oxcf.best_allowed_q) {

-    cpi->active_best_quality = cpi->oxcf.best_allowed_q;

-  }

-  // less likely

-  else if (cpi->active_best_quality > cpi->oxcf.worst_allowed_q) {

-    cpi->active_best_quality = cpi->oxcf.worst_allowed_q;

-  }

-  cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;

-  cpi->cq_target_quality = cpi->oxcf.cq_level;

-  if (!cm->use_bilinear_mc_filter)

-    cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;

-  else

-    cm->mcomp_filter_type = BILINEAR;

-  cpi->target_bandwidth = cpi->oxcf.target_bandwidth;

-  cm->Width       = cpi->oxcf.Width;

-  cm->Height      = cpi->oxcf.Height;

-  cm->horiz_scale  = cpi->horiz_scale;

-  cm->vert_scale   = cpi->vert_scale;

-  // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)

-  if (cpi->oxcf.Sharpness > 7)

-    cpi->oxcf.Sharpness = 7;

-  cm->sharpness_level = cpi->oxcf.Sharpness;

-  if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) {

-    int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);

-    int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);

-    Scale2Ratio(cm->horiz_scale, &hr, &hs);

-    Scale2Ratio(cm->vert_scale, &vr, &vs);

-    // always go to the next whole number

-    cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;

-    cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;

-  }

-  if (((cm->Width + 15) & 0xfffffff0) !=

-      cm->yv12_fb[cm->lst_fb_idx].y_width ||

-      ((cm->Height + 15) & 0xfffffff0) !=

-      cm->yv12_fb[cm->lst_fb_idx].y_height ||

-      cm->yv12_fb[cm->lst_fb_idx].y_width == 0) {

-    alloc_raw_frame_buffers(cpi);

-    vp9_alloc_compressor_data(cpi);

-  }

-  if (cpi->oxcf.fixed_q >= 0) {

-    cpi->last_q[0] = cpi->oxcf.fixed_q;

-    cpi->last_q[1] = cpi->oxcf.fixed_q;

-    cpi->last_boosted_qindex = cpi->oxcf.fixed_q;

-  }

-  cpi->Speed = cpi->oxcf.cpu_used;

-  // force to allowlag to 0 if lag_in_frames is 0;

-  if (cpi->oxcf.lag_in_frames == 0) {

-    cpi->oxcf.allow_lag = 0;

-  }

-  // Limit on lag buffers as these are not currently dynamically allocated

-  else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)

-    cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;

-  // YX Temp

-  cpi->alt_ref_source = NULL;

-  cpi->is_src_frame_alt_ref = 0;

-#if 0

-  // Experimental RD Code

-  cpi->frame_distortion = 0;

-  cpi->last_frame_distortion = 0;

-#endif

-}

-#define M_LOG2_E 0.693147180559945309417

-#define log2f(x) (log (x) / (float) M_LOG2_E)

-static void cal_nmvjointsadcost(int *mvjointsadcost) {

-  mvjointsadcost[0] = 600;

-  mvjointsadcost[1] = 300;

-  mvjointsadcost[2] = 300;

-  mvjointsadcost[0] = 300;

-}

-static void cal_nmvsadcosts(int *mvsadcost[2]) {

-  int i = 1;

-  mvsadcost [0] [0] = 0;

-  mvsadcost [1] [0] = 0;

-  do {

-    double z = 256 * (2 * (log2f(8 * i) + .6));

-    mvsadcost [0][i] = (int) z;

-    mvsadcost [1][i] = (int) z;

-    mvsadcost [0][-i] = (int) z;

-    mvsadcost [1][-i] = (int) z;

-  } while (++i <= MV_MAX);

-}

-static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {

-  int i = 1;

-  mvsadcost [0] [0] = 0;

-  mvsadcost [1] [0] = 0;

-  do {

-    double z = 256 * (2 * (log2f(8 * i) + .6));

-    mvsadcost [0][i] = (int) z;

-    mvsadcost [1][i] = (int) z;

-    mvsadcost [0][-i] = (int) z;

-    mvsadcost [1][-i] = (int) z;

-  } while (++i <= MV_MAX);

-}

-VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {

-  int i;

-  volatile union {

-    VP9_COMP *cpi;

-    VP9_PTR   ptr;

-  } ctx;

-  VP9_COMP *cpi;

-  VP9_COMMON *cm;

-  cpi = ctx.cpi = vpx_memalign(32, sizeof(VP9_COMP));

-  // Check that the CPI instance is valid

-  if (!cpi)

-    return 0;

-  cm = &cpi->common;

-  vpx_memset(cpi, 0, sizeof(VP9_COMP));

-  if (setjmp(cm->error.jmp)) {

-    VP9_PTR ptr = ctx.ptr;

-    ctx.cpi->common.error.setjmp = 0;

-    vp9_remove_compressor(&ptr);

-    return 0;

-  }

-  cpi->common.error.setjmp = 1;

-  CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));

-  vp9_create_common(&cpi->common);

-  vp9_cmachine_specific_config(cpi);

-  init_config((VP9_PTR)cpi, oxcf);

-  memcpy(cpi->base_skip_false_prob, base_skip_false_prob, sizeof(base_skip_false_prob));

-  cpi->common.current_video_frame   = 0;

-  cpi->kf_overspend_bits            = 0;

-  cpi->kf_bitrate_adjustment        = 0;

-  cpi->frames_till_gf_update_due      = 0;

-  cpi->gf_overspend_bits            = 0;

-  cpi->non_gf_bitrate_adjustment     = 0;

-  cm->prob_last_coded               = 128;

-  cm->prob_gf_coded                 = 128;

-  cm->prob_intra_coded              = 63;

-#if CONFIG_SUPERBLOCKS

-  cm->sb_coded                      = 200;

-#endif

-  for (i = 0; i < COMP_PRED_CONTEXTS; i++)

-    cm->prob_comppred[i]         = 128;

-  for (i = 0; i < TX_SIZE_MAX - 1; i++)

-    cm->prob_tx[i]               = 128;

-  // Prime the recent reference frame useage counters.

-  // Hereafter they will be maintained as a sort of moving average

-  cpi->recent_ref_frame_usage[INTRA_FRAME]  = 1;

-  cpi->recent_ref_frame_usage[LAST_FRAME]   = 1;

-  cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;

-  cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;

-  // Set reference frame sign bias for ALTREF frame to 1 (for now)

-  cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;

-  cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;

-  cpi->gold_is_last = 0;

-  cpi->alt_is_last  = 0;

-  cpi->gold_is_alt  = 0;

-  // allocate memory for storing last frame's MVs for MV prediction.

-  CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int_mv)));

-  CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int)));

-  CHECK_MEM_ERROR(cpi->lf_ref_frame, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int)));

-  // Create the encoder segmentation map and set all entries to 0

-  CHECK_MEM_ERROR(cpi->segmentation_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));

-  // And a copy in common for temporal coding

-  CHECK_MEM_ERROR(cm->last_frame_seg_map,

-                  vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));

-  // And a place holder structure is the coding context

-  // for use if we want to save and restore it

-  CHECK_MEM_ERROR(cpi->coding_context.last_frame_seg_map_copy,

-                  vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));

-  CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));

-  vpx_memset(cpi->active_map, 1, (cpi->common.mb_rows * cpi->common.mb_cols));

-  cpi->active_map_enabled = 0;

-  for (i = 0; i < (sizeof(cpi->mbgraph_stats) /

-                   sizeof(cpi->mbgraph_stats[0])); i++) {

-    CHECK_MEM_ERROR(cpi->mbgraph_stats[i].mb_stats,

-                    vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols *

-                               sizeof(*cpi->mbgraph_stats[i].mb_stats),

-                               1));

-  }

-#ifdef ENTROPY_STATS

-  if (cpi->pass != 1)

-    init_context_counters();

-#endif

-#ifdef MODE_STATS

-  vp9_zero(y_modes);

-  vp9_zero(i8x8_modes);

-  vp9_zero(uv_modes);

-  vp9_zero(uv_modes_y);

-  vp9_zero(b_modes);

-  vp9_zero(inter_y_modes);

-  vp9_zero(inter_uv_modes);

-  vp9_zero(inter_b_modes);

-#endif

-#ifdef NMV_STATS

-  init_nmvstats();

-#endif

-  /*Initialize the feed-forward activity masking.*/

-  cpi->activity_avg = 90 << 12;

-  cpi->frames_since_key = 8;        // Give a sensible default for the first frame.

-  cpi->key_frame_frequency = cpi->oxcf.key_freq;

-  cpi->this_key_frame_forced = FALSE;

-  cpi->next_key_frame_forced = FALSE;

-  cpi->source_alt_ref_pending = FALSE;

-  cpi->source_alt_ref_active = FALSE;

-  cpi->common.refresh_alt_ref_frame = 0;

-  cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;

-#if CONFIG_INTERNAL_STATS

-  cpi->b_calculate_ssimg = 0;

-  cpi->count = 0;

-  cpi->bytes = 0;

-  if (cpi->b_calculate_psnr) {

-    cpi->total_sq_error = 0.0;

-    cpi->total_sq_error2 = 0.0;

-    cpi->total_y = 0.0;

-    cpi->total_u = 0.0;

-    cpi->total_v = 0.0;

-    cpi->total = 0.0;

-    cpi->totalp_y = 0.0;

-    cpi->totalp_u = 0.0;

-    cpi->totalp_v = 0.0;

-    cpi->totalp = 0.0;

-    cpi->tot_recode_hits = 0;

-    cpi->summed_quality = 0;

-    cpi->summed_weights = 0;

-  }

-  if (cpi->b_calculate_ssimg) {

-    cpi->total_ssimg_y = 0;

-    cpi->total_ssimg_u = 0;

-    cpi->total_ssimg_v = 0;

-    cpi->total_ssimg_all = 0;

-  }

-#endif

-#ifndef LLONG_MAX

-#define LLONG_MAX  9223372036854775807LL

-#endif

-  cpi->first_time_stamp_ever = LLONG_MAX;

-  cpi->frames_till_gf_update_due      = 0;

-  cpi->key_frame_count              = 1;

-  cpi->ni_av_qi                     = cpi->oxcf.worst_allowed_q;

-  cpi->ni_tot_qi                    = 0;

-  cpi->ni_frames                   = 0;

-  cpi->tot_q = 0.0;

-  cpi->avg_q = vp9_convert_qindex_to_q(cpi->oxcf.worst_allowed_q);

-  cpi->total_byte_count             = 0;

-  cpi->rate_correction_factor         = 1.0;

-  cpi->key_frame_rate_correction_factor = 1.0;

-  cpi->gf_rate_correction_factor  = 1.0;

-  cpi->twopass.est_max_qcorrection_factor  = 1.0;

-  cal_nmvjointsadcost(cpi->mb.nmvjointsadcost);

-  cpi->mb.nmvcost[0] = &cpi->mb.nmvcosts[0][MV_MAX];

-  cpi->mb.nmvcost[1] = &cpi->mb.nmvcosts[1][MV_MAX];

-  cpi->mb.nmvsadcost[0] = &cpi->mb.nmvsadcosts[0][MV_MAX];

-  cpi->mb.nmvsadcost[1] = &cpi->mb.nmvsadcosts[1][MV_MAX];

-  cal_nmvsadcosts(cpi->mb.nmvsadcost);

-  cpi->mb.nmvcost_hp[0] = &cpi->mb.nmvcosts_hp[0][MV_MAX];

-  cpi->mb.nmvcost_hp[1] = &cpi->mb.nmvcosts_hp[1][MV_MAX];

-  cpi->mb.nmvsadcost_hp[0] = &cpi->mb.nmvsadcosts_hp[0][MV_MAX];

-  cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX];

-  cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);

-  for (i = 0; i < KEY_FRAME_CONTEXT; i++) {

-    cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate;

-  }

-#ifdef OUTPUT_YUV_SRC

-  yuv_file = fopen("bd.yuv", "ab");

-#endif

-#ifdef OUTPUT_YUV_REC

-  yuv_rec_file = fopen("rec.yuv", "wb");

-#endif

-#if 0

-  framepsnr = fopen("framepsnr.stt", "a");

-  kf_list = fopen("kf_list.stt", "w");

-#endif

-  cpi->output_pkt_list = oxcf->output_pkt_list;

-  if (cpi->pass == 1) {

-    vp9_init_first_pass(cpi);

-  } else if (cpi->pass == 2) {

-    size_t packet_sz = sizeof(FIRSTPASS_STATS);

-    int packets = oxcf->two_pass_stats_in.sz / packet_sz;

-    cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;

-    cpi->twopass.stats_in = cpi->twopass.stats_in_start;

-    cpi->twopass.stats_in_end = (void *)((char *)cpi->twopass.stats_in

-                                         + (packets - 1) * packet_sz);

-    vp9_init_second_pass(cpi);

-  }

-  vp9_set_speed_features(cpi);

-  // Set starting values of RD threshold multipliers (128 = *1)

-  for (i = 0; i < MAX_MODES; i++) {

-    cpi->rd_thresh_mult[i] = 128;

-  }

-#ifdef ENTROPY_STATS

-  init_mv_ref_counts();

-#endif

-#define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \

-    cpi->fn_ptr[BT].sdf            = SDF; \

-    cpi->fn_ptr[BT].vf             = VF; \

-    cpi->fn_ptr[BT].svf            = SVF; \

-    cpi->fn_ptr[BT].svf_halfpix_h  = SVFHH; \

-    cpi->fn_ptr[BT].svf_halfpix_v  = SVFHV; \

-    cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \

-    cpi->fn_ptr[BT].sdx3f          = SDX3F; \

-    cpi->fn_ptr[BT].sdx8f          = SDX8F; \

-    cpi->fn_ptr[BT].sdx4df         = SDX4DF;

-#if CONFIG_SUPERBLOCKS

-  BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,

-      vp9_variance_halfpixvar32x32_h, vp9_variance_halfpixvar32x32_v,

-      vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,

-      vp9_sad32x32x4d)

-#endif

-  BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,

-       vp9_variance_halfpixvar16x16_h, vp9_variance_halfpixvar16x16_v,

-       vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,

-       vp9_sad16x16x4d)

-  BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,

-      NULL, NULL, NULL, vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)

-  BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,

-      NULL, NULL, NULL, vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)

-  BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,

-      NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)

-  BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,

-      NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)

-#if ARCH_X86 || ARCH_X86_64

-  cpi->fn_ptr[BLOCK_16X16].copymem  = vp9_copy32xn;

-  cpi->fn_ptr[BLOCK_16X8].copymem   = vp9_copy32xn;

-  cpi->fn_ptr[BLOCK_8X16].copymem   = vp9_copy32xn;

-  cpi->fn_ptr[BLOCK_8X8].copymem    = vp9_copy32xn;

-  cpi->fn_ptr[BLOCK_4X4].copymem    = vp9_copy32xn;

-#endif

-  cpi->full_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, full_search);

-  cpi->diamond_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, diamond_search);

-  cpi->refining_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, refining_search);

-  // make sure frame 1 is okay

-  cpi->error_bins[0] = cpi->common.MBs;

-  /* vp9_init_quantizer() is first called here. Add check in

-   * vp9_frame_init_quantizer() so that vp9_init_quantizer is only

-   * called later when needed. This will avoid unnecessary calls of

-   * vp9_init_quantizer() for every frame.

-   */

-  vp9_init_quantizer(cpi);

-  vp9_loop_filter_init(cm);

-  cpi->common.error.setjmp = 0;

-  vp9_zero(cpi->y_uv_mode_count)

-  return (VP9_PTR) cpi;

-}

-void vp9_remove_compressor(VP9_PTR *ptr) {

-  VP9_COMP *cpi = (VP9_COMP *)(*ptr);

-  int i;

-  if (!cpi)

-    return;

-  if (cpi && (cpi->common.current_video_frame > 0)) {

-    if (cpi->pass == 2) {

-      vp9_end_second_pass(cpi);

-    }

-#ifdef ENTROPY_STATS

-    if (cpi->pass != 1) {

-      print_context_counters();

-      print_tree_update_probs();

-      print_mode_context();

-    }

-#endif

-#ifdef NMV_STATS

-    if (cpi->pass != 1)

-      print_nmvstats();

-#endif

-#if CONFIG_INTERNAL_STATS

-    vp9_clear_system_state();

-    // printf("\n8x8-4x4:%d-%d\n", cpi->t8x8_count, cpi->t4x4_count);

-    if (cpi->pass != 1) {

-      FILE *f = fopen("opsnr.stt", "a");

-      double time_encoded = (cpi->last_end_time_stamp_seen

-                             - cpi->first_time_stamp_ever) / 10000000.000;

-      double total_encode_time = (cpi->time_receive_data + cpi->time_compress_data)   / 1000.000;

-      double dr = (double)cpi->bytes * (double) 8 / (double)1000  / time_encoded;

-#if defined(MODE_STATS)

-      print_mode_contexts(&cpi->common);

-#endif

-      if (cpi->b_calculate_psnr) {

-        YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];

-        double samples = 3.0 / 2 * cpi->count * lst_yv12->y_width * lst_yv12->y_height;

-        double total_psnr = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error);

-        double total_psnr2 = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error2);

-        double total_ssim = 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);

-        fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\tVPXSSIM\t  Time(ms)\n");

-        fprintf(f, "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n",

-                dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim,

-                total_encode_time);

-//                fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f %10ld\n",

-//                        dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim,

-//                        total_encode_time, cpi->tot_recode_hits);

-      }

-      if (cpi->b_calculate_ssimg) {

-        fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t  Time(ms)\n");

-        fprintf(f, "%7.2f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,

-                cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,

-                cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time);

-//                fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f  %10ld\n", dr,

-//                        cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,

-//                        cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time, cpi->tot_recode_hits);

-      }

-      fclose(f);

-    }

-#endif

-#ifdef MODE_STATS

-    {

-      extern int count_mb_seg[4];

-      char modes_stats_file[250];

-      FILE *f;

-      double dr = (double)cpi->oxcf.frame_rate * (double)cpi->bytes * (double)8 / (double)cpi->count / (double)1000;

-      sprintf(modes_stats_file, "modes_q%03d.stt", cpi->common.base_qindex);

-      f = fopen(modes_stats_file, "w");

-      fprintf(f, "intra_mode in Intra Frames:\n");

-      {

-        int i;

-        fprintf(f, "Y: ");

-        for (i = 0; i < VP9_YMODES; i++) fprintf(f, " %8d,", y_modes[i]);

-        fprintf(f, "\n");

-      }

-      {

-        int i;

-        fprintf(f, "I8: ");

-        for (i = 0; i < VP9_I8X8_MODES; i++) fprintf(f, " %8d,", i8x8_modes[i]);

-        fprintf(f, "\n");

-      }

-      {

-        int i;

-        fprintf(f, "UV: ");

-        for (i = 0; i < VP9_UV_MODES; i++) fprintf(f, " %8d,", uv_modes[i]);

-        fprintf(f, "\n");

-      }

-      {

-        int i, j;

-        fprintf(f, "KeyFrame Y-UV:\n");

-        for (i = 0; i < VP9_YMODES; i++) {

-          fprintf(f, "%2d:", i);

-          for (j = 0; j < VP9_UV_MODES; j++) fprintf(f, "%8d, ", uv_modes_y[i][j]);

-          fprintf(f, "\n");

-        }

-      }

-      {

-        int i, j;

-        fprintf(f, "Inter Y-UV:\n");

-        for (i = 0; i < VP9_YMODES; i++) {

-          fprintf(f, "%2d:", i);

-          for (j = 0; j < VP9_UV_MODES; j++) fprintf(f, "%8d, ", cpi->y_uv_mode_count[i][j]);

-          fprintf(f, "\n");

-        }

-      }

-      {

-        int i;

-        fprintf(f, "B: ");

-        for (i = 0; i < VP9_BINTRAMODES; i++)

-          fprintf(f, "%8d, ", b_modes[i]);

-        fprintf(f, "\n");

-      }

-      fprintf(f, "Modes in Inter Frames:\n");

-      {

-        int i;

-        fprintf(f, "Y: ");

-        for (i = 0; i < MB_MODE_COUNT; i++) fprintf(f, " %8d,", inter_y_modes[i]);

-        fprintf(f, "\n");

-      }

-      {

-        int i;

-        fprintf(f, "UV: ");

-        for (i = 0; i < VP9_UV_MODES; i++) fprintf(f, " %8d,", inter_uv_modes[i]);

-        fprintf(f, "\n");

-      }

-      {

-        int i;

-        fprintf(f, "B: ");

-        for (i = 0; i < B_MODE_COUNT; i++) fprintf(f, "%8d, ", inter_b_modes[i]);

-        fprintf(f, "\n");

-      }

-      fprintf(f, "P:%8d, %8d, %8d, %8d\n", count_mb_seg[0], count_mb_seg[1], count_mb_seg[2], count_mb_seg[3]);

-      fprintf(f, "PB:%8d, %8d, %8d, %8d\n", inter_b_modes[LEFT4X4], inter_b_modes[ABOVE4X4], inter_b_modes[ZERO4X4], inter_b_modes[NEW4X4]);

-      fclose(f);

-    }

-#endif

-#ifdef ENTROPY_STATS

-    {

-      int i, j, k;

-      FILE *fmode = fopen("modecontext.c", "w");

-      fprintf(fmode, "\n#include \"entropymode.h\"\n\n");

-      fprintf(fmode, "const unsigned int vp9_kf_default_bmode_counts ");

-      fprintf(fmode, "[VP9_BINTRAMODES] [VP9_BINTRAMODES] [VP9_BINTRAMODES] =\n{\n");

-      for (i = 0; i < 10; i++) {

-        fprintf(fmode, "    { // Above Mode :  %d\n", i);

-        for (j = 0; j < 10; j++) {

-          fprintf(fmode, "        {");

-          for (k = 0; k < VP9_BINTRAMODES; k++) {

-            if (!intra_mode_stats[i][j][k])

-              fprintf(fmode, " %5d, ", 1);

-            else

-              fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]);

-          }

-          fprintf(fmode, "}, // left_mode %d\n", j);

-        }

-        fprintf(fmode, "    },\n");

-      }

-      fprintf(fmode, "};\n");

-      fclose(fmode);

-    }

-#endif

-#if defined(SECTIONBITS_OUTPUT)

-    if (0) {

-      int i;

-      FILE *f = fopen("tokenbits.stt", "a");

-      for (i = 0; i < 28; i++)

-        fprintf(f, "%8d", (int)(Sectionbits[i] / 256));

-      fprintf(f, "\n");

-      fclose(f);

-    }

-#endif

-#if 0

-    {

-      printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);

-      printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");

-      printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_mb_row / 1000, cpi->time_compress_data / 1000, (cpi->time_receive_data + cpi->time_compress_data) / 1000);

-    }

-#endif

-  }

-  dealloc_compressor_data(cpi);

-  vpx_free(cpi->mb.ss);

-  vpx_free(cpi->tok);

-  for (i = 0; i < sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]); i++) {

-    vpx_free(cpi->mbgraph_stats[i].mb_stats);

-  }

-  vp9_remove_common(&cpi->common);

-  vpx_free(cpi);

-  *ptr = 0;

-#ifdef OUTPUT_YUV_SRC

-  fclose(yuv_file);

-#endif

-#ifdef OUTPUT_YUV_REC

-  fclose(yuv_rec_file);

-#endif

-#if 0

-  if (keyfile)

-    fclose(keyfile);

-  if (framepsnr)

-    fclose(framepsnr);

-  if (kf_list)

-    fclose(kf_list);

-#endif

-}

-static uint64_t calc_plane_error(unsigned char *orig, int orig_stride,

-                                 unsigned char *recon, int recon_stride,

-                                 unsigned int cols, unsigned int rows) {

-  unsigned int row, col;

-  uint64_t total_sse = 0;

-  int diff;

-  for (row = 0; row + 16 <= rows; row += 16) {

-    for (col = 0; col + 16 <= cols; col += 16) {

-      unsigned int sse;

-      vp9_mse16x16(orig + col, orig_stride, recon + col, recon_stride, &sse);

-      total_sse += sse;

-    }

-    /* Handle odd-sized width */

-    if (col < cols) {

-      unsigned int   border_row, border_col;

-      unsigned char *border_orig = orig;

-      unsigned char *border_recon = recon;

-      for (border_row = 0; border_row < 16; border_row++) {

-        for (border_col = col; border_col < cols; border_col++) {

-          diff = border_orig[border_col] - border_recon[border_col];

-          total_sse += diff * diff;

-        }

-        border_orig += orig_stride;

-        border_recon += recon_stride;

-      }

-    }

-    orig += orig_stride * 16;

-    recon += recon_stride * 16;

-  }

-  /* Handle odd-sized height */

-  for (; row < rows; row++) {

-    for (col = 0; col < cols; col++) {

-      diff = orig[col] - recon[col];

-      total_sse += diff * diff;

-    }

-    orig += orig_stride;

-    recon += recon_stride;

-  }

-  return total_sse;

-}

-static void generate_psnr_packet(VP9_COMP *cpi) {

-  YV12_BUFFER_CONFIG      *orig = cpi->Source;

-  YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;

-  struct vpx_codec_cx_pkt  pkt;

-  uint64_t                 sse;

-  int                      i;

-  unsigned int             width = cpi->common.Width;

-  unsigned int             height = cpi->common.Height;

-  pkt.kind = VPX_CODEC_PSNR_PKT;

-  sse = calc_plane_error(orig->y_buffer, orig->y_stride,

-                         recon->y_buffer, recon->y_stride,

-                         width, height);

-  pkt.data.psnr.sse[0] = sse;

-  pkt.data.psnr.sse[1] = sse;

-  pkt.data.psnr.samples[0] = width * height;

-  pkt.data.psnr.samples[1] = width * height;

-  width = (width + 1) / 2;

-  height = (height + 1) / 2;

-  sse = calc_plane_error(orig->u_buffer, orig->uv_stride,

-                         recon->u_buffer, recon->uv_stride,

-                         width, height);

-  pkt.data.psnr.sse[0] += sse;

-  pkt.data.psnr.sse[2] = sse;

-  pkt.data.psnr.samples[0] += width * height;

-  pkt.data.psnr.samples[2] = width * height;

-  sse = calc_plane_error(orig->v_buffer, orig->uv_stride,

-                         recon->v_buffer, recon->uv_stride,

-                         width, height);

-  pkt.data.psnr.sse[0] += sse;

-  pkt.data.psnr.sse[3] = sse;

-  pkt.data.psnr.samples[0] += width * height;

-  pkt.data.psnr.samples[3] = width * height;

-  for (i = 0; i < 4; i++)

-    pkt.data.psnr.psnr[i] = vp9_mse2psnr(pkt.data.psnr.samples[i], 255.0,

-                                         pkt.data.psnr.sse[i]);

-  vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);

-}

-int vp9_use_as_reference(VP9_PTR ptr, int ref_frame_flags) {

-  VP9_COMP *cpi = (VP9_COMP *)(ptr);

-  if (ref_frame_flags > 7)

-    return -1;

-  cpi->ref_frame_flags = ref_frame_flags;

-  return 0;

-}

-int vp9_update_reference(VP9_PTR ptr, int ref_frame_flags) {

-  VP9_COMP *cpi = (VP9_COMP *)(ptr);

-  if (ref_frame_flags > 7)

-    return -1;

-  cpi->common.refresh_golden_frame = 0;

-  cpi->common.refresh_alt_ref_frame = 0;

-  cpi->common.refresh_last_frame   = 0;

-  if (ref_frame_flags & VP9_LAST_FLAG)

-    cpi->common.refresh_last_frame = 1;

-  if (ref_frame_flags & VP9_GOLD_FLAG)

-    cpi->common.refresh_golden_frame = 1;

-  if (ref_frame_flags & VP9_ALT_FLAG)

-    cpi->common.refresh_alt_ref_frame = 1;

-  return 0;

-}

-int vp9_get_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,

-                          YV12_BUFFER_CONFIG *sd) {

-  VP9_COMP *cpi = (VP9_COMP *)(ptr);

-  VP9_COMMON *cm = &cpi->common;

-  int ref_fb_idx;

-  if (ref_frame_flag == VP9_LAST_FLAG)

-    ref_fb_idx = cm->lst_fb_idx;

-  else if (ref_frame_flag == VP9_GOLD_FLAG)

-    ref_fb_idx = cm->gld_fb_idx;

-  else if (ref_frame_flag == VP9_ALT_FLAG)

-    ref_fb_idx = cm->alt_fb_idx;

-  else

-    return -1;

-  vp8_yv12_copy_frame_ptr(&cm->yv12_fb[ref_fb_idx], sd);

-  return 0;

-}

-int vp9_set_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,

-                          YV12_BUFFER_CONFIG *sd) {

-  VP9_COMP *cpi = (VP9_COMP *)(ptr);

-  VP9_COMMON *cm = &cpi->common;

-  int ref_fb_idx;

-  if (ref_frame_flag == VP9_LAST_FLAG)

-    ref_fb_idx = cm->lst_fb_idx;

-  else if (ref_frame_flag == VP9_GOLD_FLAG)

-    ref_fb_idx = cm->gld_fb_idx;

-  else if (ref_frame_flag == VP9_ALT_FLAG)

-    ref_fb_idx = cm->alt_fb_idx;

-  else

-    return -1;

-  vp8_yv12_copy_frame_ptr(sd, &cm->yv12_fb[ref_fb_idx]);

-  return 0;

-}

-int vp9_update_entropy(VP9_PTR comp, int update) {

-  VP9_COMP *cpi = (VP9_COMP *) comp;

-  VP9_COMMON *cm = &cpi->common;

-  cm->refresh_entropy_probs = update;

-  return 0;

-}

-#ifdef OUTPUT_YUV_SRC

-void vp9_write_yuv_frame(YV12_BUFFER_CONFIG *s) {

-  unsigned char *src = s->y_buffer;

-  int h = s->y_height;

-  do {

-    fwrite(src, s->y_width, 1,  yuv_file);

-    src += s->y_stride;

-  } while (--h);

-  src = s->u_buffer;

-  h = s->uv_height;

-  do {

-    fwrite(src, s->uv_width, 1,  yuv_file);

-    src += s->uv_stride;

-  } while (--h);

-  src = s->v_buffer;

-  h = s->uv_height;

-  do {

-    fwrite(src, s->uv_width, 1, yuv_file);

-    src += s->uv_stride;

-  } while (--h);

-}

-#endif

-#ifdef OUTPUT_YUV_REC

-void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {

-  YV12_BUFFER_CONFIG *s = cm->frame_to_show;

-  unsigned char *src = s->y_buffer;

-  int h = cm->Height;

-  do {

-    fwrite(src, s->y_width, 1,  yuv_rec_file);

-    src += s->y_stride;

-  } while (--h);

-  src = s->u_buffer;

-  h = (cm->Height + 1) / 2;

-  do {

-    fwrite(src, s->uv_width, 1,  yuv_rec_file);

-    src += s->uv_stride;

-  } while (--h);

-  src = s->v_buffer;

-  h = (cm->Height + 1) / 2;

-  do {

-    fwrite(src, s->uv_width, 1, yuv_rec_file);

-    src += s->uv_stride;

-  } while (--h);

-}

-#endif

-static void update_alt_ref_frame_stats(VP9_COMP *cpi) {

-  VP9_COMMON *cm = &cpi->common;

-  // Update data structure that monitors level of reference to last GF

-  vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));

-  cpi->gf_active_count = cm->mb_rows * cm->mb_cols;

-  // this frame refreshes means next frames don't unless specified by user

-  cpi->common.frames_since_golden = 0;

-  // Clear the alternate reference update pending flag.

-  cpi->source_alt_ref_pending = FALSE;

-  // Set the alternate refernce frame active flag

-  cpi->source_alt_ref_active = TRUE;

-}

-static void update_golden_frame_stats(VP9_COMP *cpi) {

-  VP9_COMMON *cm = &cpi->common;

-  // Update the Golden frame usage counts.

-  if (cm->refresh_golden_frame) {

-    // Update data structure that monitors level of reference to last GF

-    vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));

-    cpi->gf_active_count = cm->mb_rows * cm->mb_cols;

-    // this frame refreshes means next frames don't unless specified by user

-    cm->refresh_golden_frame = 0;

-    cpi->common.frames_since_golden = 0;

-    // if ( cm->frame_type == KEY_FRAME )

-    // {

-    cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;

-    cpi->recent_ref_frame_usage[LAST_FRAME] = 1;

-    cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;

-    cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;

-    // }

-    // else

-    // {

-    //  // Carry a potrtion of count over to begining of next gf sequence

-    //  cpi->recent_ref_frame_usage[INTRA_FRAME] >>= 5;

-    //  cpi->recent_ref_frame_usage[LAST_FRAME] >>= 5;

-    //  cpi->recent_ref_frame_usage[GOLDEN_FRAME] >>= 5;

-    //  cpi->recent_ref_frame_usage[ALTREF_FRAME] >>= 5;

-    // }

-    // ******** Fixed Q test code only ************

-    // If we are going to use the ALT reference for the next group of frames set a flag to say so.

-    if (cpi->oxcf.fixed_q >= 0 &&

-        cpi->oxcf.play_alternate && !cpi->common.refresh_alt_ref_frame) {

-      cpi->source_alt_ref_pending = TRUE;

-      cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;

-    }

-    if (!cpi->source_alt_ref_pending)

-      cpi->source_alt_ref_active = FALSE;

-    // Decrement count down till next gf

-    if (cpi->frames_till_gf_update_due > 0)

-      cpi->frames_till_gf_update_due--;

-  } else if (!cpi->common.refresh_alt_ref_frame) {

-    // Decrement count down till next gf

-    if (cpi->frames_till_gf_update_due > 0)

-      cpi->frames_till_gf_update_due--;

-    if (cpi->common.frames_till_alt_ref_frame)

-      cpi->common.frames_till_alt_ref_frame--;

-    cpi->common.frames_since_golden++;

-    if (cpi->common.frames_since_golden > 1) {

-      cpi->recent_ref_frame_usage[INTRA_FRAME] += cpi->count_mb_ref_frame_usage[INTRA_FRAME];

-      cpi->recent_ref_frame_usage[LAST_FRAME] += cpi->count_mb_ref_frame_usage[LAST_FRAME];

-      cpi->recent_ref_frame_usage[GOLDEN_FRAME] += cpi->count_mb_ref_frame_usage[GOLDEN_FRAME];

-      cpi->recent_ref_frame_usage[ALTREF_FRAME] += cpi->count_mb_ref_frame_usage[ALTREF_FRAME];

-    }

-  }

-}

-static int find_fp_qindex() {

-  int i;

-  for (i = 0; i < QINDEX_RANGE; i++) {

-    if (vp9_convert_qindex_to_q(i) >= 30.0) {

-      break;

-    }

-  }

-  if (i == QINDEX_RANGE)

-    i--;

-  return i;

-}

-static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags) {

-  (void) size;

-  (void) dest;

-  (void) frame_flags;

-  vp9_set_quantizer(cpi, find_fp_qindex());

-  vp9_first_pass(cpi);

-}

-#define WRITE_RECON_BUFFER 0

-#if WRITE_RECON_BUFFER

-void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {

-  // write the frame

-  FILE *yframe;

-  int i;

-  char filename[255];

-  sprintf(filename, "cx\\y%04d.raw", this_frame);

-  yframe = fopen(filename, "wb");

-  for (i = 0; i < frame->y_height; i++)

-    fwrite(frame->y_buffer + i * frame->y_stride,

-           frame->y_width, 1, yframe);

-  fclose(yframe);

-  sprintf(filename, "cx\\u%04d.raw", this_frame);

-  yframe = fopen(filename, "wb");

-  for (i = 0; i < frame->uv_height; i++)

-    fwrite(frame->u_buffer + i * frame->uv_stride,

-           frame->uv_width, 1, yframe);

-  fclose(yframe);

-  sprintf(filename, "cx\\v%04d.raw", this_frame);

-  yframe = fopen(filename, "wb");

-  for (i = 0; i < frame->uv_height; i++)

-    fwrite(frame->v_buffer + i * frame->uv_stride,

-           frame->uv_width, 1, yframe);

-  fclose(yframe);

-}

-#endif

-static double compute_edge_pixel_proportion(YV12_BUFFER_CONFIG *frame) {

-#define EDGE_THRESH 128

-  int i, j;

-  int num_edge_pels = 0;

-  int num_pels = (frame->y_height - 2) * (frame->y_width - 2);

-  unsigned char *prev = frame->y_buffer + 1;

-  unsigned char *curr = frame->y_buffer + 1 + frame->y_stride;

-  unsigned char *next = frame->y_buffer + 1 + 2 * frame->y_stride;

-  for (i = 1; i < frame->y_height - 1; i++) {

-    for (j = 1; j < frame->y_width - 1; j++) {

-      /* Sobel hor and ver gradients */

-      int v = 2 * (curr[1] - curr[-1]) + (prev[1] - prev[-1]) + (next[1] - next[-1]);

-      int h = 2 * (prev[0] - next[0]) + (prev[1] - next[1]) + (prev[-1] - next[-1]);

-      h = (h < 0 ? -h : h);

-      v = (v < 0 ? -v : v);

-      if (h > EDGE_THRESH || v > EDGE_THRESH) num_edge_pels++;

-      curr++;

-      prev++;

-      next++;

-    }

-    curr += frame->y_stride - frame->y_width + 2;

-    prev += frame->y_stride - frame->y_width + 2;

-    next += frame->y_stride - frame->y_width + 2;

-  }

-  return (double)num_edge_pels / (double)num_pels;

-}

-// Function to test for conditions that indicate we should loop

-// back and recode a frame.

-static BOOL recode_loop_test(VP9_COMP *cpi,

-                             int high_limit, int low_limit,

-                             int q, int maxq, int minq) {

-  BOOL    force_recode = FALSE;

-  VP9_COMMON *cm = &cpi->common;

-  // Is frame recode allowed at all

-  // Yes if either recode mode 1 is selected or mode two is selcted

-  // and the frame is a key frame. golden frame or alt_ref_frame

-  if ((cpi->sf.recode_loop == 1) ||

-      ((cpi->sf.recode_loop == 2) &&

-       ((cm->frame_type == KEY_FRAME) ||

-        cm->refresh_golden_frame ||

-        cm->refresh_alt_ref_frame))) {

-    // General over and under shoot tests

-    if (((cpi->projected_frame_size > high_limit) && (q < maxq)) ||

-        ((cpi->projected_frame_size < low_limit) && (q > minq))) {

-      force_recode = TRUE;

-    }

-    // Special Constrained quality tests

-    else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {

-      // Undershoot and below auto cq level

-      if ((q > cpi->cq_target_quality) &&

-          (cpi->projected_frame_size <

-           ((cpi->this_frame_target * 7) >> 3))) {

-        force_recode = TRUE;

-      }

-      // Severe undershoot and between auto and user cq level

-      else if ((q > cpi->oxcf.cq_level) &&

-               (cpi->projected_frame_size < cpi->min_frame_bandwidth) &&

-               (cpi->active_best_quality > cpi->oxcf.cq_level)) {

-        force_recode = TRUE;

-        cpi->active_best_quality = cpi->oxcf.cq_level;

-      }

-    }

-  }

-  return force_recode;

-}

-static void update_reference_frames(VP9_COMMON *cm) {

-  YV12_BUFFER_CONFIG *yv12_fb = cm->yv12_fb;

-  // At this point the new frame has been encoded.

-  // If any buffer copy / swapping is signaled it should be done here.

-  if (cm->frame_type == KEY_FRAME) {

-    yv12_fb[cm->new_fb_idx].flags |= VP9_GOLD_FLAG | VP9_ALT_FLAG;

-    yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;

-    yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;

-    cm->alt_fb_idx = cm->gld_fb_idx = cm->new_fb_idx;

-  } else { /* For non key frames */

-    if (cm->refresh_alt_ref_frame) {

-      assert(!cm->copy_buffer_to_arf);

-      cm->yv12_fb[cm->new_fb_idx].flags |= VP9_ALT_FLAG;

-      cm->yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;

-      cm->alt_fb_idx = cm->new_fb_idx;

-    } else if (cm->copy_buffer_to_arf) {

-      assert(!(cm->copy_buffer_to_arf & ~0x3));

-      if (cm->copy_buffer_to_arf == 1) {

-        if (cm->alt_fb_idx != cm->lst_fb_idx) {

-          yv12_fb[cm->lst_fb_idx].flags |= VP9_ALT_FLAG;

-          yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;

-          cm->alt_fb_idx = cm->lst_fb_idx;

-        }

-      } else { /* if (cm->copy_buffer_to_arf == 2) */

-        if (cm->alt_fb_idx != cm->gld_fb_idx) {

-          yv12_fb[cm->gld_fb_idx].flags |= VP9_ALT_FLAG;

-          yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;

-          cm->alt_fb_idx = cm->gld_fb_idx;

-        }

-      }

-    }

-    if (cm->refresh_golden_frame) {

-      assert(!cm->copy_buffer_to_gf);

-      cm->yv12_fb[cm->new_fb_idx].flags |= VP9_GOLD_FLAG;

-      cm->yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;

-      cm->gld_fb_idx = cm->new_fb_idx;

-    } else if (cm->copy_buffer_to_gf) {

-      assert(!(cm->copy_buffer_to_arf & ~0x3));

-      if (cm->copy_buffer_to_gf == 1) {

-        if (cm->gld_fb_idx != cm->lst_fb_idx) {

-          yv12_fb[cm->lst_fb_idx].flags |= VP9_GOLD_FLAG;

-          yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;

-          cm->gld_fb_idx = cm->lst_fb_idx;

-        }

-      } else { /* if (cm->copy_buffer_to_gf == 2) */

-        if (cm->alt_fb_idx != cm->gld_fb_idx) {

-          yv12_fb[cm->alt_fb_idx].flags |= VP9_GOLD_FLAG;

-          yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;

-          cm->gld_fb_idx = cm->alt_fb_idx;

-        }

-      }

-    }

-  }

-  if (cm->refresh_last_frame) {

-    cm->yv12_fb[cm->new_fb_idx].flags |= VP9_LAST_FLAG;

-    cm->yv12_fb[cm->lst_fb_idx].flags &= ~VP9_LAST_FLAG;

-    cm->lst_fb_idx = cm->new_fb_idx;

-  }

-}

-static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {

-  if (cm->no_lpf) {

-    cm->filter_level = 0;

-  }

-#if CONFIG_LOSSLESS

-  else if (cpi->oxcf.lossless) {

-    cm->filter_level = 0;

-  }

-#endif

-  else {

-    struct vpx_usec_timer timer;

-    vp9_clear_system_state();

-    vpx_usec_timer_start(&timer);

-    if (cpi->sf.auto_filter == 0)

-      vp9_pick_filter_level_fast(cpi->Source, cpi);

-    else

-      vp9_pick_filter_level(cpi->Source, cpi);

-    vpx_usec_timer_mark(&timer);

-    cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);

-  }

-  if (cm->filter_level > 0) {

-    vp9_set_alt_lf_level(cpi, cm->filter_level);

-    vp9_loop_filter_frame(cm, &cpi->mb.e_mbd);

-  }

-  vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);

-}

-#if CONFIG_PRED_FILTER

-void select_pred_filter_mode(VP9_COMP *cpi) {

-  VP9_COMMON *cm = &cpi->common;

-  int prob_pred_filter_off = cm->prob_pred_filter_off;

-  // Force filter on/off if probability is extreme

-  if (prob_pred_filter_off >= 255 * 0.95)

-    cm->pred_filter_mode = 0;   // Off at the frame level

-  else if (prob_pred_filter_off <= 255 * 0.05)

-    cm->pred_filter_mode = 1;   // On at the frame level

-  else

-    cm->pred_filter_mode = 2;   // Selectable at the MB level

-}

-void update_pred_filt_prob(VP9_COMP *cpi) {

-  VP9_COMMON *cm = &cpi->common;

-  int prob_pred_filter_off;

-  // Based on the selection in the previous frame determine what mode

-  // to use for the current frame and work out the signaling probability

-  if (cpi->pred_filter_on_count + cpi->pred_filter_off_count) {

-    prob_pred_filter_off = cpi->pred_filter_off_count * 256 /

-                           (cpi->pred_filter_on_count + cpi->pred_filter_off_count);

-    if (prob_pred_filter_off < 1)

-      prob_pred_filter_off = 1;

-    if (prob_pred_filter_off > 255)

-      prob_pred_filter_off = 255;

-    cm->prob_pred_filter_off = prob_pred_filter_off;

-  } else

-    cm->prob_pred_filter_off = 128;

-  /*

-      {

-        FILE *fp = fopen("filt_use.txt", "a");

-        fprintf (fp, "%d %d prob=%d\n", cpi->pred_filter_off_count,

-                 cpi->pred_filter_on_count, cm->prob_pred_filter_off);

-        fclose(fp);

-      }

-  */

-}

-#endif

-static void encode_frame_to_data_rate

-(

-  VP9_COMP *cpi,

-  unsigned long *size,

-  unsigned char *dest,

-  unsigned int *frame_flags

-) {

-  VP9_COMMON *cm = &cpi->common;

-  MACROBLOCKD *xd = &cpi->mb.e_mbd;

-  int Q;

-  int frame_over_shoot_limit;

-  int frame_under_shoot_limit;

-  int Loop = FALSE;

-  int loop_count;

-  int this_q;

-  int last_zbin_oq;

-  int q_low;

-  int q_high;

-  int zbin_oq_high;

-  int zbin_oq_low = 0;

-  int top_index;

-  int bottom_index;

-  int active_worst_qchanged = FALSE;

-  int overshoot_seen = FALSE;

-  int undershoot_seen = FALSE;

-  int loop_size_estimate = 0;

-  SPEED_FEATURES *sf = &cpi->sf;

-#if RESET_FOREACH_FILTER

-  int q_low0;

-  int q_high0;

-  int zbin_oq_high0;

-  int zbin_oq_low0 = 0;

-  int Q0;

-  int last_zbin_oq0;

-  int active_best_quality0;

-  int active_worst_quality0;

-  double rate_correction_factor0;

-  double gf_rate_correction_factor0;

-#endif

-  /* list of filters to search over */

-  int mcomp_filters_to_search[] = {

-    EIGHTTAP, EIGHTTAP_SHARP, SIXTAP, SWITCHABLE

-  };

-  int mcomp_filters = sizeof(mcomp_filters_to_search) /

-      sizeof(*mcomp_filters_to_search);

-  int mcomp_filter_index = 0;

-  INT64 mcomp_filter_cost[4];

-  // Clear down mmx registers to allow floating point in what follows

-  vp9_clear_system_state();

-  // For an alt ref frame in 2 pass we skip the call to the second

-  // pass function that sets the target bandwidth so must set it here

-  if (cpi->common.refresh_alt_ref_frame) {

-    cpi->per_frame_bandwidth = cpi->twopass.gf_bits;                           // Per frame bit target for the alt ref frame

-    cpi->target_bandwidth = cpi->twopass.gf_bits * cpi->output_frame_rate;      // per second target bitrate

-  }

-  // Default turn off buffer to buffer copying

-  cm->copy_buffer_to_gf = 0;

-  cm->copy_buffer_to_arf = 0;

-  // Clear zbin over-quant value and mode boost values.

-  cpi->zbin_over_quant = 0;

-  cpi->zbin_mode_boost = 0;

-  // Enable or disable mode based tweaking of the zbin

-  // For 2 Pass Only used where GF/ARF prediction quality

-  // is above a threshold

-  cpi->zbin_mode_boost = 0;

-#if CONFIG_LOSSLESS

-  cpi->zbin_mode_boost_enabled = FALSE;

-#else

-  cpi->zbin_mode_boost_enabled = TRUE;

-#endif

-  if (cpi->gfu_boost <= 400) {

-    cpi->zbin_mode_boost_enabled = FALSE;

-  }

-  // Current default encoder behaviour for the altref sign bias

-  if (cpi->source_alt_ref_active)

-    cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;

-  else

-    cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 0;

-  // Check to see if a key frame is signalled

-  // For two pass with auto key frame enabled cm->frame_type may already be set, but not for one pass.

-  if ((cm->current_video_frame == 0) ||

-      (cm->frame_flags & FRAMEFLAGS_KEY) ||

-      (cpi->oxcf.auto_key && (cpi->frames_since_key % cpi->key_frame_frequency == 0))) {

-    // Key frame from VFW/auto-keyframe/first frame

-    cm->frame_type = KEY_FRAME;

-  }

-  // Set default state for segment based loop filter update flags

-  xd->mode_ref_lf_delta_update = 0;

-  // Set various flags etc to special state if it is a key frame

-  if (cm->frame_type == KEY_FRAME) {

-    int i;

-    // Reset the loop filter deltas and segmentation map

-    setup_features(cpi);

-    // If segmentation is enabled force a map update for key frames

-    if (xd->segmentation_enabled) {

-      xd->update_mb_segmentation_map = 1;

-      xd->update_mb_segmentation_data = 1;

-    }

-    // The alternate reference frame cannot be active for a key frame

-    cpi->source_alt_ref_active = FALSE;

-    // Reset the RD threshold multipliers to default of * 1 (128)

-    for (i = 0; i < MAX_MODES; i++) {

-      cpi->rd_thresh_mult[i] = 128;

-    }

-  }

-  // Test code for new segment features

-  init_seg_features(cpi);

-  // Decide how big to make the frame

-  vp9_pick_frame_size(cpi);

-  vp9_clear_system_state();

-  // Set an active best quality and if necessary active worst quality

-  Q = cpi->active_worst_quality;

-  if (cm->frame_type == KEY_FRAME) {

-    int high = 2000;

-    int low = 400;

-    if (cpi->kf_boost > high)

-      cpi->active_best_quality = kf_low_motion_minq[Q];

-    else if (cpi->kf_boost < low)

-      cpi->active_best_quality = kf_high_motion_minq[Q];

-    else {

-      int gap = high - low;

-      int offset = high - cpi->kf_boost;

-      int qdiff = kf_high_motion_minq[Q] - kf_low_motion_minq[Q];

-      int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;

-      cpi->active_best_quality = kf_low_motion_minq[Q] + adjustment;

-    }

-    // Make an adjustment based on the %s static

-    // The main impact of this is at lower Q to prevent overly large key

-    // frames unless a lot of the image is static.

-    if (cpi->kf_zeromotion_pct < 64)

-      cpi->active_best_quality += 4 - (cpi->kf_zeromotion_pct >> 4);

-    // Special case for key frames forced because we have reached

-    // the maximum key frame interval. Here force the Q to a range

-    // based on the ambient Q to reduce the risk of popping

-    if (cpi->this_key_frame_forced) {

-      int delta_qindex;

-      int qindex = cpi->last_boosted_qindex;

-      delta_qindex = compute_qdelta(cpi, qindex,

-                                    (qindex * 0.75));

-      cpi->active_best_quality = qindex + delta_qindex;

-      if (cpi->active_best_quality < cpi->best_quality)

-        cpi->active_best_quality = cpi->best_quality;

-    }

-  }

-  else if (cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame) {

-    int high = 2000;

-    int low = 400;

-    // Use the lower of cpi->active_worst_quality and recent

-    // average Q as basis for GF/ARF Q limit unless last frame was

-    // a key frame.

-    if ((cpi->frames_since_key > 1) &&

-        (cpi->avg_frame_qindex < cpi->active_worst_quality)) {

-      Q = cpi->avg_frame_qindex;

-    }

-    // For constrained quality dont allow Q less than the cq level

-    if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&

-        (Q < cpi->cq_target_quality)) {

-      Q = cpi->cq_target_quality;

-    }

-    if (cpi->gfu_boost > high)

-      cpi->active_best_quality = gf_low_motion_minq[Q];

-    else if (cpi->gfu_boost < low)

-      cpi->active_best_quality = gf_high_motion_minq[Q];

-    else {

-      int gap = high - low;

-      int offset = high - cpi->gfu_boost;

-      int qdiff = gf_high_motion_minq[Q] - gf_low_motion_minq[Q];

-      int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;

-      cpi->active_best_quality = gf_low_motion_minq[Q] + adjustment;

-    }

-    // Constrained quality use slightly lower active best.

-    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {

-      cpi->active_best_quality =

-        cpi->active_best_quality * 15 / 16;

-    }

-  } else {

-    cpi->active_best_quality = inter_minq[Q];

-    // For the constant/constrained quality mode we dont want

-    // q to fall below the cq level.

-    if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&

-        (cpi->active_best_quality < cpi->cq_target_quality)) {

-      // If we are strongly undershooting the target rate in the last

-      // frames then use the user passed in cq value not the auto

-      // cq value.

-      if (cpi->rolling_actual_bits < cpi->min_frame_bandwidth)

-        cpi->active_best_quality = cpi->oxcf.cq_level;

-      else

-        cpi->active_best_quality = cpi->cq_target_quality;

-    }

-  }

-  // Clip the active best and worst quality values to limits

-  if (cpi->active_worst_quality > cpi->worst_quality)

-    cpi->active_worst_quality = cpi->worst_quality;

-  if (cpi->active_best_quality < cpi->best_quality)

-    cpi->active_best_quality = cpi->best_quality;

-  if (cpi->active_best_quality > cpi->worst_quality)

-    cpi->active_best_quality = cpi->worst_quality;

-  if (cpi->active_worst_quality < cpi->active_best_quality)

-    cpi->active_worst_quality = cpi->active_best_quality;

-  // Specuial case code to try and match quality with forced key frames

-  if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {

-    Q = cpi->last_boosted_qindex;

-  } else {

-    // Determine initial Q to try

-    Q = vp9_regulate_q(cpi, cpi->this_frame_target);

-  }

-  last_zbin_oq = cpi->zbin_over_quant;

-  // Set highest allowed value for Zbin over quant

-  if (cm->frame_type == KEY_FRAME)

-    zbin_oq_high = 0; // ZBIN_OQ_MAX/16

-  else if (cm->refresh_alt_ref_frame || (cm->refresh_golden_frame && !cpi->source_alt_ref_active))

-    zbin_oq_high = 16;

-  else

-    zbin_oq_high = ZBIN_OQ_MAX;

-  vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit,

-                                &frame_over_shoot_limit);

-  // Limit Q range for the adaptive loop.

-  bottom_index = cpi->active_best_quality;

-  top_index    = cpi->active_worst_quality;

-  q_low  = cpi->active_best_quality;

-  q_high = cpi->active_worst_quality;

-  loop_count = 0;

-  if (cm->frame_type != KEY_FRAME) {

-    /* TODO: Decide this more intelligently */

-    if (sf->search_best_filter) {

-      cm->mcomp_filter_type = mcomp_filters_to_search[0];

-      mcomp_filter_index = 0;

-    } else {

-      cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;

-    }

-    /* TODO: Decide this more intelligently */

-    xd->allow_high_precision_mv = (Q < HIGH_PRECISION_MV_QTHRESH);

-  }

-#if CONFIG_POSTPROC

-  if (cpi->oxcf.noise_sensitivity > 0) {

-    unsigned char *src;

-    int l = 0;

-    switch (cpi->oxcf.noise_sensitivity) {

-      case 1:

-        l = 20;

-        break;

-      case 2:

-        l = 40;

-        break;

-      case 3:

-        l = 60;

-        break;

-      case 4:

-      case 5:

-        l = 100;

-        break;

-      case 6:

-        l = 150;

-        break;

-    }

-    if (cm->frame_type == KEY_FRAME) {

-      vp9_de_noise(cpi->Source, cpi->Source, l, 1,  0, RTCD(postproc));

-    } else {

-      vp9_de_noise(cpi->Source, cpi->Source, l, 1,  0, RTCD(postproc));

-      src = cpi->Source->y_buffer;

-      if (cpi->Source->y_stride < 0) {

-        src += cpi->Source->y_stride * (cpi->Source->y_height - 1);

-      }

-    }

-  }

-#endif

-#ifdef OUTPUT_YUV_SRC

-  vp9_write_yuv_frame(cpi->Source);

-#endif

-#if RESET_FOREACH_FILTER

-  if (sf->search_best_filter) {

-    q_low0 = q_low;

-    q_high0 = q_high;

-    Q0 = Q;

-    zbin_oq_low0 = zbin_oq_low;

-    zbin_oq_high0 = zbin_oq_high;

-    last_zbin_oq0 = last_zbin_oq;

-    rate_correction_factor0 = cpi->rate_correction_factor;

-    gf_rate_correction_factor0 = cpi->gf_rate_correction_factor;

-    active_best_quality0 = cpi->active_best_quality;

-    active_worst_quality0 = cpi->active_worst_quality;

-  }

-#endif

-  do {

-    vp9_clear_system_state();  // __asm emms;

-    vp9_set_quantizer(cpi, Q);

-    this_q = Q;

-    if (loop_count == 0) {

-      // setup skip prob for costing in mode/mv decision

-      if (cpi->common.mb_no_coeff_skip) {

-        int k;

-        for (k = 0; k < MBSKIP_CONTEXTS; k++)

-          cm->mbskip_pred_probs[k] = cpi->base_skip_false_prob[Q][k];

-        if (cm->frame_type != KEY_FRAME) {

-          if (cpi->common.refresh_alt_ref_frame) {

-            for (k = 0; k < MBSKIP_CONTEXTS; k++) {

-              if (cpi->last_skip_false_probs[2][k] != 0)

-                cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[2][k];

-            }

-          } else if (cpi->common.refresh_golden_frame) {

-            for (k = 0; k < MBSKIP_CONTEXTS; k++) {

-              if (cpi->last_skip_false_probs[1][k] != 0)

-                cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[1][k];

-            }

-          } else {

-            int k;

-            for (k = 0; k < MBSKIP_CONTEXTS; k++) {

-              if (cpi->last_skip_false_probs[0][k] != 0)

-                cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[0][k];

-            }

-          }

-          // as this is for cost estimate, let's make sure it does not

-          // get extreme either way

-          {

-            int k;

-            for (k = 0; k < MBSKIP_CONTEXTS; ++k) {

-              if (cm->mbskip_pred_probs[k] < 5)

-                cm->mbskip_pred_probs[k] = 5;

-              if (cm->mbskip_pred_probs[k] > 250)

-                cm->mbskip_pred_probs[k] = 250;

-              if (cpi->is_src_frame_alt_ref)

-                cm->mbskip_pred_probs[k] = 1;

-            }

-          }

-        }

-      }

-      // Set up entropy depending on frame type.

-      if (cm->frame_type == KEY_FRAME)

-        vp9_setup_key_frame(cpi);

-      else

-        vp9_setup_inter_frame(cpi);

-    }

-    // transform / motion compensation build reconstruction frame

-    vp9_encode_frame(cpi);

-    // Update the skip mb flag probabilities based on the distribution

-    // seen in the last encoder iteration.

-    update_base_skip_probs(cpi);

-    vp9_clear_system_state();  // __asm emms;

-#if CONFIG_PRED_FILTER

-    // Update prediction filter on/off probability based on

-    // selection made for the current frame

-    if (cm->frame_type != KEY_FRAME)

-      update_pred_filt_prob(cpi);

-#endif

-    // Dummy pack of the bitstream using up to date stats to get an

-    // accurate estimate of output frame size to determine if we need

-    // to recode.

-    vp9_save_coding_context(cpi);

-    cpi->dummy_packing = 1;

-    vp9_pack_bitstream(cpi, dest, size);

-    cpi->projected_frame_size = (*size) << 3;

-    vp9_restore_coding_context(cpi);

-    if (frame_over_shoot_limit == 0)

-      frame_over_shoot_limit = 1;

-    active_worst_qchanged = FALSE;

-    // Special case handling for forced key frames

-    if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {

-      int last_q = Q;

-      int kf_err = vp9_calc_ss_err(cpi->Source,

-                                   &cm->yv12_fb[cm->new_fb_idx]);

-      int high_err_target = cpi->ambient_err;

-      int low_err_target = (cpi->ambient_err >> 1);

-      // Prevent possible divide by zero error below for perfect KF

-      kf_err += (!kf_err);

-      // The key frame is not good enough or we can afford

-      // to make it better without undue risk of popping.

-      if (((kf_err > high_err_target) &&

-           (cpi->projected_frame_size <= frame_over_shoot_limit)) ||

-          ((kf_err > low_err_target) &&

-           (cpi->projected_frame_size <= frame_under_shoot_limit))) {

-        // Lower q_high

-        q_high = (Q > q_low) ? (Q - 1) : q_low;

-        // Adjust Q

-        Q = (Q * high_err_target) / kf_err;

-        if (Q < ((q_high + q_low) >> 1))

-          Q = (q_high + q_low) >> 1;

-      }

-      // The key frame is much better than the previous frame

-      else if ((kf_err < low_err_target) &&

-               (cpi->projected_frame_size >= frame_under_shoot_limit)) {

-        // Raise q_low

-        q_low = (Q < q_high) ? (Q + 1) : q_high;

-        // Adjust Q

-        Q = (Q * low_err_target) / kf_err;

-        if (Q > ((q_high + q_low + 1) >> 1))

-          Q = (q_high + q_low + 1) >> 1;

-      }

-      // Clamp Q to upper and lower limits:

-      if (Q > q_high)

-        Q = q_high;

-      else if (Q < q_low)

-        Q = q_low;

-      Loop = ((Q != last_q)) ? TRUE : FALSE;

-    }

-    // Is the projected frame size out of range and are we allowed to attempt to recode.

-    else if (recode_loop_test(cpi,

-                              frame_over_shoot_limit, frame_under_shoot_limit,

-                              Q, top_index, bottom_index)) {

-      int last_q = Q;

-      int Retries = 0;

-      // Frame size out of permitted range:

-      // Update correction factor & compute new Q to try...

-      // Frame is too large

-      if (cpi->projected_frame_size > cpi->this_frame_target) {

-        q_low = (Q < q_high) ? (Q + 1) : q_high; // Raise Qlow as to at least the current value

-        if (cpi->zbin_over_quant > 0)            // If we are using over quant do the same for zbin_oq_low

-          zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;

-        if (undershoot_seen || (loop_count > 1)) {

-          // Update rate_correction_factor unless cpi->active_worst_quality has changed.

-          if (!active_worst_qchanged)

-            vp9_update_rate_correction_factors(cpi, 1);

-          Q = (q_high + q_low + 1) / 2;

-          // Adjust cpi->zbin_over_quant (only allowed when Q is max)

-          if (Q < MAXQ)

-            cpi->zbin_over_quant = 0;

-          else {

-            zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;

-            cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;

-          }

-        } else {

-          // Update rate_correction_factor unless cpi->active_worst_quality has changed.

-          if (!active_worst_qchanged)

-            vp9_update_rate_correction_factors(cpi, 0);

-          Q = vp9_regulate_q(cpi, cpi->this_frame_target);

-          while (((Q < q_low) || (cpi->zbin_over_quant < zbin_oq_low)) && (Retries < 10)) {

-            vp9_update_rate_correction_factors(cpi, 0);

-            Q = vp9_regulate_q(cpi, cpi->this_frame_target);

-            Retries++;

-          }

-        }

-        overshoot_seen = TRUE;

-      }

-      // Frame is too small

-      else {

-        if (cpi->zbin_over_quant == 0)

-          q_high = (Q > q_low) ? (Q - 1) : q_low; // Lower q_high if not using over quant

-        else                                    // else lower zbin_oq_high

-          zbin_oq_high = (cpi->zbin_over_quant > zbin_oq_low) ? (cpi->zbin_over_quant - 1) : zbin_oq_low;

-        if (overshoot_seen || (loop_count > 1)) {

-          // Update rate_correction_factor unless cpi->active_worst_quality has changed.

-          if (!active_worst_qchanged)

-            vp9_update_rate_correction_factors(cpi, 1);

-          Q = (q_high + q_low) / 2;

-          // Adjust cpi->zbin_over_quant (only allowed when Q is max)

-          if (Q < MAXQ)

-            cpi->zbin_over_quant = 0;

-          else

-            cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;

-        } else {

-          // Update rate_correction_factor unless cpi->active_worst_quality has changed.

-          if (!active_worst_qchanged)

-            vp9_update_rate_correction_factors(cpi, 0);

-          Q = vp9_regulate_q(cpi, cpi->this_frame_target);

-          // Special case reset for qlow for constrained quality.

-          // This should only trigger where there is very substantial

-          // undershoot on a frame and the auto cq level is above

-          // the user passsed in value.

-          if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&

-              (Q < q_low)) {

-            q_low = Q;

-          }

-          while (((Q > q_high) || (cpi->zbin_over_quant > zbin_oq_high)) && (Retries < 10)) {

-            vp9_update_rate_correction_factors(cpi, 0);

-            Q = vp9_regulate_q(cpi, cpi->this_frame_target);

-            Retries++;

-          }

-        }

-        undershoot_seen = TRUE;

-      }

-      // Clamp Q to upper and lower limits:

-      if (Q > q_high)

-        Q = q_high;

-      else if (Q < q_low)

-        Q = q_low;

-      // Clamp cpi->zbin_over_quant

-      cpi->zbin_over_quant = (cpi->zbin_over_quant < zbin_oq_low) ?

-          zbin_oq_low : (cpi->zbin_over_quant > zbin_oq_high) ?

-          zbin_oq_high : cpi->zbin_over_quant;

-      // Loop = ((Q != last_q) || (last_zbin_oq != cpi->zbin_over_quant)) ? TRUE : FALSE;

-      Loop = ((Q != last_q)) ? TRUE : FALSE;

-      last_zbin_oq = cpi->zbin_over_quant;

-    } else

-      Loop = FALSE;

-    if (cpi->is_src_frame_alt_ref)

-      Loop = FALSE;

-    if (cm->frame_type != KEY_FRAME &&

-        !sf->search_best_filter &&

-        cm->mcomp_filter_type == SWITCHABLE) {

-      int interp_factor = Q / 3;  /* denominator is 256 */

-      int count[VP9_SWITCHABLE_FILTERS];

-      int tot_count = 0, c = 0, thr;

-      int i, j;

-      for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {

-        count[i] = 0;

-        for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {

-          count[i] += cpi->switchable_interp_count[j][i];

-        }

-        tot_count += count[i];

-      }

-      thr = ((tot_count * interp_factor + 128) >> 8);

-      for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {

-        c += (count[i] >= thr);

-      }

-      if (c == 1) {

-        /* Mostly one filter is used. So set the filter at frame level */

-        for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {

-          if (count[i]) {

-            cm->mcomp_filter_type = vp9_switchable_interp[i];

-            Loop = TRUE;  /* Make sure to loop since the filter changed */

-            break;

-          }

-        }

-      }

-    }

-    if (Loop == FALSE && cm->frame_type != KEY_FRAME && sf->search_best_filter) {

-      if (mcomp_filter_index < mcomp_filters) {

-        INT64 err = vp9_calc_ss_err(cpi->Source,

-                                    &cm->yv12_fb[cm->new_fb_idx]);

-        INT64 rate = cpi->projected_frame_size << 8;

-        mcomp_filter_cost[mcomp_filter_index] =

-          (RDCOST(cpi->RDMULT, cpi->RDDIV, rate, err));

-        mcomp_filter_index++;

-        if (mcomp_filter_index < mcomp_filters) {

-          cm->mcomp_filter_type = mcomp_filters_to_search[mcomp_filter_index];

-          loop_count = -1;

-          Loop = TRUE;

-        } else {

-          int f;

-          INT64 best_cost = mcomp_filter_cost[0];

-          int mcomp_best_filter = mcomp_filters_to_search[0];

-          for (f = 1; f < mcomp_filters; f++) {

-            if (mcomp_filter_cost[f] < best_cost) {

-              mcomp_best_filter = mcomp_filters_to_search[f];

-              best_cost = mcomp_filter_cost[f];

-            }

-          }

-          if (mcomp_best_filter != mcomp_filters_to_search[mcomp_filters - 1]) {

-            loop_count = -1;

-            Loop = TRUE;

-            cm->mcomp_filter_type = mcomp_best_filter;

-          }

-          /*

-          printf("  best filter = %d, ( ", mcomp_best_filter);

-          for (f=0;f<mcomp_filters; f++) printf("%d ",  mcomp_filter_cost[f]);

-          printf(")\n");

-          */

-        }

-#if RESET_FOREACH_FILTER

-        if (Loop == TRUE) {

-          overshoot_seen = FALSE;

-          undershoot_seen = FALSE;

-          zbin_oq_low = zbin_oq_low0;

-          zbin_oq_high = zbin_oq_high0;

-          q_low = q_low0;

-          q_high = q_high0;

-          Q = Q0;

-          cpi->zbin_over_quant = last_zbin_oq = last_zbin_oq0;

-          cpi->rate_correction_factor = rate_correction_factor0;

-          cpi->gf_rate_correction_factor = gf_rate_correction_factor0;

-          cpi->active_best_quality = active_best_quality0;

-          cpi->active_worst_quality = active_worst_quality0;

-        }

-#endif

-      }

-    }

-    if (Loop == TRUE) {

-      loop_count++;

-#if CONFIG_INTERNAL_STATS

-      cpi->tot_recode_hits++;

-#endif

-    }

-  } while (Loop == TRUE);

-  // Special case code to reduce pulsing when key frames are forced at a

-  // fixed interval. Note the reconstruction error if it is the frame before

-  // the force key frame

-  if (cpi->next_key_frame_forced && (cpi->twopass.frames_to_key == 0)) {

-    cpi->ambient_err = vp9_calc_ss_err(cpi->Source,

-                                       &cm->yv12_fb[cm->new_fb_idx]);

-  }

-  // This frame's MVs are saved and will be used in next frame's MV

-  // prediction. Last frame has one more line(add to bottom) and one

-  // more column(add to right) than cm->mip. The edge elements are

-  // initialized to 0.

-  if (cm->show_frame) { // do not save for altref frame

-    int mb_row;

-    int mb_col;

-    MODE_INFO *tmp = cm->mip;

-    if (cm->frame_type != KEY_FRAME) {

-      for (mb_row = 0; mb_row < cm->mb_rows + 1; mb_row ++) {

-        for (mb_col = 0; mb_col < cm->mb_cols + 1; mb_col ++) {

-          if (tmp->mbmi.ref_frame != INTRA_FRAME)

-            cpi->lfmv[mb_col + mb_row * (cm->mode_info_stride + 1)].as_int = tmp->mbmi.mv[0].as_int;

-          cpi->lf_ref_frame_sign_bias[mb_col + mb_row * (cm->mode_info_stride + 1)] = cm->ref_frame_sign_bias[tmp->mbmi.ref_frame];

-          cpi->lf_ref_frame[mb_col + mb_row * (cm->mode_info_stride + 1)] = tmp->mbmi.ref_frame;

-          tmp++;

-        }

-      }

-    }

-  }

-  // Update the GF useage maps.

-  // This is done after completing the compression of a frame when all modes

-  // etc. are finalized but before loop filter

-  vp9_update_gf_useage_maps(cpi, cm, &cpi->mb);

-  if (cm->frame_type == KEY_FRAME)

-    cm->refresh_last_frame = 1;

-#if 0

-  {

-    FILE *f = fopen("gfactive.stt", "a");

-    fprintf(f, "%8d %8d %8d %8d %8d\n", cm->current_video_frame, (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols), cpi->this_iiratio, cpi->next_iiratio, cm->refresh_golden_frame);

-    fclose(f);

-  }

-#endif

-  cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];

-#if WRITE_RECON_BUFFER

-  if (cm->show_frame)

-    write_cx_frame_to_file(cm->frame_to_show,

-                           cm->current_video_frame);

-  else

-    write_cx_frame_to_file(cm->frame_to_show,

-                           cm->current_video_frame + 1000);

-#endif

-  // Pick the loop filter level for the frame.

-  loopfilter_frame(cpi, cm);

-  // build the bitstream

-  cpi->dummy_packing = 0;

-  vp9_pack_bitstream(cpi, dest, size);

-  if (cpi->mb.e_mbd.update_mb_segmentation_map) {

-    update_reference_segmentation_map(cpi);

-  }

-#if CONFIG_PRED_FILTER

-  // Select the prediction filtering mode to use for the

-  // next frame based on the current frame selections

-  if (cm->frame_type != KEY_FRAME)

-    select_pred_filter_mode(cpi);

-#endif

-  update_reference_frames(cm);

-  vp9_copy(cpi->common.fc.coef_counts, cpi->coef_counts);

-  vp9_copy(cpi->common.fc.hybrid_coef_counts, cpi->hybrid_coef_counts);

-  vp9_copy(cpi->common.fc.coef_counts_8x8, cpi->coef_counts_8x8);

-  vp9_copy(cpi->common.fc.hybrid_coef_counts_8x8, cpi->hybrid_coef_counts_8x8);

-  vp9_copy(cpi->common.fc.coef_counts_16x16, cpi->coef_counts_16x16);

-  vp9_copy(cpi->common.fc.hybrid_coef_counts_16x16,

-           cpi->hybrid_coef_counts_16x16);

-  vp9_adapt_coef_probs(&cpi->common);

-  if (cpi->common.frame_type != KEY_FRAME) {

-    vp9_copy(cpi->common.fc.ymode_counts, cpi->ymode_count);

-    vp9_copy(cpi->common.fc.uv_mode_counts, cpi->y_uv_mode_count);

-    vp9_copy(cpi->common.fc.bmode_counts, cpi->bmode_count);

-    vp9_copy(cpi->common.fc.i8x8_mode_counts, cpi->i8x8_mode_count);

-    vp9_copy(cpi->common.fc.sub_mv_ref_counts, cpi->sub_mv_ref_count);

-    vp9_copy(cpi->common.fc.mbsplit_counts, cpi->mbsplit_count);

-    vp9_adapt_mode_probs(&cpi->common);

-    cpi->common.fc.NMVcount = cpi->NMVcount;

-    vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);

-    vp9_update_mode_context(&cpi->common);

-  }

-  /* Move storing frame_type out of the above loop since it is also

-   * needed in motion search besides loopfilter */

-  cm->last_frame_type = cm->frame_type;

-  // Keep a copy of the size estimate used in the loop

-  loop_size_estimate = cpi->projected_frame_size;

-  // Update rate control heuristics

-  cpi->total_byte_count += (*size);

-  cpi->projected_frame_size = (*size) << 3;

-  if (!active_worst_qchanged)

-    vp9_update_rate_correction_factors(cpi, 2);

-  cpi->last_q[cm->frame_type] = cm->base_qindex;

-  // Keep record of last boosted (KF/KF/ARF) Q value.

-  // If the current frame is coded at a lower Q then we also update it.

-  // If all mbs in this group are skipped only update if the Q value is

-  // better than that already stored.

-  // This is used to help set quality in forced key frames to reduce popping

-  if ((cm->base_qindex < cpi->last_boosted_qindex) ||

-      ((cpi->static_mb_pct < 100) &&

-       ((cm->frame_type == KEY_FRAME) ||

-        cm->refresh_alt_ref_frame ||

-        (cm->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) {

-    cpi->last_boosted_qindex = cm->base_qindex;

-  }

-  if (cm->frame_type == KEY_FRAME) {

-    vp9_adjust_key_frame_context(cpi);

-  }

-  // Keep a record of ambient average Q.

-  if (cm->frame_type != KEY_FRAME)

-    cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;

-  // Keep a record from which we can calculate the average Q excluding GF updates and key frames

-  if ((cm->frame_type != KEY_FRAME) && !cm->refresh_golden_frame && !cm->refresh_alt_ref_frame) {

-    cpi->ni_frames++;

-    cpi->tot_q += vp9_convert_qindex_to_q(Q);

-    cpi->avg_q = cpi->tot_q / (double)cpi->ni_frames;

-    // Calculate the average Q for normal inter frames (not key or GFU

-    // frames).

-    cpi->ni_tot_qi += Q;

-    cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames);

-  }

-  // Update the buffer level variable.

-  // Non-viewable frames are a special case and are treated as pure overhead.

-  if (!cm->show_frame)

-    cpi->bits_off_target -= cpi->projected_frame_size;

-  else

-    cpi->bits_off_target += cpi->av_per_frame_bandwidth - cpi->projected_frame_size;

-  // Clip the buffer level at the maximum buffer size

-  if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)

-    cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;

-  // Rolling monitors of whether we are over or underspending used to help regulate min and Max Q in two pass.

-  cpi->rolling_target_bits = ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4;

-  cpi->rolling_actual_bits = ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4;

-  cpi->long_rolling_target_bits = ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32;

-  cpi->long_rolling_actual_bits = ((cpi->long_rolling_actual_bits * 31) + cpi->projected_frame_size + 16) / 32;

-  // Actual bits spent

-  cpi->total_actual_bits    += cpi->projected_frame_size;

-  // Debug stats

-  cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size);

-  cpi->buffer_level = cpi->bits_off_target;

-  // Update bits left to the kf and gf groups to account for overshoot or undershoot on these frames

-  if (cm->frame_type == KEY_FRAME) {

-    cpi->twopass.kf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;

-    if (cpi->twopass.kf_group_bits < 0)

-      cpi->twopass.kf_group_bits = 0;

-  } else if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame) {

-    cpi->twopass.gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;

-    if (cpi->twopass.gf_group_bits < 0)

-      cpi->twopass.gf_group_bits = 0;

-  }

-  // Update the skip mb flag probabilities based on the distribution seen

-  // in this frame.

-  update_base_skip_probs(cpi);

-#if 0 //CONFIG_NEW_MVREF && CONFIG_INTERNAL_STATS

-  {

-    FILE *f = fopen("mv_ref_dist.stt", "a");

-    unsigned int i;

-    for (i = 0; i < MAX_MV_REFS; ++i) {

-      fprintf(f, "%10d", cpi->best_ref_index_counts[0][i]);

-    }

-    fprintf(f, "\n" );

-    fclose(f);

-  }

-#endif

-#if 0// 1 && CONFIG_INTERNAL_STATS

-  {

-    FILE *f = fopen("tmp.stt", "a");

-    int recon_err;

-    vp9_clear_system_state();  // __asm emms;

-    recon_err = vp9_calc_ss_err(cpi->Source,

-                                &cm->yv12_fb[cm->new_fb_idx]);

-    if (cpi->twopass.total_left_stats->coded_error != 0.0)

-      fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"

-              "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"

-              "%6d %5d %5d %5d %8d %8.2f %10d %10.3f"

-              "%10.3f %8d %10d %10d %10d\n",

-              cpi->common.current_video_frame, cpi->this_frame_target,

-              cpi->projected_frame_size, loop_size_estimate,

-              (cpi->projected_frame_size - cpi->this_frame_target),

-              (int)cpi->total_target_vs_actual,

-              (cpi->oxcf.starting_buffer_level - cpi->bits_off_target),

-              (int)cpi->total_actual_bits,

-              vp9_convert_qindex_to_q(cm->base_qindex),

-              (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,

-              vp9_convert_qindex_to_q(cpi->active_best_quality),

-              vp9_convert_qindex_to_q(cpi->active_worst_quality),

-              cpi->avg_q,

-              vp9_convert_qindex_to_q(cpi->ni_av_qi),

-              vp9_convert_qindex_to_q(cpi->cq_target_quality),

-              cpi->zbin_over_quant,

-              // cpi->avg_frame_qindex, cpi->zbin_over_quant,

-              cm->refresh_golden_frame, cm->refresh_alt_ref_frame,

-              cm->frame_type, cpi->gfu_boost,

-              cpi->twopass.est_max_qcorrection_factor,

-              (int)cpi->twopass.bits_left,

-              cpi->twopass.total_left_stats->coded_error,

-              (double)cpi->twopass.bits_left /

-              cpi->twopass.total_left_stats->coded_error,

-              cpi->tot_recode_hits, recon_err, cpi->kf_boost,

-              cpi->kf_zeromotion_pct);

-    else

-      fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"

-              "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"

-              "%6d %5d %5d %5d %8d %8.2f %10d %10.3f"

-              "%8d %10d %10d %10d\n",

-              cpi->common.current_video_frame,

-              cpi->this_frame_target, cpi->projected_frame_size,

-              loop_size_estimate,

-              (cpi->projected_frame_size - cpi->this_frame_target),

-              (int)cpi->total_target_vs_actual,

-              (cpi->oxcf.starting_buffer_level - cpi->bits_off_target),

-              (int)cpi->total_actual_bits,

-              vp9_convert_qindex_to_q(cm->base_qindex),

-              (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,

-              vp9_convert_qindex_to_q(cpi->active_best_quality),

-              vp9_convert_qindex_to_q(cpi->active_worst_quality),

-              cpi->avg_q,

-              vp9_convert_qindex_to_q(cpi->ni_av_qi),

-              vp9_convert_qindex_to_q(cpi->cq_target_quality),

-              cpi->zbin_over_quant,

-              // cpi->avg_frame_qindex, cpi->zbin_over_quant,

-              cm->refresh_golden_frame, cm->refresh_alt_ref_frame,

-              cm->frame_type, cpi->gfu_boost,

-              cpi->twopass.est_max_qcorrection_factor,

-              (int)cpi->twopass.bits_left,

-              cpi->twopass.total_left_stats->coded_error,

-              cpi->tot_recode_hits, recon_err, cpi->kf_boost,

-              cpi->kf_zeromotion_pct);

-    fclose(f);

-    if (0) {

-      FILE *fmodes = fopen("Modes.stt", "a");

-      int i;

-      fprintf(fmodes, "%6d:%1d:%1d:%1d ",

-              cpi->common.current_video_frame,

-              cm->frame_type, cm->refresh_golden_frame,

-              cm->refresh_alt_ref_frame);

-      for (i = 0; i < MAX_MODES; i++)

-        fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);

-      fprintf(fmodes, "\n");

-      fclose(fmodes);

-    }

-  }

-#endif

-#if 0

-  // Debug stats for segment feature experiments.

-  print_seg_map(cpi);

-#endif

-  // If this was a kf or Gf note the Q

-  if ((cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame || cm->refresh_alt_ref_frame)

-    cm->last_kf_gf_q = cm->base_qindex;

-  if (cm->refresh_golden_frame == 1)

-    cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;

-  else

-    cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_GOLDEN;

-  if (cm->refresh_alt_ref_frame == 1)

-    cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF;

-  else

-    cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF;

-  if (cm->refresh_last_frame & cm->refresh_golden_frame) // both refreshed

-    cpi->gold_is_last = 1;

-  else if (cm->refresh_last_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other

-    cpi->gold_is_last = 0;

-  if (cm->refresh_last_frame & cm->refresh_alt_ref_frame) // both refreshed

-    cpi->alt_is_last = 1;

-  else if (cm->refresh_last_frame ^ cm->refresh_alt_ref_frame) // 1 refreshed but not the other

-    cpi->alt_is_last = 0;

-  if (cm->refresh_alt_ref_frame & cm->refresh_golden_frame) // both refreshed

-    cpi->gold_is_alt = 1;

-  else if (cm->refresh_alt_ref_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other

-    cpi->gold_is_alt = 0;

-  cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;

-  if (cpi->gold_is_last)

-    cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;

-  if (cpi->alt_is_last)

-    cpi->ref_frame_flags &= ~VP9_ALT_FLAG;

-  if (cpi->gold_is_alt)

-    cpi->ref_frame_flags &= ~VP9_ALT_FLAG;

-  if (cpi->oxcf.play_alternate && cm->refresh_alt_ref_frame && (cm->frame_type != KEY_FRAME))

-    // Update the alternate reference frame stats as appropriate.

-    update_alt_ref_frame_stats(cpi);

-  else

-    // Update the Golden frame stats as appropriate.

-    update_golden_frame_stats(cpi);

-  if (cm->frame_type == KEY_FRAME) {

-    // Tell the caller that the frame was coded as a key frame

-    *frame_flags = cm->frame_flags | FRAMEFLAGS_KEY;

-    // As this frame is a key frame  the next defaults to an inter frame.

-    cm->frame_type = INTER_FRAME;

-  } else {

-    *frame_flags = cm->frame_flags&~FRAMEFLAGS_KEY;

-  }

-  // Clear the one shot update flags for segmentation map and mode/ref loop filter deltas.

-  xd->update_mb_segmentation_map = 0;

-  xd->update_mb_segmentation_data = 0;

-  xd->mode_ref_lf_delta_update = 0;

-  // Dont increment frame counters if this was an altref buffer update not a real frame

-  if (cm->show_frame) {

-    cm->current_video_frame++;

-    cpi->frames_since_key++;

-  }

-  // reset to normal state now that we are done.

-#if 0

-  {

-    char filename[512];

-    FILE *recon_file;

-    sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);

-    recon_file = fopen(filename, "wb");

-    fwrite(cm->yv12_fb[cm->lst_fb_idx].buffer_alloc,

-           cm->yv12_fb[cm->lst_fb_idx].frame_size, 1, recon_file);

-    fclose(recon_file);

-  }

-#endif

-#ifdef OUTPUT_YUV_REC

-  vp9_write_yuv_rec_frame(cm);

-#endif

-  if (cm->show_frame) {

-    vpx_memcpy(cm->prev_mip, cm->mip,

-               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

-  } else {

-    vpx_memset(cm->prev_mip, 0,

-               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

-  }

-}

-static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,

-                        unsigned char *dest, unsigned int *frame_flags) {

-  if (!cpi->common.refresh_alt_ref_frame)

-    vp9_second_pass(cpi);

-  encode_frame_to_data_rate(cpi, size, dest, frame_flags);

-  cpi->twopass.bits_left -= 8 * *size;

-  if (!cpi->common.refresh_alt_ref_frame) {

-    double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate;

-    double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth

-                                        * cpi->oxcf.two_pass_vbrmin_section / 100);

-    if (two_pass_min_rate < lower_bounds_min_rate)

-      two_pass_min_rate = lower_bounds_min_rate;

-    cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->oxcf.frame_rate);

-  }

-}

-// For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.

-#if HAVE_ARMV7

-extern void vp9_push_neon(int64_t *store);

-extern void vp9_pop_neon(int64_t *store);

-#endif

-int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,

-                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,

-                          int64_t end_time) {

-#if HAVE_ARMV7

-  int64_t store_reg[8];

-#endif

-  VP9_COMP              *cpi = (VP9_COMP *) ptr;

-  VP9_COMMON            *cm = &cpi->common;

-  struct vpx_usec_timer  timer;

-  int                    res = 0;

-#if HAVE_ARMV7

-#if CONFIG_RUNTIME_CPU_DETECT

-  if (cm->rtcd.flags & HAS_NEON)

-#endif

-  {

-    vp9_push_neon(store_reg);

-  }

-#endif

-  vpx_usec_timer_start(&timer);

-  if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags,

-                         cpi->active_map_enabled ? cpi->active_map : NULL))

-    res = -1;

-  cm->clr_type = sd->clrtype;

-  vpx_usec_timer_mark(&timer);

-  cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);

-#if HAVE_ARMV7

-#if CONFIG_RUNTIME_CPU_DETECT

-  if (cm->rtcd.flags & HAS_NEON)

-#endif

-  {

-    vp9_pop_neon(store_reg);

-  }

-#endif

-  return res;

-}

-static int frame_is_reference(const VP9_COMP *cpi) {

-  const VP9_COMMON *cm = &cpi->common;

-  const MACROBLOCKD *xd = &cpi->mb.e_mbd;

-  return cm->frame_type == KEY_FRAME || cm->refresh_last_frame

-         || cm->refresh_golden_frame || cm->refresh_alt_ref_frame

-         || cm->copy_buffer_to_gf || cm->copy_buffer_to_arf

-         || cm->refresh_entropy_probs

-         || xd->mode_ref_lf_delta_update

-         || xd->update_mb_segmentation_map || xd->update_mb_segmentation_data;

-}

-int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,

-                            unsigned long *size, unsigned char *dest,

-                            int64_t *time_stamp, int64_t *time_end, int flush) {

-#if HAVE_ARMV7

-  int64_t store_reg[8];

-#endif

-  VP9_COMP *cpi = (VP9_COMP *) ptr;

-  VP9_COMMON *cm = &cpi->common;

-  struct vpx_usec_timer  cmptimer;

-  YV12_BUFFER_CONFIG    *force_src_buffer = NULL;

-  if (!cpi)

-    return -1;

-#if HAVE_ARMV7

-#if CONFIG_RUNTIME_CPU_DETECT

-  if (cm->rtcd.flags & HAS_NEON)

-#endif

-  {

-    vp9_push_neon(store_reg);

-  }

-#endif

-  vpx_usec_timer_start(&cmptimer);

-  cpi->source = NULL;

-  cpi->mb.e_mbd.allow_high_precision_mv = ALTREF_HIGH_PRECISION_MV;

-  // Should we code an alternate reference frame

-  if (cpi->oxcf.play_alternate &&

-      cpi->source_alt_ref_pending) {

-    if ((cpi->source = vp9_lookahead_peek(cpi->lookahead,

-                                          cpi->frames_till_gf_update_due))) {

-      cpi->alt_ref_source = cpi->source;

-      if (cpi->oxcf.arnr_max_frames > 0) {

-        vp9_temporal_filter_prepare_c(cpi,

-                                      cpi->frames_till_gf_update_due);

-        force_src_buffer = &cpi->alt_ref_buffer;

-      }

-      cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;

-      cm->refresh_alt_ref_frame = 1;

-      cm->refresh_golden_frame = 0;

-      cm->refresh_last_frame = 0;

-      cm->show_frame = 0;

-      cpi->source_alt_ref_pending = FALSE;   // Clear Pending altf Ref flag.

-      cpi->is_src_frame_alt_ref = 0;

-    }

-  }

-  if (!cpi->source) {

-    if ((cpi->source = vp9_lookahead_pop(cpi->lookahead, flush))) {

-      cm->show_frame = 1;

-      cpi->is_src_frame_alt_ref = cpi->alt_ref_source

-                                  && (cpi->source == cpi->alt_ref_source);

-      if (cpi->is_src_frame_alt_ref)

-        cpi->alt_ref_source = NULL;

-    }

-  }

-  if (cpi->source) {

-    cpi->un_scaled_source =

-      cpi->Source = force_src_buffer ? force_src_buffer : &cpi->source->img;

-    *time_stamp = cpi->source->ts_start;

-    *time_end = cpi->source->ts_end;

-    *frame_flags = cpi->source->flags;

-  } else {

-    *size = 0;

-    if (flush && cpi->pass == 1 && !cpi->twopass.first_pass_done) {

-      vp9_end_first_pass(cpi);    /* get last stats packet */

-      cpi->twopass.first_pass_done = 1;

-    }

-#if HAVE_ARMV7

-#if CONFIG_RUNTIME_CPU_DETECT

-    if (cm->rtcd.flags & HAS_NEON)

-#endif

-    {

-      vp9_pop_neon(store_reg);

-    }

-#endif

-    return -1;

-  }

-  if (cpi->source->ts_start < cpi->first_time_stamp_ever) {

-    cpi->first_time_stamp_ever = cpi->source->ts_start;

-    cpi->last_end_time_stamp_seen = cpi->source->ts_start;

-  }

-  // adjust frame rates based on timestamps given

-  if (!cm->refresh_alt_ref_frame) {

-    int64_t this_duration;

-    int step = 0;

-    if (cpi->source->ts_start == cpi->first_time_stamp_ever) {

-      this_duration = cpi->source->ts_end - cpi->source->ts_start;

-      step = 1;

-    } else {

-      int64_t last_duration;

-      this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;

-      last_duration = cpi->last_end_time_stamp_seen

-                      - cpi->last_time_stamp_seen;

-      // do a step update if the duration changes by 10%

-      if (last_duration)

-        step = ((this_duration - last_duration) * 10 / last_duration);

-    }

-    if (this_duration) {

-      if (step)

-        vp9_new_frame_rate(cpi, 10000000.0 / this_duration);

-      else {

-        double avg_duration, interval;

-        /* Average this frame's rate into the last second's average

-         * frame rate. If we haven't seen 1 second yet, then average

-         * over the whole interval seen.

-         */

-        interval = cpi->source->ts_end - cpi->first_time_stamp_ever;

-        if (interval > 10000000.0)

-          interval = 10000000;

-        avg_duration = 10000000.0 / cpi->oxcf.frame_rate;

-        avg_duration *= (interval - avg_duration + this_duration);

-        avg_duration /= interval;

-        vp9_new_frame_rate(cpi, 10000000.0 / avg_duration);

-      }

-    }

-    cpi->last_time_stamp_seen = cpi->source->ts_start;

-    cpi->last_end_time_stamp_seen = cpi->source->ts_end;

-  }

-  // start with a 0 size frame

-  *size = 0;

-  // Clear down mmx registers

-  vp9_clear_system_state();  // __asm emms;

-  cm->frame_type = INTER_FRAME;

-  cm->frame_flags = *frame_flags;

-#if 0

-  if (cm->refresh_alt_ref_frame) {

-    // cm->refresh_golden_frame = 1;

-    cm->refresh_golden_frame = 0;

-    cm->refresh_last_frame = 0;

-  } else {

-    cm->refresh_golden_frame = 0;

-    cm->refresh_last_frame = 1;

-  }

-#endif

-  /* find a free buffer for the new frame */

-  {

-    int i = 0;

-    for (; i < NUM_YV12_BUFFERS; i++) {

-      if (!cm->yv12_fb[i].flags) {

-        cm->new_fb_idx = i;

-        break;

-      }

-    }

-    assert(i < NUM_YV12_BUFFERS);

-  }

-  if (cpi->pass == 1) {

-    Pass1Encode(cpi, size, dest, frame_flags);

-  } else if (cpi->pass == 2) {

-    Pass2Encode(cpi, size, dest, frame_flags);

-  } else {

-    encode_frame_to_data_rate(cpi, size, dest, frame_flags);

-  }

-  if (cm->refresh_entropy_probs) {

-    if (cm->refresh_alt_ref_frame)

-      vpx_memcpy(&cm->lfc_a, &cm->fc, sizeof(cm->fc));

-    else

-      vpx_memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc));

-  }

-  // if its a dropped frame honor the requests on subsequent frames

-  if (*size > 0) {

-    cpi->droppable = !frame_is_reference(cpi);

-    // return to normal state

-    cm->refresh_entropy_probs = 1;

-    cm->refresh_alt_ref_frame = 0;

-    cm->refresh_golden_frame = 0;

-    cm->refresh_last_frame = 1;

-    cm->frame_type = INTER_FRAME;

-  }

-  vpx_usec_timer_mark(&cmptimer);

-  cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);

-  if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) {

-    generate_psnr_packet(cpi);

-  }

-#if CONFIG_INTERNAL_STATS

-  if (cpi->pass != 1) {

-    cpi->bytes += *size;

-    if (cm->show_frame) {

-      cpi->count++;

-      if (cpi->b_calculate_psnr) {

-        double ye, ue, ve;

-        double frame_psnr;

-        YV12_BUFFER_CONFIG      *orig = cpi->Source;

-        YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;

-        YV12_BUFFER_CONFIG      *pp = &cm->post_proc_buffer;

-        int y_samples = orig->y_height * orig->y_width;

-        int uv_samples = orig->uv_height * orig->uv_width;

-        int t_samples = y_samples + 2 * uv_samples;

-        int64_t sq_error;

-        ye = calc_plane_error(orig->y_buffer, orig->y_stride,

-                              recon->y_buffer, recon->y_stride, orig->y_width,

-                              orig->y_height);

-        ue = calc_plane_error(orig->u_buffer, orig->uv_stride,

-                              recon->u_buffer, recon->uv_stride, orig->uv_width,

-                              orig->uv_height);

-        ve = calc_plane_error(orig->v_buffer, orig->uv_stride,

-                              recon->v_buffer, recon->uv_stride, orig->uv_width,

-                              orig->uv_height);

-        sq_error = ye + ue + ve;

-        frame_psnr = vp9_mse2psnr(t_samples, 255.0, sq_error);

-        cpi->total_y += vp9_mse2psnr(y_samples, 255.0, ye);

-        cpi->total_u += vp9_mse2psnr(uv_samples, 255.0, ue);

-        cpi->total_v += vp9_mse2psnr(uv_samples, 255.0, ve);

-        cpi->total_sq_error += sq_error;

-        cpi->total  += frame_psnr;

-        {

-          double frame_psnr2, frame_ssim2 = 0;

-          double weight = 0;

-#if CONFIG_POSTPROC

-          vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0, IF_RTCD(&cm->rtcd.postproc));

-#endif

-          vp9_clear_system_state();

-          ye = calc_plane_error(orig->y_buffer, orig->y_stride,

-                                pp->y_buffer, pp->y_stride, orig->y_width,

-                                orig->y_height);

-          ue = calc_plane_error(orig->u_buffer, orig->uv_stride,

-                                pp->u_buffer, pp->uv_stride, orig->uv_width,

-                                orig->uv_height);

-          ve = calc_plane_error(orig->v_buffer, orig->uv_stride,

-                                pp->v_buffer, pp->uv_stride, orig->uv_width,

-                                orig->uv_height);

-          sq_error = ye + ue + ve;

-          frame_psnr2 = vp9_mse2psnr(t_samples, 255.0, sq_error);

-          cpi->totalp_y += vp9_mse2psnr(y_samples, 255.0, ye);

-          cpi->totalp_u += vp9_mse2psnr(uv_samples, 255.0, ue);

-          cpi->totalp_v += vp9_mse2psnr(uv_samples, 255.0, ve);

-          cpi->total_sq_error2 += sq_error;

-          cpi->totalp  += frame_psnr2;

-          frame_ssim2 = vp9_calc_ssim(cpi->Source,

-                                      &cm->post_proc_buffer, 1, &weight);

-          cpi->summed_quality += frame_ssim2 * weight;

-          cpi->summed_weights += weight;

-#if 0

-          {

-            FILE *f = fopen("q_used.stt", "a");

-            fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",

-                    cpi->common.current_video_frame, y2, u2, v2,

-                    frame_psnr2, frame_ssim2);

-            fclose(f);

-          }

-#endif

-        }

-      }

-      if (cpi->b_calculate_ssimg) {

-        double y, u, v, frame_all;

-        frame_all =  vp9_calc_ssimg(cpi->Source, cm->frame_to_show,

-                                    &y, &u, &v);

-        cpi->total_ssimg_y += y;

-        cpi->total_ssimg_u += u;

-        cpi->total_ssimg_v += v;

-        cpi->total_ssimg_all += frame_all;

-      }

-    }

-  }

-#endif

-#if HAVE_ARMV7

-#if CONFIG_RUNTIME_CPU_DETECT

-  if (cm->rtcd.flags & HAS_NEON)

-#endif

-  {

-    vp9_pop_neon(store_reg);

-  }

-#endif

-  return 0;

-}

-int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,

-                              vp9_ppflags_t *flags) {

-  VP9_COMP *cpi = (VP9_COMP *) comp;

-  if (cpi->common.refresh_alt_ref_frame)

-    return -1;

-  else {

-    int ret;

-#if CONFIG_POSTPROC

-    ret = vp9_post_proc_frame(&cpi->common, dest, flags);

-#else

-    if (cpi->common.frame_to_show) {

-      *dest = *cpi->common.frame_to_show;

-      dest->y_width = cpi->common.Width;

-      dest->y_height = cpi->common.Height;

-      dest->uv_height = cpi->common.Height / 2;

-      ret = 0;

-    } else {

-      ret = -1;

-    }

-#endif // !CONFIG_POSTPROC

-    vp9_clear_system_state();

-    return ret;

-  }

-}

-int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,

-                   unsigned int cols, int delta_q[4], int delta_lf[4],

-                   unsigned int threshold[4]) {

-  VP9_COMP *cpi = (VP9_COMP *) comp;

-  signed char feature_data[SEG_LVL_MAX][MAX_MB_SEGMENTS];

-  MACROBLOCKD *xd = &cpi->mb.e_mbd;

-  int i;

-  if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)

-    return -1;

-  if (!map) {

-    vp9_disable_segmentation((VP9_PTR)cpi);

-    return 0;

-  }

-  // Set the segmentation Map

-  vp9_set_segmentation_map((VP9_PTR)cpi, map);

-  // Activate segmentation.

-  vp9_enable_segmentation((VP9_PTR)cpi);

-  // Set up the quant segment data

-  feature_data[SEG_LVL_ALT_Q][0] = delta_q[0];

-  feature_data[SEG_LVL_ALT_Q][1] = delta_q[1];

-  feature_data[SEG_LVL_ALT_Q][2] = delta_q[2];

-  feature_data[SEG_LVL_ALT_Q][3] = delta_q[3];

-  // Set up the loop segment data s

-  feature_data[SEG_LVL_ALT_LF][0] = delta_lf[0];

-  feature_data[SEG_LVL_ALT_LF][1] = delta_lf[1];

-  feature_data[SEG_LVL_ALT_LF][2] = delta_lf[2];

-  feature_data[SEG_LVL_ALT_LF][3] = delta_lf[3];

-  cpi->segment_encode_breakout[0] = threshold[0];

-  cpi->segment_encode_breakout[1] = threshold[1];

-  cpi->segment_encode_breakout[2] = threshold[2];

-  cpi->segment_encode_breakout[3] = threshold[3];

-  // Enable the loop and quant changes in the feature mask

-  for (i = 0; i < 4; i++) {

-    if (delta_q[i])

-      vp9_enable_segfeature(xd, i, SEG_LVL_ALT_Q);

-    else

-      vp9_disable_segfeature(xd, i, SEG_LVL_ALT_Q);

-    if (delta_lf[i])

-      vp9_enable_segfeature(xd, i, SEG_LVL_ALT_LF);

-    else

-      vp9_disable_segfeature(xd, i, SEG_LVL_ALT_LF);

-  }

-  // Initialise the feature data structure

-  // SEGMENT_DELTADATA    0, SEGMENT_ABSDATA      1

-  vp9_set_segment_data((VP9_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA);

-  return 0;

-}

-int vp9_set_active_map(VP9_PTR comp, unsigned char *map,

-                       unsigned int rows, unsigned int cols) {

-  VP9_COMP *cpi = (VP9_COMP *) comp;

-  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {

-    if (map) {

-      vpx_memcpy(cpi->active_map, map, rows * cols);

-      cpi->active_map_enabled = 1;

-    } else

-      cpi->active_map_enabled = 0;

-    return 0;

-  } else {

-    // cpi->active_map_enabled = 0;

-    return -1;

-  }

-}

-int vp9_set_internal_size(VP9_PTR comp,

-                          VPX_SCALING horiz_mode, VPX_SCALING vert_mode) {

-  VP9_COMP *cpi = (VP9_COMP *) comp;

-  if (horiz_mode <= ONETWO)

-    cpi->common.horiz_scale = horiz_mode;

-  else

-    return -1;

-  if (vert_mode <= ONETWO)

-    cpi->common.vert_scale  = vert_mode;

-  else

-    return -1;

-  return 0;

-}

-int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest) {

-  int i, j;

-  int Total = 0;

-  unsigned char *src = source->y_buffer;

-  unsigned char *dst = dest->y_buffer;

-  // Loop through the Y plane raw and reconstruction data summing (square differences)

-  for (i = 0; i < source->y_height; i += 16) {

-    for (j = 0; j < source->y_width; j += 16) {

-      unsigned int sse;

-      Total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride,

-                            &sse);

-    }

-    src += 16 * source->y_stride;

-    dst += 16 * dest->y_stride;

-  }

-  return Total;

-}

-int vp9_get_quantizer(VP9_PTR c) {

-  VP9_COMP   *cpi = (VP9_COMP *) c;

-  return cpi->common.base_qindex;

-}

--- a/vp8/encoder/onyx_int.h

+++ /dev/null

@@ -1,788 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_ONYX_INT_H

-#define __INC_ONYX_INT_H

-#include <stdio.h>

-#include "vpx_ports/config.h"

-#include "vp8/common/onyx.h"

-#include "treewriter.h"

-#include "tokenize.h"

-#include "vp8/common/onyxc_int.h"

-#include "variance.h"

-#include "encodemb.h"

-#include "quantize.h"

-#include "vp8/common/entropy.h"

-#include "vp8/common/entropymode.h"

-#include "vpx_ports/mem.h"

-#include "vpx/internal/vpx_codec_internal.h"

-#include "mcomp.h"

-#include "temporal_filter.h"

-#include "vp8/common/findnearmv.h"

-#include "lookahead.h"

-// #define SPEEDSTATS 1

-#define MIN_GF_INTERVAL             4

-#define DEFAULT_GF_INTERVAL         7

-#define KEY_FRAME_CONTEXT 5

-#define MAX_LAG_BUFFERS 25

-#define AF_THRESH   25

-#define AF_THRESH2  100

-#define ARF_DECAY_THRESH 12

-#if CONFIG_PRED_FILTER

-#define MAX_MODES 54

-#else  // CONFIG_PRED_FILTER

-#define MAX_MODES 42

-#endif  // CONFIG_PRED_FILTER

-#define MIN_THRESHMULT  32

-#define MAX_THRESHMULT  512

-#define GF_ZEROMV_ZBIN_BOOST 12

-#define LF_ZEROMV_ZBIN_BOOST 6

-#define MV_ZBIN_BOOST        4

-#define ZBIN_OQ_MAX 192

-#define VP9_TEMPORAL_ALT_REF 1

-typedef struct {

-  nmv_context nmvc;

-  int nmvjointcost[MV_JOINTS];

-  int nmvcosts[2][MV_VALS];

-  int nmvcosts_hp[2][MV_VALS];

-#ifdef MODE_STATS

-  // Stats

-  int y_modes[VP9_YMODES];

-  int uv_modes[VP9_UV_MODES];

-  int i8x8_modes[VP9_I8X8_MODES];

-  int b_modes[B_MODE_COUNT];

-  int inter_y_modes[MB_MODE_COUNT];

-  int inter_uv_modes[VP9_UV_MODES];

-  int inter_b_modes[B_MODE_COUNT];

-#endif

-  vp9_prob segment_pred_probs[PREDICTION_PROBS];

-  unsigned char ref_pred_probs_update[PREDICTION_PROBS];

-  vp9_prob ref_pred_probs[PREDICTION_PROBS];

-  vp9_prob prob_comppred[COMP_PRED_CONTEXTS];

-  unsigned char *last_frame_seg_map_copy;

-  // 0 = Intra, Last, GF, ARF

-  signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];

-  // 0 = BPRED, ZERO_MV, MV, SPLIT

-  signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];

-  vp9_prob coef_probs[BLOCK_TYPES]

-      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

-  vp9_prob hybrid_coef_probs[BLOCK_TYPES]

-      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

-  vp9_prob coef_probs_8x8[BLOCK_TYPES_8X8]

-      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

-  vp9_prob hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]

-      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

-  vp9_prob coef_probs_16x16[BLOCK_TYPES_16X16]

-      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

-  vp9_prob hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]

-      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

-  vp9_prob ymode_prob [VP9_YMODES - 1]; /* interframe intra mode probs */

-  vp9_prob uv_mode_prob [VP9_YMODES][VP9_UV_MODES - 1];

-  vp9_prob bmode_prob [VP9_BINTRAMODES - 1];

-  vp9_prob i8x8_mode_prob [VP9_I8X8_MODES - 1];

-  vp9_prob sub_mv_ref_prob [SUBMVREF_COUNT][VP9_SUBMVREFS - 1];

-  vp9_prob mbsplit_prob [VP9_NUMMBSPLITS - 1];

-  vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]

-                                 [VP9_SWITCHABLE_FILTERS - 1];

-  int mv_ref_ct[6][4][2];

-  int mode_context[6][4];

-  int mv_ref_ct_a[6][4][2];

-  int mode_context_a[6][4];

-} CODING_CONTEXT;

-typedef struct {

-  double frame;

-  double intra_error;

-  double coded_error;

-  double sr_coded_error;

-  double ssim_weighted_pred_err;

-  double pcnt_inter;

-  double pcnt_motion;

-  double pcnt_second_ref;

-  double pcnt_neutral;

-  double MVr;

-  double mvr_abs;

-  double MVc;

-  double mvc_abs;

-  double MVrv;

-  double MVcv;

-  double mv_in_out_count;

-  double new_mv_count;

-  double duration;

-  double count;

-}

-FIRSTPASS_STATS;

-typedef struct {

-  int frames_so_far;

-  double frame_intra_error;

-  double frame_coded_error;

-  double frame_pcnt_inter;

-  double frame_pcnt_motion;

-  double frame_mvr;

-  double frame_mvr_abs;

-  double frame_mvc;

-  double frame_mvc_abs;

-} ONEPASS_FRAMESTATS;

-typedef struct {

-  struct {

-    int err;

-    union {

-      int_mv mv;

-      MB_PREDICTION_MODE mode;

-    } m;

-  } ref[MAX_REF_FRAMES];

-} MBGRAPH_MB_STATS;

-typedef struct {

-  MBGRAPH_MB_STATS *mb_stats;

-} MBGRAPH_FRAME_STATS;

-#if CONFIG_PRED_FILTER

-typedef enum {

-  THR_ZEROMV,

-  THR_ZEROMV_FILT,

-  THR_DC,

-  THR_NEARESTMV,

-  THR_NEARESTMV_FILT,

-  THR_NEARMV,

-  THR_NEARMV_FILT,

-  THR_ZEROG,

-  THR_ZEROG_FILT,

-  THR_NEARESTG,

-  THR_NEARESTG_FILT,

-  THR_ZEROA,

-  THR_ZEROA_FILT,

-  THR_NEARESTA,

-  THR_NEARESTA_FILT,

-  THR_NEARG,

-  THR_NEARG_FILT,

-  THR_NEARA,

-  THR_NEARA_FILT,

-  THR_V_PRED,

-  THR_H_PRED,

-  THR_D45_PRED,

-  THR_D135_PRED,

-  THR_D117_PRED,

-  THR_D153_PRED,

-  THR_D27_PRED,

-  THR_D63_PRED,

-  THR_TM,

-  THR_NEWMV,

-  THR_NEWMV_FILT,

-  THR_NEWG,

-  THR_NEWG_FILT,

-  THR_NEWA,

-  THR_NEWA_FILT,

-  THR_SPLITMV,

-  THR_SPLITG,

-  THR_SPLITA,

-  THR_B_PRED,

-  THR_I8X8_PRED,

-  THR_COMP_ZEROLG,

-  THR_COMP_NEARESTLG,

-  THR_COMP_NEARLG,

-  THR_COMP_ZEROLA,

-  THR_COMP_NEARESTLA,

-  THR_COMP_NEARLA,

-  THR_COMP_ZEROGA,

-  THR_COMP_NEARESTGA,

-  THR_COMP_NEARGA,

-  THR_COMP_NEWLG,

-  THR_COMP_NEWLA,

-  THR_COMP_NEWGA,

-  THR_COMP_SPLITLG,

-  THR_COMP_SPLITLA,

-  THR_COMP_SPLITGA,

-}

-THR_MODES;

-#else

-typedef enum {

-  THR_ZEROMV,

-  THR_DC,

-  THR_NEARESTMV,

-  THR_NEARMV,

-  THR_ZEROG,

-  THR_NEARESTG,

-  THR_ZEROA,

-  THR_NEARESTA,

-  THR_NEARG,

-  THR_NEARA,

-  THR_V_PRED,

-  THR_H_PRED,

-  THR_D45_PRED,

-  THR_D135_PRED,

-  THR_D117_PRED,

-  THR_D153_PRED,

-  THR_D27_PRED,

-  THR_D63_PRED,

-  THR_TM,

-  THR_NEWMV,

-  THR_NEWG,

-  THR_NEWA,

-  THR_SPLITMV,

-  THR_SPLITG,

-  THR_SPLITA,

-  THR_B_PRED,

-  THR_I8X8_PRED,

-  THR_COMP_ZEROLG,

-  THR_COMP_NEARESTLG,

-  THR_COMP_NEARLG,

-  THR_COMP_ZEROLA,

-  THR_COMP_NEARESTLA,

-  THR_COMP_NEARLA,

-  THR_COMP_ZEROGA,

-  THR_COMP_NEARESTGA,

-  THR_COMP_NEARGA,

-  THR_COMP_NEWLG,

-  THR_COMP_NEWLA,

-  THR_COMP_NEWGA,

-  THR_COMP_SPLITLG,

-  THR_COMP_SPLITLA,

-  THR_COMP_SPLITGA

-}

-THR_MODES;

-#endif

-typedef enum {

-  DIAMOND = 0,

-  NSTEP = 1,

-  HEX = 2

-} SEARCH_METHODS;

-typedef struct {

-  int RD;

-  SEARCH_METHODS search_method;

-  int improved_dct;

-  int auto_filter;

-  int recode_loop;

-  int iterative_sub_pixel;

-  int half_pixel_search;

-  int quarter_pixel_search;

-  int thresh_mult[MAX_MODES];

-  int max_step_search_steps;

-  int first_step;

-  int optimize_coefficients;

-  int no_skip_block4x4_search;

-  int improved_mv_pred;

-  int search_best_filter;

-} SPEED_FEATURES;

-typedef struct {

-  MACROBLOCK  mb;

-  int totalrate;

-} MB_ROW_COMP;

-typedef struct {

-  TOKENEXTRA *start;

-  TOKENEXTRA *stop;

-} TOKENLIST;

-typedef struct {

-  int ithread;

-  void *ptr1;

-  void *ptr2;

-} ENCODETHREAD_DATA;

-typedef struct {

-  int ithread;

-  void *ptr1;

-} LPFTHREAD_DATA;

-typedef struct VP9_ENCODER_RTCD {

-  VP9_COMMON_RTCD            *common;

-  vp9_search_rtcd_vtable_t    search;

-  vp9_temporal_rtcd_vtable_t  temporal;

-} VP9_ENCODER_RTCD;

-enum BlockSize {

-  BLOCK_16X8 = PARTITIONING_16X8,

-  BLOCK_8X16 = PARTITIONING_8X16,

-  BLOCK_8X8 = PARTITIONING_8X8,

-  BLOCK_4X4 = PARTITIONING_4X4,

-  BLOCK_16X16,

-  BLOCK_MAX_SEGMENTS,

-  BLOCK_32X32 = BLOCK_MAX_SEGMENTS,

-  BLOCK_MAX_SB_SEGMENTS,

-};

-typedef struct VP9_COMP {

-  DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(64, short, Y1zbin_8x8[QINDEX_RANGE][64]);

-  DECLARE_ALIGNED(64, short, Y2zbin_8x8[QINDEX_RANGE][64]);

-  DECLARE_ALIGNED(64, short, UVzbin_8x8[QINDEX_RANGE][64]);

-  DECLARE_ALIGNED(64, short, zrun_zbin_boost_y1_8x8[QINDEX_RANGE][64]);

-  DECLARE_ALIGNED(64, short, zrun_zbin_boost_y2_8x8[QINDEX_RANGE][64]);

-  DECLARE_ALIGNED(64, short, zrun_zbin_boost_uv_8x8[QINDEX_RANGE][64]);

-  DECLARE_ALIGNED(16, short, Y1zbin_16x16[QINDEX_RANGE][256]);

-  DECLARE_ALIGNED(16, short, Y2zbin_16x16[QINDEX_RANGE][256]);

-  DECLARE_ALIGNED(16, short, UVzbin_16x16[QINDEX_RANGE][256]);

-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1_16x16[QINDEX_RANGE][256]);

-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_16x16[QINDEX_RANGE][256]);

-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_16x16[QINDEX_RANGE][256]);

-  MACROBLOCK mb;

-  VP9_COMMON common;

-  VP9_CONFIG oxcf;

-  struct lookahead_ctx    *lookahead;

-  struct lookahead_entry  *source;

-  struct lookahead_entry  *alt_ref_source;

-  YV12_BUFFER_CONFIG *Source;

-  YV12_BUFFER_CONFIG *un_scaled_source;

-  YV12_BUFFER_CONFIG scaled_source;

-  int source_alt_ref_pending; // frame in src_buffers has been identified to be encoded as an alt ref

-  int source_alt_ref_active;  // an alt ref frame has been encoded and is usable

-  int is_src_frame_alt_ref;   // source of frame to encode is an exact copy of an alt ref frame

-  int gold_is_last; // golden frame same as last frame ( short circuit gold searches)

-  int alt_is_last;  // Alt reference frame same as last ( short circuit altref search)

-  int gold_is_alt;  // don't do both alt and gold search ( just do gold).

-  // int refresh_alt_ref_frame;

-  YV12_BUFFER_CONFIG last_frame_uf;

-  TOKENEXTRA *tok;

-  unsigned int tok_count;

-  unsigned int frames_since_key;

-  unsigned int key_frame_frequency;

-  unsigned int this_key_frame_forced;

-  unsigned int next_key_frame_forced;

-  // Ambient reconstruction err target for force key frames

-  int ambient_err;

-  unsigned int mode_check_freq[MAX_MODES];

-  unsigned int mode_test_hit_counts[MAX_MODES];

-  unsigned int mode_chosen_counts[MAX_MODES];

-  int rd_thresh_mult[MAX_MODES];

-  int rd_baseline_thresh[MAX_MODES];

-  int rd_threshes[MAX_MODES];

-  int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES];

-  int rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];

-  int comp_pred_count[COMP_PRED_CONTEXTS];

-  int single_pred_count[COMP_PRED_CONTEXTS];

-  // FIXME contextualize

-  int txfm_count[TX_SIZE_MAX];

-  int txfm_count_8x8p[TX_SIZE_MAX - 1];

-  int64_t rd_tx_select_diff[NB_TXFM_MODES];

-  int rd_tx_select_threshes[4][NB_TXFM_MODES];

-  int RDMULT;

-  int RDDIV;

-  CODING_CONTEXT coding_context;

-  // Rate targetting variables

-  int64_t prediction_error;

-  int64_t last_prediction_error;

-  int64_t intra_error;

-  int64_t last_intra_error;

-  int this_frame_target;

-  int projected_frame_size;

-  int last_q[2];                   // Separate values for Intra/Inter

-  int last_boosted_qindex;         // Last boosted GF/KF/ARF q

-  double rate_correction_factor;

-  double key_frame_rate_correction_factor;

-  double gf_rate_correction_factor;

-  int frames_till_gf_update_due;      // Count down till next GF

-  int current_gf_interval;          // GF interval chosen when we coded the last GF

-  int gf_overspend_bits;            // Total bits overspent becasue of GF boost (cumulative)

-  int non_gf_bitrate_adjustment;     // Used in the few frames following a GF to recover the extra bits spent in that GF

-  int kf_overspend_bits;            // Extra bits spent on key frames that need to be recovered on inter frames

-  int kf_bitrate_adjustment;        // Current number of bit s to try and recover on each inter frame.

-  int max_gf_interval;

-  int baseline_gf_interval;

-  int active_arnr_frames;           // <= cpi->oxcf.arnr_max_frames

-  int64_t key_frame_count;

-  int prior_key_frame_distance[KEY_FRAME_CONTEXT];

-  int per_frame_bandwidth;          // Current section per frame bandwidth target

-  int av_per_frame_bandwidth;        // Average frame size target for clip

-  int min_frame_bandwidth;          // Minimum allocation that should be used for any frame

-  int inter_frame_target;

-  double output_frame_rate;

-  int64_t last_time_stamp_seen;

-  int64_t last_end_time_stamp_seen;

-  int64_t first_time_stamp_ever;

-  int ni_av_qi;

-  int ni_tot_qi;

-  int ni_frames;

-  int avg_frame_qindex;

-  double tot_q;

-  double avg_q;

-  int zbin_over_quant;

-  int zbin_mode_boost;

-  int zbin_mode_boost_enabled;

-  int64_t total_byte_count;

-  int buffered_mode;

-  int buffer_level;

-  int bits_off_target;

-  int rolling_target_bits;

-  int rolling_actual_bits;

-  int long_rolling_target_bits;

-  int long_rolling_actual_bits;

-  int64_t total_actual_bits;

-  int total_target_vs_actual;        // debug stats

-  int worst_quality;

-  int active_worst_quality;

-  int best_quality;

-  int active_best_quality;

-  int cq_target_quality;

-#if CONFIG_SUPERBLOCKS

-  int sb_count;

-  int sb_ymode_count [VP9_I32X32_MODES];

-#endif

-  int ymode_count [VP9_YMODES];        /* intra MB type cts this frame */

-  int bmode_count [VP9_BINTRAMODES];

-  int i8x8_mode_count [VP9_I8X8_MODES];

-  int sub_mv_ref_count [SUBMVREF_COUNT][VP9_SUBMVREFS];

-  int mbsplit_count [VP9_NUMMBSPLITS];

-  // int uv_mode_count[VP9_UV_MODES];       /* intra MB type cts this frame */

-  int y_uv_mode_count[VP9_YMODES][VP9_UV_MODES];

-  nmv_context_counts NMVcount;

-  unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */

-  vp9_prob frame_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

-  unsigned int frame_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];

-  unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */

-  vp9_prob frame_hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

-  unsigned int frame_hybrid_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];

-  unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */

-  vp9_prob frame_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

-  unsigned int frame_branch_ct_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];

-  unsigned int hybrid_coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */

-  vp9_prob frame_hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

-  unsigned int frame_hybrid_branch_ct_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];

-  unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */

-  vp9_prob frame_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

-  unsigned int frame_branch_ct_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];

-  unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */

-  vp9_prob frame_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

-  unsigned int frame_hybrid_branch_ct_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];

-  int gfu_boost;

-  int last_boost;

-  int kf_boost;

-  int kf_zeromotion_pct;

-  int target_bandwidth;

-  struct vpx_codec_pkt_list  *output_pkt_list;

-#if 0

-  // Experimental code for lagged and one pass

-  ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS];

-  int one_pass_frame_index;

-#endif

-  MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];

-  int mbgraph_n_frames;             // number of frames filled in the above

-  int static_mb_pct;                // % forced skip mbs by segmentation

-  int seg0_progress, seg0_idx, seg0_cnt;

-  int ref_pred_count[3][2];

-  int decimation_factor;

-  int decimation_count;

-  // for real time encoding

-  int avg_encode_time;              // microsecond

-  int avg_pick_mode_time;            // microsecond

-  int Speed;

-  unsigned int cpu_freq;           // Mhz

-  int compressor_speed;

-  int interquantizer;

-  int goldfreq;

-  int auto_worst_q;

-  int cpu_used;

-  int horiz_scale;

-  int vert_scale;

-  int pass;

-  vp9_prob last_skip_false_probs[3][MBSKIP_CONTEXTS];

-  int last_skip_probs_q[3];

-  int recent_ref_frame_usage[MAX_REF_FRAMES];

-  int count_mb_ref_frame_usage[MAX_REF_FRAMES];

-  int ref_frame_flags;

-  unsigned char ref_pred_probs_update[PREDICTION_PROBS];

-  SPEED_FEATURES sf;

-  int error_bins[1024];

-  // Data used for real time conferencing mode to help determine if it would be good to update the gf

-  int inter_zz_count;

-  int gf_bad_count;

-  int gf_update_recommended;

-  int skip_true_count[3];

-  int skip_false_count[3];

-  unsigned char *segmentation_map;

-  // segment threashold for encode breakout

-  int  segment_encode_breakout[MAX_MB_SEGMENTS];

-  unsigned char *active_map;

-  unsigned int active_map_enabled;

-  TOKENLIST *tplist;

-  fractional_mv_step_fp *find_fractional_mv_step;

-  vp9_full_search_fn_t full_search_sad;

-  vp9_refining_search_fn_t refining_search_sad;

-  vp9_diamond_search_fn_t diamond_search_sad;

-  vp9_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SB_SEGMENTS];

-  uint64_t time_receive_data;

-  uint64_t time_compress_data;

-  uint64_t time_pick_lpf;

-  uint64_t time_encode_mb_row;

-  int base_skip_false_prob[QINDEX_RANGE][3];

-  struct twopass_rc {

-    unsigned int section_intra_rating;

-    unsigned int next_iiratio;

-    unsigned int this_iiratio;

-    FIRSTPASS_STATS *total_stats;

-    FIRSTPASS_STATS *this_frame_stats;

-    FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;

-    FIRSTPASS_STATS *total_left_stats;

-    int first_pass_done;

-    int64_t bits_left;

-    int64_t clip_bits_total;

-    double avg_iiratio;

-    double modified_error_total;

-    double modified_error_used;

-    double modified_error_left;

-    double kf_intra_err_min;

-    double gf_intra_err_min;

-    int frames_to_key;

-    int maxq_max_limit;

-    int maxq_min_limit;

-    int static_scene_max_gf_interval;

-    int kf_bits;

-    int gf_group_error_left;           // Remaining error from uncoded frames in a gf group. Two pass use only

-    // Projected total bits available for a key frame group of frames

-    int64_t kf_group_bits;

-    // Error score of frames still to be coded in kf group

-    int64_t kf_group_error_left;

-    int gf_group_bits;                // Projected Bits available for a group of frames including 1 GF or ARF

-    int gf_bits;                     // Bits for the golden frame or ARF - 2 pass only

-    int alt_extra_bits;

-    int sr_update_lag;

-    double est_max_qcorrection_factor;

-  } twopass;

-#if CONFIG_RUNTIME_CPU_DETECT

-  VP9_ENCODER_RTCD            rtcd;

-#endif

-#if VP9_TEMPORAL_ALT_REF

-  YV12_BUFFER_CONFIG alt_ref_buffer;

-  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];

-  int fixed_divide[512];

-#endif

-#if CONFIG_INTERNAL_STATS

-  int    count;

-  double total_y;

-  double total_u;

-  double total_v;

-  double total;

-  double total_sq_error;

-  double totalp_y;

-  double totalp_u;

-  double totalp_v;

-  double totalp;

-  double total_sq_error2;

-  int    bytes;

-  double summed_quality;

-  double summed_weights;

-  unsigned int tot_recode_hits;

-  double total_ssimg_y;

-  double total_ssimg_u;

-  double total_ssimg_v;

-  double total_ssimg_all;

-  int b_calculate_ssimg;

-#endif

-  int b_calculate_psnr;

-  // Per MB activity measurement

-  unsigned int activity_avg;

-  unsigned int *mb_activity_map;

-  int *mb_norm_activity_map;

-  // Record of which MBs still refer to last golden frame either

-  // directly or through 0,0

-  unsigned char *gf_active_flags;

-  int gf_active_count;

-  int output_partition;

-  // Store last frame's MV info for next frame MV prediction

-  int_mv *lfmv;

-  int *lf_ref_frame_sign_bias;

-  int *lf_ref_frame;

-  /* force next frame to intra when kf_auto says so */

-  int force_next_frame_intra;

-  int droppable;

-  // TODO Do we still need this??

-  int update_context;

-  int dummy_packing;    /* flag to indicate if packing is dummy */

-#if CONFIG_PRED_FILTER

-  int pred_filter_on_count;

-  int pred_filter_off_count;

-#endif

-  unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1]

-                                      [VP9_SWITCHABLE_FILTERS];

-#if CONFIG_NEW_MVREF

-  unsigned int best_ref_index_counts[MAX_REF_FRAMES][MAX_MV_REFS];

-#endif

-} VP9_COMP;

-void vp9_encode_frame(VP9_COMP *cpi);

-void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,

-                        unsigned long *size);

-void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x);

-void vp9_tokenize_mb(VP9_COMP *, MACROBLOCKD *, TOKENEXTRA **, int dry_run);

-void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run);

-void vp9_set_speed_features(VP9_COMP *cpi);

-#if CONFIG_DEBUG

-#define CHECK_MEM_ERROR(lval,expr) do {\

-    lval = (expr); \

-    if(!lval) \

-      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\

-                         "Failed to allocate "#lval" at %s:%d", \

-                         __FILE__,__LINE__);\

-  } while(0)

-#else

-#define CHECK_MEM_ERROR(lval,expr) do {\

-    lval = (expr); \

-    if(!lval) \

-      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\

-                         "Failed to allocate "#lval);\

-  } while(0)

-#endif

-#endif  // __INC_ONYX_INT_H

--- a/vp8/encoder/picklpf.c

+++ /dev/null

@@ -1,420 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp8/common/onyxc_int.h"

-#include "onyx_int.h"

-#include "quantize.h"

-#include "vpx_mem/vpx_mem.h"

-#include "vpx_scale/yv12extend.h"

-#include "vpx_scale/vpxscale.h"

-#include "vp8/common/alloccommon.h"

-#include "vp8/common/loopfilter.h"

-#if ARCH_ARM

-#include "vpx_ports/arm.h"

-#endif

-extern int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source,

-                           YV12_BUFFER_CONFIG *dest);

-#if HAVE_ARMV7

-extern void vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);

-#endif

-#if CONFIG_RUNTIME_CPU_DETECT

-#define IF_RTCD(x) (x)

-#else

-#define IF_RTCD(x) NULL

-#endif

-extern void(*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc,

-                                              YV12_BUFFER_CONFIG *dst_ybc,

-                                              int fraction);

-void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,

-                                 YV12_BUFFER_CONFIG *dst_ybc, int Fraction) {

-  unsigned char *src_y, *dst_y;

-  int yheight;

-  int ystride;

-  int border;

-  int yoffset;

-  int linestocopy;

-  border   = src_ybc->border;

-  yheight  = src_ybc->y_height;

-  ystride  = src_ybc->y_stride;

-  linestocopy = (yheight >> (Fraction + 4));

-  if (linestocopy < 1)

-    linestocopy = 1;

-  linestocopy <<= 4;

-  yoffset  = ystride * ((yheight >> 5) * 16 - 8);

-  src_y = src_ybc->y_buffer + yoffset;

-  dst_y = dst_ybc->y_buffer + yoffset;

-  vpx_memcpy(dst_y, src_y, ystride * (linestocopy + 16));

-}

-static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,

-                                YV12_BUFFER_CONFIG *dest, int Fraction) {

-  int i, j;

-  int Total = 0;

-  int srcoffset, dstoffset;

-  unsigned char *src = source->y_buffer;

-  unsigned char *dst = dest->y_buffer;

-  int linestocopy = (source->y_height >> (Fraction + 4));

-  if (linestocopy < 1)

-    linestocopy = 1;

-  linestocopy <<= 4;

-  srcoffset = source->y_stride   * (dest->y_height >> 5) * 16;

-  dstoffset = dest->y_stride     * (dest->y_height >> 5) * 16;

-  src += srcoffset;

-  dst += dstoffset;

-  // Loop through the Y plane raw and reconstruction data summing (square differences)

-  for (i = 0; i < linestocopy; i += 16) {

-    for (j = 0; j < source->y_width; j += 16) {

-      unsigned int sse;

-      Total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride,

-                            &sse);

-    }

-    src += 16 * source->y_stride;

-    dst += 16 * dest->y_stride;

-  }

-  return Total;

-}

-// Enforce a minimum filter level based upon baseline Q

-static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) {

-  int min_filter_level;

-  /*int q = (int) vp9_convert_qindex_to_q(base_qindex);

-  if (cpi->source_alt_ref_active && cpi->common.refresh_golden_frame && !cpi->common.refresh_alt_ref_frame)

-      min_filter_level = 0;

-  else

-  {

-      if (q <= 10)

-          min_filter_level = 0;

-      else if (q <= 64)

-          min_filter_level = 1;

-      else

-          min_filter_level = (q >> 6);

-  }

-  */

-  min_filter_level = 0;

-  return min_filter_level;

-}

-// Enforce a maximum filter level based upon baseline Q

-static int get_max_filter_level(VP9_COMP *cpi, int base_qindex) {

-  // PGW August 2006: Highest filter values almost always a bad idea

-  // jbb chg: 20100118 - not so any more with this overquant stuff allow high values

-  // with lots of intra coming in.

-  int max_filter_level = MAX_LOOP_FILTER;// * 3 / 4;

-  (void)base_qindex;

-  if (cpi->twopass.section_intra_rating > 8)

-    max_filter_level = MAX_LOOP_FILTER * 3 / 4;

-  return max_filter_level;

-}

-void vp9_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {

-  VP9_COMMON *cm = &cpi->common;

-  int best_err = 0;

-  int filt_err = 0;

-  int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);

-  int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);

-  int filt_val;

-  int best_filt_val = cm->filter_level;

-  //  Make a copy of the unfiltered / processed recon buffer

-  vp9_yv12_copy_partial_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf, 3);

-  if (cm->frame_type == KEY_FRAME)

-    cm->sharpness_level = 0;

-  else

-    cm->sharpness_level = cpi->oxcf.Sharpness;

-  if (cm->sharpness_level != cm->last_sharpness_level) {

-    vp9_loop_filter_update_sharpness(&cm->lf_info, cm->sharpness_level);

-    cm->last_sharpness_level = cm->sharpness_level;

-  }

-  // Start the search at the previous frame filter level unless it is now out of range.

-  if (cm->filter_level < min_filter_level)

-    cm->filter_level = min_filter_level;

-  else if (cm->filter_level > max_filter_level)

-    cm->filter_level = max_filter_level;

-  filt_val = cm->filter_level;

-  best_filt_val = filt_val;

-  // Get the err using the previous frame's filter value.

-  vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);

-  best_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);

-  //  Re-instate the unfiltered frame

-  vp9_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);

-  filt_val -= (1 + ((filt_val > 10) ? 1 : 0));

-  // Search lower filter levels

-  while (filt_val >= min_filter_level) {

-    // Apply the loop filter

-    vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);

-    // Get the err for filtered frame

-    filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);

-    //  Re-instate the unfiltered frame

-    vp9_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);

-    // Update the best case record or exit loop.

-    if (filt_err < best_err) {

-      best_err = filt_err;

-      best_filt_val = filt_val;

-    } else

-      break;

-    // Adjust filter level

-    filt_val -= (1 + ((filt_val > 10) ? 1 : 0));

-  }

-  // Search up (note that we have already done filt_val = cm->filter_level)

-  filt_val = cm->filter_level + (1 + ((filt_val > 10) ? 1 : 0));

-  if (best_filt_val == cm->filter_level) {

-    // Resist raising filter level for very small gains

-    best_err -= (best_err >> 10);

-    while (filt_val < max_filter_level) {

-      // Apply the loop filter

-      vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);

-      // Get the err for filtered frame

-      filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);

-      //  Re-instate the unfiltered frame

-      vp9_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf,

-                                      cm->frame_to_show, 3);

-      // Update the best case record or exit loop.

-      if (filt_err < best_err) {

-        // Do not raise filter level if improvement is < 1 part in 4096

-        best_err = filt_err - (filt_err >> 10);

-        best_filt_val = filt_val;

-      } else

-        break;

-      // Adjust filter level

-      filt_val += (1 + ((filt_val > 10) ? 1 : 0));

-    }

-  }

-  cm->filter_level = best_filt_val;

-  if (cm->filter_level < min_filter_level)

-    cm->filter_level = min_filter_level;

-  if (cm->filter_level > max_filter_level)

-    cm->filter_level = max_filter_level;

-}

-// Stub function for now Alt LF not used

-void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val) {

-}

-void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {

-  VP9_COMMON *cm = &cpi->common;

-  int best_err = 0;

-  int filt_err = 0;

-  int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);

-  int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);

-  int filter_step;

-  int filt_high = 0;

-  int filt_mid = cm->filter_level;      // Start search at previous frame filter level

-  int filt_low = 0;

-  int filt_best;

-  int filt_direction = 0;

-  int Bias = 0;                       // Bias against raising loop filter and in favour of lowering it

-  //  Make a copy of the unfiltered / processed recon buffer

-#if HAVE_ARMV7

-#if CONFIG_RUNTIME_CPU_DETECT

-  if (cm->rtcd.flags & HAS_NEON)

-#endif

-  {

-    vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(cm->frame_to_show, &cpi->last_frame_uf);

-  }

-#if CONFIG_RUNTIME_CPU_DETECT

-  else

-#endif

-#endif

-#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT

-  {

-    vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf);

-  }

-#endif

-  if (cm->frame_type == KEY_FRAME)

-    cm->sharpness_level = 0;

-  else

-    cm->sharpness_level = cpi->oxcf.Sharpness;

-  // Start the search at the previous frame filter level unless it is now out of range.

-  filt_mid = cm->filter_level;

-  if (filt_mid < min_filter_level)

-    filt_mid = min_filter_level;

-  else if (filt_mid > max_filter_level)

-    filt_mid = max_filter_level;

-  // Define the initial step size

-  filter_step = (filt_mid < 16) ? 4 : filt_mid / 4;

-  // Get baseline error score

-  vp9_set_alt_lf_level(cpi, filt_mid);

-  vp9_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid);

-  best_err = vp9_calc_ss_err(sd, cm->frame_to_show);

-  filt_best = filt_mid;

-  //  Re-instate the unfiltered frame

-#if HAVE_ARMV7

-#if CONFIG_RUNTIME_CPU_DETECT

-  if (cm->rtcd.flags & HAS_NEON)

-#endif

-  {

-    vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);

-  }

-#if CONFIG_RUNTIME_CPU_DETECT

-  else

-#endif

-#endif

-#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT

-  {

-    vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);

-  }

-#endif

-  while (filter_step > 0) {

-    Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; // PGW change 12/12/06 for small images

-    // jbb chg: 20100118 - in sections with lots of new material coming in don't bias as much to a low filter value

-    if (cpi->twopass.section_intra_rating < 20)

-      Bias = Bias * cpi->twopass.section_intra_rating / 20;

-    // yx, bias less for large block size

-    if (cpi->common.txfm_mode != ONLY_4X4)

-      Bias >>= 1;

-    filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step);

-    filt_low = ((filt_mid - filter_step) < min_filter_level) ? min_filter_level : (filt_mid - filter_step);

-    if ((filt_direction <= 0) && (filt_low != filt_mid)) {

-      // Get Low filter error score

-      vp9_set_alt_lf_level(cpi, filt_low);

-      vp9_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);

-      filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);

-      //  Re-instate the unfiltered frame

-#if HAVE_ARMV7

-#if CONFIG_RUNTIME_CPU_DETECT

-      if (cm->rtcd.flags & HAS_NEON)

-#endif

-      {

-        vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);

-      }

-#if CONFIG_RUNTIME_CPU_DETECT

-      else

-#endif

-#endif

-#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT

-      {

-        vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);

-      }

-#endif

-      // If value is close to the best so far then bias towards a lower loop filter value.

-      if ((filt_err - Bias) < best_err) {

-        // Was it actually better than the previous best?

-        if (filt_err < best_err)

-          best_err = filt_err;

-        filt_best = filt_low;

-      }

-    }

-    // Now look at filt_high

-    if ((filt_direction >= 0) && (filt_high != filt_mid)) {

-      vp9_set_alt_lf_level(cpi, filt_high);

-      vp9_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);

-      filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);

-      //  Re-instate the unfiltered frame

-#if HAVE_ARMV7

-#if CONFIG_RUNTIME_CPU_DETECT

-      if (cm->rtcd.flags & HAS_NEON)

-#endif

-      {

-        vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);

-      }

-#if CONFIG_RUNTIME_CPU_DETECT

-      else

-#endif

-#endif

-#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT

-      {

-        vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);

-      }

-#endif

-      // Was it better than the previous best?

-      if (filt_err < (best_err - Bias)) {

-        best_err = filt_err;

-        filt_best = filt_high;

-      }

-    }

-    // Half the step distance if the best filter value was the same as last time

-    if (filt_best == filt_mid) {

-      filter_step = filter_step / 2;

-      filt_direction = 0;

-    } else {

-      filt_direction = (filt_best < filt_mid) ? -1 : 1;

-      filt_mid = filt_best;

-    }

-  }

-  cm->filter_level = filt_best;

-}

--- a/vp8/encoder/ppc/csystemdependent.c

+++ /dev/null

@@ -1,155 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp8/encoder/variance.h"

-#include "vp8/encoder/onyx_int.h"

-SADFunction *vp9_sad16x16;

-SADFunction *vp9_sad16x8;

-SADFunction *vp9_sad8x16;

-SADFunction *vp9_sad8x8;

-SADFunction *vp9_sad4x4;

-variance_function *vp9_variance4x4;

-variance_function *vp9_variance8x8;

-variance_function *vp9_variance8x16;

-variance_function *vp9_variance16x8;

-variance_function *vp9_variance16x16;

-variance_function *vp9_mse16x16;

-sub_pixel_variance_function *vp9_sub_pixel_variance4x4;

-sub_pixel_variance_function *vp9_sub_pixel_variance8x8;

-sub_pixel_variance_function *vp9_sub_pixel_variance8x16;

-sub_pixel_variance_function *vp9_sub_pixel_variance16x8;

-sub_pixel_variance_function *vp9_sub_pixel_variance16x16;

-int (*vp9_block_error)(short *coeff, short *dqcoeff);

-int (*vp9_mbblock_error)(MACROBLOCK *mb, int dc);

-int (*vp9_mbuverror)(MACROBLOCK *mb);

-unsigned int (*vp9_get_mb_ss)(short *);

-void (*vp9_short_fdct4x4)(short *input, short *output, int pitch);

-void (*vp9_short_fdct8x4)(short *input, short *output, int pitch);

-void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);

-void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);

-void (*short_walsh4x4)(short *input, short *output, int pitch);

-void (*vp9_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);

-void (*vp9_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);

-void (*vp9_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);

-void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);

-// c imports

-extern int block_error_c(short *coeff, short *dqcoeff);

-extern int vp9_mbblock_error_c(MACROBLOCK *mb, int dc);

-extern int vp9_mbuverror_c(MACROBLOCK *mb);

-extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);

-extern void short_fdct4x4_c(short *input, short *output, int pitch);

-extern void short_fdct8x4_c(short *input, short *output, int pitch);

-extern void vp9_short_walsh4x4_c(short *input, short *output, int pitch);

-extern void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);

-extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);

-extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);

-extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);

-extern SADFunction sad16x16_c;

-extern SADFunction sad16x8_c;

-extern SADFunction sad8x16_c;

-extern SADFunction sad8x8_c;

-extern SADFunction sad4x4_c;

-extern variance_function variance16x16_c;

-extern variance_function variance8x16_c;

-extern variance_function variance16x8_c;

-extern variance_function variance8x8_c;

-extern variance_function variance4x4_c;

-extern variance_function mse16x16_c;

-extern sub_pixel_variance_function sub_pixel_variance4x4_c;

-extern sub_pixel_variance_function sub_pixel_variance8x8_c;

-extern sub_pixel_variance_function sub_pixel_variance8x16_c;

-extern sub_pixel_variance_function sub_pixel_variance16x8_c;

-extern sub_pixel_variance_function sub_pixel_variance16x16_c;

-extern unsigned int vp9_get_mb_ss_c(short *);

-// ppc

-extern int vp9_block_error_ppc(short *coeff, short *dqcoeff);

-extern void vp9_short_fdct4x4_ppc(short *input, short *output, int pitch);

-extern void vp9_short_fdct8x4_ppc(short *input, short *output, int pitch);

-extern void vp9_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride);

-extern void vp9_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);

-extern SADFunction vp9_sad16x16_ppc;

-extern SADFunction vp9_sad16x8_ppc;

-extern SADFunction vp9_sad8x16_ppc;

-extern SADFunction vp9_sad8x8_ppc;

-extern SADFunction vp9_sad4x4_ppc;

-extern variance_function vp9_variance16x16_ppc;

-extern variance_function vp9_variance8x16_ppc;

-extern variance_function vp9_variance16x8_ppc;

-extern variance_function vp9_variance8x8_ppc;

-extern variance_function vp9_variance4x4_ppc;

-extern variance_function vp9_mse16x16_ppc;

-extern sub_pixel_variance_function vp9_sub_pixel_variance4x4_ppc;

-extern sub_pixel_variance_function vp9_sub_pixel_variance8x8_ppc;

-extern sub_pixel_variance_function vp9_sub_pixel_variance8x16_ppc;

-extern sub_pixel_variance_function vp9_sub_pixel_variance16x8_ppc;

-extern sub_pixel_variance_function vp9_sub_pixel_variance16x16_ppc;

-extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);

-extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);

-void vp9_cmachine_specific_config(void) {

-  // Pure C:

-  vp9_mbuverror               = vp9_mbuverror_c;

-  vp8_fast_quantize_b           = vp8_fast_quantize_b_c;

-  vp9_short_fdct4x4            = vp9_short_fdct4x4_ppc;

-  vp9_short_fdct8x4            = vp9_short_fdct8x4_ppc;

-  vp8_fast_fdct4x4             = vp9_short_fdct4x4_ppc;

-  vp8_fast_fdct8x4             = vp9_short_fdct8x4_ppc;

-  short_walsh4x4               = vp9_short_walsh4x4_c;

-  vp9_variance4x4             = vp9_variance4x4_ppc;

-  vp9_variance8x8             = vp9_variance8x8_ppc;

-  vp9_variance8x16            = vp9_variance8x16_ppc;

-  vp9_variance16x8            = vp9_variance16x8_ppc;

-  vp9_variance16x16           = vp9_variance16x16_ppc;

-  vp9_mse16x16                = vp9_mse16x16_ppc;

-  vp9_sub_pixel_variance4x4     = vp9_sub_pixel_variance4x4_ppc;

-  vp9_sub_pixel_variance8x8     = vp9_sub_pixel_variance8x8_ppc;

-  vp9_sub_pixel_variance8x16    = vp9_sub_pixel_variance8x16_ppc;

-  vp9_sub_pixel_variance16x8    = vp9_sub_pixel_variance16x8_ppc;

-  vp9_sub_pixel_variance16x16   = vp9_sub_pixel_variance16x16_ppc;

-  vp9_get_mb_ss                 = vp9_get_mb_ss_c;

-  vp9_sad16x16                = vp9_sad16x16_ppc;

-  vp9_sad16x8                 = vp9_sad16x8_ppc;

-  vp9_sad8x16                 = vp9_sad8x16_ppc;

-  vp9_sad8x8                  = vp9_sad8x8_ppc;

-  vp9_sad4x4                  = vp9_sad4x4_ppc;

-  vp9_block_error              = vp9_block_error_ppc;

-  vp9_mbblock_error            = vp9_mbblock_error_c;

-  vp9_subtract_b               = vp9_subtract_b_c;

-  vp9_subtract_mby             = vp9_subtract_mby_ppc;

-  vp9_subtract_mbuv            = vp9_subtract_mbuv_ppc;

-}

--- a/vp8/encoder/ppc/encodemb_altivec.asm

+++ /dev/null

@@ -1,153 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl vp8_subtract_mbuv_ppc

-    .globl vp8_subtract_mby_ppc

-;# r3 short *diff

-;# r4 unsigned char *usrc

-;# r5 unsigned char *vsrc

-;# r6 unsigned char *pred

-;# r7 int stride

-vp8_subtract_mbuv_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xf000

-    mtspr   256, r12            ;# set VRSAVE

-    li      r9, 256

-    add     r3, r3, r9

-    add     r3, r3, r9

-    add     r6, r6, r9

-    li      r10, 16

-    li      r9,  4

-    mtctr   r9

-    vspltisw v0, 0

-mbu_loop:

-    lvsl    v5, 0, r4           ;# permutate value for alignment

-    lvx     v1, 0, r4           ;# src

-    lvx     v2, 0, r6           ;# pred

-    add     r4, r4, r7

-    addi    r6, r6, 16

-    vperm   v1, v1, v0, v5

-    vmrghb  v3, v0, v1          ;# unpack high src  to short

-    vmrghb  v4, v0, v2          ;# unpack high pred to short

-    lvsl    v5, 0, r4           ;# permutate value for alignment

-    lvx     v1, 0, r4           ;# src

-    add     r4, r4, r7

-    vsubshs v3, v3, v4

-    stvx    v3, 0, r3           ;# store out diff

-    vperm   v1, v1, v0, v5

-    vmrghb  v3, v0, v1          ;# unpack high src  to short

-    vmrglb  v4, v0, v2          ;# unpack high pred to short

-    vsubshs v3, v3, v4

-    stvx    v3, r10, r3         ;# store out diff

-    addi    r3, r3, 32

-    bdnz    mbu_loop

-    mtctr   r9

-mbv_loop:

-    lvsl    v5, 0, r5           ;# permutate value for alignment

-    lvx     v1, 0, r5           ;# src

-    lvx     v2, 0, r6           ;# pred

-    add     r5, r5, r7

-    addi    r6, r6, 16

-    vperm   v1, v1, v0, v5

-    vmrghb  v3, v0, v1          ;# unpack high src  to short

-    vmrghb  v4, v0, v2          ;# unpack high pred to short

-    lvsl    v5, 0, r5           ;# permutate value for alignment

-    lvx     v1, 0, r5           ;# src

-    add     r5, r5, r7

-    vsubshs v3, v3, v4

-    stvx    v3, 0, r3           ;# store out diff

-    vperm   v1, v1, v0, v5

-    vmrghb  v3, v0, v1          ;# unpack high src  to short

-    vmrglb  v4, v0, v2          ;# unpack high pred to short

-    vsubshs v3, v3, v4

-    stvx    v3, r10, r3         ;# store out diff

-    addi    r3, r3, 32

-    bdnz    mbv_loop

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-;# r3 short *diff

-;# r4 unsigned char *src

-;# r5 unsigned char *pred

-;# r6 int stride

-vp8_subtract_mby_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xf800

-    mtspr   256, r12            ;# set VRSAVE

-    li      r10, 16

-    mtctr   r10

-    vspltisw v0, 0

-mby_loop:

-    lvx     v1, 0, r4           ;# src

-    lvx     v2, 0, r5           ;# pred

-    add     r4, r4, r6

-    addi    r5, r5, 16

-    vmrghb  v3, v0, v1          ;# unpack high src  to short

-    vmrghb  v4, v0, v2          ;# unpack high pred to short

-    vsubshs v3, v3, v4

-    stvx    v3, 0, r3           ;# store out diff

-    vmrglb  v3, v0, v1          ;# unpack low src  to short

-    vmrglb  v4, v0, v2          ;# unpack low pred to short

-    vsubshs v3, v3, v4

-    stvx    v3, r10, r3         ;# store out diff

-    addi    r3, r3, 32

-    bdnz    mby_loop

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

--- a/vp8/encoder/ppc/fdct_altivec.asm

+++ /dev/null

@@ -1,205 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl vp8_short_fdct4x4_ppc

-    .globl vp8_short_fdct8x4_ppc

-.macro load_c V, LABEL, OFF, R0, R1

-    lis     \R0, \LABEL@ha

-    la      \R1, \LABEL@l(\R0)

-    lvx     \V, \OFF, \R1

-.endm

-;# Forward and inverse DCTs are nearly identical; only differences are

-;#   in normalization (fwd is twice unitary, inv is half unitary)

-;#   and that they are of course transposes of each other.

-;#

-;#   The following three accomplish most of implementation and

-;#   are used only by ppc_idct.c and ppc_fdct.c.

-.macro prologue

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xfffc

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    li      r6, 16

-    load_c v0, dct_tab, 0, r9, r10

-    lvx     v1,   r6, r10

-    addi    r10, r10, 32

-    lvx     v2,    0, r10

-    lvx     v3,   r6, r10

-    load_c v4, ppc_dctperm_tab,  0, r9, r10

-    load_c v5, ppc_dctperm_tab, r6, r9, r10

-    load_c v6, round_tab, 0, r10, r9

-.endm

-.macro epilogue

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-.endm

-;# Do horiz xf on two rows of coeffs  v8 = a0 a1 a2 a3  b0 b1 b2 b3.

-;#   a/A are the even rows 0,2   b/B are the odd rows 1,3

-;#   For fwd transform, indices are horizontal positions, then frequencies.

-;#   For inverse transform, frequencies then positions.

-;#   The two resulting  A0..A3  B0..B3  are later combined

-;#   and vertically transformed.

-.macro two_rows_horiz Dst

-    vperm   v9, v8, v8, v4      ;# v9 = a2 a3 a0 a1  b2 b3 b0 b1

-    vmsumshm v10, v0, v8, v6

-    vmsumshm v10, v1, v9, v10

-    vsraw   v10, v10, v7        ;# v10 = A0 A1  B0 B1

-    vmsumshm v11, v2, v8, v6

-    vmsumshm v11, v3, v9, v11

-    vsraw   v11, v11, v7        ;# v11 = A2 A3  B2 B3

-    vpkuwum v10, v10, v11       ;# v10  = A0 A1  B0 B1  A2 A3  B2 B3

-    vperm   \Dst, v10, v10, v5  ;# Dest = A0 B0  A1 B1  A2 B2  A3 B3

-.endm

-;# Vertical xf on two rows. DCT values in comments are for inverse transform;

-;#   forward transform uses transpose.

-.macro two_rows_vert Ceven, Codd

-    vspltw  v8, \Ceven, 0       ;# v8 = c00 c10  or  c02 c12 four times

-    vspltw  v9, \Codd,  0       ;# v9 = c20 c30  or  c22 c32 ""

-    vmsumshm v8, v8, v12, v6

-    vmsumshm v8, v9, v13, v8

-    vsraw   v10, v8, v7

-    vspltw  v8, \Codd,  1       ;# v8 = c01 c11  or  c03 c13

-    vspltw  v9, \Ceven, 1       ;# v9 = c21 c31  or  c23 c33

-    vmsumshm v8, v8, v12, v6

-    vmsumshm v8, v9, v13, v8

-    vsraw   v8, v8, v7

-    vpkuwum v8, v10, v8         ;# v8 = rows 0,1  or 2,3

-.endm

-.macro two_rows_h Dest

-    stw     r0,  0(r8)

-    lwz     r0,  4(r3)

-    stw     r0,  4(r8)

-    lwzux   r0, r3,r5

-    stw     r0,  8(r8)

-    lwz     r0,  4(r3)

-    stw     r0, 12(r8)

-    lvx     v8,  0,r8

-    two_rows_horiz \Dest

-.endm

-    .align 2

-;# r3 short *input

-;# r4 short *output

-;# r5 int pitch

-vp8_short_fdct4x4_ppc:

-    prologue

-    vspltisw v7, 14             ;# == 14, fits in 5 signed bits

-    addi    r8, r1, 0

-    lwz     r0, 0(r3)

-    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13

-    lwzux   r0, r3, r5

-    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33

-    lvx     v6, r6, r9          ;# v6 = Vround

-    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter

-    two_rows_vert v0, v1

-    stvx    v8, 0, r4

-    two_rows_vert v2, v3

-    stvx    v8, r6, r4

-    epilogue

-    blr

-    .align 2

-;# r3 short *input

-;# r4 short *output

-;# r5 int pitch

-vp8_short_fdct8x4_ppc:

-    prologue

-    vspltisw v7, 14             ;# == 14, fits in 5 signed bits

-    addi    r8,  r1, 0

-    addi    r10, r3, 0

-    lwz     r0, 0(r3)

-    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13

-    lwzux   r0, r3, r5

-    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33

-    lvx     v6, r6, r9          ;# v6 = Vround

-    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter

-    two_rows_vert v0, v1

-    stvx    v8, 0, r4

-    two_rows_vert v2, v3

-    stvx    v8, r6, r4

-    ;# Next block

-    addi    r3, r10, 8

-    addi    r4, r4, 32

-    lvx     v6, 0, r9           ;# v6 = Hround

-    vspltisw v7, 14             ;# == 14, fits in 5 signed bits

-    addi    r8, r1, 0

-    lwz     r0, 0(r3)

-    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13

-    lwzux   r0, r3, r5

-    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33

-    lvx     v6, r6, r9          ;# v6 = Vround

-    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter

-    two_rows_vert v0, v1

-    stvx    v8, 0, r4

-    two_rows_vert v2, v3

-    stvx    v8, r6, r4

-    epilogue

-    blr

-    .data

-    .align 4

-ppc_dctperm_tab:

-    .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11

-    .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15

-    .align 4

-dct_tab:

-    .short  23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274

-    .short  23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540

-    .short  23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540

-    .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274

-    .align 4

-round_tab:

-    .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))

-    .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))

--- a/vp8/encoder/ppc/rdopt_altivec.asm

+++ /dev/null

@@ -1,51 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl vp8_block_error_ppc

-    .align 2

-;# r3 short *Coeff

-;# r4 short *dqcoeff

-vp8_block_error_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xf800

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    stw     r5, 12(r1)          ;# tranfer dc to vector register

-    lvx     v0, 0, r3           ;# Coeff

-    lvx     v1, 0, r4           ;# dqcoeff

-    li      r10, 16

-    vspltisw v3, 0

-    vsubshs v0, v0, v1

-    vmsumshm v2, v0, v0, v3     ;# multiply differences

-    lvx     v0, r10, r3         ;# Coeff

-    lvx     v1, r10, r4         ;# dqcoeff

-    vsubshs v0, v0, v1

-    vmsumshm v1, v0, v0, v2     ;# multiply differences

-    vsumsws v1, v1, v3          ;# sum up

-    stvx    v1, 0, r1

-    lwz     r3, 12(r1)          ;# return value

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

--- a/vp8/encoder/ppc/sad_altivec.asm

+++ /dev/null

@@ -1,277 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl vp8_sad16x16_ppc

-    .globl vp8_sad16x8_ppc

-    .globl vp8_sad8x16_ppc

-    .globl vp8_sad8x8_ppc

-    .globl vp8_sad4x4_ppc

-.macro load_aligned_16 V R O

-    lvsl    v3,  0, \R          ;# permutate value for alignment

-    lvx     v1,  0, \R

-    lvx     v2, \O, \R

-    vperm   \V, v1, v2, v3

-.endm

-.macro prologue

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffc0

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1, -32(r1)         ;# create space on the stack

-    li      r10, 16             ;# load offset and loop counter

-    vspltisw v8, 0              ;# zero out total to start

-.endm

-.macro epilogue

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-.endm

-.macro SAD_16

-    ;# v6 = abs (v4 - v5)

-    vsububs v6, v4, v5

-    vsububs v7, v5, v4

-    vor     v6, v6, v7

-    ;# v8 += abs (v4 - v5)

-    vsum4ubs v8, v6, v8

-.endm

-.macro sad_16_loop loop_label

-    lvsl    v3,  0, r5          ;# only needs to be done once per block

-    ;# preload a line of data before getting into the loop

-    lvx     v4, 0, r3

-    lvx     v1,  0, r5

-    lvx     v2, r10, r5

-    add     r5, r5, r6

-    add     r3, r3, r4

-    vperm   v5, v1, v2, v3

-    .align 4

-\loop_label:

-    ;# compute difference on first row

-    vsububs v6, v4, v5

-    vsububs v7, v5, v4

-    ;# load up next set of data

-    lvx     v9, 0, r3

-    lvx     v1,  0, r5

-    lvx     v2, r10, r5

-    ;# perform abs() of difference

-    vor     v6, v6, v7

-    add     r3, r3, r4

-    ;# add to the running tally

-    vsum4ubs v8, v6, v8

-    ;# now onto the next line

-    vperm   v5, v1, v2, v3

-    add     r5, r5, r6

-    lvx     v4, 0, r3

-    ;# compute difference on second row

-    vsububs v6, v9, v5

-    lvx     v1,  0, r5

-    vsububs v7, v5, v9

-    lvx     v2, r10, r5

-    vor     v6, v6, v7

-    add     r3, r3, r4

-    vsum4ubs v8, v6, v8

-    vperm   v5, v1, v2, v3

-    add     r5, r5, r6

-    bdnz    \loop_label

-    vspltisw v7, 0

-    vsumsws v8, v8, v7

-    stvx    v8, 0, r1

-    lwz     r3, 12(r1)

-.endm

-.macro sad_8_loop loop_label

-    .align 4

-\loop_label:

-    ;# only one of the inputs should need to be aligned.

-    load_aligned_16 v4, r3, r10

-    load_aligned_16 v5, r5, r10

-    ;# move onto the next line

-    add     r3, r3, r4

-    add     r5, r5, r6

-    ;# only one of the inputs should need to be aligned.

-    load_aligned_16 v6, r3, r10

-    load_aligned_16 v7, r5, r10

-    ;# move onto the next line

-    add     r3, r3, r4

-    add     r5, r5, r6

-    vmrghb  v4, v4, v6

-    vmrghb  v5, v5, v7

-    SAD_16

-    bdnz    \loop_label

-    vspltisw v7, 0

-    vsumsws v8, v8, v7

-    stvx    v8, 0, r1

-    lwz     r3, 12(r1)

-.endm

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  src_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  ref_stride

-;#

-;# r3 return value

-vp8_sad16x16_ppc:

-    prologue

-    li      r9, 8

-    mtctr   r9

-    sad_16_loop sad16x16_loop

-    epilogue

-    blr

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  src_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  ref_stride

-;#

-;# r3 return value

-vp8_sad16x8_ppc:

-    prologue

-    li      r9, 4

-    mtctr   r9

-    sad_16_loop sad16x8_loop

-    epilogue

-    blr

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  src_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  ref_stride

-;#

-;# r3 return value

-vp8_sad8x16_ppc:

-    prologue

-    li      r9, 8

-    mtctr   r9

-    sad_8_loop sad8x16_loop

-    epilogue

-    blr

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  src_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  ref_stride

-;#

-;# r3 return value

-vp8_sad8x8_ppc:

-    prologue

-    li      r9, 4

-    mtctr   r9

-    sad_8_loop sad8x8_loop

-    epilogue

-    blr

-.macro transfer_4x4 I P

-    lwz     r0, 0(\I)

-    add     \I, \I, \P

-    lwz     r7, 0(\I)

-    add     \I, \I, \P

-    lwz     r8, 0(\I)

-    add     \I, \I, \P

-    lwz     r9, 0(\I)

-    stw     r0,  0(r1)

-    stw     r7,  4(r1)

-    stw     r8,  8(r1)

-    stw     r9, 12(r1)

-.endm

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  src_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  ref_stride

-;#

-;# r3 return value

-vp8_sad4x4_ppc:

-    prologue

-    transfer_4x4 r3, r4

-    lvx     v4, 0, r1

-    transfer_4x4 r5, r6

-    lvx     v5, 0, r1

-    vspltisw v8, 0              ;# zero out total to start

-    ;# v6 = abs (v4 - v5)

-    vsububs v6, v4, v5

-    vsububs v7, v5, v4

-    vor     v6, v6, v7

-    ;# v8 += abs (v4 - v5)

-    vsum4ubs v7, v6, v8

-    vsumsws v7, v7, v8

-    stvx    v7, 0, r1

-    lwz     r3, 12(r1)

-    epilogue

-    blr

--- a/vp8/encoder/ppc/variance_altivec.asm

+++ /dev/null

@@ -1,375 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl vp8_get8x8var_ppc

-    .globl vp8_get16x16var_ppc

-    .globl vp8_mse16x16_ppc

-    .globl vp9_variance16x16_ppc

-    .globl vp9_variance16x8_ppc

-    .globl vp9_variance8x16_ppc

-    .globl vp9_variance8x8_ppc

-    .globl vp9_variance4x4_ppc

-.macro load_aligned_16 V R O

-    lvsl    v3,  0, \R          ;# permutate value for alignment

-    lvx     v1,  0, \R

-    lvx     v2, \O, \R

-    vperm   \V, v1, v2, v3

-.endm

-.macro prologue

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffc0

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1, -32(r1)         ;# create space on the stack

-    li      r10, 16             ;# load offset and loop counter

-    vspltisw v7, 0              ;# zero for merging

-    vspltisw v8, 0              ;# zero out total to start

-    vspltisw v9, 0              ;# zero out total for dif^2

-.endm

-.macro epilogue

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-.endm

-.macro compute_sum_sse

-    ;# Compute sum first.  Unpack to so signed subract

-    ;#  can be used.  Only have a half word signed

-    ;#  subract.  Do high, then low.

-    vmrghb  v2, v7, v4

-    vmrghb  v3, v7, v5

-    vsubshs v2, v2, v3

-    vsum4shs v8, v2, v8

-    vmrglb  v2, v7, v4

-    vmrglb  v3, v7, v5

-    vsubshs v2, v2, v3

-    vsum4shs v8, v2, v8

-    ;# Now compute sse.

-    vsububs v2, v4, v5

-    vsububs v3, v5, v4

-    vor     v2, v2, v3

-    vmsumubm v9, v2, v2, v9

-.endm

-.macro variance_16 DS loop_label store_sum

-\loop_label:

-    ;# only one of the inputs should need to be aligned.

-    load_aligned_16 v4, r3, r10

-    load_aligned_16 v5, r5, r10

-    ;# move onto the next line

-    add     r3, r3, r4

-    add     r5, r5, r6

-    compute_sum_sse

-    bdnz    \loop_label

-    vsumsws v8, v8, v7

-    vsumsws v9, v9, v7

-    stvx    v8, 0, r1

-    lwz     r3, 12(r1)

-    stvx    v9, 0, r1

-    lwz     r4, 12(r1)

-.if \store_sum

-    stw     r3, 0(r8)           ;# sum

-.endif

-    stw     r4, 0(r7)           ;# sse

-    mullw   r3, r3, r3          ;# sum*sum

-    srawi   r3, r3, \DS         ;# (sum*sum) >> DS

-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> DS)

-.endm

-.macro variance_8 DS loop_label store_sum

-\loop_label:

-    ;# only one of the inputs should need to be aligned.

-    load_aligned_16 v4, r3, r10

-    load_aligned_16 v5, r5, r10

-    ;# move onto the next line

-    add     r3, r3, r4

-    add     r5, r5, r6

-    ;# only one of the inputs should need to be aligned.

-    load_aligned_16 v6, r3, r10

-    load_aligned_16 v0, r5, r10

-    ;# move onto the next line

-    add     r3, r3, r4

-    add     r5, r5, r6

-    vmrghb  v4, v4, v6

-    vmrghb  v5, v5, v0

-    compute_sum_sse

-    bdnz    \loop_label

-    vsumsws v8, v8, v7

-    vsumsws v9, v9, v7

-    stvx    v8, 0, r1

-    lwz     r3, 12(r1)

-    stvx    v9, 0, r1

-    lwz     r4, 12(r1)

-.if \store_sum

-    stw     r3, 0(r8)           ;# sum

-.endif

-    stw     r4, 0(r7)           ;# sse

-    mullw   r3, r3, r3          ;# sum*sum

-    srawi   r3, r3, \DS         ;# (sum*sum) >> 8

-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)

-.endm

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  source_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  recon_stride

-;# r7 unsigned int *SSE

-;# r8 int *Sum

-;#

-;# r3 return value

-vp8_get8x8var_ppc:

-    prologue

-    li      r9, 4

-    mtctr   r9

-    variance_8 6, get8x8var_loop, 1

-    epilogue

-    blr

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  source_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  recon_stride

-;# r7 unsigned int *SSE

-;# r8 int *Sum

-;#

-;# r3 return value

-vp8_get16x16var_ppc:

-    prologue

-    mtctr   r10

-    variance_16 8, get16x16var_loop, 1

-    epilogue

-    blr

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  source_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  recon_stride

-;# r7 unsigned int *sse

-;#

-;# r 3 return value

-vp8_mse16x16_ppc:

-    prologue

-    mtctr   r10

-mse16x16_loop:

-    ;# only one of the inputs should need to be aligned.

-    load_aligned_16 v4, r3, r10

-    load_aligned_16 v5, r5, r10

-    ;# move onto the next line

-    add     r3, r3, r4

-    add     r5, r5, r6

-    ;# Now compute sse.

-    vsububs v2, v4, v5

-    vsububs v3, v5, v4

-    vor     v2, v2, v3

-    vmsumubm v9, v2, v2, v9

-    bdnz    mse16x16_loop

-    vsumsws v9, v9, v7

-    stvx    v9, 0, r1

-    lwz     r3, 12(r1)

-    stvx    v9, 0, r1

-    lwz     r3, 12(r1)

-    stw     r3, 0(r7)           ;# sse

-    epilogue

-    blr

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  source_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  recon_stride

-;# r7 unsigned int *sse

-;#

-;# r3 return value

-vp9_variance16x16_ppc:

-    prologue

-    mtctr   r10

-    variance_16 8, variance16x16_loop, 0

-    epilogue

-    blr

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  source_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  recon_stride

-;# r7 unsigned int *sse

-;#

-;# r3 return value

-vp9_variance16x8_ppc:

-    prologue

-    li      r9, 8

-    mtctr   r9

-    variance_16 7, variance16x8_loop, 0

-    epilogue

-    blr

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  source_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  recon_stride

-;# r7 unsigned int *sse

-;#

-;# r3 return value

-vp9_variance8x16_ppc:

-    prologue

-    li      r9, 8

-    mtctr   r9

-    variance_8 7, variance8x16_loop, 0

-    epilogue

-    blr

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  source_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  recon_stride

-;# r7 unsigned int *sse

-;#

-;# r3 return value

-vp9_variance8x8_ppc:

-    prologue

-    li      r9, 4

-    mtctr   r9

-    variance_8 6, variance8x8_loop, 0

-    epilogue

-    blr

-.macro transfer_4x4 I P

-    lwz     r0, 0(\I)

-    add     \I, \I, \P

-    lwz     r10,0(\I)

-    add     \I, \I, \P

-    lwz     r8, 0(\I)

-    add     \I, \I, \P

-    lwz     r9, 0(\I)

-    stw     r0,  0(r1)

-    stw     r10, 4(r1)

-    stw     r8,  8(r1)

-    stw     r9, 12(r1)

-.endm

-    .align 2

-;# r3 unsigned char *src_ptr

-;# r4 int  source_stride

-;# r5 unsigned char *ref_ptr

-;# r6 int  recon_stride

-;# r7 unsigned int *sse

-;#

-;# r3 return value

-vp9_variance4x4_ppc:

-    prologue

-    transfer_4x4 r3, r4

-    lvx     v4, 0, r1

-    transfer_4x4 r5, r6

-    lvx     v5, 0, r1

-    compute_sum_sse

-    vsumsws v8, v8, v7

-    vsumsws v9, v9, v7

-    stvx    v8, 0, r1

-    lwz     r3, 12(r1)

-    stvx    v9, 0, r1

-    lwz     r4, 12(r1)

-    stw     r4, 0(r7)           ;# sse

-    mullw   r3, r3, r3          ;# sum*sum

-    srawi   r3, r3, 4           ;# (sum*sum) >> 4

-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 4)

-    epilogue

-    blr

--- a/vp8/encoder/ppc/variance_subpixel_altivec.asm

+++ /dev/null

@@ -1,865 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl vp9_sub_pixel_variance4x4_ppc

-    .globl vp9_sub_pixel_variance8x8_ppc

-    .globl vp9_sub_pixel_variance8x16_ppc

-    .globl vp9_sub_pixel_variance16x8_ppc

-    .globl vp9_sub_pixel_variance16x16_ppc

-.macro load_c V, LABEL, OFF, R0, R1

-    lis     \R0, \LABEL@ha

-    la      \R1, \LABEL@l(\R0)

-    lvx     \V, \OFF, \R1

-.endm

-.macro load_vfilter V0, V1

-    load_c \V0, vfilter_b, r6, r12, r10

-    addi    r6,  r6, 16

-    lvx     \V1, r6, r10

-.endm

-.macro HProlog jump_label

-    ;# load up horizontal filter

-    slwi.   r5, r5, 4           ;# index into horizontal filter array

-    ;# index to the next set of vectors in the row.

-    li      r10, 16

-    ;# downshift by 7 ( divide by 128 ) at the end

-    vspltish v19, 7

-    ;# If there isn't any filtering to be done for the horizontal, then

-    ;#  just skip to the second pass.

-    beq     \jump_label

-    load_c v20, hfilter_b, r5, r12, r0

-    ;# setup constants

-    ;# v14 permutation value for alignment

-    load_c v28, b_hperm_b, 0, r12, r0

-    ;# index to the next set of vectors in the row.

-    li      r12, 32

-    ;# rounding added in on the multiply

-    vspltisw v21, 8

-    vspltisw v18, 3

-    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040

-    slwi.   r6, r6, 5           ;# index into vertical filter array

-.endm

-;# Filters a horizontal line

-;# expects:

-;#  r3  src_ptr

-;#  r4  pitch

-;#  r10 16

-;#  r12 32

-;#  v17 perm intput

-;#  v18 rounding

-;#  v19 shift

-;#  v20 filter taps

-;#  v21 tmp

-;#  v22 tmp

-;#  v23 tmp

-;#  v24 tmp

-;#  v25 tmp

-;#  v26 tmp

-;#  v27 tmp

-;#  v28 perm output

-;#

-.macro hfilter_8 V, hp, lp, increment_counter

-    lvsl    v17,  0, r3         ;# permutate value for alignment

-    ;# input to filter is 9 bytes wide, output is 8 bytes.

-    lvx     v21,   0, r3

-    lvx     v22, r10, r3

-.if \increment_counter

-    add     r3, r3, r4

-.endif

-    vperm   v21, v21, v22, v17

-    vperm   v24, v21, v21, \hp  ;# v20 = 0123 1234 2345 3456

-    vperm   v25, v21, v21, \lp  ;# v21 = 4567 5678 6789 789A

-    vmsummbm v24, v20, v24, v18

-    vmsummbm v25, v20, v25, v18

-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)

-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128

-    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result

-.endm

-.macro vfilter_16 P0 P1

-    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps

-    vadduhm v22, v18, v22

-    vmuloub v23, \P0, v20

-    vadduhm v23, v18, v23

-    vmuleub v24, \P1, v21

-    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary

-    vmuloub v25, \P1, v21

-    vadduhm v23, v23, v25       ;# Ro = odds

-    vsrh    v22, v22, v19       ;# divide by 128

-    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds

-    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order

-    vmrglh  v23, v22, v23

-    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result

-.endm

-.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0

-    ;# Compute sum first.  Unpack to so signed subract

-    ;#  can be used.  Only have a half word signed

-    ;#  subract.  Do high, then low.

-    vmrghb  \t1, \z0, \src

-    vmrghb  \t2, \z0, \ref

-    vsubshs \t1, \t1, \t2

-    vsum4shs \sum, \t1, \sum

-    vmrglb  \t1, \z0, \src

-    vmrglb  \t2, \z0, \ref

-    vsubshs \t1, \t1, \t2

-    vsum4shs \sum, \t1, \sum

-    ;# Now compute sse.

-    vsububs \t1, \src, \ref

-    vsububs \t2, \ref, \src

-    vor     \t1, \t1, \t2

-    vmsumubm \sse, \t1, \t1, \sse

-.endm

-.macro variance_final sum, sse, z0, DS

-    vsumsws \sum, \sum, \z0

-    vsumsws \sse, \sse, \z0

-    stvx    \sum, 0, r1

-    lwz     r3, 12(r1)

-    stvx    \sse, 0, r1

-    lwz     r4, 12(r1)

-    stw     r4, 0(r9)           ;# sse

-    mullw   r3, r3, r3          ;# sum*sum

-    srawi   r3, r3, \DS         ;# (sum*sum) >> 8

-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)

-.endm

-.macro compute_sum_sse_16 V, increment_counter

-    load_and_align_16  v16, r7, r8, \increment_counter

-    compute_sum_sse \V, v16, v18, v19, v20, v21, v23

-.endm

-.macro load_and_align_16 V, R, P, increment_counter

-    lvsl    v17,  0, \R         ;# permutate value for alignment

-    ;# input to filter is 21 bytes wide, output is 16 bytes.

-    ;#  input will can span three vectors if not aligned correctly.

-    lvx     v21,   0, \R

-    lvx     v22, r10, \R

-.if \increment_counter

-    add     \R, \R, \P

-.endif

-    vperm   \V, v21, v22, v17

-.endm

-    .align 2

-;# r3 unsigned char  *src_ptr

-;# r4 int  src_pixels_per_line

-;# r5 int  xoffset

-;# r6 int  yoffset

-;# r7 unsigned char *dst_ptr

-;# r8 int dst_pixels_per_line

-;# r9 unsigned int *sse

-;#

-;# r3 return value

-vp9_sub_pixel_variance4x4_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xf830

-    ori     r12, r12, 0xfff8

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    HProlog second_pass_4x4_pre_copy_b

-    ;# Load up permutation constants

-    load_c v10, b_0123_b, 0, r12, r0

-    load_c v11, b_4567_b, 0, r12, r0

-    hfilter_8 v0, v10, v11, 1

-    hfilter_8 v1, v10, v11, 1

-    hfilter_8 v2, v10, v11, 1

-    hfilter_8 v3, v10, v11, 1

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional line that is needed

-    ;#  for the vertical filter.

-    beq     compute_sum_sse_4x4_b

-    hfilter_8 v4, v10, v11, 0

-    b   second_pass_4x4_b

-second_pass_4x4_pre_copy_b:

-    slwi    r6, r6, 5           ;# index into vertical filter array

-    load_and_align_16 v0, r3, r4, 1

-    load_and_align_16 v1, r3, r4, 1

-    load_and_align_16 v2, r3, r4, 1

-    load_and_align_16 v3, r3, r4, 1

-    load_and_align_16 v4, r3, r4, 0

-second_pass_4x4_b:

-    vspltish v20, 8

-    vspltish v18, 3

-    vslh    v18, v20, v18       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    load_vfilter v20, v21

-    vfilter_16 v0,  v1

-    vfilter_16 v1,  v2

-    vfilter_16 v2,  v3

-    vfilter_16 v3,  v4

-compute_sum_sse_4x4_b:

-    vspltish v18, 0             ;# sum

-    vspltish v19, 0             ;# sse

-    vspltish v23, 0             ;# unpack

-    li      r10, 16

-    load_and_align_16 v4, r7, r8, 1

-    load_and_align_16 v5, r7, r8, 1

-    load_and_align_16 v6, r7, r8, 1

-    load_and_align_16 v7, r7, r8, 1

-    vmrghb  v0, v0, v1

-    vmrghb  v1, v2, v3

-    vmrghb  v2, v4, v5

-    vmrghb  v3, v6, v7

-    load_c v10, b_hilo_b, 0, r12, r0

-    vperm   v0, v0, v1, v10

-    vperm   v1, v2, v3, v10

-    compute_sum_sse v0, v1, v18, v19, v20, v21, v23

-    variance_final v18, v19, v23, 4

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 2

-;# r3 unsigned char  *src_ptr

-;# r4 int  src_pixels_per_line

-;# r5 int  xoffset

-;# r6 int  yoffset

-;# r7 unsigned char *dst_ptr

-;# r8 int dst_pixels_per_line

-;# r9 unsigned int *sse

-;#

-;# r3 return value

-vp9_sub_pixel_variance8x8_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xfff0

-    ori     r12, r12, 0xffff

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    HProlog second_pass_8x8_pre_copy_b

-    ;# Load up permutation constants

-    load_c v10, b_0123_b, 0, r12, r0

-    load_c v11, b_4567_b, 0, r12, r0

-    hfilter_8 v0, v10, v11, 1

-    hfilter_8 v1, v10, v11, 1

-    hfilter_8 v2, v10, v11, 1

-    hfilter_8 v3, v10, v11, 1

-    hfilter_8 v4, v10, v11, 1

-    hfilter_8 v5, v10, v11, 1

-    hfilter_8 v6, v10, v11, 1

-    hfilter_8 v7, v10, v11, 1

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional line that is needed

-    ;#  for the vertical filter.

-    beq     compute_sum_sse_8x8_b

-    hfilter_8 v8, v10, v11, 0

-    b   second_pass_8x8_b

-second_pass_8x8_pre_copy_b:

-    slwi.   r6, r6, 5           ;# index into vertical filter array

-    load_and_align_16 v0, r3, r4, 1

-    load_and_align_16 v1, r3, r4, 1

-    load_and_align_16 v2, r3, r4, 1

-    load_and_align_16 v3, r3, r4, 1

-    load_and_align_16 v4, r3, r4, 1

-    load_and_align_16 v5, r3, r4, 1

-    load_and_align_16 v6, r3, r4, 1

-    load_and_align_16 v7, r3, r4, 1

-    load_and_align_16 v8, r3, r4, 0

-    beq     compute_sum_sse_8x8_b

-second_pass_8x8_b:

-    vspltish v20, 8

-    vspltish v18, 3

-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    load_vfilter v20, v21

-    vfilter_16 v0, v1

-    vfilter_16 v1, v2

-    vfilter_16 v2, v3

-    vfilter_16 v3, v4

-    vfilter_16 v4, v5

-    vfilter_16 v5, v6

-    vfilter_16 v6, v7

-    vfilter_16 v7, v8

-compute_sum_sse_8x8_b:

-    vspltish v18, 0             ;# sum

-    vspltish v19, 0             ;# sse

-    vspltish v23, 0             ;# unpack

-    li      r10, 16

-    vmrghb  v0, v0, v1

-    vmrghb  v1, v2, v3

-    vmrghb  v2, v4, v5

-    vmrghb  v3, v6, v7

-    load_and_align_16 v4,  r7, r8, 1

-    load_and_align_16 v5,  r7, r8, 1

-    load_and_align_16 v6,  r7, r8, 1

-    load_and_align_16 v7,  r7, r8, 1

-    load_and_align_16 v8,  r7, r8, 1

-    load_and_align_16 v9,  r7, r8, 1

-    load_and_align_16 v10, r7, r8, 1

-    load_and_align_16 v11, r7, r8, 0

-    vmrghb  v4, v4,  v5

-    vmrghb  v5, v6,  v7

-    vmrghb  v6, v8,  v9

-    vmrghb  v7, v10, v11

-    compute_sum_sse v0, v4, v18, v19, v20, v21, v23

-    compute_sum_sse v1, v5, v18, v19, v20, v21, v23

-    compute_sum_sse v2, v6, v18, v19, v20, v21, v23

-    compute_sum_sse v3, v7, v18, v19, v20, v21, v23

-    variance_final v18, v19, v23, 6

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 2

-;# r3 unsigned char  *src_ptr

-;# r4 int  src_pixels_per_line

-;# r5 int  xoffset

-;# r6 int  yoffset

-;# r7 unsigned char *dst_ptr

-;# r8 int dst_pixels_per_line

-;# r9 unsigned int *sse

-;#

-;# r3 return value

-vp9_sub_pixel_variance8x16_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xfffc

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1,-32(r1)          ;# create space on the stack

-    HProlog second_pass_8x16_pre_copy_b

-    ;# Load up permutation constants

-    load_c v29, b_0123_b, 0, r12, r0

-    load_c v30, b_4567_b, 0, r12, r0

-    hfilter_8 v0,  v29, v30, 1

-    hfilter_8 v1,  v29, v30, 1

-    hfilter_8 v2,  v29, v30, 1

-    hfilter_8 v3,  v29, v30, 1

-    hfilter_8 v4,  v29, v30, 1

-    hfilter_8 v5,  v29, v30, 1

-    hfilter_8 v6,  v29, v30, 1

-    hfilter_8 v7,  v29, v30, 1

-    hfilter_8 v8,  v29, v30, 1

-    hfilter_8 v9,  v29, v30, 1

-    hfilter_8 v10, v29, v30, 1

-    hfilter_8 v11, v29, v30, 1

-    hfilter_8 v12, v29, v30, 1

-    hfilter_8 v13, v29, v30, 1

-    hfilter_8 v14, v29, v30, 1

-    hfilter_8 v15, v29, v30, 1

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional line that is needed

-    ;#  for the vertical filter.

-    beq     compute_sum_sse_8x16_b

-    hfilter_8 v16, v29, v30, 0

-    b   second_pass_8x16_b

-second_pass_8x16_pre_copy_b:

-    slwi.   r6, r6, 5           ;# index into vertical filter array

-    load_and_align_16 v0,  r3, r4, 1

-    load_and_align_16 v1,  r3, r4, 1

-    load_and_align_16 v2,  r3, r4, 1

-    load_and_align_16 v3,  r3, r4, 1

-    load_and_align_16 v4,  r3, r4, 1

-    load_and_align_16 v5,  r3, r4, 1

-    load_and_align_16 v6,  r3, r4, 1

-    load_and_align_16 v7,  r3, r4, 1

-    load_and_align_16 v8,  r3, r4, 1

-    load_and_align_16 v9,  r3, r4, 1

-    load_and_align_16 v10, r3, r4, 1

-    load_and_align_16 v11, r3, r4, 1

-    load_and_align_16 v12, r3, r4, 1

-    load_and_align_16 v13, r3, r4, 1

-    load_and_align_16 v14, r3, r4, 1

-    load_and_align_16 v15, r3, r4, 1

-    load_and_align_16 v16, r3, r4, 0

-    beq     compute_sum_sse_8x16_b

-second_pass_8x16_b:

-    vspltish v20, 8

-    vspltish v18, 3

-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    load_vfilter v20, v21

-    vfilter_16 v0,  v1

-    vfilter_16 v1,  v2

-    vfilter_16 v2,  v3

-    vfilter_16 v3,  v4

-    vfilter_16 v4,  v5

-    vfilter_16 v5,  v6

-    vfilter_16 v6,  v7

-    vfilter_16 v7,  v8

-    vfilter_16 v8,  v9

-    vfilter_16 v9,  v10

-    vfilter_16 v10, v11

-    vfilter_16 v11, v12

-    vfilter_16 v12, v13

-    vfilter_16 v13, v14

-    vfilter_16 v14, v15

-    vfilter_16 v15, v16

-compute_sum_sse_8x16_b:

-    vspltish v18, 0             ;# sum

-    vspltish v19, 0             ;# sse

-    vspltish v23, 0             ;# unpack

-    li      r10, 16

-    vmrghb  v0, v0,  v1

-    vmrghb  v1, v2,  v3

-    vmrghb  v2, v4,  v5

-    vmrghb  v3, v6,  v7

-    vmrghb  v4, v8,  v9

-    vmrghb  v5, v10, v11

-    vmrghb  v6, v12, v13

-    vmrghb  v7, v14, v15

-    load_and_align_16 v8,  r7, r8, 1

-    load_and_align_16 v9,  r7, r8, 1

-    load_and_align_16 v10, r7, r8, 1

-    load_and_align_16 v11, r7, r8, 1

-    load_and_align_16 v12, r7, r8, 1

-    load_and_align_16 v13, r7, r8, 1

-    load_and_align_16 v14, r7, r8, 1

-    load_and_align_16 v15, r7, r8, 1

-    vmrghb  v8,  v8,  v9

-    vmrghb  v9,  v10, v11

-    vmrghb  v10, v12, v13

-    vmrghb  v11, v14, v15

-    compute_sum_sse v0, v8,  v18, v19, v20, v21, v23

-    compute_sum_sse v1, v9,  v18, v19, v20, v21, v23

-    compute_sum_sse v2, v10, v18, v19, v20, v21, v23

-    compute_sum_sse v3, v11, v18, v19, v20, v21, v23

-    load_and_align_16 v8,  r7, r8, 1

-    load_and_align_16 v9,  r7, r8, 1

-    load_and_align_16 v10, r7, r8, 1

-    load_and_align_16 v11, r7, r8, 1

-    load_and_align_16 v12, r7, r8, 1

-    load_and_align_16 v13, r7, r8, 1

-    load_and_align_16 v14, r7, r8, 1

-    load_and_align_16 v15, r7, r8, 0

-    vmrghb  v8,  v8,  v9

-    vmrghb  v9,  v10, v11

-    vmrghb  v10, v12, v13

-    vmrghb  v11, v14, v15

-    compute_sum_sse v4, v8,  v18, v19, v20, v21, v23

-    compute_sum_sse v5, v9,  v18, v19, v20, v21, v23

-    compute_sum_sse v6, v10, v18, v19, v20, v21, v23

-    compute_sum_sse v7, v11, v18, v19, v20, v21, v23

-    variance_final v18, v19, v23, 7

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-;# Filters a horizontal line

-;# expects:

-;#  r3  src_ptr

-;#  r4  pitch

-;#  r10 16

-;#  r12 32

-;#  v17 perm intput

-;#  v18 rounding

-;#  v19 shift

-;#  v20 filter taps

-;#  v21 tmp

-;#  v22 tmp

-;#  v23 tmp

-;#  v24 tmp

-;#  v25 tmp

-;#  v26 tmp

-;#  v27 tmp

-;#  v28 perm output

-;#

-.macro hfilter_16 V, increment_counter

-    lvsl    v17,  0, r3         ;# permutate value for alignment

-    ;# input to filter is 21 bytes wide, output is 16 bytes.

-    ;#  input will can span three vectors if not aligned correctly.

-    lvx     v21,   0, r3

-    lvx     v22, r10, r3

-    lvx     v23, r12, r3

-.if \increment_counter

-    add     r3, r3, r4

-.endif

-    vperm   v21, v21, v22, v17

-    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified

-    ;# set 0

-    vmsummbm v24, v20, v21, v18 ;# taps times elements

-    ;# set 1

-    vsldoi  v23, v21, v22, 1

-    vmsummbm v25, v20, v23, v18

-    ;# set 2

-    vsldoi  v23, v21, v22, 2

-    vmsummbm v26, v20, v23, v18

-    ;# set 3

-    vsldoi  v23, v21, v22, 3

-    vmsummbm v27, v20, v23, v18

-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)

-    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F

-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128

-    vsrh    v25, v25, v19

-    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result

-    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result

-.endm

-    .align 2

-;# r3 unsigned char  *src_ptr

-;# r4 int  src_pixels_per_line

-;# r5 int  xoffset

-;# r6 int  yoffset

-;# r7 unsigned char *dst_ptr

-;# r8 int dst_pixels_per_line

-;# r9 unsigned int *sse

-;#

-;# r3 return value

-vp9_sub_pixel_variance16x8_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xfff8

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1, -32(r1)         ;# create space on the stack

-    HProlog second_pass_16x8_pre_copy_b

-    hfilter_16 v0, 1

-    hfilter_16 v1, 1

-    hfilter_16 v2, 1

-    hfilter_16 v3, 1

-    hfilter_16 v4, 1

-    hfilter_16 v5, 1

-    hfilter_16 v6, 1

-    hfilter_16 v7, 1

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional line that is needed

-    ;#  for the vertical filter.

-    beq     compute_sum_sse_16x8_b

-    hfilter_16 v8, 0

-    b   second_pass_16x8_b

-second_pass_16x8_pre_copy_b:

-    slwi.   r6, r6, 5           ;# index into vertical filter array

-    load_and_align_16  v0,  r3, r4, 1

-    load_and_align_16  v1,  r3, r4, 1

-    load_and_align_16  v2,  r3, r4, 1

-    load_and_align_16  v3,  r3, r4, 1

-    load_and_align_16  v4,  r3, r4, 1

-    load_and_align_16  v5,  r3, r4, 1

-    load_and_align_16  v6,  r3, r4, 1

-    load_and_align_16  v7,  r3, r4, 1

-    load_and_align_16  v8,  r3, r4, 1

-    beq     compute_sum_sse_16x8_b

-second_pass_16x8_b:

-    vspltish v20, 8

-    vspltish v18, 3

-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    load_vfilter v20, v21

-    vfilter_16 v0,  v1

-    vfilter_16 v1,  v2

-    vfilter_16 v2,  v3

-    vfilter_16 v3,  v4

-    vfilter_16 v4,  v5

-    vfilter_16 v5,  v6

-    vfilter_16 v6,  v7

-    vfilter_16 v7,  v8

-compute_sum_sse_16x8_b:

-    vspltish v18, 0             ;# sum

-    vspltish v19, 0             ;# sse

-    vspltish v23, 0             ;# unpack

-    li      r10, 16

-    compute_sum_sse_16 v0, 1

-    compute_sum_sse_16 v1, 1

-    compute_sum_sse_16 v2, 1

-    compute_sum_sse_16 v3, 1

-    compute_sum_sse_16 v4, 1

-    compute_sum_sse_16 v5, 1

-    compute_sum_sse_16 v6, 1

-    compute_sum_sse_16 v7, 0

-    variance_final v18, v19, v23, 7

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 2

-;# r3 unsigned char  *src_ptr

-;# r4 int  src_pixels_per_line

-;# r5 int  xoffset

-;# r6 int  yoffset

-;# r7 unsigned char *dst_ptr

-;# r8 int dst_pixels_per_line

-;# r9 unsigned int *sse

-;#

-;# r3 return value

-vp9_sub_pixel_variance16x16_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xffff

-    ori     r12, r12, 0xfff8

-    mtspr   256, r12            ;# set VRSAVE

-    stwu    r1, -32(r1)         ;# create space on the stack

-    HProlog second_pass_16x16_pre_copy_b

-    hfilter_16 v0,  1

-    hfilter_16 v1,  1

-    hfilter_16 v2,  1

-    hfilter_16 v3,  1

-    hfilter_16 v4,  1

-    hfilter_16 v5,  1

-    hfilter_16 v6,  1

-    hfilter_16 v7,  1

-    hfilter_16 v8,  1

-    hfilter_16 v9,  1

-    hfilter_16 v10, 1

-    hfilter_16 v11, 1

-    hfilter_16 v12, 1

-    hfilter_16 v13, 1

-    hfilter_16 v14, 1

-    hfilter_16 v15, 1

-    ;# Finished filtering main horizontal block.  If there is no

-    ;#  vertical filtering, jump to storing the data.  Otherwise

-    ;#  load up and filter the additional line that is needed

-    ;#  for the vertical filter.

-    beq     compute_sum_sse_16x16_b

-    hfilter_16 v16, 0

-    b   second_pass_16x16_b

-second_pass_16x16_pre_copy_b:

-    slwi.   r6, r6, 5           ;# index into vertical filter array

-    load_and_align_16  v0,  r3, r4, 1

-    load_and_align_16  v1,  r3, r4, 1

-    load_and_align_16  v2,  r3, r4, 1

-    load_and_align_16  v3,  r3, r4, 1

-    load_and_align_16  v4,  r3, r4, 1

-    load_and_align_16  v5,  r3, r4, 1

-    load_and_align_16  v6,  r3, r4, 1

-    load_and_align_16  v7,  r3, r4, 1

-    load_and_align_16  v8,  r3, r4, 1

-    load_and_align_16  v9,  r3, r4, 1

-    load_and_align_16  v10, r3, r4, 1

-    load_and_align_16  v11, r3, r4, 1

-    load_and_align_16  v12, r3, r4, 1

-    load_and_align_16  v13, r3, r4, 1

-    load_and_align_16  v14, r3, r4, 1

-    load_and_align_16  v15, r3, r4, 1

-    load_and_align_16  v16, r3, r4, 0

-    beq     compute_sum_sse_16x16_b

-second_pass_16x16_b:

-    vspltish v20, 8

-    vspltish v18, 3

-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

-    load_vfilter v20, v21

-    vfilter_16 v0,  v1

-    vfilter_16 v1,  v2

-    vfilter_16 v2,  v3

-    vfilter_16 v3,  v4

-    vfilter_16 v4,  v5

-    vfilter_16 v5,  v6

-    vfilter_16 v6,  v7

-    vfilter_16 v7,  v8

-    vfilter_16 v8,  v9

-    vfilter_16 v9,  v10

-    vfilter_16 v10, v11

-    vfilter_16 v11, v12

-    vfilter_16 v12, v13

-    vfilter_16 v13, v14

-    vfilter_16 v14, v15

-    vfilter_16 v15, v16

-compute_sum_sse_16x16_b:

-    vspltish v18, 0             ;# sum

-    vspltish v19, 0             ;# sse

-    vspltish v23, 0             ;# unpack

-    li      r10, 16

-    compute_sum_sse_16 v0,  1

-    compute_sum_sse_16 v1,  1

-    compute_sum_sse_16 v2,  1

-    compute_sum_sse_16 v3,  1

-    compute_sum_sse_16 v4,  1

-    compute_sum_sse_16 v5,  1

-    compute_sum_sse_16 v6,  1

-    compute_sum_sse_16 v7,  1

-    compute_sum_sse_16 v8,  1

-    compute_sum_sse_16 v9,  1

-    compute_sum_sse_16 v10, 1

-    compute_sum_sse_16 v11, 1

-    compute_sum_sse_16 v12, 1

-    compute_sum_sse_16 v13, 1

-    compute_sum_sse_16 v14, 1

-    compute_sum_sse_16 v15, 0

-    variance_final v18, v19, v23, 8

-    addi    r1, r1, 32          ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .data

-    .align 4

-hfilter_b:

-    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0

-    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0

-    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0

-    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0

-    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0

-    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0

-    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0

-    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0

-    .align 4

-vfilter_b:

-    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128

-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112

-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96

-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32

-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80

-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48

-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64

-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64

-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48

-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80

-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32

-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96

-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112

-    .align 4

-b_hperm_b:

-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15

-    .align 4

-b_0123_b:

-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6

-    .align 4

-b_4567_b:

-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10

-b_hilo_b:

-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

--- a/vp8/encoder/psnr.c

+++ /dev/null

@@ -1,30 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_scale/yv12config.h"

-#include "math.h"

-#include "vp8/common/systemdependent.h" /* for vp9_clear_system_state() */

-#define MAX_PSNR 100

-double vp9_mse2psnr(double Samples, double Peak, double Mse) {

-  double psnr;

-  if ((double)Mse > 0.0)

-    psnr = 10.0 * log10(Peak * Peak * Samples / Mse);

-  else

-    psnr = MAX_PSNR;      // Limit to prevent / 0

-  if (psnr > MAX_PSNR)

-    psnr = MAX_PSNR;

-  return psnr;

-}

--- a/vp8/encoder/psnr.h

+++ /dev/null

@@ -1,17 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_PSNR_H

-#define __INC_PSNR_H

-extern double vp9_mse2psnr(double Samples, double Peak, double Mse);

-#endif

--- a/vp8/encoder/quantize.c

+++ /dev/null

@@ -1,716 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <math.h>

-#include "vpx_mem/vpx_mem.h"

-#include "onyx_int.h"

-#include "quantize.h"

-#include "vp8/common/quant_common.h"

-#include "vp8/common/seg_common.h"

-#ifdef ENC_DEBUG

-extern int enc_debug;

-#endif

-void vp9_ht_quantize_b_4x4(BLOCK *b, BLOCKD *d, TX_TYPE tx_type) {

-  int i, rc, eob;

-  int zbin;

-  int x, y, z, sz;

-  short *zbin_boost_ptr  = b->zrun_zbin_boost;

-  short *coeff_ptr       = b->coeff;

-  short *zbin_ptr        = b->zbin;

-  short *round_ptr       = b->round;

-  short *quant_ptr       = b->quant;

-  unsigned char *quant_shift_ptr = b->quant_shift;

-  short *qcoeff_ptr      = d->qcoeff;

-  short *dqcoeff_ptr     = d->dqcoeff;

-  short *dequant_ptr     = d->dequant;

-  short zbin_oq_value    = b->zbin_extra;

-  int const *pt_scan ;

-  switch (tx_type) {

-    case ADST_DCT :

-      pt_scan = vp9_row_scan;

-      break;

-    case DCT_ADST :

-      pt_scan = vp9_col_scan;

-      break;

-    default :

-      pt_scan = vp9_default_zig_zag1d;

-      break;

-  }

-  vpx_memset(qcoeff_ptr, 0, 32);

-  vpx_memset(dqcoeff_ptr, 0, 32);

-  eob = -1;

-  for (i = 0; i < b->eob_max_offset; i++) {

-    rc   = pt_scan[i];

-    z    = coeff_ptr[rc];

-    zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;

-    zbin_boost_ptr ++;

-    sz = (z >> 31);                                 // sign of z

-    x  = (z ^ sz) - sz;                             // x = abs(z)

-    if (x >= zbin) {

-      x += round_ptr[rc];

-      y  = (((x * quant_ptr[rc]) >> 16) + x)

-           >> quant_shift_ptr[rc];                // quantize (x)

-      x  = (y ^ sz) - sz;                         // get the sign back

-      qcoeff_ptr[rc]  = x;                        // write to destination

-      dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value

-      if (y) {

-        eob = i;                                // last nonzero coeffs

-        zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength

-      }

-    }

-  }

-  d->eob = eob + 1;

-}

-void vp9_regular_quantize_b_4x4(BLOCK *b, BLOCKD *d) {

-  int i, rc, eob;

-  int zbin;

-  int x, y, z, sz;

-  short *zbin_boost_ptr  = b->zrun_zbin_boost;

-  short *coeff_ptr       = b->coeff;

-  short *zbin_ptr        = b->zbin;

-  short *round_ptr       = b->round;

-  short *quant_ptr       = b->quant;

-  unsigned char *quant_shift_ptr = b->quant_shift;

-  short *qcoeff_ptr      = d->qcoeff;

-  short *dqcoeff_ptr     = d->dqcoeff;

-  short *dequant_ptr     = d->dequant;

-  short zbin_oq_value    = b->zbin_extra;

-  vpx_memset(qcoeff_ptr, 0, 32);

-  vpx_memset(dqcoeff_ptr, 0, 32);

-  eob = -1;

-  for (i = 0; i < b->eob_max_offset; i++) {

-    rc   = vp9_default_zig_zag1d[i];

-    z    = coeff_ptr[rc];

-    zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;

-    zbin_boost_ptr ++;

-    sz = (z >> 31);                                 // sign of z

-    x  = (z ^ sz) - sz;                             // x = abs(z)

-    if (x >= zbin) {

-      x += round_ptr[rc];

-      y  = (((x * quant_ptr[rc]) >> 16) + x)

-           >> quant_shift_ptr[rc];                // quantize (x)

-      x  = (y ^ sz) - sz;                         // get the sign back

-      qcoeff_ptr[rc]  = x;                        // write to destination

-      dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value

-      if (y) {

-        eob = i;                                // last nonzero coeffs

-        zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength

-      }

-    }

-  }

-  d->eob = eob + 1;

-}

-void vp9_quantize_mby_4x4_c(MACROBLOCK *x) {

-  int i;

-  int has_2nd_order = x->e_mbd.mode_info_context->mbmi.mode != SPLITMV;

-  for (i = 0; i < 16; i++)

-    x->quantize_b_4x4(&x->block[i], &x->e_mbd.block[i]);

-  if (has_2nd_order)

-    x->quantize_b_4x4(&x->block[24], &x->e_mbd.block[24]);

-}

-void vp9_quantize_mbuv_4x4_c(MACROBLOCK *x) {

-  int i;

-  for (i = 16; i < 24; i++)

-    x->quantize_b_4x4(&x->block[i], &x->e_mbd.block[i]);

-}

-void vp9_quantize_mb_4x4_c(MACROBLOCK *x) {

-  vp9_quantize_mby_4x4_c(x);

-  vp9_quantize_mbuv_4x4_c(x);

-}

-void vp9_regular_quantize_b_2x2(BLOCK *b, BLOCKD *d) {

-  int i, rc, eob;

-  int zbin;

-  int x, y, z, sz;

-  short *zbin_boost_ptr = b->zrun_zbin_boost;

-  int zbin_zrun_index = 0;

-  short *coeff_ptr  = b->coeff;

-  short *zbin_ptr   = b->zbin;

-  short *round_ptr  = b->round;

-  short *quant_ptr  = b->quant;

-  unsigned char *quant_shift_ptr = b->quant_shift;

-  short *qcoeff_ptr = d->qcoeff;

-  short *dqcoeff_ptr = d->dqcoeff;

-  short *dequant_ptr = d->dequant;

-  short zbin_oq_value = b->zbin_extra;

-  // double q2nd = 4;

-  vpx_memset(qcoeff_ptr, 0, 32);

-  vpx_memset(dqcoeff_ptr, 0, 32);

-  eob = -1;

-  for (i = 0; i < b->eob_max_offset_8x8; i++) {

-    rc   = vp9_default_zig_zag1d[i];

-    z    = coeff_ptr[rc];

-    zbin_boost_ptr = &b->zrun_zbin_boost[zbin_zrun_index];

-    zbin_zrun_index += 4;

-    zbin = (zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value);

-    sz = (z >> 31);                               // sign of z

-    x  = (z ^ sz) - sz;                           // x = abs(z)

-    if (x >= zbin) {

-      x += (round_ptr[rc]);

-      y  = ((int)((int)(x * quant_ptr[rc]) >> 16) + x)

-           >> quant_shift_ptr[rc];                // quantize (x)

-      x  = (y ^ sz) - sz;                         // get the sign back

-      qcoeff_ptr[rc]  = x;                        // write to destination

-      dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value

-      if (y) {

-        eob = i;                                  // last nonzero coeffs

-        zbin_zrun_index = 0;

-      }

-    }

-  }

-  d->eob = eob + 1;

-}

-void vp9_regular_quantize_b_8x8(BLOCK *b, BLOCKD *d) {

-  int i, rc, eob;

-  int zbin;

-  int x, y, z, sz;

-  short *zbin_boost_ptr = b->zrun_zbin_boost_8x8;

-  short *coeff_ptr  = b->coeff;

-  short *zbin_ptr   = b->zbin_8x8;

-  short *round_ptr  = b->round;

-  short *quant_ptr  = b->quant;

-  unsigned char *quant_shift_ptr = b->quant_shift;

-  short *qcoeff_ptr = d->qcoeff;

-  short *dqcoeff_ptr = d->dqcoeff;

-  short *dequant_ptr = d->dequant;

-  short zbin_oq_value = b->zbin_extra;

-  vpx_memset(qcoeff_ptr, 0, 64 * sizeof(short));

-  vpx_memset(dqcoeff_ptr, 0, 64 * sizeof(short));

-  eob = -1;

-  for (i = 0; i < b->eob_max_offset_8x8; i++) {

-    rc   = vp9_default_zig_zag1d_8x8[i];

-    z    = coeff_ptr[rc];

-    zbin = (zbin_ptr[rc != 0] + *zbin_boost_ptr + zbin_oq_value);

-    zbin_boost_ptr++;

-    sz = (z >> 31);                               // sign of z

-    x  = (z ^ sz) - sz;                           // x = abs(z)

-    if (x >= zbin) {

-      x += (round_ptr[rc != 0]);

-      y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))

-           >> quant_shift_ptr[rc != 0];            // quantize (x)

-      x  = (y ^ sz) - sz;                         // get the sign back

-      qcoeff_ptr[rc]  = x;                        // write to destination

-      dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0]; // dequantized value

-      if (y) {

-        eob = i;                                  // last nonzero coeffs

-        zbin_boost_ptr = b->zrun_zbin_boost_8x8;

-      }

-    }

-  }

-  d->eob = eob + 1;

-}

-void vp9_quantize_mby_8x8(MACROBLOCK *x) {

-  int i;

-  int has_2nd_order = x->e_mbd.mode_info_context->mbmi.mode != SPLITMV;

-  for (i = 0; i < 16; i ++) {

-    x->e_mbd.block[i].eob = 0;

-  }

-  x->e_mbd.block[24].eob = 0;

-  for (i = 0; i < 16; i += 4)

-    x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);

-  if (has_2nd_order)

-    x->quantize_b_2x2(&x->block[24], &x->e_mbd.block[24]);

-}

-void vp9_quantize_mbuv_8x8(MACROBLOCK *x) {

-  int i;

-  for (i = 16; i < 24; i ++)

-    x->e_mbd.block[i].eob = 0;

-  for (i = 16; i < 24; i += 4)

-    x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);

-}

-void vp9_quantize_mb_8x8(MACROBLOCK *x) {

-  vp9_quantize_mby_8x8(x);

-  vp9_quantize_mbuv_8x8(x);

-}

-void vp9_quantize_mby_16x16(MACROBLOCK *x) {

-  int i;

-  for (i = 0; i < 16; i++)

-    x->e_mbd.block[i].eob = 0;

-  x->e_mbd.block[24].eob = 0;

-  x->quantize_b_16x16(&x->block[0], &x->e_mbd.block[0]);

-}

-void vp9_quantize_mb_16x16(MACROBLOCK *x) {

-  vp9_quantize_mby_16x16(x);

-  vp9_quantize_mbuv_8x8(x);

-}

-void vp9_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) {

-  int i, rc, eob;

-  int zbin;

-  int x, y, z, sz;

-  short *zbin_boost_ptr = b->zrun_zbin_boost_16x16;

-  short *coeff_ptr  = b->coeff;

-  short *zbin_ptr   = b->zbin_16x16;

-  short *round_ptr  = b->round;

-  short *quant_ptr  = b->quant;

-  unsigned char *quant_shift_ptr = b->quant_shift;

-  short *qcoeff_ptr = d->qcoeff;

-  short *dqcoeff_ptr = d->dqcoeff;

-  short *dequant_ptr = d->dequant;

-  short zbin_oq_value = b->zbin_extra;

-  vpx_memset(qcoeff_ptr, 0, 256*sizeof(short));

-  vpx_memset(dqcoeff_ptr, 0, 256*sizeof(short));

-  eob = -1;

-  for (i = 0; i < b->eob_max_offset_16x16; i++) {

-    rc   = vp9_default_zig_zag1d_16x16[i];

-    z    = coeff_ptr[rc];

-    zbin = (zbin_ptr[rc!=0] + *zbin_boost_ptr + zbin_oq_value);

-    zbin_boost_ptr ++;

-    sz = (z >> 31);                               // sign of z

-    x  = (z ^ sz) - sz;                           // x = abs(z)

-    if (x >= zbin) {

-      x += (round_ptr[rc!=0]);

-      y  = ((int)(((int)(x * quant_ptr[rc!=0]) >> 16) + x))

-          >> quant_shift_ptr[rc!=0];              // quantize (x)

-      x  = (y ^ sz) - sz;                         // get the sign back

-      qcoeff_ptr[rc]  = x;                        // write to destination

-      dqcoeff_ptr[rc] = x * dequant_ptr[rc!=0];   // dequantized value

-      if (y) {

-        eob = i;                                  // last nonzero coeffs

-        zbin_boost_ptr = b->zrun_zbin_boost_16x16;

-      }

-    }

-  }

-  d->eob = eob + 1;

-}

-/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of

- * these two C functions if corresponding optimized routine is not available.

- * NEON optimized version implements currently the fast quantization for pair

- * of blocks. */

-void vp9_regular_quantize_b_4x4_pair(BLOCK *b1, BLOCK *b2,

-                                     BLOCKD *d1, BLOCKD *d2) {

-  vp9_regular_quantize_b_4x4(b1, d1);

-  vp9_regular_quantize_b_4x4(b2, d2);

-}

-static void invert_quant(short *quant,

-                         unsigned char *shift, short d) {

-  unsigned t;

-  int l;

-  t = d;

-  for (l = 0; t > 1; l++)

-    t >>= 1;

-  t = 1 + (1 << (16 + l)) / d;

-  *quant = (short)(t - (1 << 16));

-  *shift = l;

-}

-void vp9_init_quantizer(VP9_COMP *cpi) {

-  int i;

-  int quant_val;

-  int Q;

-  static const int zbin_boost[16] = {  0,  0,  8, 10, 12, 14, 16, 20,

-                                      24, 28, 32, 36, 40, 44, 44, 44

-                                    };

-  static const int zbin_boost_8x8[64] = {  0,  0,  0,  8,  8,  8, 10, 12,

-                                          14, 16, 18, 20, 22, 24, 26, 28,

-                                          30, 32, 34, 36, 38, 40, 42, 44,

-                                          46, 48, 48, 48, 48, 48, 48, 48,

-                                          48, 48, 48, 48, 48, 48, 48, 48,

-                                          48, 48, 48, 48, 48, 48, 48, 48,

-                                          48, 48, 48, 48, 48, 48, 48, 48,

-                                          48, 48, 48, 48, 48, 48, 48, 48

-                                        };

-  static const int zbin_boost_16x16[256] = {

-     0,  0,  0,  8,  8,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28,

-    30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-  };

-  int qrounding_factor = 48;

-  for (Q = 0; Q < QINDEX_RANGE; Q++) {

-    int qzbin_factor = (vp9_dc_quant(Q, 0) < 148) ? 84 : 80;

-#if CONFIG_LOSSLESS

-    if (cpi->oxcf.lossless) {

-      if (Q == 0) {

-        qzbin_factor = 64;

-        qrounding_factor = 64;

-      }

-    }

-#endif

-    // dc values

-    quant_val = vp9_dc_quant(Q, cpi->common.y1dc_delta_q);

-    invert_quant(cpi->Y1quant[Q] + 0,

-                 cpi->Y1quant_shift[Q] + 0, quant_val);

-    cpi->Y1zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

-    cpi->Y1zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

-    cpi->Y1zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

-    cpi->Y1round[Q][0] = (qrounding_factor * quant_val) >> 7;

-    cpi->common.Y1dequant[Q][0] = quant_val;

-    cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;

-    cpi->zrun_zbin_boost_y1_8x8[Q][0] =

-      ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;

-    cpi->zrun_zbin_boost_y1_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;

-    quant_val = vp9_dc2quant(Q, cpi->common.y2dc_delta_q);

-    invert_quant(cpi->Y2quant[Q] + 0,

-                 cpi->Y2quant_shift[Q] + 0, quant_val);

-    cpi->Y2zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

-    cpi->Y2zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

-    cpi->Y2zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

-    cpi->Y2round[Q][0] = (qrounding_factor * quant_val) >> 7;

-    cpi->common.Y2dequant[Q][0] = quant_val;

-    cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;

-    cpi->zrun_zbin_boost_y2_8x8[Q][0] =

-      ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;

-    cpi->zrun_zbin_boost_y2_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;

-    quant_val = vp9_dc_uv_quant(Q, cpi->common.uvdc_delta_q);

-    invert_quant(cpi->UVquant[Q] + 0,

-                 cpi->UVquant_shift[Q] + 0, quant_val);

-    cpi->UVzbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

-    cpi->UVzbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

-    cpi->UVzbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

-    cpi->UVround[Q][0] = (qrounding_factor * quant_val) >> 7;

-    cpi->common.UVdequant[Q][0] = quant_val;

-    cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;

-    cpi->zrun_zbin_boost_uv_8x8[Q][0] =

-      ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;

-    cpi->zrun_zbin_boost_uv_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;

-    // all the 4x4 ac values =;

-    for (i = 1; i < 16; i++) {

-      int rc = vp9_default_zig_zag1d[i];

-      quant_val = vp9_ac_yquant(Q);

-      invert_quant(cpi->Y1quant[Q] + rc,

-                   cpi->Y1quant_shift[Q] + rc, quant_val);

-      cpi->Y1zbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

-      cpi->Y1round[Q][rc] = (qrounding_factor * quant_val) >> 7;

-      cpi->common.Y1dequant[Q][rc] = quant_val;

-      cpi->zrun_zbin_boost_y1[Q][i] =

-        ((quant_val * zbin_boost[i]) + 64) >> 7;

-      quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);

-      invert_quant(cpi->Y2quant[Q] + rc,

-                   cpi->Y2quant_shift[Q] + rc, quant_val);

-      cpi->Y2zbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

-      cpi->Y2round[Q][rc] = (qrounding_factor * quant_val) >> 7;

-      cpi->common.Y2dequant[Q][rc] = quant_val;

-      cpi->zrun_zbin_boost_y2[Q][i] =

-        ((quant_val * zbin_boost[i]) + 64) >> 7;

-      quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);

-      invert_quant(cpi->UVquant[Q] + rc,

-                   cpi->UVquant_shift[Q] + rc, quant_val);

-      cpi->UVzbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

-      cpi->UVround[Q][rc] = (qrounding_factor * quant_val) >> 7;

-      cpi->common.UVdequant[Q][rc] = quant_val;

-      cpi->zrun_zbin_boost_uv[Q][i] =

-        ((quant_val * zbin_boost[i]) + 64) >> 7;

-    }

-    // 8x8 structures... only zbin seperated out for now

-    // This needs cleaning up for 8x8 especially if we are to add

-    // support for non flat Q matices

-    for (i = 1; i < 64; i++) {

-      int rc = vp9_default_zig_zag1d_8x8[i];

-      quant_val = vp9_ac_yquant(Q);

-      cpi->Y1zbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

-      cpi->zrun_zbin_boost_y1_8x8[Q][i] =

-        ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;

-      quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);

-      cpi->Y2zbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

-      cpi->zrun_zbin_boost_y2_8x8[Q][i] =

-        ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;

-      quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);

-      cpi->UVzbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

-      cpi->zrun_zbin_boost_uv_8x8[Q][i] =

-        ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;

-    }

-    // 16x16 structures. Same comment above applies.

-    for (i = 1; i < 256; i++) {

-      int rc = vp9_default_zig_zag1d_16x16[i];

-      quant_val = vp9_ac_yquant(Q);

-      cpi->Y1zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

-      cpi->zrun_zbin_boost_y1_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;

-      quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);

-      cpi->Y2zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

-      cpi->zrun_zbin_boost_y2_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;

-      quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);

-      cpi->UVzbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

-      cpi->zrun_zbin_boost_uv_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;

-    }

-  }

-}

-void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {

-  int i;

-  int QIndex;

-  MACROBLOCKD *xd = &x->e_mbd;

-  int zbin_extra;

-  int segment_id = xd->mode_info_context->mbmi.segment_id;

-  // Select the baseline MB Q index allowing for any segment level change.

-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {

-    // Abs Value

-    if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA)

-      QIndex = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);

-    // Delta Value

-    else {

-      QIndex = cpi->common.base_qindex +

-               vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);

-      // Clamp to valid range

-      QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;

-    }

-  } else

-    QIndex = cpi->common.base_qindex;

-  // Y

-  zbin_extra = (cpi->common.Y1dequant[QIndex][1] *

-                (cpi->zbin_over_quant +

-                 cpi->zbin_mode_boost +

-                 x->act_zbin_adj)) >> 7;

-  for (i = 0; i < 16; i++) {

-    x->block[i].quant = cpi->Y1quant[QIndex];

-    x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];

-    x->block[i].zbin = cpi->Y1zbin[QIndex];

-    x->block[i].zbin_8x8 = cpi->Y1zbin_8x8[QIndex];

-    x->block[i].zbin_16x16 = cpi->Y1zbin_16x16[QIndex];

-    x->block[i].round = cpi->Y1round[QIndex];

-    x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex];

-    x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];

-    x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y1_8x8[QIndex];

-    x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y1_16x16[QIndex];

-    x->block[i].zbin_extra = (short)zbin_extra;

-    // Segment max eob offset feature.

-    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {

-      x->block[i].eob_max_offset =

-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-      x->block[i].eob_max_offset_8x8 =

-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-      x->block[i].eob_max_offset_16x16 =

-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-    } else {

-      x->block[i].eob_max_offset = 16;

-      x->block[i].eob_max_offset_8x8 = 64;

-      x->block[i].eob_max_offset_16x16 = 256;

-    }

-  }

-  // UV

-  zbin_extra = (cpi->common.UVdequant[QIndex][1] *

-                (cpi->zbin_over_quant +

-                 cpi->zbin_mode_boost +

-                 x->act_zbin_adj)) >> 7;

-  for (i = 16; i < 24; i++) {

-    x->block[i].quant = cpi->UVquant[QIndex];

-    x->block[i].quant_shift = cpi->UVquant_shift[QIndex];

-    x->block[i].zbin = cpi->UVzbin[QIndex];

-    x->block[i].zbin_8x8 = cpi->UVzbin_8x8[QIndex];

-    x->block[i].zbin_16x16 = cpi->UVzbin_16x16[QIndex];

-    x->block[i].round = cpi->UVround[QIndex];

-    x->e_mbd.block[i].dequant = cpi->common.UVdequant[QIndex];

-    x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex];

-    x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_uv_8x8[QIndex];

-    x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_uv_16x16[QIndex];

-    x->block[i].zbin_extra = (short)zbin_extra;

-    // Segment max eob offset feature.

-    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {

-      x->block[i].eob_max_offset =

-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-      x->block[i].eob_max_offset_8x8 =

-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-    } else {

-      x->block[i].eob_max_offset = 16;

-      x->block[i].eob_max_offset_8x8 = 64;

-    }

-  }

-  // Y2

-  zbin_extra = (cpi->common.Y2dequant[QIndex][1] *

-                ((cpi->zbin_over_quant / 2) +

-                 cpi->zbin_mode_boost +

-                 x->act_zbin_adj)) >> 7;

-  x->block[24].quant = cpi->Y2quant[QIndex];

-  x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];

-  x->block[24].zbin = cpi->Y2zbin[QIndex];

-  x->block[24].zbin_8x8 = cpi->Y2zbin_8x8[QIndex];

-  x->block[24].zbin_16x16 = cpi->Y2zbin_16x16[QIndex];

-  x->block[24].round = cpi->Y2round[QIndex];

-  x->e_mbd.block[24].dequant = cpi->common.Y2dequant[QIndex];

-  x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex];

-  x->block[24].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y2_8x8[QIndex];

-  x->block[24].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y2_16x16[QIndex];

-  x->block[24].zbin_extra = (short)zbin_extra;

-  // TBD perhaps not use for Y2

-  // Segment max eob offset feature.

-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {

-    x->block[24].eob_max_offset =

-      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-    x->block[24].eob_max_offset_8x8 =

-      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-  } else {

-    x->block[24].eob_max_offset = 16;

-    x->block[24].eob_max_offset_8x8 = 4;

-  }

-  /* save this macroblock QIndex for vp9_update_zbin_extra() */

-  x->e_mbd.q_index = QIndex;

-}

-void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {

-  int i;

-  int QIndex = x->e_mbd.q_index;

-  int zbin_extra;

-  // Y

-  zbin_extra = (cpi->common.Y1dequant[QIndex][1] *

-                (cpi->zbin_over_quant +

-                 cpi->zbin_mode_boost +

-                 x->act_zbin_adj)) >> 7;

-  for (i = 0; i < 16; i++) {

-    x->block[i].zbin_extra = (short)zbin_extra;

-  }

-  // UV

-  zbin_extra = (cpi->common.UVdequant[QIndex][1] *

-                (cpi->zbin_over_quant +

-                 cpi->zbin_mode_boost +

-                 x->act_zbin_adj)) >> 7;

-  for (i = 16; i < 24; i++) {

-    x->block[i].zbin_extra = (short)zbin_extra;

-  }

-  // Y2

-  zbin_extra = (cpi->common.Y2dequant[QIndex][1] *

-                ((cpi->zbin_over_quant / 2) +

-                 cpi->zbin_mode_boost +

-                 x->act_zbin_adj)) >> 7;

-  x->block[24].zbin_extra = (short)zbin_extra;

-}

-void vp9_frame_init_quantizer(VP9_COMP *cpi) {

-  // Clear Zbin mode boost for default case

-  cpi->zbin_mode_boost = 0;

-  // MB level quantizer setup

-  vp9_mb_init_quantizer(cpi, &cpi->mb);

-}

-void vp9_set_quantizer(struct VP9_COMP *cpi, int Q) {

-  VP9_COMMON *cm = &cpi->common;

-  cm->base_qindex = Q;

-  // if any of the delta_q values are changing update flag will

-  // have to be set.

-  cm->y1dc_delta_q = 0;

-  cm->y2ac_delta_q = 0;

-  cm->uvdc_delta_q = 0;

-  cm->uvac_delta_q = 0;

-  cm->y2dc_delta_q = 0;

-  // quantizer has to be reinitialized if any delta_q changes.

-  // As there are not any here for now this is inactive code.

-  // if(update)

-  //    vp9_init_quantizer(cpi);

-}

--- a/vp8/encoder/quantize.h

+++ /dev/null

@@ -1,97 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_QUANTIZE_H

-#define __INC_QUANTIZE_H

-#include "block.h"

-#define prototype_quantize_block(sym) \

-  void (sym)(BLOCK *b,BLOCKD *d)

-#define prototype_quantize_block_pair(sym) \

-  void (sym)(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)

-#define prototype_quantize_mb(sym) \

-  void (sym)(MACROBLOCK *x)

-#if ARCH_X86 || ARCH_X86_64

-#include "x86/quantize_x86.h"

-#endif

-#if ARCH_ARM

-#include "arm/quantize_arm.h"

-#endif

-#define prototype_quantize_block_type(sym) \

-  void (sym)(BLOCK *b, BLOCKD *d, TX_TYPE type)

-extern prototype_quantize_block_type(vp9_ht_quantize_b_4x4);

-#ifndef vp9_quantize_quantb_4x4

-#define vp9_quantize_quantb_4x4 vp9_regular_quantize_b_4x4

-#endif

-extern prototype_quantize_block(vp9_quantize_quantb_4x4);

-#ifndef vp9_quantize_quantb_4x4_pair

-#define vp9_quantize_quantb_4x4_pair vp9_regular_quantize_b_4x4_pair

-#endif

-extern prototype_quantize_block_pair(vp9_quantize_quantb_4x4_pair);

-#ifndef vp9_quantize_quantb_8x8

-#define vp9_quantize_quantb_8x8 vp9_regular_quantize_b_8x8

-#endif

-extern prototype_quantize_block(vp9_quantize_quantb_8x8);

-#ifndef vp9_quantize_quantb_16x16

-#define vp9_quantize_quantb_16x16 vp9_regular_quantize_b_16x16

-#endif

-extern prototype_quantize_block(vp9_quantize_quantb_16x16);

-#ifndef vp9_quantize_quantb_2x2

-#define vp9_quantize_quantb_2x2 vp9_regular_quantize_b_2x2

-#endif

-extern prototype_quantize_block(vp9_quantize_quantb_2x2);

-#ifndef vp9_quantize_mb_4x4

-#define vp9_quantize_mb_4x4 vp9_quantize_mb_4x4_c

-#endif

-extern prototype_quantize_mb(vp9_quantize_mb_4x4);

-void vp9_quantize_mb_8x8(MACROBLOCK *x);

-#ifndef vp9_quantize_mbuv_4x4

-#define vp9_quantize_mbuv_4x4 vp9_quantize_mbuv_4x4_c

-#endif

-extern prototype_quantize_mb(vp9_quantize_mbuv_4x4);

-#ifndef vp9_quantize_mby_4x4

-#define vp9_quantize_mby_4x4 vp9_quantize_mby_4x4_c

-#endif

-extern prototype_quantize_mb(vp9_quantize_mby_4x4);

-extern prototype_quantize_mb(vp9_quantize_mby_8x8);

-extern prototype_quantize_mb(vp9_quantize_mbuv_8x8);

-void vp9_quantize_mb_16x16(MACROBLOCK *x);

-extern prototype_quantize_block(vp9_quantize_quantb_16x16);

-extern prototype_quantize_mb(vp9_quantize_mby_16x16);

-struct VP9_COMP;

-extern void vp9_set_quantizer(struct VP9_COMP *cpi, int Q);

-extern void vp9_frame_init_quantizer(struct VP9_COMP *cpi);

-extern void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x);

-extern void vp9_mb_init_quantizer(struct VP9_COMP *cpi, MACROBLOCK *x);

-extern void vp9_init_quantizer(struct VP9_COMP *cpi);

-#endif

--- a/vp8/encoder/ratectrl.c

+++ /dev/null

@@ -1,698 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <stdlib.h>

-#include <stdio.h>

-#include <string.h>

-#include <limits.h>

-#include <assert.h>

-#include "math.h"

-#include "vp8/common/alloccommon.h"

-#include "vp8/common/common.h"

-#include "ratectrl.h"

-#include "vp8/common/entropymode.h"

-#include "vpx_mem/vpx_mem.h"

-#include "vp8/common/systemdependent.h"

-#include "encodemv.h"

-#include "vp8/common/quant_common.h"

-#define MIN_BPB_FACTOR          0.005

-#define MAX_BPB_FACTOR          50

-#ifdef MODE_STATS

-extern unsigned int y_modes[VP9_YMODES];

-extern unsigned int uv_modes[VP9_UV_MODES];

-extern unsigned int b_modes[B_MODE_COUNT];

-extern unsigned int inter_y_modes[MB_MODE_COUNT];

-extern unsigned int inter_uv_modes[VP9_UV_MODES];

-extern unsigned int inter_b_modes[B_MODE_COUNT];

-#endif

-// Bits Per MB at different Q (Multiplied by 512)

-#define BPER_MB_NORMBITS    9

-// % adjustment to target kf size based on seperation from previous frame

-static const int kf_boost_seperation_adjustment[16] = {

-  30,   40,   50,   55,   60,   65,   70,   75,

-  80,   85,   90,   95,  100,  100,  100,  100,

-};

-static const int gf_adjust_table[101] = {

-  100,

-  115, 130, 145, 160, 175, 190, 200, 210, 220, 230,

-  240, 260, 270, 280, 290, 300, 310, 320, 330, 340,

-  350, 360, 370, 380, 390, 400, 400, 400, 400, 400,

-  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,

-  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,

-  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,

-  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,

-  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,

-  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,

-  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,

-};

-static const int gf_intra_usage_adjustment[20] = {

-  125, 120, 115, 110, 105, 100,  95,  85,  80,  75,

-  70,  65,  60,  55,  50,  50,  50,  50,  50,  50,

-};

-static const int gf_interval_table[101] = {

-  7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8,

-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8,

-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9,

-  9, 9, 9, 9, 9, 9, 9, 9, 9, 9,

-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10,

-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10,

-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,

-};

-static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] = { 1, 2, 3, 4, 5 };

-// These functions use formulaic calculations to make playing with the

-// quantizer tables easier. If necessary they can be replaced by lookup

-// tables if and when things settle down in the experimental bitstream

-double vp9_convert_qindex_to_q(int qindex) {

-  // Convert the index to a real Q value (scaled down to match old Q values)

-  return (double)vp9_ac_yquant(qindex) / 4.0;

-}

-int vp9_gfboost_qadjust(int qindex) {

-  int retval;

-  double q;

-  q = vp9_convert_qindex_to_q(qindex);

-  retval = (int)((0.00000828 * q * q * q) +

-                 (-0.0055 * q * q) +

-                 (1.32 * q) + 79.3);

-  return retval;

-}

-static int kfboost_qadjust(int qindex) {

-  int retval;

-  double q;

-  q = vp9_convert_qindex_to_q(qindex);

-  retval = (int)((0.00000973 * q * q * q) +

-                 (-0.00613 * q * q) +

-                 (1.316 * q) + 121.2);

-  return retval;

-}

-int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex) {

-  if (frame_type == KEY_FRAME)

-    return (int)(4500000 / vp9_convert_qindex_to_q(qindex));

-  else

-    return (int)(2850000 / vp9_convert_qindex_to_q(qindex));

-}

-void vp9_save_coding_context(VP9_COMP *cpi) {

-  CODING_CONTEXT *const cc = &cpi->coding_context;

-  VP9_COMMON *cm = &cpi->common;

-  MACROBLOCKD *xd = &cpi->mb.e_mbd;

-  // Stores a snapshot of key state variables which can subsequently be

-  // restored with a call to vp9_restore_coding_context. These functions are

-  // intended for use in a re-code loop in vp9_compress_frame where the

-  // quantizer value is adjusted between loop iterations.

-  cc->nmvc = cm->fc.nmvc;

-  vp9_copy(cc->nmvjointcost,  cpi->mb.nmvjointcost);

-  vp9_copy(cc->nmvcosts,  cpi->mb.nmvcosts);

-  vp9_copy(cc->nmvcosts_hp,  cpi->mb.nmvcosts_hp);

-  vp9_copy(cc->mv_ref_ct, cm->fc.mv_ref_ct);

-  vp9_copy(cc->mode_context, cm->fc.mode_context);

-  vp9_copy(cc->mv_ref_ct_a, cm->fc.mv_ref_ct_a);

-  vp9_copy(cc->mode_context_a, cm->fc.mode_context_a);

-  vp9_copy(cc->ymode_prob, cm->fc.ymode_prob);

-  vp9_copy(cc->bmode_prob, cm->fc.bmode_prob);

-  vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);

-  vp9_copy(cc->i8x8_mode_prob, cm->fc.i8x8_mode_prob);

-  vp9_copy(cc->sub_mv_ref_prob, cm->fc.sub_mv_ref_prob);

-  vp9_copy(cc->mbsplit_prob, cm->fc.mbsplit_prob);

-  // Stats

-#ifdef MODE_STATS

-  vp9_copy(cc->y_modes,       y_modes);

-  vp9_copy(cc->uv_modes,      uv_modes);

-  vp9_copy(cc->b_modes,       b_modes);

-  vp9_copy(cc->inter_y_modes,  inter_y_modes);

-  vp9_copy(cc->inter_uv_modes, inter_uv_modes);

-  vp9_copy(cc->inter_b_modes,  inter_b_modes);

-#endif

-  vp9_copy(cc->segment_pred_probs, cm->segment_pred_probs);

-  vp9_copy(cc->ref_pred_probs_update, cpi->ref_pred_probs_update);

-  vp9_copy(cc->ref_pred_probs, cm->ref_pred_probs);

-  vp9_copy(cc->prob_comppred, cm->prob_comppred);

-  vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,

-             cm->last_frame_seg_map, (cm->mb_rows * cm->mb_cols));

-  vp9_copy(cc->last_ref_lf_deltas, xd->last_ref_lf_deltas);

-  vp9_copy(cc->last_mode_lf_deltas, xd->last_mode_lf_deltas);

-  vp9_copy(cc->coef_probs, cm->fc.coef_probs);

-  vp9_copy(cc->hybrid_coef_probs, cm->fc.hybrid_coef_probs);

-  vp9_copy(cc->coef_probs_8x8, cm->fc.coef_probs_8x8);

-  vp9_copy(cc->hybrid_coef_probs_8x8, cm->fc.hybrid_coef_probs_8x8);

-  vp9_copy(cc->coef_probs_16x16, cm->fc.coef_probs_16x16);

-  vp9_copy(cc->hybrid_coef_probs_16x16, cm->fc.hybrid_coef_probs_16x16);

-  vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);

-}

-void vp9_restore_coding_context(VP9_COMP *cpi) {

-  CODING_CONTEXT *const cc = &cpi->coding_context;

-  VP9_COMMON *cm = &cpi->common;

-  MACROBLOCKD *xd = &cpi->mb.e_mbd;

-  // Restore key state variables to the snapshot state stored in the

-  // previous call to vp9_save_coding_context.

-  cm->fc.nmvc = cc->nmvc;

-  vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);

-  vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts);

-  vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp);

-  vp9_copy(cm->fc.mv_ref_ct, cc->mv_ref_ct);

-  vp9_copy(cm->fc.mode_context, cc->mode_context);

-  vp9_copy(cm->fc.mv_ref_ct_a, cc->mv_ref_ct_a);

-  vp9_copy(cm->fc.mode_context_a, cc->mode_context_a);

-  vp9_copy(cm->fc.ymode_prob, cc->ymode_prob);

-  vp9_copy(cm->fc.bmode_prob, cc->bmode_prob);

-  vp9_copy(cm->fc.i8x8_mode_prob, cc->i8x8_mode_prob);

-  vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);

-  vp9_copy(cm->fc.sub_mv_ref_prob, cc->sub_mv_ref_prob);

-  vp9_copy(cm->fc.mbsplit_prob, cc->mbsplit_prob);

-  // Stats

-#ifdef MODE_STATS

-  vp9_copy(y_modes, cc->y_modes);

-  vp9_copy(uv_modes, cc->uv_modes);

-  vp9_copy(b_modes, cc->b_modes);

-  vp9_copy(inter_y_modes, cc->inter_y_modes);

-  vp9_copy(inter_uv_modes, cc->inter_uv_modes);

-  vp9_copy(inter_b_modes, cc->inter_b_modes);

-#endif

-  vp9_copy(cm->segment_pred_probs, cc->segment_pred_probs);

-  vp9_copy(cpi->ref_pred_probs_update, cc->ref_pred_probs_update);

-  vp9_copy(cm->ref_pred_probs, cc->ref_pred_probs);

-  vp9_copy(cm->prob_comppred, cc->prob_comppred);

-  vpx_memcpy(cm->last_frame_seg_map,

-             cpi->coding_context.last_frame_seg_map_copy,

-             (cm->mb_rows * cm->mb_cols));

-  vp9_copy(xd->last_ref_lf_deltas, cc->last_ref_lf_deltas);

-  vp9_copy(xd->last_mode_lf_deltas, cc->last_mode_lf_deltas);

-  vp9_copy(cm->fc.coef_probs, cc->coef_probs);

-  vp9_copy(cm->fc.hybrid_coef_probs, cc->hybrid_coef_probs);

-  vp9_copy(cm->fc.coef_probs_8x8, cc->coef_probs_8x8);

-  vp9_copy(cm->fc.hybrid_coef_probs_8x8, cc->hybrid_coef_probs_8x8);

-  vp9_copy(cm->fc.coef_probs_16x16, cc->coef_probs_16x16);

-  vp9_copy(cm->fc.hybrid_coef_probs_16x16, cc->hybrid_coef_probs_16x16);

-  vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);

-}

-void vp9_setup_key_frame(VP9_COMP *cpi) {

-  VP9_COMMON *cm = &cpi->common;

-  // Setup for Key frame:

-  vp9_default_coef_probs(& cpi->common);

-  vp9_kf_default_bmode_probs(cpi->common.kf_bmode_prob);

-  vp9_init_mbmode_probs(& cpi->common);

-  vp9_default_bmode_probs(cm->fc.bmode_prob);

-  vp9_init_mv_probs(& cpi->common);

-  // cpi->common.filter_level = 0;      // Reset every key frame.

-  cpi->common.filter_level = cpi->common.base_qindex * 3 / 8;

-  // interval before next GF

-  cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;

-  cpi->common.refresh_golden_frame = TRUE;

-  cpi->common.refresh_alt_ref_frame = TRUE;

-  vp9_init_mode_contexts(&cpi->common);

-  vpx_memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));

-  vpx_memcpy(&cpi->common.lfc_a, &cpi->common.fc, sizeof(cpi->common.fc));

-  vpx_memset(cm->prev_mip, 0,

-    (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

-  vpx_memset(cm->mip, 0,

-    (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

-  vp9_update_mode_info_border(cm, cm->mip);

-  vp9_update_mode_info_in_image(cm, cm->mi);

-}

-void vp9_setup_inter_frame(VP9_COMP *cpi) {

-  if (cpi->common.refresh_alt_ref_frame) {

-    vpx_memcpy(&cpi->common.fc,

-               &cpi->common.lfc_a,

-               sizeof(cpi->common.fc));

-    vpx_memcpy(cpi->common.fc.vp8_mode_contexts,

-               cpi->common.fc.mode_context_a,

-               sizeof(cpi->common.fc.vp8_mode_contexts));

-  } else {

-    vpx_memcpy(&cpi->common.fc,

-               &cpi->common.lfc,

-               sizeof(cpi->common.fc));

-    vpx_memcpy(cpi->common.fc.vp8_mode_contexts,

-               cpi->common.fc.mode_context,

-               sizeof(cpi->common.fc.vp8_mode_contexts));

-  }

-}

-static int estimate_bits_at_q(int frame_kind, int Q, int MBs,

-                              double correction_factor) {

-  int Bpm = (int)(.5 + correction_factor * vp9_bits_per_mb(frame_kind, Q));

-  /* Attempt to retain reasonable accuracy without overflow. The cutoff is

-   * chosen such that the maximum product of Bpm and MBs fits 31 bits. The

-   * largest Bpm takes 20 bits.

-   */

-  if (MBs > (1 << 11))

-    return (Bpm >> BPER_MB_NORMBITS) * MBs;

-  else

-    return (Bpm * MBs) >> BPER_MB_NORMBITS;

-}

-static void calc_iframe_target_size(VP9_COMP *cpi) {

-  // boost defaults to half second

-  int target;

-  // Clear down mmx registers to allow floating point in what follows

-  vp9_clear_system_state();  // __asm emms;

-  // New Two pass RC

-  target = cpi->per_frame_bandwidth;

-  if (cpi->oxcf.rc_max_intra_bitrate_pct) {

-    unsigned int max_rate = cpi->per_frame_bandwidth

-                            * cpi->oxcf.rc_max_intra_bitrate_pct / 100;

-    if (target > max_rate)

-      target = max_rate;

-  }

-  cpi->this_frame_target = target;

-}

-//  Do the best we can to define the parameteres for the next GF based

-//  on what information we have available.

-//

-//  In this experimental code only two pass is supported

-//  so we just use the interval determined in the two pass code.

-static void calc_gf_params(VP9_COMP *cpi) {

-  // Set the gf interval

-  cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;

-}

-static void calc_pframe_target_size(VP9_COMP *cpi) {

-  int min_frame_target;

-  min_frame_target = 0;

-  min_frame_target = cpi->min_frame_bandwidth;

-  if (min_frame_target < (cpi->av_per_frame_bandwidth >> 5))

-    min_frame_target = cpi->av_per_frame_bandwidth >> 5;

-  // Special alt reference frame case

-  if (cpi->common.refresh_alt_ref_frame) {

-    // Per frame bit target for the alt ref frame

-    cpi->per_frame_bandwidth = cpi->twopass.gf_bits;

-    cpi->this_frame_target = cpi->per_frame_bandwidth;

-  }

-  // Normal frames (gf,and inter)

-  else {

-    cpi->this_frame_target = cpi->per_frame_bandwidth;

-  }

-  // Sanity check that the total sum of adjustments is not above the maximum allowed

-  // That is that having allowed for KF and GF penalties we have not pushed the

-  // current interframe target to low. If the adjustment we apply here is not capable of recovering

-  // all the extra bits we have spent in the KF or GF then the remainder will have to be recovered over

-  // a longer time span via other buffer / rate control mechanisms.

-  if (cpi->this_frame_target < min_frame_target)

-    cpi->this_frame_target = min_frame_target;

-  if (!cpi->common.refresh_alt_ref_frame)

-    // Note the baseline target data rate for this inter frame.

-    cpi->inter_frame_target = cpi->this_frame_target;

-  // Adjust target frame size for Golden Frames:

-  if (cpi->frames_till_gf_update_due == 0) {

-    // int Boost = 0;

-    int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;

-    cpi->common.refresh_golden_frame = TRUE;

-    calc_gf_params(cpi);

-    // If we are using alternate ref instead of gf then do not apply the boost

-    // It will instead be applied to the altref update

-    // Jims modified boost

-    if (!cpi->source_alt_ref_active) {

-      if (cpi->oxcf.fixed_q < 0) {

-        // The spend on the GF is defined in the two pass code

-        // for two pass encodes

-        cpi->this_frame_target = cpi->per_frame_bandwidth;

-      } else

-        cpi->this_frame_target =

-          (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0)

-           * cpi->last_boost) / 100;

-    }

-    // If there is an active ARF at this location use the minimum

-    // bits on this frame even if it is a contructed arf.

-    // The active maximum quantizer insures that an appropriate

-    // number of bits will be spent if needed for contstructed ARFs.

-    else {

-      cpi->this_frame_target = 0;

-    }

-    cpi->current_gf_interval = cpi->frames_till_gf_update_due;

-  }

-}

-void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {

-  int    Q = cpi->common.base_qindex;

-  int    correction_factor = 100;

-  double rate_correction_factor;

-  double adjustment_limit;

-  int    projected_size_based_on_q = 0;

-  // Clear down mmx registers to allow floating point in what follows

-  vp9_clear_system_state();  // __asm emms;

-  if (cpi->common.frame_type == KEY_FRAME) {

-    rate_correction_factor = cpi->key_frame_rate_correction_factor;

-  } else {

-    if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)

-      rate_correction_factor = cpi->gf_rate_correction_factor;

-    else

-      rate_correction_factor = cpi->rate_correction_factor;

-  }

-  // Work out how big we would have expected the frame to be at this Q given the current correction factor.

-  // Stay in double to avoid int overflow when values are large

-  projected_size_based_on_q =

-    (int)(((.5 + rate_correction_factor *

-            vp9_bits_per_mb(cpi->common.frame_type, Q)) *

-           cpi->common.MBs) / (1 << BPER_MB_NORMBITS));

-  // Make some allowance for cpi->zbin_over_quant

-  if (cpi->zbin_over_quant > 0) {

-    int Z = cpi->zbin_over_quant;

-    double Factor = 0.99;

-    double factor_adjustment = 0.01 / 256.0; // (double)ZBIN_OQ_MAX;

-    while (Z > 0) {

-      Z--;

-      projected_size_based_on_q =

-        (int)(Factor * projected_size_based_on_q);

-      Factor += factor_adjustment;

-      if (Factor  >= 0.999)

-        Factor = 0.999;

-    }

-  }

-  // Work out a size correction factor.

-  // if ( cpi->this_frame_target > 0 )

-  //  correction_factor = (100 * cpi->projected_frame_size) / cpi->this_frame_target;

-  if (projected_size_based_on_q > 0)

-    correction_factor = (100 * cpi->projected_frame_size) / projected_size_based_on_q;

-  // More heavily damped adjustment used if we have been oscillating either side of target

-  switch (damp_var) {

-    case 0:

-      adjustment_limit = 0.75;

-      break;

-    case 1:

-      adjustment_limit = 0.375;

-      break;

-    case 2:

-    default:

-      adjustment_limit = 0.25;

-      break;

-  }

-  // if ( (correction_factor > 102) && (Q < cpi->active_worst_quality) )

-  if (correction_factor > 102) {

-    // We are not already at the worst allowable quality

-    correction_factor = (int)(100.5 + ((correction_factor - 100) * adjustment_limit));

-    rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);

-    // Keep rate_correction_factor within limits

-    if (rate_correction_factor > MAX_BPB_FACTOR)

-      rate_correction_factor = MAX_BPB_FACTOR;

-  }

-  // else if ( (correction_factor < 99) && (Q > cpi->active_best_quality) )

-  else if (correction_factor < 99) {

-    // We are not already at the best allowable quality

-    correction_factor = (int)(100.5 - ((100 - correction_factor) * adjustment_limit));

-    rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);

-    // Keep rate_correction_factor within limits

-    if (rate_correction_factor < MIN_BPB_FACTOR)

-      rate_correction_factor = MIN_BPB_FACTOR;

-  }

-  if (cpi->common.frame_type == KEY_FRAME)

-    cpi->key_frame_rate_correction_factor = rate_correction_factor;

-  else {

-    if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)

-      cpi->gf_rate_correction_factor = rate_correction_factor;

-    else

-      cpi->rate_correction_factor = rate_correction_factor;

-  }

-}

-int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {

-  int Q = cpi->active_worst_quality;

-  int i;

-  int last_error = INT_MAX;

-  int target_bits_per_mb;

-  int bits_per_mb_at_this_q;

-  double correction_factor;

-  // Reset Zbin OQ value

-  cpi->zbin_over_quant = 0;

-  // Select the appropriate correction factor based upon type of frame.

-  if (cpi->common.frame_type == KEY_FRAME)

-    correction_factor = cpi->key_frame_rate_correction_factor;

-  else {

-    if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)

-      correction_factor = cpi->gf_rate_correction_factor;

-    else

-      correction_factor = cpi->rate_correction_factor;

-  }

-  // Calculate required scaling factor based on target frame size and size of frame produced using previous Q

-  if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS))

-    target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) << BPER_MB_NORMBITS;       // Case where we would overflow int

-  else

-    target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;

-  i = cpi->active_best_quality;

-  do {

-    bits_per_mb_at_this_q =

-      (int)(.5 + correction_factor *

-            vp9_bits_per_mb(cpi->common.frame_type, i));

-    if (bits_per_mb_at_this_q <= target_bits_per_mb) {

-      if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)

-        Q = i;

-      else

-        Q = i - 1;

-      break;

-    } else

-      last_error = bits_per_mb_at_this_q - target_bits_per_mb;

-  } while (++i <= cpi->active_worst_quality);

-  // If we are at MAXQ then enable Q over-run which seeks to claw back additional bits through things like

-  // the RD multiplier and zero bin size.

-  if (Q >= MAXQ) {

-    int zbin_oqmax;

-    double Factor = 0.99;

-    double factor_adjustment = 0.01 / 256.0; // (double)ZBIN_OQ_MAX;

-    if (cpi->common.frame_type == KEY_FRAME)

-      zbin_oqmax = 0; // ZBIN_OQ_MAX/16

-    else if (cpi->common.refresh_alt_ref_frame || (cpi->common.refresh_golden_frame && !cpi->source_alt_ref_active))

-      zbin_oqmax = 16;

-    else

-      zbin_oqmax = ZBIN_OQ_MAX;

-    // Each incrment in the zbin is assumed to have a fixed effect on bitrate. This is not of course true.

-    // The effect will be highly clip dependent and may well have sudden steps.

-    // The idea here is to acheive higher effective quantizers than the normal maximum by expanding the zero

-    // bin and hence decreasing the number of low magnitude non zero coefficients.

-    while (cpi->zbin_over_quant < zbin_oqmax) {

-      cpi->zbin_over_quant++;

-      if (cpi->zbin_over_quant > zbin_oqmax)

-        cpi->zbin_over_quant = zbin_oqmax;

-      // Adjust bits_per_mb_at_this_q estimate

-      bits_per_mb_at_this_q = (int)(Factor * bits_per_mb_at_this_q);

-      Factor += factor_adjustment;

-      if (Factor  >= 0.999)

-        Factor = 0.999;

-      if (bits_per_mb_at_this_q <= target_bits_per_mb)    // Break out if we get down to the target rate

-        break;

-    }

-  }

-  return Q;

-}

-static int estimate_keyframe_frequency(VP9_COMP *cpi) {

-  int i;

-  // Average key frame frequency

-  int av_key_frame_frequency = 0;

-  /* First key frame at start of sequence is a special case. We have no

-   * frequency data.

-   */

-  if (cpi->key_frame_count == 1) {

-    /* Assume a default of 1 kf every 2 seconds, or the max kf interval,

-     * whichever is smaller.

-     */

-    int key_freq = cpi->oxcf.key_freq > 0 ? cpi->oxcf.key_freq : 1;

-    av_key_frame_frequency = (int)cpi->output_frame_rate * 2;

-    if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)

-      av_key_frame_frequency = cpi->oxcf.key_freq;

-    cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1]

-      = av_key_frame_frequency;

-  } else {

-    unsigned int total_weight = 0;

-    int last_kf_interval =

-      (cpi->frames_since_key > 0) ? cpi->frames_since_key : 1;

-    /* reset keyframe context and calculate weighted average of last

-     * KEY_FRAME_CONTEXT keyframes

-     */

-    for (i = 0; i < KEY_FRAME_CONTEXT; i++) {

-      if (i < KEY_FRAME_CONTEXT - 1)

-        cpi->prior_key_frame_distance[i]

-          = cpi->prior_key_frame_distance[i + 1];

-      else

-        cpi->prior_key_frame_distance[i] = last_kf_interval;

-      av_key_frame_frequency += prior_key_frame_weight[i]

-                                * cpi->prior_key_frame_distance[i];

-      total_weight += prior_key_frame_weight[i];

-    }

-    av_key_frame_frequency  /= total_weight;

-  }

-  return av_key_frame_frequency;

-}

-void vp9_adjust_key_frame_context(VP9_COMP *cpi) {

-  // Clear down mmx registers to allow floating point in what follows

-  vp9_clear_system_state();

-  cpi->frames_since_key = 0;

-  cpi->key_frame_count++;

-}

-void vp9_compute_frame_size_bounds(VP9_COMP *cpi, int *frame_under_shoot_limit,

-                                   int *frame_over_shoot_limit) {

-  // Set-up bounds on acceptable frame size:

-  if (cpi->oxcf.fixed_q >= 0) {

-    // Fixed Q scenario: frame size never outranges target (there is no target!)

-    *frame_under_shoot_limit = 0;

-    *frame_over_shoot_limit  = INT_MAX;

-  } else {

-    if (cpi->common.frame_type == KEY_FRAME) {

-      *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;

-      *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;

-    } else {

-      if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) {

-        *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;

-        *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;

-      } else {

-        // Stron overshoot limit for constrained quality

-        if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {

-          *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;

-          *frame_under_shoot_limit = cpi->this_frame_target * 2 / 8;

-        } else {

-          *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;

-          *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;

-        }

-      }

-    }

-    // For very small rate targets where the fractional adjustment

-    // (eg * 7/8) may be tiny make sure there is at least a minimum

-    // range.

-    *frame_over_shoot_limit += 200;

-    *frame_under_shoot_limit -= 200;

-    if (*frame_under_shoot_limit < 0)

-      *frame_under_shoot_limit = 0;

-  }

-}

-// return of 0 means drop frame

-int vp9_pick_frame_size(VP9_COMP *cpi) {

-  VP9_COMMON *cm = &cpi->common;

-  if (cm->frame_type == KEY_FRAME)

-    calc_iframe_target_size(cpi);

-  else

-    calc_pframe_target_size(cpi);

-  return 1;

-}

--- a/vp8/encoder/ratectrl.h

+++ /dev/null

@@ -1,37 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#if !defined __INC_RATECTRL_H

-#include "onyx_int.h"

-#define FRAME_OVERHEAD_BITS 200

-extern void vp9_save_coding_context(VP9_COMP *cpi);

-extern void vp9_restore_coding_context(VP9_COMP *cpi);

-extern void vp9_setup_key_frame(VP9_COMP *cpi);

-extern void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var);

-extern int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame);

-extern void vp9_adjust_key_frame_context(VP9_COMP *cpi);

-extern void vp9_compute_frame_size_bounds(VP9_COMP *cpi,

-                                          int *frame_under_shoot_limit,

-                                          int *frame_over_shoot_limit);

-// return of 0 means drop frame

-extern int vp9_pick_frame_size(VP9_COMP *cpi);

-extern double vp9_convert_qindex_to_q(int qindex);

-extern int vp9_gfboost_qadjust(int qindex);

-extern int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex);

-void vp9_setup_inter_frame(VP9_COMP *cpi);

-#endif

--- a/vp8/encoder/rdopt.c

+++ /dev/null

@@ -1,4854 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <stdio.h>

-#include <math.h>

-#include <limits.h>

-#include <assert.h>

-#include "vp8/common/pragmas.h"

-#include "tokenize.h"

-#include "treewriter.h"

-#include "onyx_int.h"

-#include "modecosts.h"

-#include "encodeintra.h"

-#include "vp8/common/entropymode.h"

-#include "vp8/common/reconinter.h"

-#include "vp8/common/reconintra.h"

-#include "vp8/common/reconintra4x4.h"

-#include "vp8/common/findnearmv.h"

-#include "vp8/common/quant_common.h"

-#include "encodemb.h"

-#include "quantize.h"

-#include "vp8/common/idct.h"

-#include "variance.h"

-#include "mcomp.h"

-#include "rdopt.h"

-#include "ratectrl.h"

-#include "vpx_mem/vpx_mem.h"

-#include "vp8/common/systemdependent.h"

-#include "vp8/encoder/encodemv.h"

-#include "vp8/common/seg_common.h"

-#include "vp8/common/pred_common.h"

-#include "vp8/common/entropy.h"

-#include "vpx_rtcd.h"

-#if CONFIG_NEWBESTREFMV

-#include "vp8/common/mvref_common.h"

-#endif

-#if CONFIG_RUNTIME_CPU_DETECT

-#define IF_RTCD(x)  (x)

-#else

-#define IF_RTCD(x)  NULL

-#endif

-extern void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x);

-extern void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x);

-#define MAXF(a,b)            (((a) > (b)) ? (a) : (b))

-#define INVALID_MV 0x80008000

-/* Factor to weigh the rate for switchable interp filters */

-#define SWITCHABLE_INTERP_RATE_FACTOR 1

-static const int auto_speed_thresh[17] = {

-  1000,

-  200,

-  150,

-  130,

-  150,

-  125,

-  120,

-  115,

-  115,

-  115,

-  115,

-  115,

-  115,

-  115,

-  115,

-  115,

-  105

-};

-#if CONFIG_PRED_FILTER

-const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {

-  {ZEROMV,    LAST_FRAME,   0,  0},

-  {ZEROMV,    LAST_FRAME,   0,  1},

-  {DC_PRED,   INTRA_FRAME,  0,  0},

-  {NEARESTMV, LAST_FRAME,   0,  0},

-  {NEARESTMV, LAST_FRAME,   0,  1},

-  {NEARMV,    LAST_FRAME,   0,  0},

-  {NEARMV,    LAST_FRAME,   0,  1},

-  {ZEROMV,    GOLDEN_FRAME, 0,  0},

-  {ZEROMV,    GOLDEN_FRAME, 0,  1},

-  {NEARESTMV, GOLDEN_FRAME, 0,  0},

-  {NEARESTMV, GOLDEN_FRAME, 0,  1},

-  {ZEROMV,    ALTREF_FRAME, 0,  0},

-  {ZEROMV,    ALTREF_FRAME, 0,  1},

-  {NEARESTMV, ALTREF_FRAME, 0,  0},

-  {NEARESTMV, ALTREF_FRAME, 0,  1},

-  {NEARMV,    GOLDEN_FRAME, 0,  0},

-  {NEARMV,    GOLDEN_FRAME, 0,  1},

-  {NEARMV,    ALTREF_FRAME, 0,  0},

-  {NEARMV,    ALTREF_FRAME, 0,  1},

-  {V_PRED,    INTRA_FRAME,  0,  0},

-  {H_PRED,    INTRA_FRAME,  0,  0},

-  {D45_PRED,  INTRA_FRAME,  0,  0},

-  {D135_PRED, INTRA_FRAME,  0,  0},

-  {D117_PRED, INTRA_FRAME,  0,  0},

-  {D153_PRED, INTRA_FRAME,  0,  0},

-  {D27_PRED,  INTRA_FRAME,  0,  0},

-  {D63_PRED,  INTRA_FRAME,  0,  0},

-  {TM_PRED,   INTRA_FRAME,  0,  0},

-  {NEWMV,     LAST_FRAME,   0,  0},

-  {NEWMV,     LAST_FRAME,   0,  1},

-  {NEWMV,     GOLDEN_FRAME, 0,  0},

-  {NEWMV,     GOLDEN_FRAME, 0,  1},

-  {NEWMV,     ALTREF_FRAME, 0,  0},

-  {NEWMV,     ALTREF_FRAME, 0,  1},

-  {SPLITMV,   LAST_FRAME,   0,  0},

-  {SPLITMV,   GOLDEN_FRAME, 0,  0},

-  {SPLITMV,   ALTREF_FRAME, 0,  0},

-  {B_PRED,    INTRA_FRAME,  0,  0},

-  {I8X8_PRED, INTRA_FRAME,  0,  0},

-  /* compound prediction modes */

-  {ZEROMV,    LAST_FRAME,   GOLDEN_FRAME, 0},

-  {NEARESTMV, LAST_FRAME,   GOLDEN_FRAME, 0},

-  {NEARMV,    LAST_FRAME,   GOLDEN_FRAME, 0},

-  {ZEROMV,    ALTREF_FRAME, LAST_FRAME,   0},

-  {NEARESTMV, ALTREF_FRAME, LAST_FRAME,   0},

-  {NEARMV,    ALTREF_FRAME, LAST_FRAME,   0},

-  {ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME, 0},

-  {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME, 0},

-  {NEARMV,    GOLDEN_FRAME, ALTREF_FRAME, 0},

-  {NEWMV,     LAST_FRAME,   GOLDEN_FRAME, 0},

-  {NEWMV,     ALTREF_FRAME, LAST_FRAME,   0},

-  {NEWMV,     GOLDEN_FRAME, ALTREF_FRAME, 0},

-  {SPLITMV,   LAST_FRAME,   GOLDEN_FRAME, 0},

-  {SPLITMV,   ALTREF_FRAME, LAST_FRAME,   0},

-  {SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME, 0}

-};

-#else

-const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {

-  {ZEROMV,    LAST_FRAME,   0},

-  {DC_PRED,   INTRA_FRAME,  0},

-  {NEARESTMV, LAST_FRAME,   0},

-  {NEARMV,    LAST_FRAME,   0},

-  {ZEROMV,    GOLDEN_FRAME, 0},

-  {NEARESTMV, GOLDEN_FRAME, 0},

-  {ZEROMV,    ALTREF_FRAME, 0},

-  {NEARESTMV, ALTREF_FRAME, 0},

-  {NEARMV,    GOLDEN_FRAME, 0},

-  {NEARMV,    ALTREF_FRAME, 0},

-  {V_PRED,    INTRA_FRAME,  0},

-  {H_PRED,    INTRA_FRAME,  0},

-  {D45_PRED,  INTRA_FRAME,  0},

-  {D135_PRED, INTRA_FRAME,  0},

-  {D117_PRED, INTRA_FRAME,  0},

-  {D153_PRED, INTRA_FRAME,  0},

-  {D27_PRED,  INTRA_FRAME,  0},

-  {D63_PRED,  INTRA_FRAME,  0},

-  {TM_PRED,   INTRA_FRAME,  0},

-  {NEWMV,     LAST_FRAME,   0},

-  {NEWMV,     GOLDEN_FRAME, 0},

-  {NEWMV,     ALTREF_FRAME, 0},

-  {SPLITMV,   LAST_FRAME,   0},

-  {SPLITMV,   GOLDEN_FRAME, 0},

-  {SPLITMV,   ALTREF_FRAME, 0},

-  {B_PRED,    INTRA_FRAME,  0},

-  {I8X8_PRED, INTRA_FRAME,  0},

-  /* compound prediction modes */

-  {ZEROMV,    LAST_FRAME,   GOLDEN_FRAME},

-  {NEARESTMV, LAST_FRAME,   GOLDEN_FRAME},

-  {NEARMV,    LAST_FRAME,   GOLDEN_FRAME},

-  {ZEROMV,    ALTREF_FRAME, LAST_FRAME},

-  {NEARESTMV, ALTREF_FRAME, LAST_FRAME},

-  {NEARMV,    ALTREF_FRAME, LAST_FRAME},

-  {ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME},

-  {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},

-  {NEARMV,    GOLDEN_FRAME, ALTREF_FRAME},

-  {NEWMV,     LAST_FRAME,   GOLDEN_FRAME},

-  {NEWMV,     ALTREF_FRAME, LAST_FRAME  },

-  {NEWMV,     GOLDEN_FRAME, ALTREF_FRAME},

-  {SPLITMV,   LAST_FRAME,   GOLDEN_FRAME},

-  {SPLITMV,   ALTREF_FRAME, LAST_FRAME  },

-  {SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME}

-};

-#endif

-static void fill_token_costs(

-  unsigned int (*c)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS],

-  const vp9_prob(*p)[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES],

-  int block_type_counts) {

-  int i, j, k;

-  for (i = 0; i < block_type_counts; i++)

-    for (j = 0; j < COEF_BANDS; j++)

-      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

-        if (k == 0 && ((j > 0 && i > 0) || (j > 1 && i == 0)))

-          vp9_cost_tokens_skip((int *)(c[i][j][k]),

-                               p[i][j][k],

-                               vp9_coef_tree);

-        else

-          vp9_cost_tokens((int *)(c[i][j][k]),

-                          p[i][j][k],

-                          vp9_coef_tree);

-      }

-}

-static int rd_iifactor[32] =  { 4, 4, 3, 2, 1, 0, 0, 0,

-                                0, 0, 0, 0, 0, 0, 0, 0,

-                                0, 0, 0, 0, 0, 0, 0, 0,

-                                0, 0, 0, 0, 0, 0, 0, 0, };

-// 3* dc_qlookup[Q]*dc_qlookup[Q];

-/* values are now correlated to quantizer */

-static int sad_per_bit16lut[QINDEX_RANGE];

-static int sad_per_bit4lut[QINDEX_RANGE];

-void vp9_init_me_luts() {

-  int i;

-  // Initialize the sad lut tables using a formulaic calculation for now

-  // This is to make it easier to resolve the impact of experimental changes

-  // to the quantizer tables.

-  for (i = 0; i < QINDEX_RANGE; i++) {

-    sad_per_bit16lut[i] =

-      (int)((0.0418 * vp9_convert_qindex_to_q(i)) + 2.4107);

-    sad_per_bit4lut[i] = (int)((0.063 * vp9_convert_qindex_to_q(i)) + 2.742);

-  }

-}

-static int compute_rd_mult(int qindex) {

-  int q;

-  q = vp9_dc_quant(qindex, 0);

-  return (11 * q * q) >> 6;

-}

-void vp9_initialize_me_consts(VP9_COMP *cpi, int QIndex) {

-  cpi->mb.sadperbit16 =  sad_per_bit16lut[QIndex];

-  cpi->mb.sadperbit4  =  sad_per_bit4lut[QIndex];

-}

-void vp9_initialize_rd_consts(VP9_COMP *cpi, int QIndex) {

-  int q, i;

-  vp9_clear_system_state();  // __asm emms;

-  // Further tests required to see if optimum is different

-  // for key frames, golden frames and arf frames.

-  // if (cpi->common.refresh_golden_frame ||

-  //     cpi->common.refresh_alt_ref_frame)

-  QIndex = (QIndex < 0) ? 0 : ((QIndex > MAXQ) ? MAXQ : QIndex);

-  cpi->RDMULT = compute_rd_mult(QIndex);

-  // Extend rate multiplier along side quantizer zbin increases

-  if (cpi->zbin_over_quant  > 0) {

-    double oq_factor;

-    // Experimental code using the same basic equation as used for Q above

-    // The units of cpi->zbin_over_quant are 1/128 of Q bin size

-    oq_factor = 1.0 + ((double)0.0015625 * cpi->zbin_over_quant);

-    cpi->RDMULT = (int)((double)cpi->RDMULT * oq_factor * oq_factor);

-  }

-  if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {

-    if (cpi->twopass.next_iiratio > 31)

-      cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4;

-    else

-      cpi->RDMULT +=

-        (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;

-  }

-  if (cpi->RDMULT < 7)

-    cpi->RDMULT = 7;

-  cpi->mb.errorperbit = (cpi->RDMULT / 110);

-  cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);

-  vp9_set_speed_features(cpi);

-  q = (int)pow(vp9_dc_quant(QIndex, 0) >> 2, 1.25);

-  q = q << 2;

-  cpi->RDMULT = cpi->RDMULT << 4;

-  if (q < 8)

-    q = 8;

-  if (cpi->RDMULT > 1000) {

-    cpi->RDDIV = 1;

-    cpi->RDMULT /= 100;

-    for (i = 0; i < MAX_MODES; i++) {

-      if (cpi->sf.thresh_mult[i] < INT_MAX) {

-        cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q / 100;

-      } else {

-        cpi->rd_threshes[i] = INT_MAX;

-      }

-      cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];

-    }

-  } else {

-    cpi->RDDIV = 100;

-    for (i = 0; i < MAX_MODES; i++) {

-      if (cpi->sf.thresh_mult[i] < (INT_MAX / q)) {

-        cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q;

-      } else {

-        cpi->rd_threshes[i] = INT_MAX;

-      }

-      cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];

-    }

-  }

-  fill_token_costs(

-    cpi->mb.token_costs[TX_4X4],

-    (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs,

-    BLOCK_TYPES);

-  fill_token_costs(

-    cpi->mb.hybrid_token_costs[TX_4X4],

-    (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11])

-    cpi->common.fc.hybrid_coef_probs,

-    BLOCK_TYPES);

-  fill_token_costs(

-    cpi->mb.token_costs[TX_8X8],

-    (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_8x8,

-    BLOCK_TYPES_8X8);

-  fill_token_costs(

-    cpi->mb.hybrid_token_costs[TX_8X8],

-    (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11])

-    cpi->common.fc.hybrid_coef_probs_8x8,

-    BLOCK_TYPES_8X8);

-  fill_token_costs(

-    cpi->mb.token_costs[TX_16X16],

-    (const vp9_prob(*)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_16x16,

-    BLOCK_TYPES_16X16);

-  fill_token_costs(

-    cpi->mb.hybrid_token_costs[TX_16X16],

-    (const vp9_prob(*)[8][PREV_COEF_CONTEXTS][11])

-    cpi->common.fc.hybrid_coef_probs_16x16,

-    BLOCK_TYPES_16X16);

-  /*rough estimate for costing*/

-  cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4;

-  vp9_init_mode_costs(cpi);

-  if (cpi->common.frame_type != KEY_FRAME)

-  {

-    vp9_build_nmv_cost_table(

-        cpi->mb.nmvjointcost,

-        cpi->mb.e_mbd.allow_high_precision_mv ?

-        cpi->mb.nmvcost_hp : cpi->mb.nmvcost,

-        &cpi->common.fc.nmvc,

-        cpi->mb.e_mbd.allow_high_precision_mv, 1, 1);

-  }

-}

-void vp9_auto_select_speed(VP9_COMP *cpi) {

-  int milliseconds_for_compress = (int)(1000000 / cpi->oxcf.frame_rate);

-  milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16;

-  /*

-  // this is done during parameter valid check

-  if( cpi->oxcf.cpu_used > 16)

-      cpi->oxcf.cpu_used = 16;

-  if( cpi->oxcf.cpu_used < -16)

-      cpi->oxcf.cpu_used = -16;

-  */

-  if (cpi->avg_pick_mode_time < milliseconds_for_compress &&

-      (cpi->avg_encode_time - cpi->avg_pick_mode_time) <

-      milliseconds_for_compress) {

-    if (cpi->avg_pick_mode_time == 0) {

-      cpi->Speed = 4;

-    } else {

-      if (milliseconds_for_compress * 100 < cpi->avg_encode_time * 95) {

-        cpi->Speed          += 2;

-        cpi->avg_pick_mode_time = 0;

-        cpi->avg_encode_time = 0;

-        if (cpi->Speed > 16) {

-          cpi->Speed = 16;

-        }

-      }

-      if (milliseconds_for_compress * 100 >

-          cpi->avg_encode_time * auto_speed_thresh[cpi->Speed]) {

-        cpi->Speed          -= 1;

-        cpi->avg_pick_mode_time = 0;

-        cpi->avg_encode_time = 0;

-        // In real-time mode, cpi->speed is in [4, 16].

-        if (cpi->Speed < 4) {      // if ( cpi->Speed < 0 )

-          cpi->Speed = 4;        // cpi->Speed = 0;

-        }

-      }

-    }

-  } else {

-    cpi->Speed += 4;

-    if (cpi->Speed > 16)

-      cpi->Speed = 16;

-    cpi->avg_pick_mode_time = 0;

-    cpi->avg_encode_time = 0;

-  }

-}

-int vp9_block_error_c(short *coeff, short *dqcoeff, int block_size) {

-  int i, error = 0;

-  for (i = 0; i < block_size; i++) {

-    int this_diff = coeff[i] - dqcoeff[i];

-    error += this_diff * this_diff;

-  }

-  return error;

-}

-int vp9_mbblock_error_c(MACROBLOCK *mb, int dc) {

-  BLOCK  *be;

-  BLOCKD *bd;

-  int i, j;

-  int berror, error = 0;

-  for (i = 0; i < 16; i++) {

-    be = &mb->block[i];

-    bd = &mb->e_mbd.block[i];

-    berror = 0;

-    for (j = dc; j < 16; j++) {

-      int this_diff = be->coeff[j] - bd->dqcoeff[j];

-      berror += this_diff * this_diff;

-    }

-    error += berror;

-  }

-  return error;

-}

-int vp9_mbuverror_c(MACROBLOCK *mb) {

-  BLOCK  *be;

-  BLOCKD *bd;

-  int i, error = 0;

-  for (i = 16; i < 24; i++) {

-    be = &mb->block[i];

-    bd = &mb->e_mbd.block[i];

-    error += vp9_block_error_c(be->coeff, bd->dqcoeff, 16);

-  }

-  return error;

-}

-int vp9_uvsse(MACROBLOCK *x) {

-  unsigned char *uptr, *vptr;

-  unsigned char *upred_ptr = (*(x->block[16].base_src) + x->block[16].src);

-  unsigned char *vpred_ptr = (*(x->block[20].base_src) + x->block[20].src);

-  int uv_stride = x->block[16].src_stride;

-  unsigned int sse1 = 0;

-  unsigned int sse2 = 0;

-  int mv_row = x->e_mbd.mode_info_context->mbmi.mv[0].as_mv.row;

-  int mv_col = x->e_mbd.mode_info_context->mbmi.mv[0].as_mv.col;

-  int offset;

-  int pre_stride = x->e_mbd.block[16].pre_stride;

-  if (mv_row < 0)

-    mv_row -= 1;

-  else

-    mv_row += 1;

-  if (mv_col < 0)

-    mv_col -= 1;

-  else

-    mv_col += 1;

-  mv_row /= 2;

-  mv_col /= 2;

-  offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);

-  uptr = x->e_mbd.pre.u_buffer + offset;

-  vptr = x->e_mbd.pre.v_buffer + offset;

-  if ((mv_row | mv_col) & 7) {

-    vp9_sub_pixel_variance8x8(uptr, pre_stride, (mv_col & 7) << 1,

-                              (mv_row & 7) << 1, upred_ptr, uv_stride, &sse2);

-    vp9_sub_pixel_variance8x8(vptr, pre_stride, (mv_col & 7) << 1,

-                              (mv_row & 7) << 1, vpred_ptr, uv_stride, &sse1);

-    sse2 += sse1;

-  } else {

-    vp9_variance8x8(uptr, pre_stride, upred_ptr, uv_stride, &sse2);

-    vp9_variance8x8(vptr, pre_stride, vpred_ptr, uv_stride, &sse1);

-    sse2 += sse1;

-  }

-  return sse2;

-}

-static int cost_coeffs_2x2(MACROBLOCK *mb,

-                           BLOCKD *b, PLANE_TYPE type,

-                           ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {

-  int c = (type == PLANE_TYPE_Y_NO_DC); /* start at coef 0, unless Y with Y2 */

-  int eob = b->eob;

-  int pt;    /* surrounding block/prev coef predictor */

-  int cost = 0;

-  short *qcoeff_ptr = b->qcoeff;

-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

-  assert(eob <= 4);

-  for (; c < eob; c++) {

-    int v = qcoeff_ptr[vp9_default_zig_zag1d[c]];

-    int t = vp9_dct_value_tokens_ptr[v].Token;

-    cost += mb->token_costs[TX_8X8][type][vp9_coef_bands[c]][pt][t];

-    cost += vp9_dct_value_cost_ptr[v];

-    pt = vp9_prev_token_class[t];

-  }

-  if (c < 4)

-    cost += mb->token_costs[TX_8X8][type][vp9_coef_bands[c]]

-            [pt] [DCT_EOB_TOKEN];

-  pt = (c != !type); // is eob first coefficient;

-  *a = *l = pt;

-  return cost;

-}

-static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, PLANE_TYPE type,

-                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

-                       int tx_size) {

-  const int eob = b->eob;

-  int c = (type == PLANE_TYPE_Y_NO_DC); /* start at coef 0, unless Y with Y2 */

-  int cost = 0, default_eob, seg_eob;

-  int pt;                     /* surrounding block/prev coef predictor */

-  int const *scan, *band;

-  short *qcoeff_ptr = b->qcoeff;

-  MACROBLOCKD *xd = &mb->e_mbd;

-  MB_MODE_INFO *mbmi = &mb->e_mbd.mode_info_context->mbmi;

-  TX_TYPE tx_type = DCT_DCT;

-  int segment_id = mbmi->segment_id;

-  switch (tx_size) {

-    case TX_4X4:

-      scan = vp9_default_zig_zag1d;

-      band = vp9_coef_bands;

-      default_eob = 16;

-      if (type == PLANE_TYPE_Y_WITH_DC) {

-        tx_type = get_tx_type_4x4(xd, b);

-        if (tx_type != DCT_DCT) {

-          switch (tx_type) {

-            case ADST_DCT:

-              scan = vp9_row_scan;

-              break;

-            case DCT_ADST:

-              scan = vp9_col_scan;

-              break;

-            default:

-              scan = vp9_default_zig_zag1d;

-              break;

-          }

-        }

-      }

-      break;

-    case TX_8X8:

-      scan = vp9_default_zig_zag1d_8x8;

-      band = vp9_coef_bands_8x8;

-      default_eob = 64;

-      if (type == PLANE_TYPE_Y_WITH_DC) {

-        BLOCKD *bb;

-        int ib = (b - xd->block);

-        if (ib < 16) {

-          ib = (ib & 8) + ((ib & 4) >> 1);

-          bb = xd->block + ib;

-          tx_type = get_tx_type_8x8(xd, bb);

-        }

-      }

-      break;

-    case TX_16X16:

-      scan = vp9_default_zig_zag1d_16x16;

-      band = vp9_coef_bands_16x16;

-      default_eob = 256;

-      if (type == PLANE_TYPE_Y_WITH_DC) {

-        tx_type = get_tx_type_16x16(xd, b);

-      }

-      break;

-    default:

-      break;

-  }

-  if (vp9_segfeature_active(&mb->e_mbd, segment_id, SEG_LVL_EOB))

-    seg_eob = vp9_get_segdata(&mb->e_mbd, segment_id, SEG_LVL_EOB);

-  else

-    seg_eob = default_eob;

-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

-  if (tx_type != DCT_DCT) {

-    for (; c < eob; c++) {

-      int v = qcoeff_ptr[scan[c]];

-      int t = vp9_dct_value_tokens_ptr[v].Token;

-      cost += mb->hybrid_token_costs[tx_size][type][band[c]][pt][t];

-      cost += vp9_dct_value_cost_ptr[v];

-      pt = vp9_prev_token_class[t];

-    }

-    if (c < seg_eob)

-      cost += mb->hybrid_token_costs[tx_size][type][band[c]]

-          [pt][DCT_EOB_TOKEN];

-  } else {

-    for (; c < eob; c++) {

-      int v = qcoeff_ptr[scan[c]];

-      int t = vp9_dct_value_tokens_ptr[v].Token;

-      cost += mb->token_costs[tx_size][type][band[c]][pt][t];

-      cost += vp9_dct_value_cost_ptr[v];

-      pt = vp9_prev_token_class[t];

-    }

-    if (c < seg_eob)

-      cost += mb->token_costs[tx_size][type][band[c]]

-          [pt][DCT_EOB_TOKEN];

-  }

-  pt = (c != !type); // is eob first coefficient;

-  *a = *l = pt;

-  return cost;

-}

-static int rdcost_mby_4x4(MACROBLOCK *mb) {

-  int cost = 0;

-  int b;

-  MACROBLOCKD *xd = &mb->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta;

-  ENTROPY_CONTEXT *tl;

-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  ta = (ENTROPY_CONTEXT *)&t_above;

-  tl = (ENTROPY_CONTEXT *)&t_left;

-  for (b = 0; b < 16; b++)

-    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_NO_DC,

-                        ta + vp9_block2above[b], tl + vp9_block2left[b],

-                        TX_4X4);

-  cost += cost_coeffs(mb, xd->block + 24, PLANE_TYPE_Y2,

-                      ta + vp9_block2above[24], tl + vp9_block2left[24],

-                      TX_4X4);

-  return cost;

-}

-static void macro_block_yrd_4x4(MACROBLOCK *mb,

-                                int *Rate,

-                                int *Distortion,

-                                const VP9_ENCODER_RTCD *rtcd,

-                                int *skippable) {

-  int b;

-  MACROBLOCKD *const xd = &mb->e_mbd;

-  BLOCK   *const mb_y2 = mb->block + 24;

-  BLOCKD *const x_y2  = xd->block + 24;

-  short *Y2DCPtr = mb_y2->src_diff;

-  BLOCK *beptr;

-  int d;

-  vp9_subtract_mby(mb->src_diff, *(mb->block[0].base_src), xd->predictor,

-                   mb->block[0].src_stride);

-  // Fdct and building the 2nd order block

-  for (beptr = mb->block; beptr < mb->block + 16; beptr += 2) {

-    mb->vp9_short_fdct8x4(beptr->src_diff, beptr->coeff, 32);

-    *Y2DCPtr++ = beptr->coeff[0];

-    *Y2DCPtr++ = beptr->coeff[16];

-  }

-  // 2nd order fdct

-  mb->short_walsh4x4(mb_y2->src_diff, mb_y2->coeff, 8);

-  // Quantization

-  for (b = 0; b < 16; b++) {

-    mb->quantize_b_4x4(&mb->block[b], &xd->block[b]);

-  }

-  // DC predication and Quantization of 2nd Order block

-  mb->quantize_b_4x4(mb_y2, x_y2);

-  // Distortion

-  d = vp9_mbblock_error(mb, 1);

-  d += vp9_block_error(mb_y2->coeff, x_y2->dqcoeff, 16);

-  *Distortion = (d >> 2);

-  // rate

-  *Rate = rdcost_mby_4x4(mb);

-  *skippable = vp9_mby_is_skippable_4x4(&mb->e_mbd, 1);

-}

-static int rdcost_mby_8x8(MACROBLOCK *mb, int backup) {

-  int cost = 0;

-  int b;

-  MACROBLOCKD *xd = &mb->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta;

-  ENTROPY_CONTEXT *tl;

-  if (backup) {

-    vpx_memcpy(&t_above,xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));

-    ta = (ENTROPY_CONTEXT *)&t_above;

-    tl = (ENTROPY_CONTEXT *)&t_left;

-  } else {

-    ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;

-    tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;

-  }

-  for (b = 0; b < 16; b += 4)

-    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_NO_DC,

-                        ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],

-                        TX_8X8);

-  cost += cost_coeffs_2x2(mb, xd->block + 24, PLANE_TYPE_Y2,

-                          ta + vp9_block2above[24], tl + vp9_block2left[24]);

-  return cost;

-}

-static void macro_block_yrd_8x8(MACROBLOCK *mb,

-                                int *Rate,

-                                int *Distortion,

-                                const VP9_ENCODER_RTCD *rtcd,

-                                int *skippable) {

-  MACROBLOCKD *const xd = &mb->e_mbd;

-  BLOCK   *const mb_y2 = mb->block + 24;

-  BLOCKD *const x_y2  = xd->block + 24;

-  int d;

-  vp9_subtract_mby(mb->src_diff, *(mb->block[0].base_src), xd->predictor,

-                   mb->block[0].src_stride);

-  vp9_transform_mby_8x8(mb);

-  vp9_quantize_mby_8x8(mb);

-  /* remove 1st order dc to properly combine 1st/2nd order distortion */

-  mb->coeff[0] = 0;

-  mb->coeff[64] = 0;

-  mb->coeff[128] = 0;

-  mb->coeff[192] = 0;

-  xd->dqcoeff[0] = 0;

-  xd->dqcoeff[64] = 0;

-  xd->dqcoeff[128] = 0;

-  xd->dqcoeff[192] = 0;

-  d = vp9_mbblock_error(mb, 0);

-  d += vp9_block_error(mb_y2->coeff, x_y2->dqcoeff, 16);

-  *Distortion = (d >> 2);

-  // rate

-  *Rate = rdcost_mby_8x8(mb, 1);

-  *skippable = vp9_mby_is_skippable_8x8(&mb->e_mbd, 1);

-}

-static int rdcost_mby_16x16(MACROBLOCK *mb) {

-  int cost;

-  MACROBLOCKD *xd = &mb->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta, *tl;

-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  ta = (ENTROPY_CONTEXT *)&t_above;

-  tl = (ENTROPY_CONTEXT *)&t_left;

-  cost = cost_coeffs(mb, xd->block, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16);

-  return cost;

-}

-static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,

-                                  const VP9_ENCODER_RTCD *rtcd, int *skippable) {

-  int d;

-  MACROBLOCKD *xd = &mb->e_mbd;

-  BLOCKD *b  = &mb->e_mbd.block[0];

-  BLOCK  *be = &mb->block[0];

-  TX_TYPE tx_type;

-  vp9_subtract_mby(mb->src_diff, *(mb->block[0].base_src), mb->e_mbd.predictor,

-                   mb->block[0].src_stride);

-  tx_type = get_tx_type_16x16(xd, b);

-  if (tx_type != DCT_DCT) {

-    vp9_fht(be->src_diff, 32, be->coeff, tx_type, 16);

-  } else

-    vp9_transform_mby_16x16(mb);

-  vp9_quantize_mby_16x16(mb);

-  // TODO(jingning) is it possible to quickly determine whether to force

-  //                trailing coefficients to be zero, instead of running trellis

-  //                optimization in the rate-distortion optimization loop?

-  if (mb->e_mbd.mode_info_context->mbmi.mode < I8X8_PRED)

-    vp9_optimize_mby_16x16(mb, rtcd);

-  d = vp9_mbblock_error(mb, 0);

-  *Distortion = (d >> 2);

-  // rate

-  *Rate = rdcost_mby_16x16(mb);

-  *skippable = vp9_mby_is_skippable_16x16(&mb->e_mbd);

-}

-static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,

-                            int *distortion, int *skippable,

-                            int64_t txfm_cache[NB_TXFM_MODES]) {

-  VP9_COMMON *cm = &cpi->common;

-  MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;

-  MACROBLOCKD *xd = &x->e_mbd;

-  int can_skip = cm->mb_no_coeff_skip;

-  vp9_prob skip_prob = can_skip ? vp9_get_pred_prob(cm, xd, PRED_MBSKIP) : 128;

-  int s0, s1;

-  int r4x4, r4x4s, r8x8, r8x8s, d4x4, d8x8, s4x4, s8x8;

-  int64_t rd4x4, rd8x8, rd4x4s, rd8x8s;

-  int d16x16, r16x16, r16x16s, s16x16;

-  int64_t rd16x16, rd16x16s;

-  // FIXME don't do sub x3

-  if (skip_prob == 0)

-    skip_prob = 1;

-  s0 = vp9_cost_bit(skip_prob, 0);

-  s1 = vp9_cost_bit(skip_prob, 1);

-  macro_block_yrd_16x16(x, &r16x16, &d16x16, IF_RTCD(&cpi->rtcd), &s16x16);

-  if (can_skip) {

-    if (s16x16) {

-      rd16x16 = RDCOST(x->rdmult, x->rddiv, s1, d16x16);

-    } else {

-      rd16x16 = RDCOST(x->rdmult, x->rddiv, r16x16 + s0, d16x16);

-    }

-  } else {

-    rd16x16 = RDCOST(x->rdmult, x->rddiv, r16x16, d16x16);

-  }

-  r16x16s = r16x16 + vp9_cost_one(cm->prob_tx[0]) + vp9_cost_one(cm->prob_tx[1]);

-  if (can_skip) {

-    if (s16x16) {

-      rd16x16s = RDCOST(x->rdmult, x->rddiv, s1, d16x16);

-    } else {

-      rd16x16s = RDCOST(x->rdmult, x->rddiv, r16x16s + s0, d16x16);

-    }

-  } else {

-    rd16x16s = RDCOST(x->rdmult, x->rddiv, r16x16s, d16x16);

-  }

-  macro_block_yrd_8x8(x, &r8x8, &d8x8, IF_RTCD(&cpi->rtcd), &s8x8);

-  if (can_skip) {

-    if (s8x8) {

-      rd8x8 = RDCOST(x->rdmult, x->rddiv, s1, d8x8);

-    } else {

-      rd8x8 = RDCOST(x->rdmult, x->rddiv, r8x8 + s0, d8x8);

-    }

-  } else {

-    rd8x8 = RDCOST(x->rdmult, x->rddiv, r8x8, d8x8);

-  }

-  r8x8s = r8x8 + vp9_cost_one(cm->prob_tx[0]);

-  r8x8s += vp9_cost_zero(cm->prob_tx[1]);

-  if (can_skip) {

-    if (s8x8) {

-      rd8x8s = RDCOST(x->rdmult, x->rddiv, s1, d8x8);

-    } else {

-      rd8x8s = RDCOST(x->rdmult, x->rddiv, r8x8s + s0, d8x8);

-    }

-  } else {

-    rd8x8s = RDCOST(x->rdmult, x->rddiv, r8x8s, d8x8);

-  }

-  macro_block_yrd_4x4(x, &r4x4, &d4x4, IF_RTCD(&cpi->rtcd), &s4x4);

-  if (can_skip) {

-    if (s4x4) {

-      rd4x4 = RDCOST(x->rdmult, x->rddiv, s1, d4x4);

-    } else {

-      rd4x4 = RDCOST(x->rdmult, x->rddiv, r4x4 + s0, d4x4);

-    }

-  } else {

-    rd4x4 = RDCOST(x->rdmult, x->rddiv, r4x4, d4x4);

-  }

-  r4x4s = r4x4 + vp9_cost_zero(cm->prob_tx[0]);

-  if (can_skip) {

-    if (s4x4) {

-      rd4x4s = RDCOST(x->rdmult, x->rddiv, s1, d4x4);

-    } else {

-      rd4x4s = RDCOST(x->rdmult, x->rddiv, r4x4s + s0, d4x4);

-    }

-  } else {

-    rd4x4s = RDCOST(x->rdmult, x->rddiv, r4x4s, d4x4);

-  }

-  if ( cpi->common.txfm_mode == ALLOW_16X16 ||

-      (cpi->common.txfm_mode == TX_MODE_SELECT &&

-       rd16x16s < rd8x8s && rd16x16s < rd4x4s)) {

-    mbmi->txfm_size = TX_16X16;

-    *skippable = s16x16;

-    *distortion = d16x16;

-    *rate = (cpi->common.txfm_mode == ALLOW_16X16) ? r16x16 : r16x16s;

-  } else

-  if ( cpi->common.txfm_mode == ALLOW_8X8 ||

-      (cpi->common.txfm_mode == TX_MODE_SELECT && rd8x8s < rd4x4s)) {

-    mbmi->txfm_size = TX_8X8;

-    *skippable = s8x8;

-    *distortion = d8x8;

-    *rate = (cpi->common.txfm_mode == ALLOW_8X8) ? r8x8 : r8x8s;

-  } else {

-    assert(cpi->common.txfm_mode == ONLY_4X4 ||

-           (cpi->common.txfm_mode == TX_MODE_SELECT && rd4x4s <= rd8x8s));

-    mbmi->txfm_size = TX_4X4;

-    *skippable = s4x4;

-    *distortion = d4x4;

-    *rate = (cpi->common.txfm_mode == ONLY_4X4) ? r4x4 : r4x4s;

-  }

-  txfm_cache[ONLY_4X4] = rd4x4;

-  txfm_cache[ALLOW_8X8] = rd8x8;

-  txfm_cache[ALLOW_16X16] = rd16x16;

-  if (rd16x16s < rd8x8s && rd16x16s < rd4x4s)

-    txfm_cache[TX_MODE_SELECT] = rd16x16s;

-  else

-    txfm_cache[TX_MODE_SELECT] = rd4x4s < rd8x8s ? rd4x4s : rd8x8s;

-}

-static void copy_predictor(unsigned char *dst, const unsigned char *predictor) {

-  const unsigned int *p = (const unsigned int *)predictor;

-  unsigned int *d = (unsigned int *)dst;

-  d[0] = p[0];

-  d[4] = p[4];

-  d[8] = p[8];

-  d[12] = p[12];

-}

-#if CONFIG_SUPERBLOCKS

-static void super_block_yrd_8x8(MACROBLOCK *x,

-                                int *rate,

-                                int *distortion,

-                                const VP9_ENCODER_RTCD *rtcd, int *skip)

-{

-  MACROBLOCKD *const xd = &x->e_mbd;

-  BLOCK *const by2 = x->block + 24;

-  BLOCKD *const bdy2  = xd->block + 24;

-  int d = 0, r = 0, n;

-  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;

-  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;

-  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;

-  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;

-  ENTROPY_CONTEXT_PLANES t_above[2];

-  ENTROPY_CONTEXT_PLANES t_left[2];

-  int skippable = 1;

-  vpx_memcpy(t_above, xd->above_context, sizeof(t_above));

-  vpx_memcpy(t_left, xd->left_context, sizeof(t_left));

-  for (n = 0; n < 4; n++) {

-    int x_idx = n & 1, y_idx = n >> 1;

-    vp9_subtract_mby_s_c(x->src_diff,

-                         src + x_idx * 16 + y_idx * 16 * src_y_stride,

-                         src_y_stride,

-                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,

-                         dst_y_stride);

-    vp9_transform_mby_8x8(x);

-    vp9_quantize_mby_8x8(x);

-    /* remove 1st order dc to properly combine 1st/2nd order distortion */

-    x->coeff[  0] = 0;

-    x->coeff[ 64] = 0;

-    x->coeff[128] = 0;

-    x->coeff[192] = 0;

-    xd->dqcoeff[  0] = 0;

-    xd->dqcoeff[ 64] = 0;

-    xd->dqcoeff[128] = 0;

-    xd->dqcoeff[192] = 0;

-    d += vp9_mbblock_error(x, 0);

-    d += vp9_block_error(by2->coeff, bdy2->dqcoeff, 16);

-    xd->above_context = ta + x_idx;

-    xd->left_context = tl + y_idx;

-    r += rdcost_mby_8x8(x, 0);

-    skippable = skippable && vp9_mby_is_skippable_8x8(xd, 1);

-  }

-  *distortion = (d >> 2);

-  *rate       = r;

-  if (skip) *skip = skippable;

-  xd->above_context = ta;

-  xd->left_context = tl;

-  vpx_memcpy(xd->above_context, &t_above, sizeof(t_above));

-  vpx_memcpy(xd->left_context, &t_left, sizeof(t_left));

-}

-#endif

-static void copy_predictor_8x8(unsigned char *dst, const unsigned char *predictor) {

-  const unsigned int *p = (const unsigned int *)predictor;

-  unsigned int *d = (unsigned int *)dst;

-  d[0] = p[0];

-  d[1] = p[1];

-  d[4] = p[4];

-  d[5] = p[5];

-  d[8] = p[8];

-  d[9] = p[9];

-  d[12] = p[12];

-  d[13] = p[13];

-  d[16] = p[16];

-  d[17] = p[17];

-  d[20] = p[20];

-  d[21] = p[21];

-  d[24] = p[24];

-  d[25] = p[25];

-  d[28] = p[28];

-  d[29] = p[29];

-}

-static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,

-                                     BLOCKD *b, B_PREDICTION_MODE *best_mode,

-#if CONFIG_COMP_INTRA_PRED

-                                     B_PREDICTION_MODE *best_second_mode,

-                                     int allow_comp,

-#endif

-                                     int *bmode_costs,

-                                     ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

-                                     int *bestrate, int *bestratey,

-                                     int *bestdistortion) {

-  B_PREDICTION_MODE mode;

-  MACROBLOCKD *xd = &x->e_mbd;

-#if CONFIG_COMP_INTRA_PRED

-  B_PREDICTION_MODE mode2;

-#endif

-  int64_t best_rd = INT64_MAX;

-  int rate = 0;

-  int distortion;

-  ENTROPY_CONTEXT ta = *a, tempa = *a;

-  ENTROPY_CONTEXT tl = *l, templ = *l;

-  TX_TYPE tx_type = DCT_DCT;

-  TX_TYPE best_tx_type = DCT_DCT;

-  /*

-   * The predictor buffer is a 2d buffer with a stride of 16.  Create

-   * a temp buffer that meets the stride requirements, but we are only

-   * interested in the left 4x4 block

-   * */

-  DECLARE_ALIGNED_ARRAY(16, unsigned char,  best_predictor, 16 * 4);

-  DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16);

-  for (mode = B_DC_PRED; mode <= B_HU_PRED; mode++) {

-#if CONFIG_COMP_INTRA_PRED

-    for (mode2 = (allow_comp ? 0 : (B_DC_PRED - 1));

-                   mode2 != (allow_comp ? (mode + 1) : 0); mode2++) {

-#endif

-      int64_t this_rd;

-      int ratey;

-      b->bmi.as_mode.first = mode;

-      rate = bmode_costs[mode];

-#if CONFIG_COMP_INTRA_PRED

-      if (mode2 == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {

-#endif

-        vp9_intra4x4_predict(b, mode, b->predictor);

-#if CONFIG_COMP_INTRA_PRED

-      } else {

-        vp9_comp_intra4x4_predict(b, mode, mode2, b->predictor);

-        rate += bmode_costs[mode2];

-      }

-#endif

-      vp9_subtract_b(be, b, 16);

-      b->bmi.as_mode.first = mode;

-      tx_type = get_tx_type_4x4(xd, b);

-      if (tx_type != DCT_DCT) {

-        vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);

-        vp9_ht_quantize_b_4x4(be, b, tx_type);

-      } else {

-        x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);

-        x->quantize_b_4x4(be, b);

-      }

-      tempa = ta;

-      templ = tl;

-      ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4);

-      rate += ratey;

-      distortion = vp9_block_error(be->coeff, b->dqcoeff, 16) >> 2;

-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

-      if (this_rd < best_rd) {

-        *bestrate = rate;

-        *bestratey = ratey;

-        *bestdistortion = distortion;

-        best_rd = this_rd;

-        *best_mode = mode;

-        best_tx_type = tx_type;

-#if CONFIG_COMP_INTRA_PRED

-        *best_second_mode = mode2;

-#endif

-        *a = tempa;

-        *l = templ;

-        copy_predictor(best_predictor, b->predictor);

-        vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);

-      }

-#if CONFIG_COMP_INTRA_PRED

-    }

-#endif

-  }

-  b->bmi.as_mode.first = (B_PREDICTION_MODE)(*best_mode);

-#if CONFIG_COMP_INTRA_PRED

-  b->bmi.as_mode.second = (B_PREDICTION_MODE)(*best_second_mode);

-#endif

-  // inverse transform

-  if (best_tx_type != DCT_DCT)

-    vp9_ihtllm_c(best_dqcoeff, b->diff, 32, best_tx_type, 4);

-  else

-    IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(

-        best_dqcoeff, b->diff, 32);

-  vp9_recon_b(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);

-  return best_rd;

-}

-static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, int *Rate,

-                                     int *rate_y, int *Distortion, int64_t best_rd,

-#if CONFIG_COMP_INTRA_PRED

-                                     int allow_comp,

-#endif

-                                     int update_contexts) {

-  int i;

-  MACROBLOCKD *const xd = &mb->e_mbd;

-  int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];

-  int distortion = 0;

-  int tot_rate_y = 0;

-  int64_t total_rd = 0;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta, *tl;

-  int *bmode_costs;

-  if (update_contexts) {

-    ta = (ENTROPY_CONTEXT *)xd->above_context;

-    tl = (ENTROPY_CONTEXT *)xd->left_context;

-  } else {

-    vpx_memcpy(&t_above, xd->above_context,

-               sizeof(ENTROPY_CONTEXT_PLANES));

-    vpx_memcpy(&t_left, xd->left_context,

-               sizeof(ENTROPY_CONTEXT_PLANES));

-    ta = (ENTROPY_CONTEXT *)&t_above;

-    tl = (ENTROPY_CONTEXT *)&t_left;

-  }

-  xd->mode_info_context->mbmi.mode = B_PRED;

-  bmode_costs = mb->inter_bmode_costs;

-  for (i = 0; i < 16; i++) {

-    MODE_INFO *const mic = xd->mode_info_context;

-    const int mis = xd->mode_info_stride;

-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);

-#if CONFIG_COMP_INTRA_PRED

-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_second_mode);

-#endif

-    int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);

-    if (xd->frame_type == KEY_FRAME) {

-      const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);

-      const B_PREDICTION_MODE L = left_block_mode(mic, i);

-      bmode_costs  = mb->bmode_costs[A][L];

-    }

-    total_rd += rd_pick_intra4x4block(

-                  cpi, mb, mb->block + i, xd->block + i, &best_mode,

-#if CONFIG_COMP_INTRA_PRED

-                  & best_second_mode, allow_comp,

-#endif

-                  bmode_costs, ta + vp9_block2above[i],

-                  tl + vp9_block2left[i], &r, &ry, &d);

-    cost += r;

-    distortion += d;

-    tot_rate_y += ry;

-    mic->bmi[i].as_mode.first = best_mode;

-#if CONFIG_COMP_INTRA_PRED

-    mic->bmi[i].as_mode.second = best_second_mode;

-#endif

-    if (total_rd >= best_rd)

-      break;

-  }

-  if (total_rd >= best_rd)

-    return INT64_MAX;

-#if CONFIG_COMP_INTRA_PRED

-  cost += vp9_cost_bit(128, allow_comp);

-#endif

-  *Rate = cost;

-  *rate_y += tot_rate_y;

-  *Distortion = distortion;

-  return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);

-}

-#if CONFIG_SUPERBLOCKS

-static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi,

-                                      MACROBLOCK *x,

-                                      int *rate,

-                                      int *rate_tokenonly,

-                                      int *distortion,

-                                      int *skippable) {

-  MB_PREDICTION_MODE mode;

-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);

-  int this_rate, this_rate_tokenonly;

-  int this_distortion, s;

-  int64_t best_rd = INT64_MAX, this_rd;

-  /* Y Search for 32x32 intra prediction mode */

-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {

-    x->e_mbd.mode_info_context->mbmi.mode = mode;

-    vp9_build_intra_predictors_sby_s(&x->e_mbd);

-    super_block_yrd_8x8(x, &this_rate_tokenonly,

-                        &this_distortion, IF_RTCD(&cpi->rtcd), &s);

-    this_rate = this_rate_tokenonly +

-                x->mbmode_cost[x->e_mbd.frame_type]

-                              [x->e_mbd.mode_info_context->mbmi.mode];

-    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);

-    if (this_rd < best_rd) {

-      mode_selected   = mode;

-      best_rd         = this_rd;

-      *rate           = this_rate;

-      *rate_tokenonly = this_rate_tokenonly;

-      *distortion     = this_distortion;

-      *skippable      = s;

-    }

-  }

-  x->e_mbd.mode_info_context->mbmi.mode = mode_selected;

-  return best_rd;

-}

-#endif

-static int64_t rd_pick_intra16x16mby_mode(VP9_COMP *cpi,

-                                          MACROBLOCK *x,

-                                          int *Rate,

-                                          int *rate_y,

-                                          int *Distortion,

-                                          int *skippable,

-                                          int64_t txfm_cache[NB_TXFM_MODES]) {

-  MB_PREDICTION_MODE mode;

-  TX_SIZE txfm_size;

-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);

-#if CONFIG_COMP_INTRA_PRED

-  MB_PREDICTION_MODE mode2;

-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode2_selected);

-#endif

-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

-  int rate, ratey;

-  int distortion, skip;

-  int64_t best_rd = INT64_MAX;

-  int64_t this_rd;

-  MACROBLOCKD *xd = &x->e_mbd;

-  int i;

-  for (i = 0; i < NB_TXFM_MODES; i++)

-    txfm_cache[i] = INT64_MAX;

-  // Y Search for 16x16 intra prediction mode

-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {

-    int64_t local_txfm_cache[NB_TXFM_MODES];

-    mbmi->mode = mode;

-#if CONFIG_COMP_INTRA_PRED

-    for (mode2 = DC_PRED - 1; mode2 != TM_PRED + 1; mode2++) {

-      mbmi->second_mode = mode2;

-      if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {

-#endif

-        vp9_build_intra_predictors_mby(&x->e_mbd);

-#if CONFIG_COMP_INTRA_PRED

-      } else {

-        continue; // i.e. disable for now

-        vp9_build_comp_intra_predictors_mby(&x->e_mbd);

-      }

-#endif

-      macro_block_yrd(cpi, x, &ratey, &distortion, &skip, local_txfm_cache);

-      // FIXME add compoundmode cost

-      // FIXME add rate for mode2

-      rate = ratey + x->mbmode_cost[x->e_mbd.frame_type][mbmi->mode];

-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

-      if (this_rd < best_rd) {

-        mode_selected = mode;

-        txfm_size = mbmi->txfm_size;

-#if CONFIG_COMP_INTRA_PRED

-        mode2_selected = mode2;

-#endif

-        best_rd = this_rd;

-        *Rate = rate;

-        *rate_y = ratey;

-        *Distortion = distortion;

-        *skippable = skip;

-      }

-      for (i = 0; i < NB_TXFM_MODES; i++) {

-        int64_t adj_rd = this_rd + local_txfm_cache[i] -

-                          local_txfm_cache[cpi->common.txfm_mode];

-        if (adj_rd < txfm_cache[i]) {

-          txfm_cache[i] = adj_rd;

-        }

-      }

-#if CONFIG_COMP_INTRA_PRED

-    }

-#endif

-  }

-  mbmi->txfm_size = txfm_size;

-  mbmi->mode = mode_selected;

-#if CONFIG_COMP_INTRA_PRED

-  mbmi->second_mode = mode2_selected;

-#endif

-  return best_rd;

-}

-static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,

-                                     B_PREDICTION_MODE *best_mode,

-#if CONFIG_COMP_INTRA_PRED

-                                     B_PREDICTION_MODE *best_second_mode,

-#endif

-                                     int *mode_costs,

-                                     ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

-                                     int *bestrate, int *bestratey,

-                                     int *bestdistortion) {

-  MB_PREDICTION_MODE mode;

-#if CONFIG_COMP_INTRA_PRED

-  MB_PREDICTION_MODE mode2;

-#endif

-  MACROBLOCKD *xd = &x->e_mbd;

-  int64_t best_rd = INT64_MAX;

-  int distortion, rate = 0;

-  BLOCK  *be = x->block + ib;

-  BLOCKD *b = xd->block + ib;

-  ENTROPY_CONTEXT ta0, ta1, besta0 = 0, besta1 = 0;

-  ENTROPY_CONTEXT tl0, tl1, bestl0 = 0, bestl1 = 0;

-  /*

-   * The predictor buffer is a 2d buffer with a stride of 16.  Create

-   * a temp buffer that meets the stride requirements, but we are only

-   * interested in the left 8x8 block

-   * */

-  DECLARE_ALIGNED_ARRAY(16, unsigned char,  best_predictor, 16 * 8);

-  DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16 * 4);

-  // perform transformation of dimension 8x8

-  // note the input and output index mapping

-  int idx = (ib & 0x02) ? (ib + 2) : ib;

-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {

-#if CONFIG_COMP_INTRA_PRED

-    for (mode2 = DC_PRED - 1; mode2 != TM_PRED + 1; mode2++) {

-#endif

-      int64_t this_rd;

-      int rate_t;

-      // FIXME rate for compound mode and second intrapred mode

-      rate = mode_costs[mode];

-      b->bmi.as_mode.first = mode;

-#if CONFIG_COMP_INTRA_PRED

-      if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {

-#endif

-        vp9_intra8x8_predict(b, mode, b->predictor);

-#if CONFIG_COMP_INTRA_PRED

-      } else {

-        continue; // i.e. disable for now

-        vp9_comp_intra8x8_predict(b, mode, mode2, b->predictor);

-      }

-#endif

-      vp9_subtract_4b_c(be, b, 16);

-      if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {

-        TX_TYPE tx_type = get_tx_type_8x8(xd, b);

-        if (tx_type != DCT_DCT)

-          vp9_fht(be->src_diff, 32, (x->block + idx)->coeff, tx_type, 8);

-        else

-          x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);

-        x->quantize_b_8x8(x->block + idx, xd->block + idx);

-        // compute quantization mse of 8x8 block

-        distortion = vp9_block_error_c((x->block + idx)->coeff,

-                                       (xd->block + idx)->dqcoeff, 64);

-        ta0 = a[vp9_block2above_8x8[idx]];

-        tl0 = l[vp9_block2left_8x8[idx]];

-        rate_t = cost_coeffs(x, xd->block + idx, PLANE_TYPE_Y_WITH_DC,

-                             &ta0, &tl0, TX_8X8);

-        rate += rate_t;

-        ta1 = ta0;

-        tl1 = tl0;

-      } else {

-        x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);

-        x->vp9_short_fdct8x4((be + 4)->src_diff, (be + 4)->coeff, 32);

-        x->quantize_b_4x4_pair(x->block + ib, x->block + ib + 1,

-                               xd->block + ib, xd->block + ib + 1);

-        x->quantize_b_4x4_pair(x->block + ib + 4, x->block + ib + 5,

-                               xd->block + ib + 4, xd->block + ib + 5);

-        distortion = vp9_block_error_c((x->block + ib)->coeff,

-                                       (xd->block + ib)->dqcoeff, 16);

-        distortion += vp9_block_error_c((x->block + ib + 1)->coeff,

-                                        (xd->block + ib + 1)->dqcoeff, 16);

-        distortion += vp9_block_error_c((x->block + ib + 4)->coeff,

-                                        (xd->block + ib + 4)->dqcoeff, 16);

-        distortion += vp9_block_error_c((x->block + ib + 5)->coeff,

-                                        (xd->block + ib + 5)->dqcoeff, 16);

-        ta0 = a[vp9_block2above[ib]];

-        ta1 = a[vp9_block2above[ib + 1]];

-        tl0 = l[vp9_block2left[ib]];

-        tl1 = l[vp9_block2left[ib + 4]];

-        rate_t = cost_coeffs(x, xd->block + ib, PLANE_TYPE_Y_WITH_DC,

-                             &ta0, &tl0, TX_4X4);

-        rate_t += cost_coeffs(x, xd->block + ib + 1, PLANE_TYPE_Y_WITH_DC,

-                              &ta1, &tl0, TX_4X4);

-        rate_t += cost_coeffs(x, xd->block + ib + 4, PLANE_TYPE_Y_WITH_DC,

-                              &ta0, &tl1, TX_4X4);

-        rate_t += cost_coeffs(x, xd->block + ib + 5, PLANE_TYPE_Y_WITH_DC,

-                              &ta1, &tl1, TX_4X4);

-        rate += rate_t;

-      }

-      distortion >>= 2;

-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

-      if (this_rd < best_rd) {

-        *bestrate = rate;

-        *bestratey = rate_t;

-        *bestdistortion = distortion;

-        besta0 = ta0;

-        besta1 = ta1;

-        bestl0 = tl0;

-        bestl1 = tl1;

-        best_rd = this_rd;

-        *best_mode = mode;

-#if CONFIG_COMP_INTRA_PRED

-        *best_second_mode = mode2;

-#endif

-        copy_predictor_8x8(best_predictor, b->predictor);

-        vpx_memcpy(best_dqcoeff, b->dqcoeff, 64);

-        vpx_memcpy(best_dqcoeff + 32, b->dqcoeff + 64, 64);

-#if CONFIG_COMP_INTRA_PRED

-      }

-#endif

-    }

-  }

-  b->bmi.as_mode.first = (*best_mode);

-#if CONFIG_COMP_INTRA_PRED

-  b->bmi.as_mode.second = (*best_second_mode);

-#endif

-  vp9_encode_intra8x8(IF_RTCD(&cpi->rtcd), x, ib);

-  if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {

-    a[vp9_block2above_8x8[idx]]     = besta0;

-    a[vp9_block2above_8x8[idx] + 1] = besta1;

-    l[vp9_block2left_8x8[idx]]      = bestl0;

-    l[vp9_block2left_8x8[idx] + 1]  = bestl1;

-  } else {

-    a[vp9_block2above[ib]]     = besta0;

-    a[vp9_block2above[ib + 1]] = besta1;

-    l[vp9_block2left[ib]]      = bestl0;

-    l[vp9_block2left[ib + 4]]  = bestl1;

-  }

-  return best_rd;

-}

-static int64_t rd_pick_intra8x8mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,

-                                         int *Rate, int *rate_y,

-                                         int *Distortion, int64_t best_rd) {

-  MACROBLOCKD *const xd = &mb->e_mbd;

-  int i, ib;

-  int cost = mb->mbmode_cost [xd->frame_type] [I8X8_PRED];

-  int distortion = 0;

-  int tot_rate_y = 0;

-  long long total_rd = 0;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta, *tl;

-  int *i8x8mode_costs;

-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  ta = (ENTROPY_CONTEXT *)&t_above;

-  tl = (ENTROPY_CONTEXT *)&t_left;

-  xd->mode_info_context->mbmi.mode = I8X8_PRED;

-  i8x8mode_costs  = mb->i8x8_mode_costs;

-  for (i = 0; i < 4; i++) {

-    MODE_INFO *const mic = xd->mode_info_context;

-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);

-#if CONFIG_COMP_INTRA_PRED

-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_second_mode);

-#endif

-    int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);

-    ib = vp9_i8x8_block[i];

-    total_rd += rd_pick_intra8x8block(

-                  cpi, mb, ib, &best_mode,

-#if CONFIG_COMP_INTRA_PRED

-                  & best_second_mode,

-#endif

-                  i8x8mode_costs, ta, tl, &r, &ry, &d);

-    cost += r;

-    distortion += d;

-    tot_rate_y += ry;

-    mic->bmi[ib].as_mode.first = best_mode;

-#if CONFIG_COMP_INTRA_PRED

-    mic->bmi[ib].as_mode.second = best_second_mode;

-#endif

-  }

-  *Rate = cost;

-  *rate_y += tot_rate_y;

-  *Distortion = distortion;

-  return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);

-}

-static int rd_cost_mbuv(MACROBLOCK *mb) {

-  int b;

-  int cost = 0;

-  MACROBLOCKD *xd = &mb->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta, *tl;

-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  ta = (ENTROPY_CONTEXT *)&t_above;

-  tl = (ENTROPY_CONTEXT *)&t_left;

-  for (b = 16; b < 24; b++)

-    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,

-                        ta + vp9_block2above[b], tl + vp9_block2left[b],

-                        TX_4X4);

-  return cost;

-}

-static int64_t rd_inter16x16_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,

-                                int *distortion, int fullpixel, int *skip) {

-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

-                    x->e_mbd.predictor, x->src.uv_stride);

-  vp9_transform_mbuv_4x4(x);

-  vp9_quantize_mbuv_4x4(x);

-  *rate       = rd_cost_mbuv(x);

-  *distortion = vp9_mbuverror(x) / 4;

-  *skip       = vp9_mbuv_is_skippable_4x4(&x->e_mbd);

-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);

-}

-static int rd_cost_mbuv_8x8(MACROBLOCK *mb, int backup) {

-  int b;

-  int cost = 0;

-  MACROBLOCKD *xd = &mb->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta, *tl;

-  if (backup) {

-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));

-    ta = (ENTROPY_CONTEXT *)&t_above;

-    tl = (ENTROPY_CONTEXT *)&t_left;

-  } else {

-    ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;

-    tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;

-  }

-  for (b = 16; b < 24; b += 4)

-    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,

-                        ta + vp9_block2above_8x8[b],

-                        tl + vp9_block2left_8x8[b], TX_8X8);

-  return cost;

-}

-#if CONFIG_SUPERBLOCKS

-static int64_t rd_inter32x32_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,

-                                int *distortion, int fullpixel, int *skip) {

-  MACROBLOCKD *xd = &x->e_mbd;

-  int n, r = 0, d = 0;

-  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;

-  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;

-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;

-  int skippable = 1;

-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];

-  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;

-  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;

-  memcpy(t_above, xd->above_context, sizeof(t_above));

-  memcpy(t_left, xd->left_context, sizeof(t_left));

-  for (n = 0; n < 4; n++) {

-    int x_idx = n & 1, y_idx = n >> 1;

-    vp9_subtract_mbuv_s_c(x->src_diff,

-                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

-                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

-                          src_uv_stride,

-                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                          dst_uv_stride);

-    vp9_transform_mbuv_8x8(x);

-    vp9_quantize_mbuv_8x8(x);

-    xd->above_context = ta + x_idx;

-    xd->left_context = tl + y_idx;

-    r += rd_cost_mbuv_8x8(x, 0);

-    d += vp9_mbuverror(x) / 4;

-    skippable = skippable && vp9_mbuv_is_skippable_8x8(xd);

-  }

-  *rate = r;

-  *distortion = d;

-  if (skip) *skip = skippable;

-  xd->left_context = tl;

-  xd->above_context = ta;

-  memcpy(xd->above_context, t_above, sizeof(t_above));

-  memcpy(xd->left_context, t_left, sizeof(t_left));

-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);

-}

-#endif

-static int64_t rd_inter16x16_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,

-                                    int *distortion, int fullpixel, int *skip) {

-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

-                    x->e_mbd.predictor, x->src.uv_stride);

-  vp9_transform_mbuv_8x8(x);

-  vp9_quantize_mbuv_8x8(x);

-  *rate       = rd_cost_mbuv_8x8(x, 1);

-  *distortion = vp9_mbuverror(x) / 4;

-  *skip       = vp9_mbuv_is_skippable_8x8(&x->e_mbd);

-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);

-}

-static int64_t rd_inter4x4_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,

-                              int *distortion, int *skippable, int fullpixel) {

-  vp9_build_inter4x4_predictors_mbuv(&x->e_mbd);

-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

-                    x->e_mbd.predictor, x->src.uv_stride);

-  vp9_transform_mbuv_4x4(x);

-  vp9_quantize_mbuv_4x4(x);

-  *rate       = rd_cost_mbuv(x);

-  *distortion = vp9_mbuverror(x) / 4;

-  *skippable  = vp9_mbuv_is_skippable_4x4(&x->e_mbd);

-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);

-}

-static void rd_pick_intra_mbuv_mode(VP9_COMP *cpi,

-                                    MACROBLOCK *x,

-                                    int *rate,

-                                    int *rate_tokenonly,

-                                    int *distortion,

-                                    int *skippable) {

-  MB_PREDICTION_MODE mode;

-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);

-#if CONFIG_COMP_INTRA_PRED

-  MB_PREDICTION_MODE mode2;

-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode2_selected);

-#endif

-  MACROBLOCKD *xd = &x->e_mbd;

-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

-  int64_t best_rd = INT64_MAX;

-  int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);

-  int rate_to, UNINITIALIZED_IS_SAFE(skip);

-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {

-#if CONFIG_COMP_INTRA_PRED

-    for (mode2 = DC_PRED - 1; mode2 != TM_PRED + 1; mode2++) {

-#endif

-      int rate;

-      int distortion;

-      int64_t this_rd;

-      mbmi->uv_mode = mode;

-#if CONFIG_COMP_INTRA_PRED

-      mbmi->second_uv_mode = mode2;

-      if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {

-#endif

-        vp9_build_intra_predictors_mbuv(&x->e_mbd);

-#if CONFIG_COMP_INTRA_PRED

-      } else {

-        continue;

-        vp9_build_comp_intra_predictors_mbuv(&x->e_mbd);

-      }

-#endif

-      vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

-                        x->e_mbd.predictor, x->src.uv_stride);

-      vp9_transform_mbuv_4x4(x);

-      vp9_quantize_mbuv_4x4(x);

-      rate_to = rd_cost_mbuv(x);

-      rate = rate_to

-             + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];

-      distortion = vp9_mbuverror(x) / 4;

-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

-      if (this_rd < best_rd) {

-        skip = vp9_mbuv_is_skippable_4x4(xd);

-        best_rd = this_rd;

-        d = distortion;

-        r = rate;

-        *rate_tokenonly = rate_to;

-        mode_selected = mode;

-#if CONFIG_COMP_INTRA_PRED

-        mode2_selected = mode2;

-      }

-#endif

-    }

-  }

-  *rate = r;

-  *distortion = d;

-  *skippable = skip;

-  mbmi->uv_mode = mode_selected;

-#if CONFIG_COMP_INTRA_PRED

-  mbmi->second_uv_mode = mode2_selected;

-#endif

-}

-static void rd_pick_intra_mbuv_mode_8x8(VP9_COMP *cpi,

-                                        MACROBLOCK *x,

-                                        int *rate,

-                                        int *rate_tokenonly,

-                                        int *distortion,

-                                        int *skippable) {

-  MACROBLOCKD *xd = &x->e_mbd;

-  MB_PREDICTION_MODE mode;

-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);

-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

-  int64_t best_rd = INT64_MAX;

-  int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);

-  int rate_to, UNINITIALIZED_IS_SAFE(skip);

-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {

-    int rate;

-    int distortion;

-    int64_t this_rd;

-    mbmi->uv_mode = mode;

-    vp9_build_intra_predictors_mbuv(&x->e_mbd);

-    vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

-                      x->e_mbd.predictor, x->src.uv_stride);

-    vp9_transform_mbuv_8x8(x);

-    vp9_quantize_mbuv_8x8(x);

-    rate_to = rd_cost_mbuv_8x8(x, 1);

-    rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];

-    distortion = vp9_mbuverror(x) / 4;

-    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

-    if (this_rd < best_rd) {

-      skip = vp9_mbuv_is_skippable_8x8(xd);

-      best_rd = this_rd;

-      d = distortion;

-      r = rate;

-      *rate_tokenonly = rate_to;

-      mode_selected = mode;

-    }

-  }

-  *rate = r;

-  *distortion = d;

-  *skippable = skip;

-  mbmi->uv_mode = mode_selected;

-}

-#if CONFIG_SUPERBLOCKS

-static void super_block_uvrd_8x8(MACROBLOCK *x,

-                                 int *rate,

-                                 int *distortion,

-                                 const VP9_ENCODER_RTCD *rtcd,

-                                 int *skippable) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  int d = 0, r = 0, n, s = 1;

-  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;

-  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;

-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;

-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];

-  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;

-  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;

-  memcpy(t_above, xd->above_context, sizeof(t_above));

-  memcpy(t_left,  xd->left_context,  sizeof(t_left));

-  for (n = 0; n < 4; n++) {

-    int x_idx = n & 1, y_idx = n >> 1;

-    vp9_subtract_mbuv_s_c(x->src_diff,

-                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

-                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

-                          src_uv_stride,

-                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                          dst_uv_stride);

-    vp9_transform_mbuv_8x8(x);

-    vp9_quantize_mbuv_8x8(x);

-    s &= vp9_mbuv_is_skippable_8x8(xd);

-    d += vp9_mbuverror(x) >> 2;

-    xd->above_context = ta + x_idx;

-    xd->left_context = tl + y_idx;

-    r += rd_cost_mbuv_8x8(x, 0);

-  }

-  xd->above_context = ta;

-  xd->left_context = tl;

-  *distortion = d;

-  *rate       = r;

-  *skippable  = s;

-  xd->left_context = tl;

-  xd->above_context = ta;

-  memcpy(xd->above_context, t_above, sizeof(t_above));

-  memcpy(xd->left_context,  t_left,  sizeof(t_left));

-}

-static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi,

-                                       MACROBLOCK *x,

-                                       int *rate,

-                                       int *rate_tokenonly,

-                                       int *distortion,

-                                       int *skippable) {

-  MB_PREDICTION_MODE mode;

-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);

-  int64_t best_rd = INT64_MAX, this_rd;

-  int this_rate_tokenonly, this_rate;

-  int this_distortion, s;

-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {

-    x->e_mbd.mode_info_context->mbmi.uv_mode = mode;

-    vp9_build_intra_predictors_sbuv_s(&x->e_mbd);

-    super_block_uvrd_8x8(x, &this_rate_tokenonly,

-                         &this_distortion, IF_RTCD(&cpi->rtcd), &s);

-    this_rate = this_rate_tokenonly +

-                x->mbmode_cost[x->e_mbd.frame_type]

-                              [x->e_mbd.mode_info_context->mbmi.mode];

-    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);

-    if (this_rd < best_rd) {

-      mode_selected   = mode;

-      best_rd         = this_rd;

-      *rate           = this_rate;

-      *rate_tokenonly = this_rate_tokenonly;

-      *distortion     = this_distortion;

-      *skippable      = s;

-    }

-  }

-  x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected;

-  return best_rd;

-}

-#endif

-int vp9_cost_mv_ref(VP9_COMP *cpi,

-                    MB_PREDICTION_MODE m,

-                    const int near_mv_ref_ct[4]) {

-  MACROBLOCKD *xd = &cpi->mb.e_mbd;

-  int segment_id = xd->mode_info_context->mbmi.segment_id;

-  // If the mode coding is done entirely at the segment level

-  // we should not account for it at the per mb level in rd code.

-  // Note that if the segment level coding is expanded from single mode

-  // to multiple mode masks as per reference frame coding we will need

-  // to do something different here.

-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {

-    VP9_COMMON *pc = &cpi->common;

-    vp9_prob p [VP9_MVREFS - 1];

-    assert(NEARESTMV <= m  &&  m <= SPLITMV);

-    vp9_mv_ref_probs(pc, p, near_mv_ref_ct);

-    return cost_token(vp9_mv_ref_tree, p,

-                      vp9_mv_ref_encoding_array - NEARESTMV + m);

-  } else

-    return 0;

-}

-void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) {

-  x->e_mbd.mode_info_context->mbmi.mode = mb;

-  x->e_mbd.mode_info_context->mbmi.mv[0].as_int = mv->as_int;

-}

-static int labels2mode(

-  MACROBLOCK *x,

-  int const *labelings, int which_label,

-  B_PREDICTION_MODE this_mode,

-  int_mv *this_mv, int_mv *this_second_mv,

-  int_mv seg_mvs[MAX_REF_FRAMES - 1],

-  int_mv *best_ref_mv,

-  int_mv *second_best_ref_mv,

-  DEC_MVCOSTS) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  MODE_INFO *const mic = xd->mode_info_context;

-  MB_MODE_INFO * mbmi = &mic->mbmi;

-  const int mis = xd->mode_info_stride;

-  int i, cost = 0, thismvcost = 0;

-  /* We have to be careful retrieving previously-encoded motion vectors.

-     Ones from this macroblock have to be pulled from the BLOCKD array

-     as they have not yet made it to the bmi array in our MB_MODE_INFO. */

-  for (i = 0; i < 16; ++i) {

-    BLOCKD *const d = xd->block + i;

-    const int row = i >> 2,  col = i & 3;

-    B_PREDICTION_MODE m;

-    if (labelings[i] != which_label)

-      continue;

-    if (col  &&  labelings[i] == labelings[i - 1])

-      m = LEFT4X4;

-    else if (row  &&  labelings[i] == labelings[i - 4])

-      m = ABOVE4X4;

-    else {

-      // the only time we should do costing for new motion vector or mode

-      // is when we are on a new label  (jbb May 08, 2007)

-      switch (m = this_mode) {

-        case NEW4X4 :

-          if (mbmi->second_ref_frame) {

-            this_mv->as_int = seg_mvs[mbmi->ref_frame - 1].as_int;

-            this_second_mv->as_int =

-              seg_mvs[mbmi->second_ref_frame - 1].as_int;

-          }

-          thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, MVCOSTS,

-                                        102, xd->allow_high_precision_mv);

-          if (mbmi->second_ref_frame) {

-            thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,

-                                          MVCOSTS, 102,

-                                          xd->allow_high_precision_mv);

-          }

-          break;

-        case LEFT4X4:

-          this_mv->as_int = col ? d[-1].bmi.as_mv.first.as_int : left_block_mv(mic, i);

-          if (mbmi->second_ref_frame)

-            this_second_mv->as_int = col ? d[-1].bmi.as_mv.second.as_int : left_block_second_mv(mic, i);

-          break;

-        case ABOVE4X4:

-          this_mv->as_int = row ? d[-4].bmi.as_mv.first.as_int : above_block_mv(mic, i, mis);

-          if (mbmi->second_ref_frame)

-            this_second_mv->as_int = row ? d[-4].bmi.as_mv.second.as_int : above_block_second_mv(mic, i, mis);

-          break;

-        case ZERO4X4:

-          this_mv->as_int = 0;

-          if (mbmi->second_ref_frame)

-            this_second_mv->as_int = 0;

-          break;

-        default:

-          break;

-      }

-      if (m == ABOVE4X4) { // replace above with left if same

-        int_mv left_mv, left_second_mv;

-        left_second_mv.as_int = 0;

-        left_mv.as_int = col ? d[-1].bmi.as_mv.first.as_int :

-                         left_block_mv(mic, i);

-        if (mbmi->second_ref_frame)

-          left_second_mv.as_int = col ? d[-1].bmi.as_mv.second.as_int :

-                                  left_block_second_mv(mic, i);

-        if (left_mv.as_int == this_mv->as_int &&

-            (!mbmi->second_ref_frame ||

-             left_second_mv.as_int == this_second_mv->as_int))

-          m = LEFT4X4;

-      }

-      cost = x->inter_bmode_costs[ m];

-    }

-    d->bmi.as_mv.first.as_int = this_mv->as_int;

-    if (mbmi->second_ref_frame)

-      d->bmi.as_mv.second.as_int = this_second_mv->as_int;

-    x->partition_info->bmi[i].mode = m;

-    x->partition_info->bmi[i].mv.as_int = this_mv->as_int;

-    if (mbmi->second_ref_frame)

-      x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;

-  }

-  cost += thismvcost;

-  return cost;

-}

-static int64_t encode_inter_mb_segment(MACROBLOCK *x,

-                                       int const *labels,

-                                       int which_label,

-                                       int *labelyrate,

-                                       int *distortion,

-                                       ENTROPY_CONTEXT *ta,

-                                       ENTROPY_CONTEXT *tl,

-                                       const VP9_ENCODER_RTCD *rtcd) {

-  int i;

-  MACROBLOCKD *xd = &x->e_mbd;

-  *labelyrate = 0;

-  *distortion = 0;

-  for (i = 0; i < 16; i++) {

-    if (labels[i] == which_label) {

-      BLOCKD *bd = &x->e_mbd.block[i];

-      BLOCK *be = &x->block[i];

-      int thisdistortion;

-      vp9_build_inter_predictors_b(bd, 16, xd->subpixel_predict);

-      if (xd->mode_info_context->mbmi.second_ref_frame)

-        vp9_build_2nd_inter_predictors_b(bd, 16, xd->subpixel_predict_avg);

-      vp9_subtract_b(be, bd, 16);

-      x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);

-      x->quantize_b_4x4(be, bd);

-      thisdistortion = vp9_block_error(be->coeff, bd->dqcoeff, 16);

-      *distortion += thisdistortion;

-      *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,

-                                 ta + vp9_block2above[i],

-                                 tl + vp9_block2left[i], TX_4X4);

-    }

-  }

-  *distortion >>= 2;

-  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);

-}

-static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,

-                                           int const *labels,

-                                           int which_label,

-                                           int *labelyrate,

-                                           int *distortion,

-                                           int64_t *otherrd,

-                                           ENTROPY_CONTEXT *ta,

-                                           ENTROPY_CONTEXT *tl,

-                                           const VP9_ENCODER_RTCD *rtcd) {

-  int i, j;

-  MACROBLOCKD *xd = &x->e_mbd;

-  const int iblock[4] = { 0, 1, 4, 5 };

-  int othercost = 0, otherdist = 0;

-  ENTROPY_CONTEXT_PLANES tac, tlc;

-  ENTROPY_CONTEXT *tacp = (ENTROPY_CONTEXT *) &tac,

-                  *tlcp = (ENTROPY_CONTEXT *) &tlc;

-  if (otherrd) {

-    memcpy(&tac, ta, sizeof(ENTROPY_CONTEXT_PLANES));

-    memcpy(&tlc, tl, sizeof(ENTROPY_CONTEXT_PLANES));

-  }

-  *distortion = 0;

-  *labelyrate = 0;

-  for (i = 0; i < 4; i++) {

-    int ib = vp9_i8x8_block[i];

-    if (labels[ib] == which_label) {

-      int idx = (ib & 8) + ((ib & 2) << 1);

-      BLOCKD *bd = &xd->block[ib], *bd2 = &xd->block[idx];

-      BLOCK *be = &x->block[ib], *be2 = &x->block[idx];

-      int thisdistortion;

-      vp9_build_inter_predictors4b(xd, bd, 16);

-      if (xd->mode_info_context->mbmi.second_ref_frame)

-        vp9_build_2nd_inter_predictors4b(xd, bd, 16);

-      vp9_subtract_4b_c(be, bd, 16);

-      if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) {

-        if (otherrd) {

-          x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32);

-          x->quantize_b_8x8(be2, bd2);

-          thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);

-          otherdist += thisdistortion;

-          othercost += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,

-                                     tacp + vp9_block2above_8x8[idx],

-                                     tlcp + vp9_block2left_8x8[idx], TX_8X8);

-        }

-        for (j = 0; j < 4; j += 2) {

-          bd = &xd->block[ib + iblock[j]];

-          be = &x->block[ib + iblock[j]];

-          x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);

-          x->quantize_b_4x4_pair(be, be + 1, bd, bd + 1);

-          thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);

-          *distortion += thisdistortion;

-          *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,

-                                     ta + vp9_block2above[ib + iblock[j]],

-                                     tl + vp9_block2left[ib + iblock[j]],

-                                     TX_4X4);

-          *labelyrate += cost_coeffs(x, bd + 1, PLANE_TYPE_Y_WITH_DC,

-                                     ta + vp9_block2above[ib + iblock[j] + 1],

-                                     tl + vp9_block2left[ib + iblock[j]],

-                                     TX_4X4);

-        }

-      } else /* 8x8 */ {

-        if (otherrd) {

-          for (j = 0; j < 4; j += 2) {

-            BLOCKD *bd3 = &xd->block[ib + iblock[j]];

-            BLOCK *be3 = &x->block[ib + iblock[j]];

-            x->vp9_short_fdct8x4(be3->src_diff, be3->coeff, 32);

-            x->quantize_b_4x4_pair(be3, be3 + 1, bd3, bd3 + 1);

-            thisdistortion = vp9_block_error_c(be3->coeff, bd3->dqcoeff, 32);

-            otherdist += thisdistortion;

-            othercost += cost_coeffs(x, bd3, PLANE_TYPE_Y_WITH_DC,

-                                     tacp + vp9_block2above[ib + iblock[j]],

-                                     tlcp + vp9_block2left[ib + iblock[j]],

-                                     TX_4X4);

-            othercost += cost_coeffs(x, bd3 + 1, PLANE_TYPE_Y_WITH_DC,

-                                     tacp + vp9_block2above[ib + iblock[j] + 1],

-                                     tlcp + vp9_block2left[ib + iblock[j]],

-                                     TX_4X4);

-          }

-        }

-        x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32);

-        x->quantize_b_8x8(be2, bd2);

-        thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);

-        *distortion += thisdistortion;

-        *labelyrate += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,

-                                   ta + vp9_block2above_8x8[idx],

-                                   tl + vp9_block2left_8x8[idx], TX_8X8);

-      }

-    }

-  }

-  *distortion >>= 2;

-  if (otherrd) {

-    otherdist >>= 2;

-    *otherrd = RDCOST(x->rdmult, x->rddiv, othercost, otherdist);

-  }

-  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);

-}

-static const unsigned int segmentation_to_sseshift[4] = {3, 3, 2, 0};

-typedef struct {

-  int_mv *ref_mv, *second_ref_mv;

-  int_mv mvp;

-  int64_t segment_rd;

-  SPLITMV_PARTITIONING_TYPE segment_num;

-  TX_SIZE txfm_size;

-  int r;

-  int d;

-  int segment_yrate;

-  B_PREDICTION_MODE modes[16];

-  int_mv mvs[16], second_mvs[16];

-  int eobs[16];

-  int mvthresh;

-  int *mdcounts;

-  int_mv sv_mvp[4];     // save 4 mvp from 8x8

-  int sv_istep[2];  // save 2 initial step_param for 16x8/8x16

-} BEST_SEG_INFO;

-static __inline

-int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {

-  int r = 0;

-  r |= (mv->as_mv.row >> 3) < x->mv_row_min;

-  r |= (mv->as_mv.row >> 3) > x->mv_row_max;

-  r |= (mv->as_mv.col >> 3) < x->mv_col_min;

-  r |= (mv->as_mv.col >> 3) > x->mv_col_max;

-  return r;

-}

-static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,

-                                    BEST_SEG_INFO *bsi,

-                                    SPLITMV_PARTITIONING_TYPE segmentation,

-                                    TX_SIZE tx_size, int64_t *otherrds,

-                                    int64_t *rds, int *completed,

-                                    /* 16 = n_blocks */

-                                    int_mv seg_mvs[16 /* n_blocks */]

-                                                  [MAX_REF_FRAMES - 1]) {

-  int i, j;

-  int const *labels;

-  int br = 0, bd = 0;

-  B_PREDICTION_MODE this_mode;

-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

-  int label_count;

-  int64_t this_segment_rd = 0, other_segment_rd;

-  int label_mv_thresh;

-  int rate = 0;

-  int sbr = 0, sbd = 0;

-  int segmentyrate = 0;

-  int best_eobs[16] = { 0 };

-  vp9_variance_fn_ptr_t *v_fn_ptr;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta, *tl;

-  ENTROPY_CONTEXT_PLANES t_above_b, t_left_b;

-  ENTROPY_CONTEXT *ta_b, *tl_b;

-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));

-  ta = (ENTROPY_CONTEXT *)&t_above;

-  tl = (ENTROPY_CONTEXT *)&t_left;

-  ta_b = (ENTROPY_CONTEXT *)&t_above_b;

-  tl_b = (ENTROPY_CONTEXT *)&t_left_b;

-  v_fn_ptr = &cpi->fn_ptr[segmentation];

-  labels = vp9_mbsplits[segmentation];

-  label_count = vp9_mbsplit_count[segmentation];

-  // 64 makes this threshold really big effectively

-  // making it so that we very rarely check mvs on

-  // segments.   setting this to 1 would make mv thresh

-  // roughly equal to what it is for macroblocks

-  label_mv_thresh = 1 * bsi->mvthresh / label_count;

-  // Segmentation method overheads

-  rate = cost_token(vp9_mbsplit_tree, vp9_mbsplit_probs,

-                    vp9_mbsplit_encodings + segmentation);

-  rate += vp9_cost_mv_ref(cpi, SPLITMV, bsi->mdcounts);

-  this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);

-  br += rate;

-  other_segment_rd = this_segment_rd;

-  mbmi->txfm_size = tx_size;

-  for (i = 0; i < label_count && this_segment_rd < bsi->segment_rd; i++) {

-    int_mv mode_mv[B_MODE_COUNT], second_mode_mv[B_MODE_COUNT];

-    int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;

-    B_PREDICTION_MODE mode_selected = ZERO4X4;

-    int bestlabelyrate = 0;

-    // search for the best motion vector on this segment

-    for (this_mode = LEFT4X4; this_mode <= NEW4X4; this_mode ++) {

-      int64_t this_rd, other_rd;

-      int distortion;

-      int labelyrate;

-      ENTROPY_CONTEXT_PLANES t_above_s, t_left_s;

-      ENTROPY_CONTEXT *ta_s;

-      ENTROPY_CONTEXT *tl_s;

-      vpx_memcpy(&t_above_s, &t_above, sizeof(ENTROPY_CONTEXT_PLANES));

-      vpx_memcpy(&t_left_s, &t_left, sizeof(ENTROPY_CONTEXT_PLANES));

-      ta_s = (ENTROPY_CONTEXT *)&t_above_s;

-      tl_s = (ENTROPY_CONTEXT *)&t_left_s;

-      // motion search for newmv (single predictor case only)

-      if (!mbmi->second_ref_frame && this_mode == NEW4X4) {

-        int sseshift, n;

-        int step_param = 0;

-        int further_steps;

-        int thissme, bestsme = INT_MAX;

-        BLOCK *c;

-        BLOCKD *e;

-        /* Is the best so far sufficiently good that we cant justify doing

-         * and new motion search. */

-        if (best_label_rd < label_mv_thresh)

-          break;

-        if (cpi->compressor_speed) {

-          if (segmentation == PARTITIONING_8X16 ||

-              segmentation == PARTITIONING_16X8) {

-            bsi->mvp.as_int = bsi->sv_mvp[i].as_int;

-            if (i == 1 && segmentation == PARTITIONING_16X8)

-              bsi->mvp.as_int = bsi->sv_mvp[2].as_int;

-            step_param = bsi->sv_istep[i];

-          }

-          // use previous block's result as next block's MV predictor.

-          if (segmentation == PARTITIONING_4X4 && i > 0) {

-            bsi->mvp.as_int = x->e_mbd.block[i - 1].bmi.as_mv.first.as_int;

-            if (i == 4 || i == 8 || i == 12)

-              bsi->mvp.as_int = x->e_mbd.block[i - 4].bmi.as_mv.first.as_int;

-            step_param = 2;

-          }

-        }

-        further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;

-        {

-          int sadpb = x->sadperbit4;

-          int_mv mvp_full;

-          mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;

-          mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;

-          // find first label

-          n = vp9_mbsplit_offset[segmentation][i];

-          c = &x->block[n];

-          e = &x->e_mbd.block[n];

-          bestsme = vp9_full_pixel_diamond(cpi, x, c, e, &mvp_full, step_param,

-                                           sadpb, further_steps, 0, v_fn_ptr,

-                                           bsi->ref_mv, &mode_mv[NEW4X4]);

-          sseshift = segmentation_to_sseshift[segmentation];

-          // Should we do a full search (best quality only)

-          if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) {

-            /* Check if mvp_full is within the range. */

-            clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,

-                     x->mv_row_min, x->mv_row_max);

-            thissme = cpi->full_search_sad(x, c, e, &mvp_full,

-                                           sadpb, 16, v_fn_ptr,

-                                           XMVCOST, bsi->ref_mv);

-            if (thissme < bestsme) {

-              bestsme = thissme;

-              mode_mv[NEW4X4].as_int = e->bmi.as_mv.first.as_int;

-            } else {

-              /* The full search result is actually worse so re-instate the

-               * previous best vector */

-              e->bmi.as_mv.first.as_int = mode_mv[NEW4X4].as_int;

-            }

-          }

-        }

-        if (bestsme < INT_MAX) {

-          int distortion;

-          unsigned int sse;

-          cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4],

-                                       bsi->ref_mv, x->errorperbit, v_fn_ptr,

-                                       XMVCOST, &distortion, &sse);

-          // safe motion search result for use in compound prediction

-          seg_mvs[i][mbmi->ref_frame - 1].as_int = mode_mv[NEW4X4].as_int;

-        }

-      } /* NEW4X4 */

-      else if (mbmi->second_ref_frame && this_mode == NEW4X4) {

-        /* motion search not completed? Then skip newmv for this block with

-         * comppred */

-        if (seg_mvs[i][mbmi->second_ref_frame - 1].as_int == INVALID_MV ||

-            seg_mvs[i][mbmi->ref_frame        - 1].as_int == INVALID_MV) {

-          continue;

-        }

-      }

-      rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode],

-                         &second_mode_mv[this_mode], seg_mvs[i],

-                         bsi->ref_mv, bsi->second_ref_mv, XMVCOST);

-      // Trap vectors that reach beyond the UMV borders

-      if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||

-          ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||

-          ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||

-          ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {

-        continue;

-      }

-      if (mbmi->second_ref_frame &&

-          mv_check_bounds(x, &second_mode_mv[this_mode]))

-        continue;

-      if (segmentation == PARTITIONING_4X4) {

-        this_rd = encode_inter_mb_segment(x, labels, i, &labelyrate,

-                                          &distortion,

-                                          ta_s, tl_s, IF_RTCD(&cpi->rtcd));

-        other_rd = this_rd;

-      } else {

-        this_rd = encode_inter_mb_segment_8x8(x, labels, i, &labelyrate,

-                                              &distortion, &other_rd,

-                                              ta_s, tl_s, IF_RTCD(&cpi->rtcd));

-      }

-      this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);

-      rate += labelyrate;

-      if (this_rd < best_label_rd) {

-        sbr = rate;

-        sbd = distortion;

-        bestlabelyrate = labelyrate;

-        mode_selected = this_mode;

-        best_label_rd = this_rd;

-        if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_4X4) {

-          for (j = 0; j < 16; j++)

-            if (labels[j] == i)

-              best_eobs[j] = x->e_mbd.block[j].eob;

-        } else {

-          for (j = 0; j < 4; j++) {

-            int ib = vp9_i8x8_block[j], idx = j * 4;

-            if (labels[ib] == i)

-              best_eobs[idx] = x->e_mbd.block[idx].eob;

-          }

-        }

-        if (other_rd < best_other_rd)

-          best_other_rd = other_rd;

-        vpx_memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));

-        vpx_memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));

-      }

-    } /*for each 4x4 mode*/

-    vpx_memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));

-    vpx_memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));

-    labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],

-                &second_mode_mv[mode_selected], seg_mvs[i],

-                bsi->ref_mv, bsi->second_ref_mv, XMVCOST);

-    br += sbr;

-    bd += sbd;

-    segmentyrate += bestlabelyrate;

-    this_segment_rd += best_label_rd;

-    other_segment_rd += best_other_rd;

-    if (rds)

-      rds[i] = this_segment_rd;

-    if (otherrds)

-      otherrds[i] = other_segment_rd;

-  } /* for each label */

-  if (this_segment_rd < bsi->segment_rd) {

-    bsi->r = br;

-    bsi->d = bd;

-    bsi->segment_yrate = segmentyrate;

-    bsi->segment_rd = this_segment_rd;

-    bsi->segment_num = segmentation;

-    bsi->txfm_size = mbmi->txfm_size;

-    // store everything needed to come back to this!!

-    for (i = 0; i < 16; i++) {

-      BLOCKD *bd = &x->e_mbd.block[i];

-      bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;

-      if (mbmi->second_ref_frame)

-        bsi->second_mvs[i].as_mv = x->partition_info->bmi[i].second_mv.as_mv;

-      bsi->modes[i] = x->partition_info->bmi[i].mode;

-      bsi->eobs[i] = best_eobs[i];

-    }

-  }

-  if (completed) {

-    *completed = i;

-  }

-}

-static void rd_check_segment(VP9_COMP *cpi, MACROBLOCK *x,

-                             BEST_SEG_INFO *bsi,

-                             unsigned int segmentation,

-                             /* 16 = n_blocks */

-                             int_mv seg_mvs[16][MAX_REF_FRAMES - 1],

-                             int64_t txfm_cache[NB_TXFM_MODES]) {

-  int i, n, c = vp9_mbsplit_count[segmentation];

-  if (segmentation == PARTITIONING_4X4) {

-    int64_t rd[16];

-    rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, NULL,

-                            rd, &n, seg_mvs);

-    if (n == c) {

-      for (i = 0; i < NB_TXFM_MODES; i++) {

-        if (rd[c - 1] < txfm_cache[i])

-          txfm_cache[i] = rd[c - 1];

-      }

-    }

-  } else {

-    int64_t diff, base_rd;

-    int cost4x4 = vp9_cost_bit(cpi->common.prob_tx[0], 0);

-    int cost8x8 = vp9_cost_bit(cpi->common.prob_tx[0], 1);

-    if (cpi->common.txfm_mode == TX_MODE_SELECT) {

-      int64_t rd4x4[4], rd8x8[4];

-      int n4x4, n8x8, nmin;

-      BEST_SEG_INFO bsi4x4, bsi8x8;

-      /* factor in cost of cost4x4/8x8 in decision */

-      vpx_memcpy(&bsi4x4, bsi, sizeof(*bsi));

-      vpx_memcpy(&bsi8x8, bsi, sizeof(*bsi));

-      rd_check_segment_txsize(cpi, x, &bsi4x4, segmentation,

-                              TX_4X4, NULL, rd4x4, &n4x4, seg_mvs);

-      rd_check_segment_txsize(cpi, x, &bsi8x8, segmentation,

-                              TX_8X8, NULL, rd8x8, &n8x8, seg_mvs);

-      if (bsi4x4.segment_num == segmentation) {

-        bsi4x4.segment_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);

-        if (bsi4x4.segment_rd < bsi->segment_rd)

-          vpx_memcpy(bsi, &bsi4x4, sizeof(*bsi));

-      }

-      if (bsi8x8.segment_num == segmentation) {

-        bsi8x8.segment_rd += RDCOST(x->rdmult, x->rddiv, cost8x8, 0);

-        if (bsi8x8.segment_rd < bsi->segment_rd)

-          vpx_memcpy(bsi, &bsi8x8, sizeof(*bsi));

-      }

-      n = n4x4 > n8x8 ? n4x4 : n8x8;

-      if (n == c) {

-        nmin = n4x4 < n8x8 ? n4x4 : n8x8;

-        diff = rd8x8[nmin - 1] - rd4x4[nmin - 1];

-        if (n == n4x4) {

-          base_rd = rd4x4[c - 1];

-        } else {

-          base_rd = rd8x8[c - 1] - diff;

-        }

-      }

-    } else {

-      int64_t rd[4], otherrd[4];

-      if (cpi->common.txfm_mode == ONLY_4X4) {

-        rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, otherrd,

-                                rd, &n, seg_mvs);

-        if (n == c) {

-          base_rd = rd[c - 1];

-          diff = otherrd[c - 1] - rd[c - 1];

-        }

-      } else /* use 8x8 transform */ {

-        rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_8X8, otherrd,

-                                rd, &n, seg_mvs);

-        if (n == c) {

-          diff = rd[c - 1] - otherrd[c - 1];

-          base_rd = otherrd[c - 1];

-        }

-      }

-    }

-    if (n == c) {

-      if (base_rd < txfm_cache[ONLY_4X4]) {

-        txfm_cache[ONLY_4X4] = base_rd;

-      }

-      if (base_rd + diff < txfm_cache[1]) {

-        txfm_cache[ALLOW_8X8] = txfm_cache[ALLOW_16X16] = base_rd + diff;

-      }

-      if (diff < 0) {

-        base_rd += diff + RDCOST(x->rdmult, x->rddiv, cost8x8, 0);

-      } else {

-        base_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);

-      }

-      if (base_rd < txfm_cache[TX_MODE_SELECT]) {

-        txfm_cache[TX_MODE_SELECT] = base_rd;

-      }

-    }

-  }

-}

-static __inline void cal_step_param(int sr, int *sp) {

-  int step = 0;

-  if (sr > MAX_FIRST_STEP) sr = MAX_FIRST_STEP;

-  else if (sr < 1) sr = 1;

-  while (sr >>= 1)

-    step++;

-  *sp = MAX_MVSEARCH_STEPS - 1 - step;

-}

-static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,

-                                       int_mv *best_ref_mv,

-                                       int_mv *second_best_ref_mv,

-                                       int64_t best_rd,

-                                       int *mdcounts,

-                                       int *returntotrate,

-                                       int *returnyrate,

-                                       int *returndistortion,

-                                       int *skippable, int mvthresh,

-                                       int_mv seg_mvs[NB_PARTITIONINGS]

-                                                     [16 /* n_blocks */]

-                                                     [MAX_REF_FRAMES - 1],

-                                       int64_t txfm_cache[NB_TXFM_MODES]) {

-  int i;

-  BEST_SEG_INFO bsi;

-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

-  vpx_memset(&bsi, 0, sizeof(bsi));

-  for (i = 0; i < NB_TXFM_MODES; i++)

-    txfm_cache[i] = INT64_MAX;

-  bsi.segment_rd = best_rd;

-  bsi.ref_mv = best_ref_mv;

-  bsi.second_ref_mv = second_best_ref_mv;

-  bsi.mvp.as_int = best_ref_mv->as_int;

-  bsi.mvthresh = mvthresh;

-  bsi.mdcounts = mdcounts;

-  bsi.txfm_size = TX_4X4;

-  for (i = 0; i < 16; i++)

-    bsi.modes[i] = ZERO4X4;

-  if (cpi->compressor_speed == 0) {

-    /* for now, we will keep the original segmentation order

-       when in best quality mode */

-    rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,

-                     seg_mvs[PARTITIONING_16X8], txfm_cache);

-    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,

-                     seg_mvs[PARTITIONING_8X16], txfm_cache);

-    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,

-                     seg_mvs[PARTITIONING_8X8], txfm_cache);

-    rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,

-                     seg_mvs[PARTITIONING_4X4], txfm_cache);

-  } else {

-    int sr;

-    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,

-                     seg_mvs[PARTITIONING_8X8], txfm_cache);

-    if (bsi.segment_rd < best_rd) {

-      int tmp_col_min = x->mv_col_min;

-      int tmp_col_max = x->mv_col_max;

-      int tmp_row_min = x->mv_row_min;

-      int tmp_row_max = x->mv_row_max;

-      vp9_clamp_mv_min_max(x, best_ref_mv);

-      /* Get 8x8 result */

-      bsi.sv_mvp[0].as_int = bsi.mvs[0].as_int;

-      bsi.sv_mvp[1].as_int = bsi.mvs[2].as_int;

-      bsi.sv_mvp[2].as_int = bsi.mvs[8].as_int;

-      bsi.sv_mvp[3].as_int = bsi.mvs[10].as_int;

-      /* Use 8x8 result as 16x8/8x16's predictor MV. Adjust search range

-       * according to the closeness of 2 MV. */

-      /* block 8X16 */

-      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[2].as_mv.row)) >> 3,

-                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[2].as_mv.col)) >> 3);

-      cal_step_param(sr, &bsi.sv_istep[0]);

-      sr = MAXF((abs(bsi.sv_mvp[1].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,

-                (abs(bsi.sv_mvp[1].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);

-      cal_step_param(sr, &bsi.sv_istep[1]);

-      rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,

-                       seg_mvs[PARTITIONING_8X16], txfm_cache);

-      /* block 16X8 */

-      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[1].as_mv.row)) >> 3,

-                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[1].as_mv.col)) >> 3);

-      cal_step_param(sr, &bsi.sv_istep[0]);

-      sr = MAXF((abs(bsi.sv_mvp[2].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,

-                (abs(bsi.sv_mvp[2].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);

-      cal_step_param(sr, &bsi.sv_istep[1]);

-      rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,

-                       seg_mvs[PARTITIONING_16X8], txfm_cache);

-      /* If 8x8 is better than 16x8/8x16, then do 4x4 search */

-      /* Not skip 4x4 if speed=0 (good quality) */

-      if (cpi->sf.no_skip_block4x4_search ||

-          bsi.segment_num == PARTITIONING_8X8) {

-        /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */

-        bsi.mvp.as_int = bsi.sv_mvp[0].as_int;

-        rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,

-                         seg_mvs[PARTITIONING_4X4], txfm_cache);

-      }

-      /* restore UMV window */

-      x->mv_col_min = tmp_col_min;

-      x->mv_col_max = tmp_col_max;

-      x->mv_row_min = tmp_row_min;

-      x->mv_row_max = tmp_row_max;

-    }

-  }

-  /* set it to the best */

-  for (i = 0; i < 16; i++) {

-    BLOCKD *bd = &x->e_mbd.block[i];

-    bd->bmi.as_mv.first.as_int = bsi.mvs[i].as_int;

-    if (mbmi->second_ref_frame)

-      bd->bmi.as_mv.second.as_int = bsi.second_mvs[i].as_int;

-    bd->eob = bsi.eobs[i];

-  }

-  *returntotrate = bsi.r;

-  *returndistortion = bsi.d;

-  *returnyrate = bsi.segment_yrate;

-  *skippable = bsi.txfm_size == TX_4X4 ?

-                    vp9_mby_is_skippable_4x4(&x->e_mbd, 0) :

-                    vp9_mby_is_skippable_8x8(&x->e_mbd, 0);

-  /* save partitions */

-  mbmi->txfm_size = bsi.txfm_size;

-  mbmi->partitioning = bsi.segment_num;

-  x->partition_info->count = vp9_mbsplit_count[bsi.segment_num];

-  for (i = 0; i < x->partition_info->count; i++) {

-    int j;

-    j = vp9_mbsplit_offset[bsi.segment_num][i];

-    x->partition_info->bmi[i].mode = bsi.modes[j];

-    x->partition_info->bmi[i].mv.as_mv = bsi.mvs[j].as_mv;

-    if (mbmi->second_ref_frame)

-      x->partition_info->bmi[i].second_mv.as_mv = bsi.second_mvs[j].as_mv;

-  }

-  /*

-   * used to set mbmi->mv.as_int

-   */

-  x->partition_info->bmi[15].mv.as_int = bsi.mvs[15].as_int;

-  if (mbmi->second_ref_frame)

-    x->partition_info->bmi[15].second_mv.as_int = bsi.second_mvs[15].as_int;

-  return bsi.segment_rd;

-}

-/* Order arr in increasing order, original position stored in idx */

-static void insertsortmv(int arr[], int len) {

-  int i, j, k;

-  for (i = 1; i <= len - 1; i++) {

-    for (j = 0; j < i; j++) {

-      if (arr[j] > arr[i]) {

-        int temp;

-        temp = arr[i];

-        for (k = i; k > j; k--)

-          arr[k] = arr[k - 1];

-        arr[j] = temp;

-      }

-    }

-  }

-}

-static void insertsortsad(int arr[], int idx[], int len) {

-  int i, j, k;

-  for (i = 1; i <= len - 1; i++) {

-    for (j = 0; j < i; j++) {

-      if (arr[j] > arr[i]) {

-        int temp, tempi;

-        temp = arr[i];

-        tempi = idx[i];

-        for (k = i; k > j; k--) {

-          arr[k] = arr[k - 1];

-          idx[k] = idx[k - 1];

-        }

-        arr[j] = temp;

-        idx[j] = tempi;

-      }

-    }

-  }

-}

-// The improved MV prediction

-void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCKD *xd, const MODE_INFO *here,

-                 int_mv *mvp, int refframe, int *ref_frame_sign_bias,

-                 int *sr, int near_sadidx[]) {

-  const MODE_INFO *above = here - xd->mode_info_stride;

-  const MODE_INFO *left = here - 1;

-  const MODE_INFO *aboveleft = above - 1;

-  int_mv           near_mvs[8];

-  int              near_ref[8];

-  int_mv           mv;

-  int              vcnt = 0;

-  int              find = 0;

-  int              mb_offset;

-  int              mvx[8];

-  int              mvy[8];

-  int              i;

-  mv.as_int = 0;

-  if (here->mbmi.ref_frame != INTRA_FRAME) {

-    near_mvs[0].as_int = near_mvs[1].as_int = near_mvs[2].as_int = near_mvs[3].as_int = near_mvs[4].as_int = near_mvs[5].as_int = near_mvs[6].as_int = near_mvs[7].as_int = 0;

-    near_ref[0] = near_ref[1] = near_ref[2] = near_ref[3] = near_ref[4] = near_ref[5] = near_ref[6] = near_ref[7] = 0;

-    // read in 3 nearby block's MVs from current frame as prediction candidates.

-    if (above->mbmi.ref_frame != INTRA_FRAME) {

-      near_mvs[vcnt].as_int = above->mbmi.mv[0].as_int;

-      mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);

-      near_ref[vcnt] =  above->mbmi.ref_frame;

-    }

-    vcnt++;

-    if (left->mbmi.ref_frame != INTRA_FRAME) {

-      near_mvs[vcnt].as_int = left->mbmi.mv[0].as_int;

-      mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);

-      near_ref[vcnt] =  left->mbmi.ref_frame;

-    }

-    vcnt++;

-    if (aboveleft->mbmi.ref_frame != INTRA_FRAME) {

-      near_mvs[vcnt].as_int = aboveleft->mbmi.mv[0].as_int;

-      mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);

-      near_ref[vcnt] =  aboveleft->mbmi.ref_frame;

-    }

-    vcnt++;

-    // read in 5 nearby block's MVs from last frame.

-    if (cpi->common.last_frame_type != KEY_FRAME) {

-      mb_offset = (-xd->mb_to_top_edge / 128 + 1) * (xd->mode_info_stride + 1) + (-xd->mb_to_left_edge / 128 + 1);

-      // current in last frame

-      if (cpi->lf_ref_frame[mb_offset] != INTRA_FRAME) {

-        near_mvs[vcnt].as_int = cpi->lfmv[mb_offset].as_int;

-        mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset], refframe, &near_mvs[vcnt], ref_frame_sign_bias);

-        near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset];

-      }

-      vcnt++;

-      // above in last frame

-      if (cpi->lf_ref_frame[mb_offset - xd->mode_info_stride - 1] != INTRA_FRAME) {

-        near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - xd->mode_info_stride - 1].as_int;

-        mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset - xd->mode_info_stride - 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);

-        near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset - xd->mode_info_stride - 1];

-      }

-      vcnt++;

-      // left in last frame

-      if (cpi->lf_ref_frame[mb_offset - 1] != INTRA_FRAME) {

-        near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - 1].as_int;

-        mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset - 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);

-        near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset - 1];

-      }

-      vcnt++;

-      // right in last frame

-      if (cpi->lf_ref_frame[mb_offset + 1] != INTRA_FRAME) {

-        near_mvs[vcnt].as_int = cpi->lfmv[mb_offset + 1].as_int;

-        mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset + 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);

-        near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset + 1];

-      }

-      vcnt++;

-      // below in last frame

-      if (cpi->lf_ref_frame[mb_offset + xd->mode_info_stride + 1] != INTRA_FRAME) {

-        near_mvs[vcnt].as_int = cpi->lfmv[mb_offset + xd->mode_info_stride + 1].as_int;

-        mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset + xd->mode_info_stride + 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);

-        near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset + xd->mode_info_stride + 1];

-      }

-      vcnt++;

-    }

-    for (i = 0; i < vcnt; i++) {

-      if (near_ref[near_sadidx[i]] != INTRA_FRAME) {

-        if (here->mbmi.ref_frame == near_ref[near_sadidx[i]]) {

-          mv.as_int = near_mvs[near_sadidx[i]].as_int;

-          find = 1;

-          if (i < 3)

-            *sr = 3;

-          else

-            *sr = 2;

-          break;

-        }

-      }

-    }

-    if (!find) {

-      for (i = 0; i < vcnt; i++) {

-        mvx[i] = near_mvs[i].as_mv.row;

-        mvy[i] = near_mvs[i].as_mv.col;

-      }

-      insertsortmv(mvx, vcnt);

-      insertsortmv(mvy, vcnt);

-      mv.as_mv.row = mvx[vcnt / 2];

-      mv.as_mv.col = mvy[vcnt / 2];

-      find = 1;

-      // sr is set to 0 to allow calling function to decide the search range.

-      *sr = 0;

-    }

-  }

-  /* Set up return values */

-  mvp->as_int = mv.as_int;

-  clamp_mv2(mvp, xd);

-}

-static void cal_sad(VP9_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x,

-                    int recon_yoffset, int near_sadidx[],

-                    enum BlockSize block_size) {

-  /* 0-cf above, 1-cf left, 2-cf aboveleft, 3-lf current, 4-lf above,

-   * 5-lf left, 6-lf right, 7-lf below */

-  int near_sad[8] = {0};

-  BLOCK *b = &x->block[0];

-  unsigned char *src_y_ptr = *(b->base_src);

-  const unsigned char *dst_y_ptr = xd->dst.y_buffer;

-  const int bs = (block_size == BLOCK_16X16) ? 16 : 32;

-  const int dst_y_str = xd->dst.y_stride;

-  // calculate sad for current frame 3 nearby MBs.

-  if (xd->mb_to_top_edge == 0 && xd->mb_to_left_edge == 0) {

-    near_sad[0] = near_sad[1] = near_sad[2] = INT_MAX;

-  } else if (xd->mb_to_top_edge == 0) {

-    // only has left MB for sad calculation.

-    near_sad[0] = near_sad[2] = INT_MAX;

-    near_sad[1] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,

-                                              dst_y_ptr - bs,

-                                              dst_y_str, 0x7fffffff);

-  } else if (xd->mb_to_left_edge == 0) {

-    // only has left MB for sad calculation.

-    near_sad[1] = near_sad[2] = INT_MAX;

-    near_sad[0] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,

-                                              dst_y_ptr - dst_y_str * bs,

-                                              dst_y_str, 0x7fffffff);

-  } else {

-    near_sad[0] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,

-                                              dst_y_ptr - dst_y_str * bs,

-                                              dst_y_str, 0x7fffffff);

-    near_sad[1] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,

-                                              dst_y_ptr - bs,

-                                              dst_y_str, 0x7fffffff);

-    near_sad[2] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,

-                                              dst_y_ptr - dst_y_str * bs - bs,

-                                              dst_y_str, 0x7fffffff);

-  }

-  if (cpi->common.last_frame_type != KEY_FRAME) {

-    // calculate sad for last frame 5 nearby MBs.

-    unsigned char *pre_y_buffer = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_buffer + recon_yoffset;

-    const int pre_y_str = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_stride;

-    if (xd->mb_to_top_edge == 0) near_sad[4] = INT_MAX;

-    if (xd->mb_to_left_edge == 0) near_sad[5] = INT_MAX;

-    if (xd->mb_to_right_edge == 0) near_sad[6] = INT_MAX;

-    if (xd->mb_to_bottom_edge == 0) near_sad[7] = INT_MAX;

-    near_sad[3] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,

-                                              pre_y_buffer,

-                                              pre_y_str, 0x7fffffff);

-    if (near_sad[4] != INT_MAX)

-      near_sad[4] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,

-                                                pre_y_buffer - pre_y_str * bs,

-                                                pre_y_str, 0x7fffffff);

-    if (near_sad[5] != INT_MAX)

-      near_sad[5] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,

-                                                pre_y_buffer - bs,

-                                                pre_y_str, 0x7fffffff);

-    if (near_sad[6] != INT_MAX)

-      near_sad[6] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,

-                                                pre_y_buffer + bs,

-                                                pre_y_str, 0x7fffffff);

-    if (near_sad[7] != INT_MAX)

-      near_sad[7] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,

-                                                pre_y_buffer + pre_y_str * bs,

-                                                pre_y_str, 0x7fffffff);

-  }

-  if (cpi->common.last_frame_type != KEY_FRAME) {

-    insertsortsad(near_sad, near_sadidx, 8);

-  } else {

-    insertsortsad(near_sad, near_sadidx, 3);

-  }

-}

-static void set_i8x8_block_modes(MACROBLOCK *x, int modes[2][4]) {

-  int i;

-  MACROBLOCKD *xd = &x->e_mbd;

-  for (i = 0; i < 4; i++) {

-    int ib = vp9_i8x8_block[i];

-    xd->mode_info_context->bmi[ib + 0].as_mode.first = modes[0][i];

-    xd->mode_info_context->bmi[ib + 1].as_mode.first = modes[0][i];

-    xd->mode_info_context->bmi[ib + 4].as_mode.first = modes[0][i];

-    xd->mode_info_context->bmi[ib + 5].as_mode.first = modes[0][i];

-#if CONFIG_COMP_INTRA_PRED

-    xd->mode_info_context->bmi[ib + 0].as_mode.second = modes[1][i];

-    xd->mode_info_context->bmi[ib + 1].as_mode.second = modes[1][i];

-    xd->mode_info_context->bmi[ib + 4].as_mode.second = modes[1][i];

-    xd->mode_info_context->bmi[ib + 5].as_mode.second = modes[1][i];

-#endif

-    // printf("%d,%d,%d,%d %d,%d,%d,%d\n",

-    //       modes[0][0], modes[0][1], modes[0][2], modes[0][3],

-    //       modes[1][0], modes[1][1], modes[1][2], modes[1][3]);

-  }

-  for (i = 0; i < 16; i++) {

-    xd->block[i].bmi = xd->mode_info_context->bmi[i];

-  }

-}

-extern void vp9_calc_ref_probs(int *count, vp9_prob *probs);

-static void estimate_curframe_refprobs(VP9_COMP *cpi, vp9_prob mod_refprobs[3], int pred_ref) {

-  int norm_cnt[MAX_REF_FRAMES];

-  const int *const rfct = cpi->count_mb_ref_frame_usage;

-  int intra_count = rfct[INTRA_FRAME];

-  int last_count  = rfct[LAST_FRAME];

-  int gf_count    = rfct[GOLDEN_FRAME];

-  int arf_count   = rfct[ALTREF_FRAME];

-  // Work out modified reference frame probabilities to use where prediction

-  // of the reference frame fails

-  if (pred_ref == INTRA_FRAME) {

-    norm_cnt[0] = 0;

-    norm_cnt[1] = last_count;

-    norm_cnt[2] = gf_count;

-    norm_cnt[3] = arf_count;

-    vp9_calc_ref_probs(norm_cnt, mod_refprobs);

-    mod_refprobs[0] = 0;    // This branch implicit

-  } else if (pred_ref == LAST_FRAME) {

-    norm_cnt[0] = intra_count;

-    norm_cnt[1] = 0;

-    norm_cnt[2] = gf_count;

-    norm_cnt[3] = arf_count;

-    vp9_calc_ref_probs(norm_cnt, mod_refprobs);

-    mod_refprobs[1] = 0;    // This branch implicit

-  } else if (pred_ref == GOLDEN_FRAME) {

-    norm_cnt[0] = intra_count;

-    norm_cnt[1] = last_count;

-    norm_cnt[2] = 0;

-    norm_cnt[3] = arf_count;

-    vp9_calc_ref_probs(norm_cnt, mod_refprobs);

-    mod_refprobs[2] = 0;  // This branch implicit

-  } else {

-    norm_cnt[0] = intra_count;

-    norm_cnt[1] = last_count;

-    norm_cnt[2] = gf_count;

-    norm_cnt[3] = 0;

-    vp9_calc_ref_probs(norm_cnt, mod_refprobs);

-    mod_refprobs[2] = 0;  // This branch implicit

-  }

-}

-static __inline unsigned weighted_cost(vp9_prob *tab0, vp9_prob *tab1, int idx, int val, int weight) {

-  unsigned cost0 = tab0[idx] ? vp9_cost_bit(tab0[idx], val) : 0;

-  unsigned cost1 = tab1[idx] ? vp9_cost_bit(tab1[idx], val) : 0;

-  // weight is 16-bit fixed point, so this basically calculates:

-  // 0.5 + weight * cost1 + (1.0 - weight) * cost0

-  return (0x8000 + weight * cost1 + (0x10000 - weight) * cost0) >> 16;

-}

-static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, unsigned int *ref_costs) {

-  VP9_COMMON *cm = &cpi->common;

-  MACROBLOCKD *xd = &cpi->mb.e_mbd;

-  vp9_prob *mod_refprobs;

-  unsigned int cost;

-  int pred_ref;

-  int pred_flag;

-  int pred_ctx;

-  int i;

-  int tot_count;

-  vp9_prob pred_prob, new_pred_prob;

-  int seg_ref_active;

-  int seg_ref_count = 0;

-  seg_ref_active = vp9_segfeature_active(xd,

-                                         segment_id,

-                                         SEG_LVL_REF_FRAME);

-  if (seg_ref_active) {

-    seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME)  +

-                    vp9_check_segref(xd, segment_id, LAST_FRAME)   +

-                    vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +

-                    vp9_check_segref(xd, segment_id, ALTREF_FRAME);

-  }

-  // Get the predicted reference for this mb

-  pred_ref = vp9_get_pred_ref(cm, xd);

-  // Get the context probability for the prediction flag (based on last frame)

-  pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);

-  // Predict probability for current frame based on stats so far

-  pred_ctx = vp9_get_pred_context(cm, xd, PRED_REF);

-  tot_count = cpi->ref_pred_count[pred_ctx][0] + cpi->ref_pred_count[pred_ctx][1];

-  if (tot_count) {

-    new_pred_prob =

-      (cpi->ref_pred_count[pred_ctx][0] * 255 + (tot_count >> 1)) / tot_count;

-    new_pred_prob += !new_pred_prob;

-  } else

-    new_pred_prob = 128;

-  // Get the set of probabilities to use if prediction fails

-  mod_refprobs = cm->mod_refprobs[pred_ref];

-  // For each possible selected reference frame work out a cost.

-  for (i = 0; i < MAX_REF_FRAMES; i++) {

-    if (seg_ref_active && seg_ref_count == 1) {

-      cost = 0;

-    } else {

-      pred_flag = (i == pred_ref);

-      // Get the prediction for the current mb

-      cost = weighted_cost(&pred_prob, &new_pred_prob, 0,

-                           pred_flag, cpi->seg0_progress);

-      if (cost > 1024) cost = 768; // i.e. account for 4 bits max.

-      // for incorrectly predicted cases

-      if (! pred_flag) {

-        vp9_prob curframe_mod_refprobs[3];

-        if (cpi->seg0_progress) {

-          estimate_curframe_refprobs(cpi, curframe_mod_refprobs, pred_ref);

-        } else {

-          vpx_memset(curframe_mod_refprobs, 0, sizeof(curframe_mod_refprobs));

-        }

-        cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 0,

-                              (i != INTRA_FRAME), cpi->seg0_progress);

-        if (i != INTRA_FRAME) {

-          cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 1,

-                                (i != LAST_FRAME), cpi->seg0_progress);

-          if (i != LAST_FRAME) {

-            cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 2,

-                                  (i != GOLDEN_FRAME), cpi->seg0_progress);

-          }

-        }

-      }

-    }

-    ref_costs[i] = cost;

-  }

-}

-static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,

-                                 int mode_index,

-                                 PARTITION_INFO *partition,

-                                 int_mv *ref_mv,

-                                 int_mv *second_ref_mv,

-                                 int single_pred_diff,

-                                 int comp_pred_diff,

-                                 int hybrid_pred_diff,

-                                 int64_t txfm_size_diff[NB_TXFM_MODES]) {

-  MACROBLOCKD *xd = &x->e_mbd;

-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

-  // Take a snapshot of the coding context so it can be

-  // restored if we decide to encode this way

-  ctx->best_mode_index = mode_index;

-  vpx_memcpy(&ctx->mic, xd->mode_info_context,

-             sizeof(MODE_INFO));

-  if (partition)

-    vpx_memcpy(&ctx->partition_info, partition,

-               sizeof(PARTITION_INFO));

-  ctx->best_ref_mv.as_int = ref_mv->as_int;

-  ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;

-  // ctx[mb_index].rddiv = x->rddiv;

-  // ctx[mb_index].rdmult = x->rdmult;

-  ctx->single_pred_diff = single_pred_diff;

-  ctx->comp_pred_diff   = comp_pred_diff;

-  ctx->hybrid_pred_diff = hybrid_pred_diff;

-  if (txfm_size_diff) {

-    memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));

-  } else {

-    memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));

-  }

-}

-static void inter_mode_cost(VP9_COMP *cpi, MACROBLOCK *x, int this_mode,

-                            int *rate2, int *distortion2, int *rate_y,

-                            int *distortion, int* rate_uv, int *distortion_uv,

-                            int *skippable, int64_t txfm_cache[NB_TXFM_MODES]) {

-  int y_skippable, uv_skippable;

-  // Y cost and distortion

-  macro_block_yrd(cpi, x, rate_y, distortion, &y_skippable, txfm_cache);

-  *rate2 += *rate_y;

-  *distortion2 += *distortion;

-  // UV cost and distortion

-  if (x->e_mbd.mode_info_context->mbmi.txfm_size != TX_4X4)

-    rd_inter16x16_uv_8x8(cpi, x, rate_uv, distortion_uv,

-                         cpi->common.full_pixel, &uv_skippable);

-  else

-    rd_inter16x16_uv(cpi, x, rate_uv, distortion_uv, cpi->common.full_pixel,

-                     &uv_skippable);

-  *rate2 += *rate_uv;

-  *distortion2 += *distortion_uv;

-  *skippable = y_skippable && uv_skippable;

-}

-#define MIN(x,y) (((x)<(y))?(x):(y))

-#define MAX(x,y) (((x)>(y))?(x):(y))

-static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,

-                               int idx, int frame_type,

-                               int recon_yoffset, int recon_uvoffset,

-                               int_mv frame_nearest_mv[4],

-                               int_mv frame_near_mv[4],

-                               int_mv frame_best_ref_mv[4],

-                               int frame_mdcounts[4][4],

-                               unsigned char *y_buffer[4],

-                               unsigned char *u_buffer[4],

-                               unsigned char *v_buffer[4]) {

-  YV12_BUFFER_CONFIG *yv12 = &cpi->common.yv12_fb[idx];

-  MACROBLOCKD *xd = &x->e_mbd;

-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;

-  vp9_find_near_mvs(xd, xd->mode_info_context,

-                    xd->prev_mode_info_context,

-                    &frame_nearest_mv[frame_type], &frame_near_mv[frame_type],

-                    &frame_best_ref_mv[frame_type], frame_mdcounts[frame_type],

-                    frame_type, cpi->common.ref_frame_sign_bias);

-  y_buffer[frame_type] = yv12->y_buffer + recon_yoffset;

-  u_buffer[frame_type] = yv12->u_buffer + recon_uvoffset;

-  v_buffer[frame_type] = yv12->v_buffer + recon_uvoffset;

-#if CONFIG_NEWBESTREFMV

-  vp9_find_mv_refs(xd, xd->mode_info_context,

-                   xd->prev_mode_info_context,

-                   frame_type,

-                   mbmi->ref_mvs[frame_type],

-                   cpi->common.ref_frame_sign_bias);

-  vp9_find_best_ref_mvs(xd, y_buffer[frame_type],

-                        yv12->y_stride,

-                        mbmi->ref_mvs[frame_type],

-                        &frame_best_ref_mv[frame_type],

-                        &frame_nearest_mv[frame_type],

-                        &frame_near_mv[frame_type]);

-#endif

-}

-static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,

-                                 enum BlockSize block_size,

-                                 int *saddone, int near_sadidx[],

-                                 int mdcounts[4], int64_t txfm_cache[],

-                                 int *rate2, int *distortion, int *skippable,

-                                 int *compmode_cost,

-                                 int *rate_y, int *distortion_y,

-                                 int *rate_uv, int *distortion_uv,

-                                 int *mode_excluded, int *disable_skip,

-                                 int recon_yoffset, int mode_index,

-                                 int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],

-                                 int_mv frame_best_ref_mv[4]) {

-  VP9_COMMON *cm = &cpi->common;

-  MACROBLOCKD *xd = &x->e_mbd;

-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

-  BLOCK *b = &x->block[0];

-  BLOCKD *d = &xd->block[0];

-  const int is_comp_pred = (mbmi->second_ref_frame != 0);

-  const int num_refs = is_comp_pred ? 2 : 1;

-  const int this_mode = mbmi->mode;

-  int i;

-  int refs[2] = { mbmi->ref_frame, mbmi->second_ref_frame };

-  int_mv cur_mv[2];

-  int_mv mvp;

-  int64_t this_rd = 0;

-  switch (this_mode) {

-    case NEWMV:

-      if (is_comp_pred) {

-        if (frame_mv[NEWMV][refs[0]].as_int == INVALID_MV ||

-            frame_mv[NEWMV][refs[1]].as_int == INVALID_MV)

-          return INT64_MAX;

-        *rate2 += vp9_mv_bit_cost(&frame_mv[NEWMV][refs[0]],

-                                  &frame_best_ref_mv[refs[0]],

-                                  XMVCOST, 96,

-                                  x->e_mbd.allow_high_precision_mv);

-        *rate2 += vp9_mv_bit_cost(&frame_mv[NEWMV][refs[1]],

-                                  &frame_best_ref_mv[refs[1]],

-                                  XMVCOST, 96,

-                                  x->e_mbd.allow_high_precision_mv);

-      } else {

-        int bestsme = INT_MAX;

-        int further_steps, step_param = cpi->sf.first_step;

-        int sadpb = x->sadperbit16;

-        int_mv mvp_full, tmp_mv;

-        // search range got from mv_pred(). It uses step_param levels. (0-7)

-        int sr = 0;

-        int tmp_col_min = x->mv_col_min;

-        int tmp_col_max = x->mv_col_max;

-        int tmp_row_min = x->mv_row_min;

-        int tmp_row_max = x->mv_row_max;

-        vp9_clamp_mv_min_max(x, &frame_best_ref_mv[refs[0]]);

-        if (!*saddone) {

-          cal_sad(cpi, xd, x, recon_yoffset, &near_sadidx[0], block_size);

-          *saddone = 1;

-        }

-        vp9_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp,

-                    mbmi->ref_frame, cpi->common.ref_frame_sign_bias,

-                    &sr, &near_sadidx[0]);

-        mvp_full.as_mv.col = mvp.as_mv.col >> 3;

-        mvp_full.as_mv.row = mvp.as_mv.row >> 3;

-        // adjust search range according to sr from mv prediction

-        step_param = MAX(step_param, sr);

-        // Further step/diamond searches as necessary

-        further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;

-        bestsme = vp9_full_pixel_diamond(cpi, x, b, d, &mvp_full, step_param,

-                                         sadpb, further_steps, 1,

-                                         &cpi->fn_ptr[block_size],

-                                         &frame_best_ref_mv[refs[0]], &tmp_mv);

-        x->mv_col_min = tmp_col_min;

-        x->mv_col_max = tmp_col_max;

-        x->mv_row_min = tmp_row_min;

-        x->mv_row_max = tmp_row_max;

-        if (bestsme < INT_MAX) {

-          int dis; /* TODO: use dis in distortion calculation later. */

-          unsigned int sse;

-          cpi->find_fractional_mv_step(x, b, d, &tmp_mv,

-                                       &frame_best_ref_mv[refs[0]],

-                                       x->errorperbit,

-                                       &cpi->fn_ptr[block_size],

-                                       XMVCOST, &dis, &sse);

-        }

-        d->bmi.as_mv.first.as_int = tmp_mv.as_int;

-        frame_mv[NEWMV][refs[0]].as_int = d->bmi.as_mv.first.as_int;

-        // Add the new motion vector cost to our rolling cost variable

-        *rate2 += vp9_mv_bit_cost(&tmp_mv, &frame_best_ref_mv[refs[0]],

-                                  XMVCOST, 96, xd->allow_high_precision_mv);

-      }

-      break;

-    case NEARESTMV:

-    case NEARMV:

-      // Do not bother proceeding if the vector (from newmv, nearest or

-      // near) is 0,0 as this should then be coded using the zeromv mode.

-      for (i = 0; i < num_refs; ++i)

-        if (frame_mv[this_mode][refs[i]].as_int == 0)

-          return INT64_MAX;

-    case ZEROMV:

-    default:

-      break;

-  }

-  for (i = 0; i < num_refs; ++i) {

-    cur_mv[i] = frame_mv[this_mode][refs[i]];

-    // Clip "next_nearest" so that it does not extend to far out of image

-    clamp_mv2(&cur_mv[i], xd);

-    if (mv_check_bounds(x, &cur_mv[i]))

-      return INT64_MAX;

-    mbmi->mv[i].as_int = cur_mv[i].as_int;

-  }

-#if CONFIG_PRED_FILTER

-  // Filtered prediction:

-  mbmi->pred_filter_enabled = vp9_mode_order[mode_index].pred_filter_flag;

-  *rate2 += vp9_cost_bit(cpi->common.prob_pred_filter_off,

-                         mbmi->pred_filter_enabled);

-#endif

-  if (cpi->common.mcomp_filter_type == SWITCHABLE) {

-    const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);

-    const int m = vp9_switchable_interp_map[mbmi->interp_filter];

-    *rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];

-  }

-  /* We don't include the cost of the second reference here, because there

-   * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other

-   * words if you present them in that order, the second one is always known

-   * if the first is known */

-  *compmode_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP),

-                                is_comp_pred);

-  *rate2 += vp9_cost_mv_ref(cpi, this_mode, mdcounts);

-  if (block_size == BLOCK_16X16) {

-    vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);

-    if (is_comp_pred)

-      vp9_build_2nd_inter16x16_predictors_mby(xd, xd->predictor, 16);

-  } else {

-#if CONFIG_SUPERBLOCKS

-    vp9_build_inter32x32_predictors_sb(xd,

-                                       xd->dst.y_buffer,

-                                       xd->dst.u_buffer,

-                                       xd->dst.v_buffer,

-                                       xd->dst.y_stride,

-                                       xd->dst.uv_stride);

-#endif

-  }

-  if (cpi->active_map_enabled && x->active_ptr[0] == 0)

-    x->skip = 1;

-  else if (x->encode_breakout) {

-    unsigned int sse, var;

-    int threshold = (xd->block[0].dequant[1]

-                     * xd->block[0].dequant[1] >> 4);

-    if (threshold < x->encode_breakout)

-      threshold = x->encode_breakout;

-    if (block_size == BLOCK_16X16) {

-      var = vp9_variance16x16(*(b->base_src), b->src_stride,

-                              xd->predictor, 16, &sse);

-    } else {

-#if CONFIG_SUPERBLOCKS

-      var = vp9_variance32x32(*(b->base_src), b->src_stride,

-                              xd->dst.y_buffer, xd->dst.y_stride, &sse);

-#endif

-    }

-    if (sse < threshold) {

-      unsigned int q2dc = xd->block[24].dequant[0];

-      /* If there is no codeable 2nd order dc

-       or a very small uniform pixel change change */

-      if ((sse - var < q2dc * q2dc >> 4) ||

-          (sse / 2 > var && sse - var < 64)) {

-        // Check u and v to make sure skip is ok

-        int sse2;

-        if (block_size == BLOCK_16X16) {

-          sse2 = vp9_uvsse(x);

-        } else {

-          unsigned int sse2u, sse2v;

-          var = vp9_variance16x16(x->src.u_buffer, x->src.uv_stride,

-                                  xd->dst.u_buffer, xd->dst.uv_stride, &sse2u);

-          var = vp9_variance16x16(x->src.v_buffer, x->src.uv_stride,

-                                  xd->dst.v_buffer, xd->dst.uv_stride, &sse2v);

-          sse2 = sse2u + sse2v;

-        }

-        if (sse2 * 2 < threshold) {

-          x->skip = 1;

-          *distortion = sse + sse2;

-          *rate2 = 500;

-          /* for best_yrd calculation */

-          *rate_uv = 0;

-          *distortion_uv = sse2;

-          *disable_skip = 1;

-          this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);

-        }

-      }

-    }

-  }

-  if (!x->skip) {

-    if (block_size == BLOCK_16X16) {

-      vp9_build_1st_inter16x16_predictors_mbuv(xd, &xd->predictor[256],

-                                               &xd->predictor[320], 8);

-      if (is_comp_pred)

-        vp9_build_2nd_inter16x16_predictors_mbuv(xd, &xd->predictor[256],

-                                                 &xd->predictor[320], 8);

-      inter_mode_cost(cpi, x, this_mode, rate2, distortion,

-                      rate_y, distortion_y, rate_uv, distortion_uv,

-                      skippable, txfm_cache);

-    } else {

-#if CONFIG_SUPERBLOCKS

-      int skippable_y, skippable_uv;

-      // Y cost and distortion - FIXME support other transform sizes

-      super_block_yrd_8x8(x, rate_y, distortion_y,

-                          IF_RTCD(&cpi->rtcd), &skippable_y);

-      *rate2 += *rate_y;

-      *distortion += *distortion_y;

-      rd_inter32x32_uv_8x8(cpi, x, rate_uv, distortion_uv,

-                           cm->full_pixel, &skippable_uv);

-      *rate2 += *rate_uv;

-      *distortion += *distortion_uv;

-      *skippable = skippable_y && skippable_uv;

-#endif

-    }

-  }

-  if (is_comp_pred) {

-    *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);

-  } else {

-    *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);

-  }

-  return this_rd;  // if 0, this will be re-calculated by caller

-}

-void vp9_rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,

-                            int recon_yoffset, int recon_uvoffset,

-                            int *returnrate, int *returndistortion,

-                            int64_t *returnintra) {

-  VP9_COMMON *cm = &cpi->common;

-  MACROBLOCKD *xd = &x->e_mbd;

-  union b_mode_info best_bmodes[16];

-  MB_MODE_INFO best_mbmode;

-  PARTITION_INFO best_partition;

-  int_mv best_ref_mv, second_best_ref_mv;

-  MB_PREDICTION_MODE this_mode;

-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;

-  int i, best_mode_index = 0;

-  int mode8x8[2][4];

-  unsigned char segment_id = mbmi->segment_id;

-  int mode_index;

-  int mdcounts[4];

-  int rate, distortion;

-  int rate2, distortion2;

-  int64_t best_txfm_rd[NB_TXFM_MODES];

-  int64_t best_txfm_diff[NB_TXFM_MODES];

-  int64_t best_pred_diff[NB_PREDICTION_TYPES];

-  int64_t best_pred_rd[NB_PREDICTION_TYPES];

-  int64_t best_rd = INT64_MAX, best_intra_rd = INT64_MAX;

-#if CONFIG_PRED_FILTER

-  int64_t best_overall_rd = INT64_MAX;

-#endif

-  int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly;

-  int uv_intra_skippable = 0;

-  int uv_intra_rate_8x8 = 0, uv_intra_distortion_8x8 = 0, uv_intra_rate_tokenonly_8x8 = 0;

-  int uv_intra_skippable_8x8 = 0;

-  int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);

-  int distortion_uv = INT_MAX;

-  int64_t best_yrd = INT64_MAX;

-#if CONFIG_PRED_FILTER

-  int best_filter_state;

-#endif

-  int switchable_filter_index = 0;

-  MB_PREDICTION_MODE uv_intra_mode;

-  MB_PREDICTION_MODE uv_intra_mode_8x8 = 0;

-  int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};

-  int saddone = 0;

-  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];

-  int_mv frame_best_ref_mv[4];

-  int frame_mdcounts[4][4];

-  unsigned char *y_buffer[4], *u_buffer[4], *v_buffer[4];

-  unsigned int ref_costs[MAX_REF_FRAMES];

-  int_mv seg_mvs[NB_PARTITIONINGS][16 /* n_blocks */][MAX_REF_FRAMES - 1];

-  vpx_memset(mode8x8, 0, sizeof(mode8x8));

-  vpx_memset(&frame_mv, 0, sizeof(frame_mv));

-  vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));

-  vpx_memset(&best_bmodes, 0, sizeof(best_bmodes));

-  vpx_memset(&x->mb_context[xd->mb_index], 0, sizeof(PICK_MODE_CONTEXT));

-  for (i = 0; i < MAX_REF_FRAMES; i++)

-    frame_mv[NEWMV][i].as_int = INVALID_MV;

-  for (i = 0; i < NB_PREDICTION_TYPES; ++i)

-    best_pred_rd[i] = INT64_MAX;

-  for (i = 0; i < NB_TXFM_MODES; i++)

-    best_txfm_rd[i] = INT64_MAX;

-  for (i = 0; i < NB_PARTITIONINGS; i++) {

-    int j, k;

-    for (j = 0; j < 16; j++)

-      for (k = 0; k < MAX_REF_FRAMES - 1; k++)

-        seg_mvs[i][j][k].as_int = INVALID_MV;

-  }

-  if (cpi->ref_frame_flags & VP9_LAST_FLAG) {

-    setup_buffer_inter(cpi, x, cpi->common.lst_fb_idx, LAST_FRAME,

-                       recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],

-                       frame_mv[NEARMV], frame_best_ref_mv,

-                       frame_mdcounts, y_buffer, u_buffer, v_buffer);

-  }

-  if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {

-    setup_buffer_inter(cpi, x, cpi->common.gld_fb_idx, GOLDEN_FRAME,

-                       recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],

-                       frame_mv[NEARMV], frame_best_ref_mv,

-                       frame_mdcounts, y_buffer, u_buffer, v_buffer);

-  }

-  if (cpi->ref_frame_flags & VP9_ALT_FLAG) {

-    setup_buffer_inter(cpi, x, cpi->common.alt_fb_idx, ALTREF_FRAME,

-                       recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],

-                       frame_mv[NEARMV], frame_best_ref_mv,

-                       frame_mdcounts, y_buffer, u_buffer, v_buffer);

-  }

-  *returnintra = INT64_MAX;

-  x->skip = 0;

-  mbmi->ref_frame = INTRA_FRAME;

-  /* Initialize zbin mode boost for uv costing */

-  cpi->zbin_mode_boost = 0;

-  vp9_update_zbin_extra(cpi, x);

-  rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate,

-                          &uv_intra_rate_tokenonly, &uv_intra_distortion,

-                          &uv_intra_skippable);

-  uv_intra_mode = mbmi->uv_mode;

-  /* rough estimate for now */

-  if (cpi->common.txfm_mode != ONLY_4X4) {

-    rd_pick_intra_mbuv_mode_8x8(cpi, x, &uv_intra_rate_8x8,

-                                &uv_intra_rate_tokenonly_8x8,

-                                &uv_intra_distortion_8x8,

-                                &uv_intra_skippable_8x8);

-    uv_intra_mode_8x8 = mbmi->uv_mode;

-  }

-  // Get estimates of reference frame costs for each reference frame

-  // that depend on the current prediction etc.

-  estimate_ref_frame_costs(cpi, segment_id, ref_costs);

-  for (mode_index = 0; mode_index < MAX_MODES;

-       mode_index += (!switchable_filter_index)) {

-    int64_t this_rd = INT64_MAX;

-    int disable_skip = 0, skippable = 0;

-    int other_cost = 0;

-    int compmode_cost = 0;

-    int mode_excluded = 0;

-    int64_t txfm_cache[NB_TXFM_MODES] = { 0 };

-    // These variables hold are rolling total cost and distortion for this mode

-    rate2 = 0;

-    distortion2 = 0;

-    rate_y = 0;

-    rate_uv = 0;

-    this_mode = vp9_mode_order[mode_index].mode;

-    mbmi->mode = this_mode;

-    mbmi->uv_mode = DC_PRED;

-    mbmi->ref_frame = vp9_mode_order[mode_index].ref_frame;

-    mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;

-#if CONFIG_PRED_FILTER

-    mbmi->pred_filter_enabled = 0;

-#endif

-    if (cpi->common.mcomp_filter_type == SWITCHABLE &&

-        this_mode >= NEARESTMV && this_mode <= SPLITMV) {

-      mbmi->interp_filter =

-          vp9_switchable_interp[switchable_filter_index++];

-      if (switchable_filter_index == VP9_SWITCHABLE_FILTERS)

-        switchable_filter_index = 0;

-    } else {

-      mbmi->interp_filter = cpi->common.mcomp_filter_type;

-    }

-    vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

-    // Test best rd so far against threshold for trying this mode.

-    if (best_rd <= cpi->rd_threshes[mode_index])

-      continue;

-    // current coding mode under rate-distortion optimization test loop

-#if CONFIG_COMP_INTRA_PRED

-    mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

-    mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

-#endif

-    // If the segment reference frame feature is enabled....

-    // then do nothing if the current ref frame is not allowed..

-    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

-        !vp9_check_segref(xd, segment_id, mbmi->ref_frame)) {

-      continue;

-    // If the segment mode feature is enabled....

-    // then do nothing if the current mode is not allowed..

-    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&

-               (this_mode !=

-                vp9_get_segdata(xd, segment_id, SEG_LVL_MODE))) {

-      continue;

-    // Disable this drop out case if either the mode or ref frame

-    // segment level feature is enabled for this segment. This is to

-    // prevent the possibility that the we end up unable to pick any mode.

-    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

-               !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {

-      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,

-      // unless ARNR filtering is enabled in which case we want

-      // an unfiltered alternative

-      if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {

-        if (this_mode != ZEROMV ||

-            mbmi->ref_frame != ALTREF_FRAME) {

-          continue;

-        }

-      }

-    }

-    /* everything but intra */

-    if (mbmi->ref_frame) {

-      int ref = mbmi->ref_frame;

-      xd->pre.y_buffer = y_buffer[ref];

-      xd->pre.u_buffer = u_buffer[ref];

-      xd->pre.v_buffer = v_buffer[ref];

-      best_ref_mv = frame_best_ref_mv[ref];

-      vpx_memcpy(mdcounts, frame_mdcounts[ref], sizeof(mdcounts));

-    }

-    if (mbmi->second_ref_frame) {

-      int ref = mbmi->second_ref_frame;

-      xd->second_pre.y_buffer = y_buffer[ref];

-      xd->second_pre.u_buffer = u_buffer[ref];

-      xd->second_pre.v_buffer = v_buffer[ref];

-      second_best_ref_mv  = frame_best_ref_mv[ref];

-    }

-    // Experimental code. Special case for gf and arf zeromv modes.

-    // Increase zbin size to suppress noise

-    if (cpi->zbin_mode_boost_enabled) {

-      if (vp9_mode_order[mode_index].ref_frame == INTRA_FRAME)

-        cpi->zbin_mode_boost = 0;

-      else {

-        if (vp9_mode_order[mode_index].mode == ZEROMV) {

-          if (vp9_mode_order[mode_index].ref_frame != LAST_FRAME)

-            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;

-          else

-            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;

-        } else if (vp9_mode_order[mode_index].mode == SPLITMV)

-          cpi->zbin_mode_boost = 0;

-        else

-          cpi->zbin_mode_boost = MV_ZBIN_BOOST;

-      }

-      vp9_update_zbin_extra(cpi, x);

-    }

-    // Intra

-    if (!mbmi->ref_frame) {

-      switch (this_mode) {

-        default:

-        case DC_PRED:

-        case V_PRED:

-        case H_PRED:

-        case TM_PRED:

-        case D45_PRED:

-        case D135_PRED:

-        case D117_PRED:

-        case D153_PRED:

-        case D27_PRED:

-        case D63_PRED:

-          mbmi->ref_frame = INTRA_FRAME;

-          // FIXME compound intra prediction

-          vp9_build_intra_predictors_mby(&x->e_mbd);

-          macro_block_yrd(cpi, x, &rate_y, &distortion, &skippable, txfm_cache);

-          rate2 += rate_y;

-          distortion2 += distortion;

-          rate2 += x->mbmode_cost[xd->frame_type][mbmi->mode];

-          if (mbmi->txfm_size != TX_4X4) {

-            rate2 += uv_intra_rate_8x8;

-            rate_uv = uv_intra_rate_tokenonly_8x8;

-            distortion2 += uv_intra_distortion_8x8;

-            distortion_uv = uv_intra_distortion_8x8;

-            skippable = skippable && uv_intra_skippable_8x8;

-          } else {

-            rate2 += uv_intra_rate;

-            rate_uv = uv_intra_rate_tokenonly;

-            distortion2 += uv_intra_distortion;

-            distortion_uv = uv_intra_distortion;

-            skippable = skippable && uv_intra_skippable;

-          }

-          break;

-        case B_PRED: {

-          int64_t tmp_rd;

-          // Note the rate value returned here includes the cost of coding

-          // the BPRED mode : x->mbmode_cost[xd->frame_type][BPRED];

-          mbmi->txfm_size = TX_4X4;

-          tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion, best_yrd,

-#if CONFIG_COMP_INTRA_PRED

-                                             0,

-#endif

-                                             0);

-          rate2 += rate;

-          distortion2 += distortion;

-          if (tmp_rd < best_yrd) {

-            rate2 += uv_intra_rate;

-            rate_uv = uv_intra_rate_tokenonly;

-            distortion2 += uv_intra_distortion;

-            distortion_uv = uv_intra_distortion;

-          } else {

-            this_rd = INT64_MAX;

-            disable_skip = 1;

-          }

-        }

-        break;

-        case I8X8_PRED: {

-          int cost0 = vp9_cost_bit(cm->prob_tx[0], 0);

-          int cost1 = vp9_cost_bit(cm->prob_tx[0], 1);

-          int64_t tmp_rd_4x4s, tmp_rd_8x8s;

-          int64_t tmp_rd_4x4, tmp_rd_8x8, tmp_rd;

-          int r4x4, tok4x4, d4x4, r8x8, tok8x8, d8x8;

-          mbmi->txfm_size = TX_4X4;

-          tmp_rd_4x4 = rd_pick_intra8x8mby_modes(cpi, x, &r4x4, &tok4x4,

-                                                 &d4x4, best_yrd);

-          mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;

-          mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;

-          mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;

-          mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;

-#if CONFIG_COMP_INTRA_PRED

-          mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;

-          mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;

-          mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;

-          mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;

-#endif

-          mbmi->txfm_size = TX_8X8;

-          tmp_rd_8x8 = rd_pick_intra8x8mby_modes(cpi, x, &r8x8, &tok8x8,

-                                                 &d8x8, best_yrd);

-          txfm_cache[ONLY_4X4]  = tmp_rd_4x4;

-          txfm_cache[ALLOW_8X8] = tmp_rd_8x8;

-          txfm_cache[ALLOW_16X16] = tmp_rd_8x8;

-          tmp_rd_4x4s = tmp_rd_4x4 + RDCOST(x->rdmult, x->rddiv, cost0, 0);

-          tmp_rd_8x8s = tmp_rd_8x8 + RDCOST(x->rdmult, x->rddiv, cost1, 0);

-          txfm_cache[TX_MODE_SELECT] = tmp_rd_4x4s < tmp_rd_8x8s ? tmp_rd_4x4s : tmp_rd_8x8s;

-          if (cm->txfm_mode == TX_MODE_SELECT) {

-            if (tmp_rd_4x4s < tmp_rd_8x8s) {

-              rate = r4x4 + cost0;

-              rate_y = tok4x4 + cost0;

-              distortion = d4x4;

-              mbmi->txfm_size = TX_4X4;

-              tmp_rd = tmp_rd_4x4s;

-            } else {

-              rate = r8x8 + cost1;

-              rate_y = tok8x8 + cost1;

-              distortion = d8x8;

-              mbmi->txfm_size = TX_8X8;

-              tmp_rd = tmp_rd_8x8s;

-              mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;

-              mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;

-              mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;

-              mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;

-#if CONFIG_COMP_INTRA_PRED

-              mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;

-              mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;

-              mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;

-              mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;

-#endif

-            }

-          } else if (cm->txfm_mode == ONLY_4X4) {

-            rate = r4x4;

-            rate_y = tok4x4;

-            distortion = d4x4;

-            mbmi->txfm_size = TX_4X4;

-            tmp_rd = tmp_rd_4x4;

-          } else {

-            rate = r8x8;

-            rate_y = tok8x8;

-            distortion = d8x8;

-            mbmi->txfm_size = TX_8X8;

-            tmp_rd = tmp_rd_8x8;

-            mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;

-            mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;

-            mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;

-            mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;

-#if CONFIG_COMP_INTRA_PRED

-            mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;

-            mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;

-            mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;

-            mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;

-#endif

-          }

-          rate2 += rate;

-          distortion2 += distortion;

-          /* TODO: uv rate maybe over-estimated here since there is UV intra

-                   mode coded in I8X8_PRED prediction */

-          if (tmp_rd < best_yrd) {

-            rate2 += uv_intra_rate;

-            rate_uv = uv_intra_rate_tokenonly;

-            distortion2 += uv_intra_distortion;

-            distortion_uv = uv_intra_distortion;

-          } else {

-            this_rd = INT64_MAX;

-            disable_skip = 1;

-          }

-        }

-        break;

-      }

-    }

-    // Split MV. The code is very different from the other inter modes so

-    // special case it.

-    else if (this_mode == SPLITMV) {

-      const int is_comp_pred = mbmi->second_ref_frame != 0;

-      int64_t tmp_rd, this_rd_thresh;

-      int_mv *second_ref = is_comp_pred ? &second_best_ref_mv : NULL;

-      this_rd_thresh =

-              (mbmi->ref_frame == LAST_FRAME) ?

-          cpi->rd_threshes[THR_NEWMV] : cpi->rd_threshes[THR_NEWA];

-      this_rd_thresh =

-              (mbmi->ref_frame == GOLDEN_FRAME) ?

-          cpi->rd_threshes[THR_NEWG] : this_rd_thresh;

-      tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,

-                                           second_ref, best_yrd, mdcounts,

-                                           &rate, &rate_y, &distortion,

-                                           &skippable,

-                                           this_rd_thresh, seg_mvs,

-                                           txfm_cache);

-      rate2 += rate;

-      distortion2 += distortion;

-      if (cpi->common.mcomp_filter_type == SWITCHABLE)

-        rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs

-            [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]

-                [vp9_switchable_interp_map[mbmi->interp_filter]];

-      // If even the 'Y' rd value of split is higher than best so far

-      // then dont bother looking at UV

-      if (tmp_rd < best_yrd) {

-        int uv_skippable;

-        rd_inter4x4_uv(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,

-                       cpi->common.full_pixel);

-        rate2 += rate_uv;

-        distortion2 += distortion_uv;

-        skippable = skippable && uv_skippable;

-      } else {

-        this_rd = INT64_MAX;

-        disable_skip = 1;

-      }

-      if (is_comp_pred)

-        mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;

-      else

-        mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;

-      compmode_cost =

-        vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);

-      mbmi->mode = this_mode;

-    }

-    else {

-      this_rd = handle_inter_mode(cpi, x, BLOCK_16X16,

-                                  &saddone, near_sadidx, mdcounts, txfm_cache,

-                                  &rate2, &distortion2, &skippable,

-                                  &compmode_cost, &rate_y, &distortion,

-                                  &rate_uv, &distortion_uv,

-                                  &mode_excluded, &disable_skip, recon_yoffset,

-                                  mode_index, frame_mv, frame_best_ref_mv);

-      if (this_rd == INT64_MAX)

-        continue;

-    }

-    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION)

-      rate2 += compmode_cost;

-    // Estimate the reference frame signaling cost and add it

-    // to the rolling cost variable.

-    rate2 += ref_costs[mbmi->ref_frame];

-    if (!disable_skip) {

-      // Test for the condition where skip block will be activated

-      // because there are no non zero coefficients and make any

-      // necessary adjustment for rate. Ignore if skip is coded at

-      // segment level as the cost wont have been added in.

-      if (cpi->common.mb_no_coeff_skip) {

-        int mb_skip_allowed;

-        // Is Mb level skip allowed for this mb.

-        mb_skip_allowed =

-          !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||

-          vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-        if (skippable) {

-          mbmi->mb_skip_coeff = 1;

-          // Back out the coefficient coding costs

-          rate2 -= (rate_y + rate_uv);

-          // for best_yrd calculation

-          rate_uv = 0;

-          if (mb_skip_allowed) {

-            int prob_skip_cost;

-            // Cost the skip mb case

-            vp9_prob skip_prob =

-              vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP);

-            if (skip_prob) {

-              prob_skip_cost = vp9_cost_bit(skip_prob, 1);

-              rate2 += prob_skip_cost;

-              other_cost += prob_skip_cost;

-            }

-          }

-        }

-        // Add in the cost of the no skip flag.

-        else {

-          mbmi->mb_skip_coeff = 0;

-          if (mb_skip_allowed) {

-            int prob_skip_cost = vp9_cost_bit(

-                   vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP), 0);

-            rate2 += prob_skip_cost;

-            other_cost += prob_skip_cost;

-          }

-        }

-      }

-      // Calculate the final RD estimate for this mode.

-      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);

-    }

-    // Keep record of best intra distortion

-    if ((mbmi->ref_frame == INTRA_FRAME) &&

-        (this_rd < best_intra_rd)) {

-      best_intra_rd = this_rd;

-      *returnintra = distortion2;

-    }

-    if (!disable_skip && mbmi->ref_frame == INTRA_FRAME)

-      for (i = 0; i < NB_PREDICTION_TYPES; ++i)

-        best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);

-#if CONFIG_PRED_FILTER

-    // Keep track of the best mode irrespective of prediction filter state

-    if (this_rd < best_overall_rd) {

-      best_overall_rd = this_rd;

-      best_filter_state = mbmi->pred_filter_enabled;

-    }

-    // Ignore modes where the prediction filter state doesn't

-    // match the state signaled at the frame level

-    if ((cm->pred_filter_mode == 2) ||

-        (cm->pred_filter_mode ==

-         mbmi->pred_filter_enabled)) {

-#endif

-      // Did this mode help.. i.e. is it the new best mode

-      if (this_rd < best_rd || x->skip) {

-        if (!mode_excluded) {

-          // Note index of best mode so far

-          best_mode_index = mode_index;

-          if (this_mode <= B_PRED) {

-            if (mbmi->txfm_size != TX_4X4

-                && this_mode != B_PRED

-                && this_mode != I8X8_PRED)

-              mbmi->uv_mode = uv_intra_mode_8x8;

-            else

-              mbmi->uv_mode = uv_intra_mode;

-            /* required for left and above block mv */

-            mbmi->mv[0].as_int = 0;

-          }

-          other_cost += ref_costs[mbmi->ref_frame];

-          /* Calculate the final y RD estimate for this mode */

-          best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),

-                            (distortion2 - distortion_uv));

-          *returnrate = rate2;

-          *returndistortion = distortion2;

-          best_rd = this_rd;

-          vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));

-          vpx_memcpy(&best_partition, x->partition_info, sizeof(PARTITION_INFO));

-          if ((this_mode == B_PRED)

-              || (this_mode == I8X8_PRED)

-              || (this_mode == SPLITMV))

-            for (i = 0; i < 16; i++) {

-              best_bmodes[i] = xd->block[i].bmi;

-            }

-        }

-        // Testing this mode gave rise to an improvement in best error score.

-        // Lower threshold a bit for next time

-        cpi->rd_thresh_mult[mode_index] =

-            (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?

-            cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;

-        cpi->rd_threshes[mode_index] =

-            (cpi->rd_baseline_thresh[mode_index] >> 7) *

-            cpi->rd_thresh_mult[mode_index];

-      }

-      // If the mode did not help improve the best error case then raise the

-      // threshold for testing that mode next time around.

-      else {

-        cpi->rd_thresh_mult[mode_index] += 4;

-        if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)

-          cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;

-        cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];

-      }

-      /* keep record of best compound/single-only prediction */

-      if (!disable_skip &&

-          mbmi->ref_frame != INTRA_FRAME) {

-        int64_t single_rd, hybrid_rd;

-        int single_rate, hybrid_rate;

-        if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {

-          single_rate = rate2 - compmode_cost;

-          hybrid_rate = rate2;

-        } else {

-          single_rate = rate2;

-          hybrid_rate = rate2 + compmode_cost;

-        }

-        single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);

-        hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);

-        if (mbmi->second_ref_frame == INTRA_FRAME &&

-            single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {

-          best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;

-        } else if (mbmi->second_ref_frame != INTRA_FRAME &&

-                   single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {

-          best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;

-        }

-        if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])

-          best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;

-      }

-      /* keep record of best txfm size */

-      if (!mode_excluded && this_rd != INT64_MAX) {

-        for (i = 0; i < NB_TXFM_MODES; i++) {

-          int64_t adj_rd;

-          if (this_mode != B_PRED) {

-            adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->txfm_mode];

-          } else {

-            adj_rd = this_rd;

-          }

-          if (adj_rd < best_txfm_rd[i])

-            best_txfm_rd[i] = adj_rd;

-        }

-      }

-#if CONFIG_PRED_FILTER

-    }

-#endif

-    if (x->skip && !mode_excluded)

-      break;

-  }

-#if CONFIG_PRED_FILTER

-  // Update counts for prediction filter usage

-  if (best_filter_state != 0)

-    ++cpi->pred_filter_on_count;

-  else

-    ++cpi->pred_filter_off_count;

-#endif

-  if (cpi->common.mcomp_filter_type == SWITCHABLE &&

-      best_mbmode.mode >= NEARESTMV &&

-      best_mbmode.mode <= SPLITMV) {

-    ++cpi->switchable_interp_count

-        [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]

-        [vp9_switchable_interp_map[best_mbmode.interp_filter]];

-  }

-  // Reduce the activation RD thresholds for the best choice mode

-  if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&

-      (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {

-    int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);

-    cpi->rd_thresh_mult[best_mode_index] =

-        (cpi->rd_thresh_mult[best_mode_index] >=

-         (MIN_THRESHMULT + best_adjustment)) ?

-        cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;

-    cpi->rd_threshes[best_mode_index] =

-        (cpi->rd_baseline_thresh[best_mode_index] >> 7) *

-        cpi->rd_thresh_mult[best_mode_index];

-  }

-  // This code force Altref,0,0 and skip for the frame that overlays a

-  // an alrtef unless Altref is filtered. However, this is unsafe if

-  // segment level coding of ref frame or mode is enabled for this

-  // segment.

-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

-      !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&

-      cpi->is_src_frame_alt_ref &&

-      (cpi->oxcf.arnr_max_frames == 0) &&

-      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {

-    mbmi->mode = ZEROMV;

-    if (cm->txfm_mode != TX_MODE_SELECT)

-      mbmi->txfm_size = cm->txfm_mode;

-    else

-      mbmi->txfm_size = TX_16X16;

-    mbmi->ref_frame = ALTREF_FRAME;

-    mbmi->mv[0].as_int = 0;

-    mbmi->uv_mode = DC_PRED;

-    mbmi->mb_skip_coeff =

-      (cpi->common.mb_no_coeff_skip) ? 1 : 0;

-    mbmi->partitioning = 0;

-    vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));

-    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));

-    goto end;

-  }

-  // macroblock modes

-  vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));

-  if (best_mbmode.mode == B_PRED) {

-    for (i = 0; i < 16; i++) {

-      xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;

-      xd->block[i].bmi.as_mode = xd->mode_info_context->bmi[i].as_mode;

-    }

-  }

-  if (best_mbmode.mode == I8X8_PRED)

-    set_i8x8_block_modes(x, mode8x8);

-  if (best_mbmode.mode == SPLITMV) {

-    for (i = 0; i < 16; i++)

-      xd->mode_info_context->bmi[i].as_mv.first.as_int = best_bmodes[i].as_mv.first.as_int;

-    if (mbmi->second_ref_frame)

-      for (i = 0; i < 16; i++)

-        xd->mode_info_context->bmi[i].as_mv.second.as_int = best_bmodes[i].as_mv.second.as_int;

-    vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));

-    mbmi->mv[0].as_int = x->partition_info->bmi[15].mv.as_int;

-    mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;

-  }

-  for (i = 0; i < NB_PREDICTION_TYPES; ++i) {

-    if (best_pred_rd[i] == INT64_MAX)

-      best_pred_diff[i] = INT_MIN;

-    else

-      best_pred_diff[i] = best_rd - best_pred_rd[i];

-  }

-  if (!x->skip) {

-    for (i = 0; i < NB_TXFM_MODES; i++) {

-      if (best_txfm_rd[i] == INT64_MAX)

-        best_txfm_diff[i] = INT_MIN;

-      else

-        best_txfm_diff[i] = best_rd - best_txfm_rd[i];

-    }

-  } else {

-    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));

-  }

-end:

-  store_coding_context(x, &x->mb_context[xd->mb_index], best_mode_index, &best_partition,

-                       &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],

-                       &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],

-                       best_pred_diff[0], best_pred_diff[1], best_pred_diff[2],

-                       best_txfm_diff);

-}

-#if CONFIG_SUPERBLOCKS

-void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,

-                               int *returnrate,

-                               int *returndist) {

-  VP9_COMMON *cm = &cpi->common;

-  MACROBLOCKD *xd = &x->e_mbd;

-  int rate_y, rate_uv;

-  int rate_y_tokenonly, rate_uv_tokenonly;

-  int error_y, error_uv;

-  int dist_y, dist_uv;

-  int y_skip, uv_skip;

-  xd->mode_info_context->mbmi.txfm_size = TX_8X8;

-  error_uv = rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,

-                                     &dist_uv, &uv_skip);

-  error_y = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,

-                                   &dist_y, &y_skip);

-  if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) {

-    *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +

-                  vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);

-    *returndist = dist_y + (dist_uv >> 2);

-  } else {

-    *returnrate = rate_y + rate_uv;

-    if (cpi->common.mb_no_coeff_skip)

-      *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);

-    *returndist = dist_y + (dist_uv >> 2);

-  }

-}

-#endif

-void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,

-                            int *returnrate, int *returndist) {

-  VP9_COMMON *cm = &cpi->common;

-  MACROBLOCKD *xd = &x->e_mbd;

-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

-  int64_t error4x4, error16x16;

-#if CONFIG_COMP_INTRA_PRED

-  int64_t error4x4d;

-  int rate4x4d, dist4x4d;

-#endif

-  int rate4x4, rate16x16 = 0, rateuv, rateuv8x8;

-  int dist4x4, dist16x16, distuv, distuv8x8;

-  int rate;

-  int rate4x4_tokenonly = 0;

-  int rate16x16_tokenonly = 0;

-  int rateuv_tokenonly = 0, rateuv8x8_tokenonly = 0;

-  int64_t error8x8;

-  int rate8x8_tokenonly=0;

-  int rate8x8, dist8x8;

-  int mode16x16;

-  int mode8x8[2][4];

-  int dist;

-  int modeuv, modeuv8x8, uv_intra_skippable, uv_intra_skippable_8x8;

-  int y_intra16x16_skippable;

-  int64_t txfm_cache[NB_TXFM_MODES];

-  TX_SIZE txfm_size_16x16;

-  int i;

-  mbmi->ref_frame = INTRA_FRAME;

-  rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv,

-                          &uv_intra_skippable);

-  modeuv = mbmi->uv_mode;

-  if (cpi->common.txfm_mode != ONLY_4X4) {

-    rd_pick_intra_mbuv_mode_8x8(cpi, x, &rateuv8x8, &rateuv8x8_tokenonly,

-                                &distuv8x8, &uv_intra_skippable_8x8);

-    modeuv8x8 = mbmi->uv_mode;

-  } else {

-    uv_intra_skippable_8x8 = uv_intra_skippable;

-    rateuv8x8 = rateuv;

-    distuv8x8 = distuv;

-    rateuv8x8_tokenonly = rateuv_tokenonly;

-    modeuv8x8 = modeuv;

-  }

-  // current macroblock under rate-distortion optimization test loop

-  error16x16 = rd_pick_intra16x16mby_mode(cpi, x, &rate16x16,

-                                          &rate16x16_tokenonly, &dist16x16,

-                                          &y_intra16x16_skippable, txfm_cache);

-  mode16x16 = mbmi->mode;

-  txfm_size_16x16 = mbmi->txfm_size;

-  // FIXME(rbultje) support transform-size selection

-  mbmi->txfm_size = (cm->txfm_mode == ONLY_4X4) ? TX_4X4 : TX_8X8;

-  error8x8 = rd_pick_intra8x8mby_modes(cpi, x, &rate8x8, &rate8x8_tokenonly,

-                                       &dist8x8, error16x16);

-  mode8x8[0][0]= xd->mode_info_context->bmi[0].as_mode.first;

-  mode8x8[0][1]= xd->mode_info_context->bmi[2].as_mode.first;

-  mode8x8[0][2]= xd->mode_info_context->bmi[8].as_mode.first;

-  mode8x8[0][3]= xd->mode_info_context->bmi[10].as_mode.first;

-#if CONFIG_COMP_INTRA_PRED

-  mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;

-  mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;

-  mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;

-  mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;

-#endif

-  error4x4 = rd_pick_intra4x4mby_modes(cpi, x,

-                                       &rate4x4, &rate4x4_tokenonly,

-                                       &dist4x4, error16x16,

-#if CONFIG_COMP_INTRA_PRED

-                                       0,

-#endif

-                                       0);

-#if CONFIG_COMP_INTRA_PRED

-  error4x4d = rd_pick_intra4x4mby_modes(cpi, x,

-                                        &rate4x4d, &rate4x4_tokenonly,

-                                        &dist4x4d, error16x16, 1, 0);

-#endif

-  mbmi->mb_skip_coeff = 0;

-  if (cpi->common.mb_no_coeff_skip &&

-      y_intra16x16_skippable && uv_intra_skippable_8x8) {

-    mbmi->mb_skip_coeff = 1;

-    mbmi->mode = mode16x16;

-    mbmi->uv_mode = modeuv;

-    rate = rateuv8x8 + rate16x16 - rateuv8x8_tokenonly - rate16x16_tokenonly +

-           vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);

-    dist = dist16x16 + (distuv8x8 >> 2);

-    mbmi->txfm_size = txfm_size_16x16;

-    memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,

-           sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));

-  } else if (error8x8 > error16x16) {

-    if (error4x4 < error16x16) {

-      rate = rateuv;

-#if CONFIG_COMP_INTRA_PRED

-      rate += (error4x4d < error4x4) ? rate4x4d : rate4x4;

-      if (error4x4d >= error4x4) // FIXME save original modes etc.

-        error4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4,

-                                             &rate4x4_tokenonly,

-                                             &dist4x4, error16x16, 0,

-                                             cpi->update_context);

-#else

-      rate += rate4x4;

-#endif

-      mbmi->mode = B_PRED;

-      mbmi->txfm_size = TX_4X4;

-      dist = dist4x4 + (distuv >> 2);

-      memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,

-             sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));

-    } else {

-      mbmi->txfm_size = txfm_size_16x16;

-      mbmi->mode = mode16x16;

-      rate = rate16x16 + rateuv8x8;

-      dist = dist16x16 + (distuv8x8 >> 2);

-      for (i = 0; i < NB_TXFM_MODES; i++) {

-        x->mb_context[xd->mb_index].txfm_rd_diff[i] = error16x16 - txfm_cache[i];

-      }

-    }

-    if (cpi->common.mb_no_coeff_skip)

-      rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);

-  } else {

-    if (error4x4 < error8x8) {

-      rate = rateuv;

-#if CONFIG_COMP_INTRA_PRED

-      rate += (error4x4d < error4x4) ? rate4x4d : rate4x4;

-      if (error4x4d >= error4x4) // FIXME save original modes etc.

-        error4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4,

-                                             &rate4x4_tokenonly,

-                                             &dist4x4, error16x16, 0,

-                                             cpi->update_context);

-#else

-      rate += rate4x4;

-#endif

-      mbmi->mode = B_PRED;

-      mbmi->txfm_size = TX_4X4;

-      dist = dist4x4 + (distuv >> 2);

-      memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,

-             sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));

-    } else {

-      // FIXME(rbultje) support transform-size selection

-      mbmi->mode = I8X8_PRED;

-      mbmi->txfm_size = (cm->txfm_mode == ONLY_4X4) ? TX_4X4 : TX_8X8;

-      set_i8x8_block_modes(x, mode8x8);

-      rate = rate8x8 + rateuv;

-      dist = dist8x8 + (distuv >> 2);

-      memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,

-             sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));

-    }

-    if (cpi->common.mb_no_coeff_skip)

-      rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);

-  }

-  *returnrate = rate;

-  *returndist = dist;

-}

-#if CONFIG_SUPERBLOCKS

-int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,

-                                  int recon_yoffset, int recon_uvoffset,

-                                  int *returnrate, int *returndistortion) {

-  VP9_COMMON *cm = &cpi->common;

-  MACROBLOCKD *xd = &x->e_mbd;

-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

-  MB_PREDICTION_MODE this_mode;

-  MV_REFERENCE_FRAME ref_frame;

-  unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;

-  int comp_pred;

-  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];

-  int_mv frame_best_ref_mv[4];

-  int frame_mdcounts[4][4];

-  unsigned char *y_buffer[4];

-  unsigned char *u_buffer[4];

-  unsigned char *v_buffer[4];

-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,

-                                    VP9_ALT_FLAG };

-  int idx_list[4] = { 0, cpi->common.lst_fb_idx, cpi->common.gld_fb_idx,

-                      cpi->common.alt_fb_idx };

-  int mdcounts[4];

-  int near_sadidx[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };

-  int saddone = 0;

-  int64_t best_rd = INT64_MAX;

-  int64_t best_comp_rd = INT64_MAX;

-  int64_t best_single_rd = INT64_MAX;

-  int64_t best_hybrid_rd = INT64_MAX;

-  int64_t best_yrd = INT64_MAX;

-  MB_MODE_INFO best_mbmode;

-  int mode_index, best_mode_index;

-  unsigned int ref_costs[MAX_REF_FRAMES];

-  x->skip = 0;

-  xd->mode_info_context->mbmi.segment_id = segment_id;

-  estimate_ref_frame_costs(cpi, segment_id, ref_costs);

-  vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));

-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {

-    if (cpi->ref_frame_flags & flag_list[ref_frame]) {

-      setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame,

-                         recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],

-                         frame_mv[NEARMV], frame_best_ref_mv,

-                         frame_mdcounts, y_buffer, u_buffer, v_buffer);

-    }

-    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;

-    frame_mv[ZEROMV][ref_frame].as_int = 0;

-  }

-  for (mode_index = 0; mode_index < MAX_MODES; mode_index++) {

-    int mode_excluded;

-    int64_t this_rd = INT64_MAX;

-    int disable_skip = 0;

-    int other_cost = 0;

-    int compmode_cost = 0;

-    int rate2 = 0, rate_y = 0, rate_uv = 0;

-    int distortion2 = 0, distortion_y = 0, distortion_uv = 0;

-    int skippable;

-    int64_t txfm_cache[NB_TXFM_MODES];

-    // Test best rd so far against threshold for trying this mode.

-    if (best_rd <= cpi->rd_threshes[mode_index]) {

-      continue;

-    }

-    this_mode = vp9_mode_order[mode_index].mode;

-    ref_frame = vp9_mode_order[mode_index].ref_frame;

-    mbmi->ref_frame = ref_frame;

-    comp_pred = vp9_mode_order[mode_index].second_ref_frame != INTRA_FRAME;

-    mbmi->mode = this_mode;

-    mbmi->uv_mode = DC_PRED;

-#if CONFIG_COMP_INTRA_PRED

-    mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

-    mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

-#endif

-    if (!(cpi->ref_frame_flags & flag_list[ref_frame]))

-      continue;

-    // not yet supported or not superblocky

-    // TODO(rbultje): support intra coding

-    if (ref_frame == INTRA_FRAME || this_mode == SPLITMV)

-      continue;

-    if (comp_pred) {

-      int second_ref;

-      if (ref_frame == ALTREF_FRAME) {

-        second_ref = LAST_FRAME;

-      } else {

-        second_ref = ref_frame + 1;

-      }

-      if (!(cpi->ref_frame_flags & flag_list[second_ref]))

-        continue;

-      mbmi->second_ref_frame = second_ref;

-      xd->second_pre.y_buffer = y_buffer[second_ref];

-      xd->second_pre.u_buffer = u_buffer[second_ref];

-      xd->second_pre.v_buffer = v_buffer[second_ref];

-      mode_excluded = cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;

-    } else {

-      mbmi->second_ref_frame = INTRA_FRAME;

-      mode_excluded = cm->comp_pred_mode == COMP_PREDICTION_ONLY;

-    }

-    xd->pre.y_buffer = y_buffer[ref_frame];

-    xd->pre.u_buffer = u_buffer[ref_frame];

-    xd->pre.v_buffer = v_buffer[ref_frame];

-    vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts));

-    // If the segment reference frame feature is enabled....

-    // then do nothing if the current ref frame is not allowed..

-    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

-        !vp9_check_segref(xd, segment_id, ref_frame)) {

-      continue;

-    // If the segment mode feature is enabled....

-    // then do nothing if the current mode is not allowed..

-    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&

-               (this_mode != vp9_get_segdata(xd, segment_id, SEG_LVL_MODE))) {

-      continue;

-    // Disable this drop out case if either the mode or ref frame

-    // segment level feature is enabled for this segment. This is to

-    // prevent the possibility that we end up unable to pick any mode.

-    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

-               !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {

-      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,

-      // unless ARNR filtering is enabled in which case we want

-      // an unfiltered alternative

-      if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {

-        if (this_mode != ZEROMV || ref_frame != ALTREF_FRAME) {

-          continue;

-        }

-      }

-    }

-    this_rd = handle_inter_mode(cpi, x, BLOCK_32X32,

-                                &saddone, near_sadidx, mdcounts, txfm_cache,

-                                &rate2, &distortion2, &skippable,

-                                &compmode_cost, &rate_y, &distortion_y,

-                                &rate_uv, &distortion_uv,

-                                &mode_excluded, &disable_skip, recon_yoffset,

-                                mode_index, frame_mv, frame_best_ref_mv);

-    if (this_rd == INT64_MAX)

-      continue;

-    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {

-      rate2 += compmode_cost;

-    }

-    // Estimate the reference frame signaling cost and add it

-    // to the rolling cost variable.

-    rate2 += ref_costs[xd->mode_info_context->mbmi.ref_frame];

-    if (!disable_skip) {

-      // Test for the condition where skip block will be activated

-      // because there are no non zero coefficients and make any

-      // necessary adjustment for rate. Ignore if skip is coded at

-      // segment level as the cost wont have been added in.

-      if (cpi->common.mb_no_coeff_skip) {

-        int mb_skip_allowed;

-        // Is Mb level skip allowed for this mb.

-        mb_skip_allowed =

-          !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||

-          vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-        if (skippable) {

-          // Back out the coefficient coding costs

-          rate2 -= (rate_y + rate_uv);

-          // for best_yrd calculation

-          rate_uv = 0;

-          if (mb_skip_allowed) {

-            int prob_skip_cost;

-            // Cost the skip mb case

-            vp9_prob skip_prob =

-              vp9_get_pred_prob(cm, xd, PRED_MBSKIP);

-            if (skip_prob) {

-              prob_skip_cost = vp9_cost_bit(skip_prob, 1);

-              rate2 += prob_skip_cost;

-              other_cost += prob_skip_cost;

-            }

-          }

-        }

-        // Add in the cost of the no skip flag.

-        else if (mb_skip_allowed) {

-          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,

-                                                          PRED_MBSKIP), 0);

-          rate2 += prob_skip_cost;

-          other_cost += prob_skip_cost;

-        }

-      }

-      // Calculate the final RD estimate for this mode.

-      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);

-    }

-#if 0

-    // Keep record of best intra distortion

-    if ((xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&

-        (this_rd < best_intra_rd)) {

-      best_intra_rd = this_rd;

-      *returnintra = distortion2;

-    }

-#endif

-    if (!disable_skip && xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {

-      if (this_rd < best_comp_rd)

-        best_comp_rd = this_rd;

-      if (this_rd < best_single_rd)

-        best_single_rd = this_rd;

-      if (this_rd < best_hybrid_rd)

-        best_hybrid_rd = this_rd;

-    }

-    // Did this mode help.. i.e. is it the new best mode

-    if (this_rd < best_rd || x->skip) {

-      if (!mode_excluded) {

-        // Note index of best mode so far

-        best_mode_index = mode_index;

-#if 0

-        if (this_mode <= B_PRED) {

-          xd->mode_info_context->mbmi.uv_mode = uv_intra_mode_8x8;

-          /* required for left and above block mv */

-          xd->mode_info_context->mbmi.mv.as_int = 0;

-        }

-#endif

-        other_cost += ref_costs[xd->mode_info_context->mbmi.ref_frame];

-        /* Calculate the final y RD estimate for this mode */

-        best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),

-                          (distortion2 - distortion_uv));

-        *returnrate = rate2;

-        *returndistortion = distortion2;

-        best_rd = this_rd;

-        vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));

-      }

-#if 0

-      // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time

-      cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;

-      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];

-#endif

-    }

-    // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around.

-    else {

-#if 0

-      cpi->rd_thresh_mult[mode_index] += 4;

-      if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)

-        cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;

-      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];

-#endif

-    }

-    /* keep record of best compound/single-only prediction */

-    if (!disable_skip && mbmi->ref_frame != INTRA_FRAME) {

-      int single_rd, hybrid_rd, single_rate, hybrid_rate;

-      if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {

-        single_rate = rate2 - compmode_cost;

-        hybrid_rate = rate2;

-      } else {

-        single_rate = rate2;

-        hybrid_rate = rate2 + compmode_cost;

-      }

-      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);

-      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);

-      if (mbmi->second_ref_frame == INTRA_FRAME && single_rd < best_single_rd) {

-        best_single_rd = single_rd;

-      } else if (mbmi->second_ref_frame != INTRA_FRAME &&

-                 single_rd < best_comp_rd) {

-        best_comp_rd = single_rd;

-      }

-      if (hybrid_rd < best_hybrid_rd) {

-        best_hybrid_rd = hybrid_rd;

-      }

-    }

-    if (x->skip && !mode_excluded)

-      break;

-  }

-  // TODO(rbultje) integrate with RD thresholding

-#if 0

-  // Reduce the activation RD thresholds for the best choice mode

-  if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&

-      (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {

-    int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);

-    cpi->rd_thresh_mult[best_mode_index] =

-      (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ?

-      cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;

-    cpi->rd_threshes[best_mode_index] =

-      (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];

-  }

-#endif

-  // This code forces Altref,0,0 and skip for the frame that overlays a

-  // an alrtef unless Altref is filtered. However, this is unsafe if

-  // segment level coding of ref frame or mode is enabled for this

-  // segment.

-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

-      !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&

-      cpi->is_src_frame_alt_ref &&

-      (cpi->oxcf.arnr_max_frames == 0) &&

-      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {

-    mbmi->mode = ZEROMV;

-    mbmi->ref_frame = ALTREF_FRAME;

-    mbmi->second_ref_frame = 0;

-    mbmi->mv[0].as_int = 0;

-    mbmi->uv_mode = DC_PRED;

-    mbmi->mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;

-    mbmi->partitioning = 0;

-    mbmi->txfm_size = TX_8X8;

-    if (best_rd != INT64_MAX)

-      store_coding_context(x, &x->sb_context[0], best_mode_index, NULL,

-                           &frame_best_ref_mv[mbmi->ref_frame],

-                           &frame_best_ref_mv[mbmi->second_ref_frame],

-                           0, 0, 0, NULL);

-    return best_rd;

-  }

-  // macroblock modes

-  vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));

-  mbmi->txfm_size = TX_8X8;

-  if (best_rd != INT64_MAX)

-    store_coding_context(x, &x->sb_context[0], best_mode_index, NULL,

-                         &frame_best_ref_mv[mbmi->ref_frame],

-                         &frame_best_ref_mv[mbmi->second_ref_frame],

-                         (best_single_rd == INT64_MAX) ? INT_MIN :

-                                        (best_rd - best_single_rd),

-                         (best_comp_rd   == INT64_MAX) ? INT_MIN :

-                                        (best_rd - best_comp_rd),

-                         (best_hybrid_rd == INT64_MAX) ? INT_MIN :

-                                        (best_rd - best_hybrid_rd),

-                         NULL);

-  return best_rd;

-}

-#endif

-void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,

-                                    int recon_yoffset,

-                                    int recon_uvoffset,

-                                    int *totalrate, int *totaldist) {

-  MACROBLOCKD *const xd = &x->e_mbd;

-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

-  int rate, distortion;

-  int64_t intra_error = 0;

-  unsigned char *segment_id = &mbmi->segment_id;

-  if (xd->segmentation_enabled)

-    x->encode_breakout = cpi->segment_encode_breakout[*segment_id];

-  else

-    x->encode_breakout = cpi->oxcf.encode_breakout;

-  // if (cpi->sf.RD)

-  // For now this codebase is limited to a single rd encode path

-  {

-    int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;

-    vp9_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,

-                           &distortion, &intra_error);

-    /* restore cpi->zbin_mode_boost_enabled */

-    cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;

-  }

-  // else

-  // The non rd encode path has been deleted from this code base

-  // to simplify development

-  //    vp9_pick_inter_mode

-  // Store metrics so they can be added in to totals if this mode is picked

-  x->mb_context[xd->mb_index].distortion  = distortion;

-  x->mb_context[xd->mb_index].intra_error = intra_error;

-  *totalrate = rate;

-  *totaldist = distortion;

-}

--- a/vp8/encoder/rdopt.h

+++ /dev/null

@@ -1,41 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_RDOPT_H

-#define __INC_RDOPT_H

-#define RDCOST(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )

-#define RDCOST_8x8(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )

-extern void vp9_initialize_rd_consts(VP9_COMP *cpi, int Qvalue);

-extern void vp9_rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,

-                                   int recon_yoffset, int recon_uvoffset,

-                                   int *returnrate, int *returndistortion,

-                                   int64_t *returnintra);

-extern void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,

-                                   int *r, int *d);

-extern void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,

-                                      int *r, int *d);

-extern void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCKD *xd,

-                        const MODE_INFO *here, int_mv *mvp,

-                        int refframe, int *ref_frame_sign_bias,

-                        int *sr, int near_sadidx[]);

-extern void vp9_init_me_luts();

-extern void vp9_set_mbmode_and_mvs(MACROBLOCK *x,

-                                   MB_PREDICTION_MODE mb, int_mv *mv);

-#endif

--- a/vp8/encoder/sad_c.c

+++ /dev/null

@@ -1,480 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <stdlib.h>

-#include "vp8/common/sadmxn.h"

-#include "vpx_ports/config.h"

-#include "vpx/vpx_integer.h"

-unsigned int vp9_sad32x32_c(const unsigned char *src_ptr,

-                            int  src_stride,

-                            const unsigned char *ref_ptr,

-                            int  ref_stride,

-                            int max_sad) {

-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);

-}

-unsigned int vp9_sad16x16_c(const unsigned char *src_ptr,

-                            int  src_stride,

-                            const unsigned char *ref_ptr,

-                            int  ref_stride,

-                            int max_sad) {

-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16);

-}

-unsigned int vp9_sad8x8_c(const unsigned char *src_ptr,

-                          int  src_stride,

-                          const unsigned char *ref_ptr,

-                          int  ref_stride,

-                          int max_sad) {

-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8);

-}

-unsigned int vp9_sad16x8_c(const unsigned char *src_ptr,

-                           int  src_stride,

-                           const unsigned char *ref_ptr,

-                           int  ref_stride,

-                           int max_sad) {

-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8);

-}

-unsigned int vp9_sad8x16_c(const unsigned char *src_ptr,

-                           int  src_stride,

-                           const unsigned char *ref_ptr,

-                           int  ref_stride,

-                           int max_sad) {

-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);

-}

-unsigned int vp9_sad4x4_c(const unsigned char *src_ptr,

-                          int  src_stride,

-                          const unsigned char *ref_ptr,

-                          int  ref_stride,

-                          int max_sad) {

-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);

-}

-void vp9_sad32x32x3_c(const unsigned char *src_ptr,

-                      int  src_stride,

-                      const unsigned char *ref_ptr,

-                      int  ref_stride,

-                      unsigned int *sad_array

-                      ) {

-  sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride,

-                                ref_ptr, ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride,

-                                ref_ptr + 1, ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad32x32_c(src_ptr, src_stride,

-                                ref_ptr + 2, ref_stride, 0x7fffffff);

-}

-void vp9_sad32x32x8_c(const unsigned char *src_ptr,

-                      int  src_stride,

-                      const unsigned char *ref_ptr,

-                      int  ref_stride,

-                      unsigned short *sad_array

-                      ) {

-  sad_array[0] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,

-                                                ref_ptr, ref_stride,

-                                                0x7fffffff);

-  sad_array[1] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,

-                                                ref_ptr + 1, ref_stride,

-                                                0x7fffffff);

-  sad_array[2] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,

-                                                ref_ptr + 2, ref_stride,

-                                                0x7fffffff);

-  sad_array[3] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,

-                                                ref_ptr + 3, ref_stride,

-                                                0x7fffffff);

-  sad_array[4] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,

-                                                ref_ptr + 4, ref_stride,

-                                                0x7fffffff);

-  sad_array[5] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,

-                                                ref_ptr + 5, ref_stride,

-                                                0x7fffffff);

-  sad_array[6] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,

-                                                ref_ptr + 6, ref_stride,

-                                                0x7fffffff);

-  sad_array[7] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,

-                                                ref_ptr + 7, ref_stride,

-                                                0x7fffffff);

-}

-void vp9_sad16x16x3_c(const unsigned char *src_ptr,

-                      int  src_stride,

-                      const unsigned char *ref_ptr,

-                      int  ref_stride,

-                      unsigned int *sad_array) {

-  sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride,

-                                ref_ptr, ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad16x16_c(src_ptr, src_stride,

-                                ref_ptr + 1, ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad16x16_c(src_ptr, src_stride,

-                                ref_ptr + 2, ref_stride, 0x7fffffff);

-}

-void vp9_sad16x16x8_c(const unsigned char *src_ptr,

-                      int  src_stride,

-                      const unsigned char *ref_ptr,

-                      int  ref_stride,

-                      unsigned short *sad_array) {

-  sad_array[0] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,

-                                                ref_ptr, ref_stride,

-                                                0x7fffffff);

-  sad_array[1] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,

-                                                ref_ptr + 1, ref_stride,

-                                                0x7fffffff);

-  sad_array[2] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,

-                                                ref_ptr + 2, ref_stride,

-                                                0x7fffffff);

-  sad_array[3] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,

-                                                ref_ptr + 3, ref_stride,

-                                                0x7fffffff);

-  sad_array[4] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,

-                                                ref_ptr + 4, ref_stride,

-                                                0x7fffffff);

-  sad_array[5] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,

-                                                ref_ptr + 5, ref_stride,

-                                                0x7fffffff);

-  sad_array[6] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,

-                                                ref_ptr + 6, ref_stride,

-                                                0x7fffffff);

-  sad_array[7] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,

-                                                ref_ptr + 7, ref_stride,

-                                                0x7fffffff);

-}

-void vp9_sad16x8x3_c(const unsigned char *src_ptr,

-                     int  src_stride,

-                     const unsigned char *ref_ptr,

-                     int  ref_stride,

-                     unsigned int *sad_array) {

-  sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride,

-                               ref_ptr, ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad16x8_c(src_ptr, src_stride,

-                               ref_ptr + 1, ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad16x8_c(src_ptr, src_stride,

-                               ref_ptr + 2, ref_stride, 0x7fffffff);

-}

-void vp9_sad16x8x8_c(const unsigned char *src_ptr,

-                     int  src_stride,

-                     const unsigned char *ref_ptr,

-                     int  ref_stride,

-                     unsigned short *sad_array) {

-  sad_array[0] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,

-                                               ref_ptr, ref_stride,

-                                               0x7fffffff);

-  sad_array[1] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,

-                                               ref_ptr + 1, ref_stride,

-                                               0x7fffffff);

-  sad_array[2] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,

-                                               ref_ptr + 2, ref_stride,

-                                               0x7fffffff);

-  sad_array[3] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,

-                                               ref_ptr + 3, ref_stride,

-                                               0x7fffffff);

-  sad_array[4] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,

-                                               ref_ptr + 4, ref_stride,

-                                               0x7fffffff);

-  sad_array[5] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,

-                                               ref_ptr + 5, ref_stride,

-                                               0x7fffffff);

-  sad_array[6] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,

-                                               ref_ptr + 6, ref_stride,

-                                               0x7fffffff);

-  sad_array[7] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,

-                                               ref_ptr + 7, ref_stride,

-                                               0x7fffffff);

-}

-void vp9_sad8x8x3_c(const unsigned char *src_ptr,

-                    int  src_stride,

-                    const unsigned char *ref_ptr,

-                    int  ref_stride,

-                    unsigned int *sad_array) {

-  sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride,

-                              ref_ptr, ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad8x8_c(src_ptr, src_stride,

-                              ref_ptr + 1, ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad8x8_c(src_ptr, src_stride,

-                              ref_ptr + 2, ref_stride, 0x7fffffff);

-}

-void vp9_sad8x8x8_c(const unsigned char *src_ptr,

-                    int  src_stride,

-                    const unsigned char *ref_ptr,

-                    int  ref_stride,

-                    unsigned short *sad_array) {

-  sad_array[0] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,

-                                              ref_ptr, ref_stride,

-                                              0x7fffffff);

-  sad_array[1] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,

-                                              ref_ptr + 1, ref_stride,

-                                              0x7fffffff);

-  sad_array[2] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,

-                                              ref_ptr + 2, ref_stride,

-                                              0x7fffffff);

-  sad_array[3] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,

-                                              ref_ptr + 3, ref_stride,

-                                              0x7fffffff);

-  sad_array[4] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,

-                                              ref_ptr + 4, ref_stride,

-                                              0x7fffffff);

-  sad_array[5] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,

-                                              ref_ptr + 5, ref_stride,

-                                              0x7fffffff);

-  sad_array[6] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,

-                                              ref_ptr + 6, ref_stride,

-                                              0x7fffffff);

-  sad_array[7] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,

-                                              ref_ptr + 7, ref_stride,

-                                              0x7fffffff);

-}

-void vp9_sad8x16x3_c(const unsigned char *src_ptr,

-                     int  src_stride,

-                     const unsigned char *ref_ptr,

-                     int  ref_stride,

-                     unsigned int *sad_array) {

-  sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride,

-                               ref_ptr, ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad8x16_c(src_ptr, src_stride,

-                               ref_ptr + 1, ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad8x16_c(src_ptr, src_stride,

-                               ref_ptr + 2, ref_stride, 0x7fffffff);

-}

-void vp9_sad8x16x8_c(const unsigned char *src_ptr,

-                     int  src_stride,

-                     const unsigned char *ref_ptr,

-                     int  ref_stride,

-                     unsigned short *sad_array) {

-  sad_array[0] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,

-                                               ref_ptr, ref_stride,

-                                               0x7fffffff);

-  sad_array[1] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,

-                                               ref_ptr + 1, ref_stride,

-                                               0x7fffffff);

-  sad_array[2] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,

-                                               ref_ptr + 2, ref_stride,

-                                               0x7fffffff);

-  sad_array[3] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,

-                                               ref_ptr + 3, ref_stride,

-                                               0x7fffffff);

-  sad_array[4] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,

-                                               ref_ptr + 4, ref_stride,

-                                               0x7fffffff);

-  sad_array[5] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,

-                                               ref_ptr + 5, ref_stride,

-                                               0x7fffffff);

-  sad_array[6] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,

-                                               ref_ptr + 6, ref_stride,

-                                               0x7fffffff);

-  sad_array[7] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,

-                                               ref_ptr + 7, ref_stride,

-                                               0x7fffffff);

-}

-void vp9_sad4x4x3_c(const unsigned char *src_ptr,

-                    int  src_stride,

-                    const unsigned char *ref_ptr,

-                    int  ref_stride,

-                    unsigned int *sad_array) {

-  sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride,

-                              ref_ptr, ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad4x4_c(src_ptr, src_stride,

-                              ref_ptr + 1, ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad4x4_c(src_ptr, src_stride,

-                              ref_ptr + 2, ref_stride, 0x7fffffff);

-}

-void vp9_sad4x4x8_c(const unsigned char *src_ptr,

-                    int  src_stride,

-                    const unsigned char *ref_ptr,

-                    int  ref_stride,

-                    unsigned short *sad_array) {

-  sad_array[0] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,

-                                              ref_ptr, ref_stride,

-                                              0x7fffffff);

-  sad_array[1] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,

-                                              ref_ptr + 1, ref_stride,

-                                              0x7fffffff);

-  sad_array[2] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,

-                                              ref_ptr + 2, ref_stride,

-                                              0x7fffffff);

-  sad_array[3] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,

-                                              ref_ptr + 3, ref_stride,

-                                              0x7fffffff);

-  sad_array[4] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,

-                                              ref_ptr + 4, ref_stride,

-                                              0x7fffffff);

-  sad_array[5] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,

-                                              ref_ptr + 5, ref_stride,

-                                              0x7fffffff);

-  sad_array[6] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,

-                                              ref_ptr + 6, ref_stride,

-                                              0x7fffffff);

-  sad_array[7] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,

-                                              ref_ptr + 7, ref_stride,

-                                              0x7fffffff);

-}

-void vp9_sad32x32x4d_c(const unsigned char *src_ptr,

-                       int  src_stride,

-                       unsigned char *ref_ptr[],

-                       int  ref_stride,

-                       unsigned int *sad_array

-                       ) {

-  sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride,

-                                ref_ptr[0], ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride,

-                                ref_ptr[1], ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad32x32_c(src_ptr, src_stride,

-                                ref_ptr[2], ref_stride, 0x7fffffff);

-  sad_array[3] = vp9_sad32x32_c(src_ptr, src_stride,

-                                ref_ptr[3], ref_stride, 0x7fffffff);

-}

-void vp9_sad16x16x4d_c(const unsigned char *src_ptr,

-                       int  src_stride,

-                       unsigned char *ref_ptr[],

-                       int  ref_stride,

-                       unsigned int *sad_array) {

-  sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride,

-                                ref_ptr[0], ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad16x16_c(src_ptr, src_stride,

-                                ref_ptr[1], ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad16x16_c(src_ptr, src_stride,

-                                ref_ptr[2], ref_stride, 0x7fffffff);

-  sad_array[3] = vp9_sad16x16_c(src_ptr, src_stride,

-                                ref_ptr[3], ref_stride, 0x7fffffff);

-}

-void vp9_sad16x8x4d_c(const unsigned char *src_ptr,

-                      int  src_stride,

-                      unsigned char *ref_ptr[],

-                      int  ref_stride,

-                      unsigned int *sad_array) {

-  sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride,

-                               ref_ptr[0], ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad16x8_c(src_ptr, src_stride,

-                               ref_ptr[1], ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad16x8_c(src_ptr, src_stride,

-                               ref_ptr[2], ref_stride, 0x7fffffff);

-  sad_array[3] = vp9_sad16x8_c(src_ptr, src_stride,

-                               ref_ptr[3], ref_stride, 0x7fffffff);

-}

-void vp9_sad8x8x4d_c(const unsigned char *src_ptr,

-                     int  src_stride,

-                     unsigned char *ref_ptr[],

-                     int  ref_stride,

-                     unsigned int *sad_array) {

-  sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride,

-                              ref_ptr[0], ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad8x8_c(src_ptr, src_stride,

-                              ref_ptr[1], ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad8x8_c(src_ptr, src_stride,

-                              ref_ptr[2], ref_stride, 0x7fffffff);

-  sad_array[3] = vp9_sad8x8_c(src_ptr, src_stride,

-                              ref_ptr[3], ref_stride, 0x7fffffff);

-}

-void vp9_sad8x16x4d_c(const unsigned char *src_ptr,

-                      int  src_stride,

-                      unsigned char *ref_ptr[],

-                      int  ref_stride,

-                      unsigned int *sad_array) {

-  sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride,

-                               ref_ptr[0], ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad8x16_c(src_ptr, src_stride,

-                               ref_ptr[1], ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad8x16_c(src_ptr, src_stride,

-                               ref_ptr[2], ref_stride, 0x7fffffff);

-  sad_array[3] = vp9_sad8x16_c(src_ptr, src_stride,

-                               ref_ptr[3], ref_stride, 0x7fffffff);

-}

-void vp9_sad4x4x4d_c(const unsigned char *src_ptr,

-                     int  src_stride,

-                     unsigned char *ref_ptr[],

-                     int  ref_stride,

-                     unsigned int *sad_array) {

-  sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride,

-                              ref_ptr[0], ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad4x4_c(src_ptr, src_stride,

-                              ref_ptr[1], ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad4x4_c(src_ptr, src_stride,

-                              ref_ptr[2], ref_stride, 0x7fffffff);

-  sad_array[3] = vp9_sad4x4_c(src_ptr, src_stride,

-                              ref_ptr[3], ref_stride, 0x7fffffff);

-}

-/* Copy 2 macroblocks to a buffer */

-void vp9_copy32xn_c(unsigned char *src_ptr,

-                    int  src_stride,

-                    unsigned char *dst_ptr,

-                    int  dst_stride,

-                    int height) {

-  int r;

-  for (r = 0; r < height; r++) {

-#if !(CONFIG_FAST_UNALIGNED)

-    dst_ptr[0] = src_ptr[0];

-    dst_ptr[1] = src_ptr[1];

-    dst_ptr[2] = src_ptr[2];

-    dst_ptr[3] = src_ptr[3];

-    dst_ptr[4] = src_ptr[4];

-    dst_ptr[5] = src_ptr[5];

-    dst_ptr[6] = src_ptr[6];

-    dst_ptr[7] = src_ptr[7];

-    dst_ptr[8] = src_ptr[8];

-    dst_ptr[9] = src_ptr[9];

-    dst_ptr[10] = src_ptr[10];

-    dst_ptr[11] = src_ptr[11];

-    dst_ptr[12] = src_ptr[12];

-    dst_ptr[13] = src_ptr[13];

-    dst_ptr[14] = src_ptr[14];

-    dst_ptr[15] = src_ptr[15];

-    dst_ptr[16] = src_ptr[16];

-    dst_ptr[17] = src_ptr[17];

-    dst_ptr[18] = src_ptr[18];

-    dst_ptr[19] = src_ptr[19];

-    dst_ptr[20] = src_ptr[20];

-    dst_ptr[21] = src_ptr[21];

-    dst_ptr[22] = src_ptr[22];

-    dst_ptr[23] = src_ptr[23];

-    dst_ptr[24] = src_ptr[24];

-    dst_ptr[25] = src_ptr[25];

-    dst_ptr[26] = src_ptr[26];

-    dst_ptr[27] = src_ptr[27];

-    dst_ptr[28] = src_ptr[28];

-    dst_ptr[29] = src_ptr[29];

-    dst_ptr[30] = src_ptr[30];

-    dst_ptr[31] = src_ptr[31];

-#else

-    ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0];

-    ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1];

-    ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2];

-    ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3];

-    ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4];

-    ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5];

-    ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6];

-    ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7];

-#endif

-    src_ptr += src_stride;

-    dst_ptr += dst_stride;

-  }

-}

--- a/vp8/encoder/satd_c.c

+++ /dev/null

@@ -1,47 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <stdlib.h>

-#include "vpx_ports/mem.h"

-#include "./vpx_rtcd.h"

-unsigned int vp9_satd16x16_c(const unsigned char *src_ptr,

-                             int  src_stride,

-                             const unsigned char *ref_ptr,

-                             int  ref_stride,

-                             unsigned int *psatd) {

-  int r, c, i;

-  unsigned int satd = 0;

-  DECLARE_ALIGNED(16, short, diff_in[256]);

-  DECLARE_ALIGNED(16, short, diff_out[16]);

-  short *in;

-  for (r = 0; r < 16; r++) {

-    for (c = 0; c < 16; c++) {

-      diff_in[r * 16 + c] = src_ptr[c] - ref_ptr[c];

-    }

-    src_ptr += src_stride;

-    ref_ptr += ref_stride;

-  }

-  in = diff_in;

-  for (r = 0; r < 16; r += 4) {

-    for (c = 0; c < 16; c += 4) {

-      vp9_short_walsh4x4_c(in + c, diff_out, 32);

-      for (i = 0; i < 16; i++)

-        satd += abs(diff_out[i]);

-    }

-    in += 64;

-  }

-  if (psatd)

-    *psatd = satd;

-  return satd;

-}

--- a/vp8/encoder/segmentation.c

+++ /dev/null

@@ -1,327 +1,0 @@

-/*

- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "limits.h"

-#include "vpx_mem/vpx_mem.h"

-#include "segmentation.h"

-#include "vp8/common/pred_common.h"

-void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x) {

-  int mb_row, mb_col;

-  MODE_INFO *this_mb_mode_info = cm->mi;

-  x->gf_active_ptr = (signed char *)cpi->gf_active_flags;

-  if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame)) {

-    // Reset Gf useage monitors

-    vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));

-    cpi->gf_active_count = cm->mb_rows * cm->mb_cols;

-  } else {

-    // for each macroblock row in image

-    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {

-      // for each macroblock col in image

-      for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

-        // If using golden then set GF active flag if not already set.

-        // If using last frame 0,0 mode then leave flag as it is

-        // else if using non 0,0 motion or intra modes then clear

-        // flag if it is currently set

-        if ((this_mb_mode_info->mbmi.ref_frame == GOLDEN_FRAME) ||

-            (this_mb_mode_info->mbmi.ref_frame == ALTREF_FRAME)) {

-          if (*(x->gf_active_ptr) == 0) {

-            *(x->gf_active_ptr) = 1;

-            cpi->gf_active_count++;

-          }

-        } else if ((this_mb_mode_info->mbmi.mode != ZEROMV) &&

-                   *(x->gf_active_ptr)) {

-          *(x->gf_active_ptr) = 0;

-          cpi->gf_active_count--;

-        }

-        x->gf_active_ptr++;          // Step onto next entry

-        this_mb_mode_info++;         // skip to next mb

-      }

-      // this is to account for the border

-      this_mb_mode_info++;

-    }

-  }

-}

-void vp9_enable_segmentation(VP9_PTR ptr) {

-  VP9_COMP *cpi = (VP9_COMP *)(ptr);

-  // Set the appropriate feature bit

-  cpi->mb.e_mbd.segmentation_enabled = 1;

-  cpi->mb.e_mbd.update_mb_segmentation_map = 1;

-  cpi->mb.e_mbd.update_mb_segmentation_data = 1;

-}

-void vp9_disable_segmentation(VP9_PTR ptr) {

-  VP9_COMP *cpi = (VP9_COMP *)(ptr);

-  // Clear the appropriate feature bit

-  cpi->mb.e_mbd.segmentation_enabled = 0;

-}

-void vp9_set_segmentation_map(VP9_PTR ptr,

-                              unsigned char *segmentation_map) {

-  VP9_COMP *cpi = (VP9_COMP *)(ptr);

-  // Copy in the new segmentation map

-  vpx_memcpy(cpi->segmentation_map, segmentation_map,

-             (cpi->common.mb_rows * cpi->common.mb_cols));

-  // Signal that the map should be updated.

-  cpi->mb.e_mbd.update_mb_segmentation_map = 1;

-  cpi->mb.e_mbd.update_mb_segmentation_data = 1;

-}

-void vp9_set_segment_data(VP9_PTR ptr,

-                          signed char *feature_data,

-                          unsigned char abs_delta) {

-  VP9_COMP *cpi = (VP9_COMP *)(ptr);

-  cpi->mb.e_mbd.mb_segment_abs_delta = abs_delta;

-  vpx_memcpy(cpi->mb.e_mbd.segment_feature_data, feature_data,

-             sizeof(cpi->mb.e_mbd.segment_feature_data));

-  // TBD ?? Set the feature mask

-  // vpx_memcpy(cpi->mb.e_mbd.segment_feature_mask, 0,

-  //            sizeof(cpi->mb.e_mbd.segment_feature_mask));

-}

-// Based on set of segment counts calculate a probability tree

-static void calc_segtree_probs(MACROBLOCKD *xd,

-                               int *segcounts,

-                               vp9_prob *segment_tree_probs) {

-  int count1, count2;

-  int tot_count;

-  int i;

-  // Blank the strtucture to start with

-  vpx_memset(segment_tree_probs, 0,

-             MB_FEATURE_TREE_PROBS * sizeof(*segment_tree_probs));

-  // Total count for all segments

-  count1 = segcounts[0] + segcounts[1];

-  count2 = segcounts[2] + segcounts[3];

-  tot_count = count1 + count2;

-  // Work out probabilities of each segment

-  if (tot_count)

-    segment_tree_probs[0] = (count1 * 255) / tot_count;

-  if (count1 > 0)

-    segment_tree_probs[1] = (segcounts[0] * 255) / count1;

-  if (count2 > 0)

-    segment_tree_probs[2] = (segcounts[2] * 255) / count2;

-  // Clamp probabilities to minimum allowed value

-  for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {

-    if (segment_tree_probs[i] == 0)

-      segment_tree_probs[i] = 1;

-  }

-}

-// Based on set of segment counts and probabilities calculate a cost estimate

-static int cost_segmap(MACROBLOCKD *xd,

-                       int *segcounts,

-                       vp9_prob *probs) {

-  int cost;

-  int count1, count2;

-  // Cost the top node of the tree

-  count1 = segcounts[0] + segcounts[1];

-  count2 = segcounts[2] + segcounts[3];

-  cost = count1 * vp9_cost_zero(probs[0]) +

-         count2 * vp9_cost_one(probs[0]);

-  // Now add the cost of each individual segment branch

-  if (count1 > 0)

-    cost += segcounts[0] * vp9_cost_zero(probs[1]) +

-            segcounts[1] * vp9_cost_one(probs[1]);

-  if (count2 > 0)

-    cost += segcounts[2] * vp9_cost_zero(probs[2]) +

-            segcounts[3] * vp9_cost_one(probs[2]);

-  return cost;

-}

-void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {

-  VP9_COMMON *const cm = &cpi->common;

-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;

-  const int mis = cm->mode_info_stride;

-  int i;

-  int tot_count;

-  int no_pred_cost;

-  int t_pred_cost = INT_MAX;

-  int pred_context;

-  int mb_row, mb_col;

-  int segmap_index = 0;

-  unsigned char segment_id;

-  int temporal_predictor_count[PREDICTION_PROBS][2];

-  int no_pred_segcounts[MAX_MB_SEGMENTS];

-  int t_unpred_seg_counts[MAX_MB_SEGMENTS];

-  vp9_prob no_pred_tree[MB_FEATURE_TREE_PROBS];

-  vp9_prob t_pred_tree[MB_FEATURE_TREE_PROBS];

-  vp9_prob t_nopred_prob[PREDICTION_PROBS];

-  // Set default state for the segment tree probabilities and the

-  // temporal coding probabilities

-  vpx_memset(xd->mb_segment_tree_probs, 255,

-             sizeof(xd->mb_segment_tree_probs));

-  vpx_memset(cm->segment_pred_probs, 255,

-             sizeof(cm->segment_pred_probs));

-  vpx_memset(no_pred_segcounts, 0, sizeof(no_pred_segcounts));

-  vpx_memset(t_unpred_seg_counts, 0, sizeof(t_unpred_seg_counts));

-  vpx_memset(temporal_predictor_count, 0, sizeof(temporal_predictor_count));

-  // First of all generate stats regarding how well the last segment map

-  // predicts this one

-  // Initialize macroblock decoder mode info context for the first mb

-  // in the frame

-  xd->mode_info_context = cm->mi;

-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 2) {

-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 2) {

-      for (i = 0; i < 4; i++) {

-        static const int dx[4] = { +1, -1, +1, +1 };

-        static const int dy[4] = {  0, +1,  0, -1 };

-        int x_idx = i & 1, y_idx = i >> 1;

-        if (mb_col + x_idx >= cm->mb_cols ||

-            mb_row + y_idx >= cm->mb_rows) {

-          goto end;

-        }

-        xd->mb_to_top_edge = -((mb_row * 16) << 3);

-        xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;

-        xd->mb_to_left_edge = -((mb_col * 16) << 3);

-        xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_row) * 16) << 3;

-        segmap_index = (mb_row + y_idx) * cm->mb_cols + mb_col + x_idx;

-        segment_id = xd->mode_info_context->mbmi.segment_id;

-#if CONFIG_SUPERBLOCKS

-        if (xd->mode_info_context->mbmi.encoded_as_sb) {

-          if (mb_col + 1 < cm->mb_cols)

-            segment_id = segment_id &&

-                         xd->mode_info_context[1].mbmi.segment_id;

-          if (mb_row + 1 < cm->mb_rows) {

-            segment_id = segment_id &&

-                         xd->mode_info_context[mis].mbmi.segment_id;

-            if (mb_col + 1 < cm->mb_cols)

-              segment_id = segment_id &&

-                           xd->mode_info_context[mis + 1].mbmi.segment_id;

-          }

-        }

-#endif

-        // Count the number of hits on each segment with no prediction

-        no_pred_segcounts[segment_id]++;

-        // Temporal prediction not allowed on key frames

-        if (cm->frame_type != KEY_FRAME) {

-          // Test to see if the segment id matches the predicted value.

-          int seg_predicted =

-            (segment_id == vp9_get_pred_mb_segid(cm, xd, segmap_index));

-          // Get the segment id prediction context

-          pred_context =

-            vp9_get_pred_context(cm, xd, PRED_SEG_ID);

-          // Store the prediction status for this mb and update counts

-          // as appropriate

-          vp9_set_pred_flag(xd, PRED_SEG_ID, seg_predicted);

-          temporal_predictor_count[pred_context][seg_predicted]++;

-          if (!seg_predicted)

-            // Update the "unpredicted" segment count

-            t_unpred_seg_counts[segment_id]++;

-        }

-#if CONFIG_SUPERBLOCKS

-        if (xd->mode_info_context->mbmi.encoded_as_sb) {

-          assert(!i);

-          xd->mode_info_context += 2;

-          break;

-        }

-#endif

-      end:

-        xd->mode_info_context += dx[i] + dy[i] * cm->mode_info_stride;

-      }

-    }

-    // this is to account for the border in mode_info_context

-    xd->mode_info_context -= mb_col;

-    xd->mode_info_context += cm->mode_info_stride * 2;

-  }

-  // Work out probability tree for coding segments without prediction

-  // and the cost.

-  calc_segtree_probs(xd, no_pred_segcounts, no_pred_tree);

-  no_pred_cost = cost_segmap(xd, no_pred_segcounts, no_pred_tree);

-  // Key frames cannot use temporal prediction

-  if (cm->frame_type != KEY_FRAME) {

-    // Work out probability tree for coding those segments not

-    // predicted using the temporal method and the cost.

-    calc_segtree_probs(xd, t_unpred_seg_counts, t_pred_tree);

-    t_pred_cost = cost_segmap(xd, t_unpred_seg_counts, t_pred_tree);

-    // Add in the cost of the signalling for each prediction context

-    for (i = 0; i < PREDICTION_PROBS; i++) {

-      tot_count = temporal_predictor_count[i][0] +

-                  temporal_predictor_count[i][1];

-      // Work out the context probabilities for the segment

-      // prediction flag

-      if (tot_count) {

-        t_nopred_prob[i] = (temporal_predictor_count[i][0] * 255) /

-                           tot_count;

-        // Clamp to minimum allowed value

-        if (t_nopred_prob[i] < 1)

-          t_nopred_prob[i] = 1;

-      } else

-        t_nopred_prob[i] = 1;

-      // Add in the predictor signaling cost

-      t_pred_cost += (temporal_predictor_count[i][0] *

-                      vp9_cost_zero(t_nopred_prob[i])) +

-                     (temporal_predictor_count[i][1] *

-                      vp9_cost_one(t_nopred_prob[i]));

-    }

-  }

-  // Now choose which coding method to use.

-  if (t_pred_cost < no_pred_cost) {

-    cm->temporal_update = 1;

-    vpx_memcpy(xd->mb_segment_tree_probs,

-               t_pred_tree, sizeof(t_pred_tree));

-    vpx_memcpy(&cm->segment_pred_probs,

-               t_nopred_prob, sizeof(t_nopred_prob));

-  } else {

-    cm->temporal_update = 0;

-    vpx_memcpy(xd->mb_segment_tree_probs,

-               no_pred_tree, sizeof(no_pred_tree));

-  }

-}

--- a/vp8/encoder/segmentation.h

+++ /dev/null

@@ -1,46 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "string.h"

-#include "vp8/common/blockd.h"

-#include "onyx_int.h"

-#ifndef __INC_SEGMENTATION_H__

-#define __INC_SEGMENTATION_H__ 1

-extern void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm,

-                                      MACROBLOCK *x);

-extern void vp9_enable_segmentation(VP9_PTR ptr);

-extern void vp9_disable_segmentation(VP9_PTR ptr);

-// Valid values for a segment are 0 to 3

-// Segmentation map is arrange as [Rows][Columns]

-extern void vp9_set_segmentation_map(VP9_PTR ptr,

-                                     unsigned char *segmentation_map);

-// The values given for each segment can be either deltas (from the default

-// value chosen for the frame) or absolute values.

-//

-// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for

-// SEGMENT_ALT_LF)

-// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for

-// SEGMENT_ALT_LF)

-//

-// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use

-// the absolute values given).

-//

-extern void vp9_set_segment_data(VP9_PTR ptr, signed char *feature_data,

-                                 unsigned char abs_delta);

-extern void vp9_choose_segmap_coding_method(VP9_COMP *cpi);

-#endif /* __INC_SEGMENTATION_H__ */

--- a/vp8/encoder/ssim.c

+++ /dev/null

@@ -1,147 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "onyx_int.h"

-void vp9_ssim_parms_16x16_c(unsigned char *s, int sp, unsigned char *r,

-                            int rp, unsigned long *sum_s, unsigned long *sum_r,

-                            unsigned long *sum_sq_s, unsigned long *sum_sq_r,

-                            unsigned long *sum_sxr) {

-  int i, j;

-  for (i = 0; i < 16; i++, s += sp, r += rp) {

-    for (j = 0; j < 16; j++) {

-      *sum_s += s[j];

-      *sum_r += r[j];

-      *sum_sq_s += s[j] * s[j];

-      *sum_sq_r += r[j] * r[j];

-      *sum_sxr += s[j] * r[j];

-    }

-  }

-}

-void vp9_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp,

-                          unsigned long *sum_s, unsigned long *sum_r,

-                          unsigned long *sum_sq_s, unsigned long *sum_sq_r,

-                          unsigned long *sum_sxr) {

-  int i, j;

-  for (i = 0; i < 8; i++, s += sp, r += rp) {

-    for (j = 0; j < 8; j++) {

-      *sum_s += s[j];

-      *sum_r += r[j];

-      *sum_sq_s += s[j] * s[j];

-      *sum_sq_r += r[j] * r[j];

-      *sum_sxr += s[j] * r[j];

-    }

-  }

-}

-const static int64_t cc1 =  26634; // (64^2*(.01*255)^2

-const static int64_t cc2 = 239708; // (64^2*(.03*255)^2

-static double similarity(unsigned long sum_s, unsigned long sum_r,

-                         unsigned long sum_sq_s, unsigned long sum_sq_r,

-                         unsigned long sum_sxr, int count) {

-  int64_t ssim_n, ssim_d;

-  int64_t c1, c2;

-  // scale the constants by number of pixels

-  c1 = (cc1 * count * count) >> 12;

-  c2 = (cc2 * count * count) >> 12;

-  ssim_n = (2 * sum_s * sum_r + c1) * ((int64_t) 2 * count * sum_sxr -

-                                       (int64_t) 2 * sum_s * sum_r + c2);

-  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *

-           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +

-            (int64_t)count * sum_sq_r - (int64_t) sum_r * sum_r + c2);

-  return ssim_n * 1.0 / ssim_d;

-}

-static double ssim_16x16(unsigned char *s, int sp, unsigned char *r, int rp) {

-  unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;

-  vp9_ssim_parms_16x16(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,

-                       &sum_sxr);

-  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);

-}

-static double ssim_8x8(unsigned char *s, int sp, unsigned char *r, int rp) {

-  unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;

-  vp9_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,

-                     &sum_sxr);

-  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);

-}

-// We are using a 8x8 moving window with starting location of each 8x8 window

-// on the 4x4 pixel grid. Such arrangement allows the windows to overlap

-// block boundaries to penalize blocking artifacts.

-double vp9_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1,

-                 int stride_img2, int width, int height) {

-  int i, j;

-  int samples = 0;

-  double ssim_total = 0;

-  // sample point start with each 4x4 location

-  for (i = 0; i < height - 8; i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {

-    for (j = 0; j < width - 8; j += 4) {

-      double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);

-      ssim_total += v;

-      samples++;

-    }

-  }

-  ssim_total /= samples;

-  return ssim_total;

-}

-double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,

-                     int lumamask, double *weight) {

-  double a, b, c;

-  double ssimv;

-  a = vp9_ssim2(source->y_buffer, dest->y_buffer,

-                source->y_stride, dest->y_stride, source->y_width,

-                source->y_height);

-  b = vp9_ssim2(source->u_buffer, dest->u_buffer,

-                source->uv_stride, dest->uv_stride, source->uv_width,

-                source->uv_height);

-  c = vp9_ssim2(source->v_buffer, dest->v_buffer,

-                source->uv_stride, dest->uv_stride, source->uv_width,

-                source->uv_height);

-  ssimv = a * .8 + .1 * (b + c);

-  *weight = 1;

-  return ssimv;

-}

-double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,

-                      double *ssim_y, double *ssim_u, double *ssim_v) {

-  double ssim_all = 0;

-  double a, b, c;

-  a = vp9_ssim2(source->y_buffer, dest->y_buffer,

-                source->y_stride, dest->y_stride, source->y_width,

-                source->y_height);

-  b = vp9_ssim2(source->u_buffer, dest->u_buffer,

-                source->uv_stride, dest->uv_stride, source->uv_width,

-                source->uv_height);

-  c = vp9_ssim2(source->v_buffer, dest->v_buffer,

-                source->uv_stride, dest->uv_stride, source->uv_width,

-                source->uv_height);

-  *ssim_y = a;

-  *ssim_u = b;

-  *ssim_v = c;

-  ssim_all = (a * 4 + b + c) / 6;

-  return ssim_all;

-}

--- a/vp8/encoder/temporal_filter.c

+++ /dev/null

@@ -1,516 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vp8/common/onyxc_int.h"

-#include "onyx_int.h"

-#include "vp8/common/systemdependent.h"

-#include "quantize.h"

-#include "vp8/common/alloccommon.h"

-#include "mcomp.h"

-#include "firstpass.h"

-#include "psnr.h"

-#include "vpx_scale/vpxscale.h"

-#include "vp8/common/extend.h"

-#include "ratectrl.h"

-#include "vp8/common/quant_common.h"

-#include "segmentation.h"

-#include "vpx_scale/yv12extend.h"

-#include "vpx_mem/vpx_mem.h"

-#include "vp8/common/swapyv12buffer.h"

-#include "vpx_ports/vpx_timer.h"

-#include <math.h>

-#include <limits.h>

-#define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering

-#define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering

-#if VP9_TEMPORAL_ALT_REF

-static void temporal_filter_predictors_mb_c

-(

-  MACROBLOCKD *xd,

-  unsigned char *y_mb_ptr,

-  unsigned char *u_mb_ptr,

-  unsigned char *v_mb_ptr,

-  int stride,

-  int mv_row,

-  int mv_col,

-  unsigned char *pred

-) {

-  int offset;

-  unsigned char *yptr, *uptr, *vptr;

-  int omv_row, omv_col;

-  // Y

-  yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3);

-  if ((mv_row | mv_col) & 7) {

-    xd->subpixel_predict16x16(yptr, stride,

-                             (mv_col & 7) << 1, (mv_row & 7) << 1, &pred[0], 16);

-  } else {

-    vp9_copy_mem16x16(yptr, stride, &pred[0], 16);

-  }

-  // U & V

-  omv_row = mv_row;

-  omv_col = mv_col;

-  mv_row >>= 1;

-  mv_col >>= 1;

-  stride = (stride + 1) >> 1;

-  offset = (mv_row >> 3) * stride + (mv_col >> 3);

-  uptr = u_mb_ptr + offset;

-  vptr = v_mb_ptr + offset;

-  if ((omv_row | omv_col) & 15) {

-    xd->subpixel_predict8x8(uptr, stride,

-                           (omv_col & 15), (omv_row & 15), &pred[256], 8);

-    xd->subpixel_predict8x8(vptr, stride,

-                           (omv_col & 15), (omv_row & 15), &pred[320], 8);

-  }

-  else {

-    vp9_copy_mem8x8(uptr, stride, &pred[256], 8);

-    vp9_copy_mem8x8(vptr, stride, &pred[320], 8);

-  }

-}

-void vp9_temporal_filter_apply_c

-(

-  unsigned char *frame1,

-  unsigned int stride,

-  unsigned char *frame2,

-  unsigned int block_size,

-  int strength,

-  int filter_weight,

-  unsigned int *accumulator,

-  unsigned short *count

-) {

-  unsigned int i, j, k;

-  int modifier;

-  int byte = 0;

-  for (i = 0, k = 0; i < block_size; i++) {

-    for (j = 0; j < block_size; j++, k++) {

-      int src_byte = frame1[byte];

-      int pixel_value = *frame2++;

-      modifier   = src_byte - pixel_value;

-      // This is an integer approximation of:

-      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);

-      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);

-      modifier  *= modifier;

-      modifier  *= 3;

-      modifier  += 1 << (strength - 1);

-      modifier >>= strength;

-      if (modifier > 16)

-        modifier = 16;

-      modifier = 16 - modifier;

-      modifier *= filter_weight;

-      count[k] += modifier;

-      accumulator[k] += modifier * pixel_value;

-      byte++;

-    }

-    byte += stride - block_size;

-  }

-}

-#if ALT_REF_MC_ENABLED

-static int temporal_filter_find_matching_mb_c

-(

-  VP9_COMP *cpi,

-  YV12_BUFFER_CONFIG *arf_frame,

-  YV12_BUFFER_CONFIG *frame_ptr,

-  int mb_offset,

-  int error_thresh

-) {

-  MACROBLOCK *x = &cpi->mb;

-  int step_param;

-  int further_steps;

-  int sadpb = x->sadperbit16;

-  int bestsme = INT_MAX;

-  BLOCK *b = &x->block[0];

-  BLOCKD *d = &x->e_mbd.block[0];

-  int_mv best_ref_mv1;

-  int_mv best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */

-  // Save input state

-  unsigned char **base_src = b->base_src;

-  int src = b->src;

-  int src_stride = b->src_stride;

-  unsigned char **base_pre = d->base_pre;

-  int pre = d->pre;

-  int pre_stride = d->pre_stride;

-  best_ref_mv1.as_int = 0;

-  best_ref_mv1_full.as_mv.col = best_ref_mv1.as_mv.col >> 3;

-  best_ref_mv1_full.as_mv.row = best_ref_mv1.as_mv.row >> 3;

-  // Setup frame pointers

-  b->base_src = &arf_frame->y_buffer;

-  b->src_stride = arf_frame->y_stride;

-  b->src = mb_offset;

-  d->base_pre = &frame_ptr->y_buffer;

-  d->pre_stride = frame_ptr->y_stride;

-  d->pre = mb_offset;

-  // Further step/diamond searches as necessary

-  if (cpi->Speed < 8) {

-    step_param = cpi->sf.first_step +

-                 ((cpi->Speed > 5) ? 1 : 0);

-    further_steps =

-      (cpi->sf.max_step_search_steps - 1) - step_param;

-  } else {

-    step_param = cpi->sf.first_step + 2;

-    further_steps = 0;

-  }

-  /*cpi->sf.search_method == HEX*/

-  // TODO Check that the 16x16 vf & sdf are selected here

-  // Ignore mv costing by sending NULL pointer instead of cost arrays

-  bestsme = vp9_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.as_mv.first,

-                           step_param, sadpb, &cpi->fn_ptr[BLOCK_16X16],

-                           NULLMVCOST, NULLMVCOST,

-                           &best_ref_mv1);

-#if ALT_REF_SUBPEL_ENABLED

-  // Try sub-pixel MC?

-  // if (bestsme > error_thresh && bestsme < INT_MAX)

-  {

-    int distortion;

-    unsigned int sse;

-    // Ignore mv costing by sending NULL pointer instead of cost array

-    bestsme = cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv.first,

-                                           &best_ref_mv1,

-                                           x->errorperbit,

-                                           &cpi->fn_ptr[BLOCK_16X16],

-                                           NULLMVCOST,

-                                           &distortion, &sse);

-  }

-#endif

-  // Save input state

-  b->base_src = base_src;

-  b->src = src;

-  b->src_stride = src_stride;

-  d->base_pre = base_pre;

-  d->pre = pre;

-  d->pre_stride = pre_stride;

-  return bestsme;

-}

-#endif

-static void temporal_filter_iterate_c

-(

-  VP9_COMP *cpi,

-  int frame_count,

-  int alt_ref_index,

-  int strength

-) {

-  int byte;

-  int frame;

-  int mb_col, mb_row;

-  unsigned int filter_weight;

-  int mb_cols = cpi->common.mb_cols;

-  int mb_rows = cpi->common.mb_rows;

-  int mb_y_offset = 0;

-  int mb_uv_offset = 0;

-  DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16 * 16 + 8 * 8 + 8 * 8);

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, count, 16 * 16 + 8 * 8 + 8 * 8);

-  MACROBLOCKD *mbd = &cpi->mb.e_mbd;

-  YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];

-  unsigned char *dst1, *dst2;

-  DECLARE_ALIGNED_ARRAY(16, unsigned char,  predictor, 16 * 16 + 8 * 8 + 8 * 8);

-  // Save input state

-  unsigned char *y_buffer = mbd->pre.y_buffer;

-  unsigned char *u_buffer = mbd->pre.u_buffer;

-  unsigned char *v_buffer = mbd->pre.v_buffer;

-  for (mb_row = 0; mb_row < mb_rows; mb_row++) {

-#if ALT_REF_MC_ENABLED

-    // Source frames are extended to 16 pixels.  This is different than

-    //  L/A/G reference frames that have a border of 32 (VP8BORDERINPIXELS)

-    // A 6/8 tap filter is used for motion search.  This requires 2 pixels

-    //  before and 3 pixels after.  So the largest Y mv on a border would

-    //  then be 16 - INTERP_EXTEND. The UV blocks are half the size of the Y and

-    //  therefore only extended by 8.  The largest mv that a UV block

-    //  can support is 8 - INTERP_EXTEND.  A UV mv is half of a Y mv.

-    //  (16 - INTERP_EXTEND) >> 1 which is greater than 8 - INTERP_EXTEND.

-    // To keep the mv in play for both Y and UV planes the max that it

-    //  can be on a border is therefore 16 - (2*INTERP_EXTEND+1).

-    cpi->mb.mv_row_min = -((mb_row * 16) + (17 - 2 * INTERP_EXTEND));

-    cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16)

-                         + (17 - 2 * INTERP_EXTEND);

-#endif

-    for (mb_col = 0; mb_col < mb_cols; mb_col++) {

-      int i, j, k;

-      int stride;

-      vpx_memset(accumulator, 0, 384 * sizeof(unsigned int));

-      vpx_memset(count, 0, 384 * sizeof(unsigned short));

-#if ALT_REF_MC_ENABLED

-      cpi->mb.mv_col_min = -((mb_col * 16) + (17 - 2 * INTERP_EXTEND));

-      cpi->mb.mv_col_max = ((cpi->common.mb_cols - 1 - mb_col) * 16)

-                           + (17 - 2 * INTERP_EXTEND);

-#endif

-      for (frame = 0; frame < frame_count; frame++) {

-        if (cpi->frames[frame] == NULL)

-          continue;

-        mbd->block[0].bmi.as_mv.first.as_mv.row = 0;

-        mbd->block[0].bmi.as_mv.first.as_mv.col = 0;

-        if (frame == alt_ref_index) {

-          filter_weight = 2;

-        } else {

-          int err = 0;

-#if ALT_REF_MC_ENABLED

-#define THRESH_LOW   10000

-#define THRESH_HIGH  20000

-          // Find best match in this frame by MC

-          err = temporal_filter_find_matching_mb_c

-                (cpi,

-                 cpi->frames[alt_ref_index],

-                 cpi->frames[frame],

-                 mb_y_offset,

-                 THRESH_LOW);

-#endif

-          // Assign higher weight to matching MB if it's error

-          // score is lower. If not applying MC default behavior

-          // is to weight all MBs equal.

-          filter_weight = err < THRESH_LOW

-                          ? 2 : err < THRESH_HIGH ? 1 : 0;

-        }

-        if (filter_weight != 0) {

-          // Construct the predictors

-          temporal_filter_predictors_mb_c

-          (mbd,

-           cpi->frames[frame]->y_buffer + mb_y_offset,

-           cpi->frames[frame]->u_buffer + mb_uv_offset,

-           cpi->frames[frame]->v_buffer + mb_uv_offset,

-           cpi->frames[frame]->y_stride,

-           mbd->block[0].bmi.as_mv.first.as_mv.row,

-           mbd->block[0].bmi.as_mv.first.as_mv.col,

-           predictor);

-          // Apply the filter (YUV)

-          TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)

-          (f->y_buffer + mb_y_offset,

-           f->y_stride,

-           predictor,

-           16,

-           strength,

-           filter_weight,

-           accumulator,

-           count);

-          TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)

-          (f->u_buffer + mb_uv_offset,

-           f->uv_stride,

-           predictor + 256,

-           8,

-           strength,

-           filter_weight,

-           accumulator + 256,

-           count + 256);

-          TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)

-          (f->v_buffer + mb_uv_offset,

-           f->uv_stride,

-           predictor + 320,

-           8,

-           strength,

-           filter_weight,

-           accumulator + 320,

-           count + 320);

-        }

-      }

-      // Normalize filter output to produce AltRef frame

-      dst1 = cpi->alt_ref_buffer.y_buffer;

-      stride = cpi->alt_ref_buffer.y_stride;

-      byte = mb_y_offset;

-      for (i = 0, k = 0; i < 16; i++) {

-        for (j = 0; j < 16; j++, k++) {

-          unsigned int pval = accumulator[k] + (count[k] >> 1);

-          pval *= cpi->fixed_divide[count[k]];

-          pval >>= 19;

-          dst1[byte] = (unsigned char)pval;

-          // move to next pixel

-          byte++;

-        }

-        byte += stride - 16;

-      }

-      dst1 = cpi->alt_ref_buffer.u_buffer;

-      dst2 = cpi->alt_ref_buffer.v_buffer;

-      stride = cpi->alt_ref_buffer.uv_stride;

-      byte = mb_uv_offset;

-      for (i = 0, k = 256; i < 8; i++) {

-        for (j = 0; j < 8; j++, k++) {

-          int m = k + 64;

-          // U

-          unsigned int pval = accumulator[k] + (count[k] >> 1);

-          pval *= cpi->fixed_divide[count[k]];

-          pval >>= 19;

-          dst1[byte] = (unsigned char)pval;

-          // V

-          pval = accumulator[m] + (count[m] >> 1);

-          pval *= cpi->fixed_divide[count[m]];

-          pval >>= 19;

-          dst2[byte] = (unsigned char)pval;

-          // move to next pixel

-          byte++;

-        }

-        byte += stride - 8;

-      }

-      mb_y_offset += 16;

-      mb_uv_offset += 8;

-    }

-    mb_y_offset += 16 * (f->y_stride - mb_cols);

-    mb_uv_offset += 8 * (f->uv_stride - mb_cols);

-  }

-  // Restore input state

-  mbd->pre.y_buffer = y_buffer;

-  mbd->pre.u_buffer = u_buffer;

-  mbd->pre.v_buffer = v_buffer;

-}

-void vp9_temporal_filter_prepare_c

-(

-  VP9_COMP *cpi,

-  int distance

-) {

-  int frame = 0;

-  int num_frames_backward = 0;

-  int num_frames_forward = 0;

-  int frames_to_blur_backward = 0;

-  int frames_to_blur_forward = 0;

-  int frames_to_blur = 0;

-  int start_frame = 0;

-  int strength = cpi->oxcf.arnr_strength;

-  int blur_type = cpi->oxcf.arnr_type;

-  int max_frames = cpi->active_arnr_frames;

-  num_frames_backward = distance;

-  num_frames_forward = vp9_lookahead_depth(cpi->lookahead)

-                       - (num_frames_backward + 1);

-  switch (blur_type) {

-    case 1:

-      /////////////////////////////////////////

-      // Backward Blur

-      frames_to_blur_backward = num_frames_backward;

-      if (frames_to_blur_backward >= max_frames)

-        frames_to_blur_backward = max_frames - 1;

-      frames_to_blur = frames_to_blur_backward + 1;

-      break;

-    case 2:

-      /////////////////////////////////////////

-      // Forward Blur

-      frames_to_blur_forward = num_frames_forward;

-      if (frames_to_blur_forward >= max_frames)

-        frames_to_blur_forward = max_frames - 1;

-      frames_to_blur = frames_to_blur_forward + 1;

-      break;

-    case 3:

-    default:

-      /////////////////////////////////////////

-      // Center Blur

-      frames_to_blur_forward = num_frames_forward;

-      frames_to_blur_backward = num_frames_backward;

-      if (frames_to_blur_forward > frames_to_blur_backward)

-        frames_to_blur_forward = frames_to_blur_backward;

-      if (frames_to_blur_backward > frames_to_blur_forward)

-        frames_to_blur_backward = frames_to_blur_forward;

-      // When max_frames is even we have 1 more frame backward than forward

-      if (frames_to_blur_forward > (max_frames - 1) / 2)

-        frames_to_blur_forward = ((max_frames - 1) / 2);

-      if (frames_to_blur_backward > (max_frames / 2))

-        frames_to_blur_backward = (max_frames / 2);

-      frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1;

-      break;

-  }

-  start_frame = distance + frames_to_blur_forward;

-#ifdef DEBUGFWG

-  // DEBUG FWG

-  printf("max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d start:%d"

-, max_frames

-, num_frames_backward

-, num_frames_forward

-, frames_to_blur

-, frames_to_blur_backward

-, frames_to_blur_forward

-, cpi->source_encode_index

-, cpi->last_alt_ref_sei

-, start_frame);

-#endif

-  // Setup frame pointers, NULL indicates frame not included in filter

-  vpx_memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *));

-  for (frame = 0; frame < frames_to_blur; frame++) {

-    int which_buffer =  start_frame - frame;

-    struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead,

-                                                     which_buffer);

-    cpi->frames[frames_to_blur - 1 - frame] = &buf->img;

-  }

-  temporal_filter_iterate_c(

-    cpi,

-    frames_to_blur,

-    frames_to_blur_backward,

-    strength);

-}

-#endif

--- a/vp8/encoder/temporal_filter.h

+++ /dev/null

@@ -1,47 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_TEMPORAL_FILTER_H

-#define __INC_TEMPORAL_FILTER_H

-#define prototype_apply(sym)\

-  void (sym) \

-  ( \

-    unsigned char *frame1, \

-    unsigned int stride, \

-    unsigned char *frame2, \

-    unsigned int block_size, \

-    int strength, \

-    int filter_weight, \

-    unsigned int *accumulator, \

-    unsigned short *count \

-  )

-#if ARCH_X86 || ARCH_X86_64

-#include "x86/temporal_filter_x86.h"

-#endif

-#ifndef vp9_temporal_filter_apply

-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c

-#endif

-extern prototype_apply(vp9_temporal_filter_apply);

-typedef struct {

-  prototype_apply(*apply);

-} vp9_temporal_rtcd_vtable_t;

-#if CONFIG_RUNTIME_CPU_DETECT

-#define TEMPORAL_INVOKE(ctx,fn) (ctx)->fn

-#else

-#define TEMPORAL_INVOKE(ctx,fn) vp9_temporal_filter_##fn

-#endif

-#endif // __INC_TEMPORAL_FILTER_H

--- a/vp8/encoder/tokenize.c

+++ /dev/null

@@ -1,868 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <math.h>

-#include <stdio.h>

-#include <string.h>

-#include <assert.h>

-#include "onyx_int.h"

-#include "tokenize.h"

-#include "vpx_mem/vpx_mem.h"

-#include "vp8/common/pred_common.h"

-#include "vp8/common/seg_common.h"

-#include "vp8/common/entropy.h"

-/* Global event counters used for accumulating statistics across several

-   compressions, then generating context.c = initial stats. */

-#ifdef ENTROPY_STATS

-INT64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

-INT64 hybrid_context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

-INT64 context_counters_8x8[BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

-INT64 hybrid_context_counters_8x8[BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

-INT64 context_counters_16x16[BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

-INT64 hybrid_context_counters_16x16[BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

-extern unsigned int tree_update_hist[BLOCK_TYPES][COEF_BANDS]

-                    [PREV_COEF_CONTEXTS][ENTROPY_NODES][2];

-extern unsigned int hybrid_tree_update_hist[BLOCK_TYPES][COEF_BANDS]

-                    [PREV_COEF_CONTEXTS][ENTROPY_NODES][2];

-extern unsigned int tree_update_hist_8x8[BLOCK_TYPES_8X8][COEF_BANDS]

-                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];

-extern unsigned int hybrid_tree_update_hist_8x8[BLOCK_TYPES_8X8][COEF_BANDS]

-                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];

-extern unsigned int tree_update_hist_16x16[BLOCK_TYPES_16X16][COEF_BANDS]

-                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];

-extern unsigned int hybrid_tree_update_hist_16x16[BLOCK_TYPES_16X16][COEF_BANDS]

-                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];

-#endif  /* ENTROPY_STATS */

-void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run);

-void vp9_fix_contexts(MACROBLOCKD *xd);

-static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];

-const TOKENVALUE *vp9_dct_value_tokens_ptr;

-static int dct_value_cost[DCT_MAX_VALUE * 2];

-const int *vp9_dct_value_cost_ptr;

-static void fill_value_tokens() {

-  TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE;

-  vp9_extra_bit_struct *const e = vp9_extra_bits;

-  int i = -DCT_MAX_VALUE;

-  int sign = 1;

-  do {

-    if (!i)

-      sign = 0;

-    {

-      const int a = sign ? -i : i;

-      int eb = sign;

-      if (a > 4) {

-        int j = 4;

-        while (++j < 11  &&  e[j].base_val <= a) {}

-        t[i].Token = --j;

-        eb |= (a - e[j].base_val) << 1;

-      } else

-        t[i].Token = a;

-      t[i].Extra = eb;

-    }

-    // initialize the cost for extra bits for all possible coefficient value.

-    {

-      int cost = 0;

-      vp9_extra_bit_struct *p = vp9_extra_bits + t[i].Token;

-      if (p->base_val) {

-        const int extra = t[i].Extra;

-        const int Length = p->Len;

-        if (Length)

-          cost += treed_cost(p->tree, p->prob, extra >> 1, Length);

-        cost += vp9_cost_bit(vp9_prob_half, extra & 1); /* sign */

-        dct_value_cost[i + DCT_MAX_VALUE] = cost;

-      }

-    }

-  } while (++i < DCT_MAX_VALUE);

-  vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;

-  vp9_dct_value_cost_ptr   = dct_value_cost + DCT_MAX_VALUE;

-}

-static void tokenize_b(VP9_COMP *cpi,

-                       MACROBLOCKD *xd,

-                       const BLOCKD * const b,

-                       TOKENEXTRA **tp,

-                       PLANE_TYPE type,

-                       ENTROPY_CONTEXT *a,

-                       ENTROPY_CONTEXT *l,

-                       TX_SIZE tx_size,

-                       int dry_run) {

-  int pt; /* near block/prev token context index */

-  int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0;

-  const int eob = b->eob;     /* one beyond last nonzero coeff */

-  TOKENEXTRA *t = *tp;        /* store tokens starting here */

-  const short *qcoeff_ptr = b->qcoeff;

-  int seg_eob;

-  int segment_id = xd->mode_info_context->mbmi.segment_id;

-  const int *bands, *scan;

-  unsigned int (*counts)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];

-  vp9_prob (*probs)[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

-  const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

-                          get_tx_type(xd, b) : DCT_DCT;

-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

-  switch (tx_size) {

-    default:

-    case TX_4X4:

-      seg_eob = 16;

-      bands = vp9_coef_bands;

-      scan = vp9_default_zig_zag1d;

-      if (tx_type != DCT_DCT) {

-        counts = cpi->hybrid_coef_counts;

-        probs = cpi->common.fc.hybrid_coef_probs;

-        if (tx_type == ADST_DCT) {

-          scan = vp9_row_scan;

-        } else if (tx_type == DCT_ADST) {

-          scan = vp9_col_scan;

-        }

-      } else {

-        counts = cpi->coef_counts;

-        probs = cpi->common.fc.coef_probs;

-      }

-      break;

-    case TX_8X8:

-      if (type == PLANE_TYPE_Y2) {

-        seg_eob = 4;

-        bands = vp9_coef_bands;

-        scan = vp9_default_zig_zag1d;

-      } else {

-        seg_eob = 64;

-        bands = vp9_coef_bands_8x8;

-        scan = vp9_default_zig_zag1d_8x8;

-      }

-      if (tx_type != DCT_DCT) {

-        counts = cpi->hybrid_coef_counts_8x8;

-        probs = cpi->common.fc.hybrid_coef_probs_8x8;

-      } else {

-        counts = cpi->coef_counts_8x8;

-        probs = cpi->common.fc.coef_probs_8x8;

-      }

-      break;

-    case TX_16X16:

-      seg_eob = 256;

-      bands = vp9_coef_bands_16x16;

-      scan = vp9_default_zig_zag1d_16x16;

-      if (tx_type != DCT_DCT) {

-        counts = cpi->hybrid_coef_counts_16x16;

-        probs = cpi->common.fc.hybrid_coef_probs_16x16;

-      } else {

-        counts = cpi->coef_counts_16x16;

-        probs = cpi->common.fc.coef_probs_16x16;

-      }

-      break;

-  }

-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB))

-    seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-  do {

-    const int band = bands[c];

-    int token;

-    if (c < eob) {

-      const int rc = scan[c];

-      const int v = qcoeff_ptr[rc];

-      assert(-DCT_MAX_VALUE <= v  &&  v < DCT_MAX_VALUE);

-      t->Extra = vp9_dct_value_tokens_ptr[v].Extra;

-      token    = vp9_dct_value_tokens_ptr[v].Token;

-    } else {

-      token = DCT_EOB_TOKEN;

-    }

-    t->Token = token;

-    t->context_tree = probs[type][band][pt];

-    t->skip_eob_node = (pt == 0) && ((band > 0 && type != PLANE_TYPE_Y_NO_DC) ||

-                                     (band > 1 && type == PLANE_TYPE_Y_NO_DC));

-    assert(vp9_coef_encodings[t->Token].Len - t->skip_eob_node > 0);

-    if (!dry_run) {

-      ++counts[type][band][pt][token];

-    }

-    pt = vp9_prev_token_class[token];

-    ++t;

-  } while (c < eob && ++c < seg_eob);

-  *tp = t;

-  *a = *l = (c != !type); /* 0 <-> all coeff data is zero */

-}

-int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block) {

-  int skip = 1;

-  int i = 0;

-  if (has_y2_block) {

-    for (i = 0; i < 16; i++)

-      skip &= (xd->block[i].eob < 2);

-    skip &= (!xd->block[24].eob);

-  } else {

-    for (i = 0; i < 16; i++)

-      skip &= (!xd->block[i].eob);

-  }

-  return skip;

-}

-int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd) {

-  int skip = 1;

-  int i;

-  for (i = 16; i < 24; i++)

-    skip &= (!xd->block[i].eob);

-  return skip;

-}

-static int mb_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block) {

-  return (vp9_mby_is_skippable_4x4(xd, has_y2_block) &

-          vp9_mbuv_is_skippable_4x4(xd));

-}

-int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block) {

-  int skip = 1;

-  int i = 0;

-  if (has_y2_block) {

-    for (i = 0; i < 16; i += 4)

-      skip &= (xd->block[i].eob < 2);

-    skip &= (!xd->block[24].eob);

-  } else {

-    for (i = 0; i < 16; i += 4)

-      skip &= (!xd->block[i].eob);

-  }

-  return skip;

-}

-int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd) {

-  return (!xd->block[16].eob) & (!xd->block[20].eob);

-}

-static int mb_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block) {

-  return (vp9_mby_is_skippable_8x8(xd, has_y2_block) &

-          vp9_mbuv_is_skippable_8x8(xd));

-}

-static int mb_is_skippable_8x8_4x4uv(MACROBLOCKD *xd, int has_y2_block) {

-  return (vp9_mby_is_skippable_8x8(xd, has_y2_block) &

-          vp9_mbuv_is_skippable_4x4(xd));

-}

-int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd) {

-  int skip = 1;

-  skip &= !xd->block[0].eob;

-  return skip;

-}

-static int mb_is_skippable_16x16(MACROBLOCKD *xd) {

-  return (vp9_mby_is_skippable_16x16(xd) & vp9_mbuv_is_skippable_8x8(xd));

-}

-void vp9_tokenize_mb(VP9_COMP *cpi,

-                     MACROBLOCKD *xd,

-                     TOKENEXTRA **t,

-                     int dry_run) {

-  PLANE_TYPE plane_type;

-  int has_y2_block;

-  int b;

-  int tx_size = xd->mode_info_context->mbmi.txfm_size;

-  int mb_skip_context = vp9_get_pred_context(&cpi->common, xd, PRED_MBSKIP);

-  TOKENEXTRA *t_backup = *t;

-  ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *) xd->above_context;

-  ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *) xd->left_context;

-  // If the MB is going to be skipped because of a segment level flag

-  // exclude this from the skip count stats used to calculate the

-  // transmitted skip probability;

-  int skip_inc;

-  int segment_id = xd->mode_info_context->mbmi.segment_id;

-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||

-      (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0)) {

-    skip_inc = 1;

-  } else

-    skip_inc = 0;

-  has_y2_block = (tx_size != TX_16X16

-                  && xd->mode_info_context->mbmi.mode != B_PRED

-                  && xd->mode_info_context->mbmi.mode != I8X8_PRED

-                  && xd->mode_info_context->mbmi.mode != SPLITMV);

-  switch (tx_size) {

-    case TX_16X16:

-      xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_16x16(xd);

-      break;

-    case TX_8X8:

-      if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||

-          xd->mode_info_context->mbmi.mode == SPLITMV)

-        xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_8x8_4x4uv(xd, 0);

-      else

-        xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_8x8(xd, has_y2_block);

-      break;

-    default:

-      xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_4x4(xd, has_y2_block);

-      break;

-  }

-  if (xd->mode_info_context->mbmi.mb_skip_coeff) {

-    if (!dry_run)

-      cpi->skip_true_count[mb_skip_context] += skip_inc;

-    if (!cpi->common.mb_no_coeff_skip) {

-      vp9_stuff_mb(cpi, xd, t, dry_run);

-    } else {

-      vp9_fix_contexts(xd);

-    }

-    if (dry_run)

-      *t = t_backup;

-    return;

-  }

-  if (!dry_run)

-    cpi->skip_false_count[mb_skip_context] += skip_inc;

-  if (has_y2_block) {

-    if (tx_size == TX_8X8) {

-      tokenize_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,

-                 A + vp9_block2above_8x8[24], L + vp9_block2left_8x8[24],

-                 TX_8X8, dry_run);

-    } else {

-      tokenize_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,

-                 A + vp9_block2above[24], L + vp9_block2left[24],

-                 TX_4X4, dry_run);

-    }

-    plane_type = PLANE_TYPE_Y_NO_DC;

-  } else

-    plane_type = PLANE_TYPE_Y_WITH_DC;

-  if (tx_size == TX_16X16) {

-    tokenize_b(cpi, xd, xd->block, t, PLANE_TYPE_Y_WITH_DC,

-               A, L, TX_16X16, dry_run);

-    A[1] = A[2] = A[3] = A[0];

-    L[1] = L[2] = L[3] = L[0];

-    for (b = 16; b < 24; b += 4) {

-      tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,

-                 A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],

-                 TX_8X8, dry_run);

-      A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];

-      L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];

-    }

-    vpx_memset(&A[8], 0, sizeof(A[8]));

-    vpx_memset(&L[8], 0, sizeof(L[8]));

-  } else if (tx_size == TX_8X8) {

-    for (b = 0; b < 16; b += 4) {

-      tokenize_b(cpi, xd, xd->block + b, t, plane_type,

-                 A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],

-                 TX_8X8, dry_run);

-      A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];

-      L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];

-    }

-    if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||

-        xd->mode_info_context->mbmi.mode == SPLITMV) {

-      for (b = 16; b < 24; b++) {

-        tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,

-                   A + vp9_block2above[b], L + vp9_block2left[b],

-                   TX_4X4, dry_run);

-      }

-    } else {

-      for (b = 16; b < 24; b += 4) {

-        tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,

-                   A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],

-                   TX_8X8, dry_run);

-        A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];

-        L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];

-      }

-    }

-  } else {

-    for (b = 0; b < 16; b++) {

-      tokenize_b(cpi, xd, xd->block + b, t, plane_type,

-                 A + vp9_block2above[b], L + vp9_block2left[b],

-                 TX_4X4, dry_run);

-    }

-    for (b = 16; b < 24; b++) {

-      tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,

-                 A + vp9_block2above[b], L + vp9_block2left[b],

-                 TX_4X4, dry_run);

-    }

-  }

-  if (dry_run)

-    *t = t_backup;

-}

-#ifdef ENTROPY_STATS

-void init_context_counters(void) {

-  FILE *f = fopen("context.bin", "rb");

-  if (!f) {

-    vpx_memset(context_counters, 0, sizeof(context_counters));

-    vpx_memset(context_counters_8x8, 0, sizeof(context_counters_8x8));

-    vpx_memset(context_counters_16x16, 0, sizeof(context_counters_16x16));

-  } else {

-    fread(context_counters, sizeof(context_counters), 1, f);

-    fread(context_counters_8x8, sizeof(context_counters_8x8), 1, f);

-    fread(context_counters_16x16, sizeof(context_counters_16x16), 1, f);

-    fclose(f);

-  }

-  f = fopen("treeupdate.bin", "rb");

-  if (!f) {

-    vpx_memset(tree_update_hist, 0, sizeof(tree_update_hist));

-    vpx_memset(tree_update_hist_8x8, 0, sizeof(tree_update_hist_8x8));

-    vpx_memset(tree_update_hist_16x16, 0, sizeof(tree_update_hist_16x16));

-  } else {

-    fread(tree_update_hist, sizeof(tree_update_hist), 1, f);

-    fread(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);

-    fread(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);

-    fclose(f);

-  }

-}

-void print_context_counters() {

-  int type, band, pt, t;

-  FILE *f = fopen("context.c", "w");

-  fprintf(f, "#include \"entropy.h\"\n");

-  fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");

-  fprintf(f, "static const unsigned int\n"

-          "vp9_default_coef_counts[BLOCK_TYPES]\n"

-          "                      [COEF_BANDS]\n"

-          "                      [PREV_COEF_CONTEXTS]\n"

-          "                      [MAX_ENTROPY_TOKENS]={\n");

-# define Comma( X) (X? ",":"")

-  type = 0;

-  do {

-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);

-    band = 0;

-    do {

-      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);

-      pt = 0;

-      do {

-        fprintf(f, "%s\n      {", Comma(pt));

-        t = 0;

-        do {

-          const INT64 x = context_counters [type] [band] [pt] [t];

-          const int y = (int) x;

-          assert(x == (INT64) y);  /* no overflow handling yet */

-          fprintf(f, "%s %d", Comma(t), y);

-        } while (++t < MAX_ENTROPY_TOKENS);

-        fprintf(f, "}");

-      } while (++pt < PREV_COEF_CONTEXTS);

-      fprintf(f, "\n    }");

-    } while (++band < COEF_BANDS);

-    fprintf(f, "\n  }");

-  } while (++type < BLOCK_TYPES);

-  fprintf(f, "\n};\n");

-  fprintf(f, "static const unsigned int\nvp9_default_coef_counts_8x8"

-          "[BLOCK_TYPES_8X8] [COEF_BANDS]"

-          "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");

-  type = 0;

-  do {

-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);

-    band = 0;

-    do {

-      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);

-      pt = 0;

-      do {

-        fprintf(f, "%s\n      {", Comma(pt));

-        t = 0;

-        do {

-          const INT64 x = context_counters_8x8 [type] [band] [pt] [t];

-          const int y = (int) x;

-          assert(x == (INT64) y);  /* no overflow handling yet */

-          fprintf(f, "%s %d", Comma(t), y);

-        } while (++t < MAX_ENTROPY_TOKENS);

-        fprintf(f, "}");

-      } while (++pt < PREV_COEF_CONTEXTS);

-      fprintf(f, "\n    }");

-    } while (++band < COEF_BANDS);

-    fprintf(f, "\n  }");

-  } while (++type < BLOCK_TYPES_8X8);

-  fprintf(f, "\n};\n");

-  fprintf(f, "static const unsigned int\nvp9_default_coef_counts_16x16"

-          "[BLOCK_TYPES_16X16] [COEF_BANDS]"

-          "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");

-  type = 0;

-  do {

-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);

-    band = 0;

-    do {

-      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);

-      pt = 0;

-      do {

-        fprintf(f, "%s\n      {", Comma(pt));

-        t = 0;

-        do {

-          const INT64 x = context_counters_16x16 [type] [band] [pt] [t];

-          const int y = (int) x;

-          assert(x == (INT64) y);  /* no overflow handling yet */

-          fprintf(f, "%s %d", Comma(t), y);

-        } while (++t < MAX_ENTROPY_TOKENS);

-        fprintf(f, "}");

-      } while (++pt < PREV_COEF_CONTEXTS);

-      fprintf(f, "\n    }");

-    } while (++band < COEF_BANDS);

-    fprintf(f, "\n  }");

-  } while (++type < BLOCK_TYPES_16X16);

-  fprintf(f, "\n};\n");

-  fprintf(f, "static const vp9_prob\n"

-          "vp9_default_coef_probs[BLOCK_TYPES] [COEF_BANDS] \n"

-          "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");

-  type = 0;

-  do {

-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);

-    band = 0;

-    do {

-      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);

-      pt = 0;

-      do {

-        unsigned int branch_ct [ENTROPY_NODES] [2];

-        unsigned int coef_counts[MAX_ENTROPY_TOKENS];

-        vp9_prob coef_probs[ENTROPY_NODES];

-        for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

-          coef_counts[t] = context_counters [type] [band] [pt] [t];

-        vp9_tree_probs_from_distribution(

-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

-          coef_probs, branch_ct, coef_counts, 256, 1);

-        fprintf(f, "%s\n      {", Comma(pt));

-        t = 0;

-        do {

-          fprintf(f, "%s %d", Comma(t), coef_probs[t]);

-        } while (++t < ENTROPY_NODES);

-        fprintf(f, "}");

-      } while (++pt < PREV_COEF_CONTEXTS);

-      fprintf(f, "\n    }");

-    } while (++band < COEF_BANDS);

-    fprintf(f, "\n  }");

-  } while (++type < BLOCK_TYPES);

-  fprintf(f, "\n};\n");

-  fprintf(f, "static const vp9_prob\n"

-          "vp9_default_coef_probs_8x8[BLOCK_TYPES_8X8] [COEF_BANDS]\n"

-          "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");

-  type = 0;

-  do {

-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);

-    band = 0;

-    do {

-      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);

-      pt = 0;

-      do {

-        unsigned int branch_ct [ENTROPY_NODES] [2];

-        unsigned int coef_counts[MAX_ENTROPY_TOKENS];

-        vp9_prob coef_probs[ENTROPY_NODES];

-        for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

-          coef_counts[t] = context_counters_8x8[type] [band] [pt] [t];

-        vp9_tree_probs_from_distribution(

-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

-          coef_probs, branch_ct, coef_counts, 256, 1);

-        fprintf(f, "%s\n      {", Comma(pt));

-        t = 0;

-        do {

-          fprintf(f, "%s %d", Comma(t), coef_probs[t]);

-        } while (++t < ENTROPY_NODES);

-        fprintf(f, "}");

-      } while (++pt < PREV_COEF_CONTEXTS);

-      fprintf(f, "\n    }");

-    } while (++band < COEF_BANDS);

-    fprintf(f, "\n  }");

-  } while (++type < BLOCK_TYPES_8X8);

-  fprintf(f, "\n};\n");

-  fprintf(f, "static const vp9_prob\n"

-          "vp9_default_coef_probs_16x16[BLOCK_TYPES_16X16] [COEF_BANDS]\n"

-          "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");

-  type = 0;

-  do {

-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);

-    band = 0;

-    do {

-      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);

-      pt = 0;

-      do {

-        unsigned int branch_ct [ENTROPY_NODES] [2];

-        unsigned int coef_counts[MAX_ENTROPY_TOKENS];

-        vp9_prob coef_probs[ENTROPY_NODES];

-        for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

-          coef_counts[t] = context_counters_16x16[type] [band] [pt] [t];

-        vp9_tree_probs_from_distribution(

-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

-          coef_probs, branch_ct, coef_counts, 256, 1);

-        fprintf(f, "%s\n      {", Comma(pt));

-        t = 0;

-        do {

-          fprintf(f, "%s %d", Comma(t), coef_probs[t]);

-        } while (++t < ENTROPY_NODES);

-        fprintf(f, "}");

-      } while (++pt < PREV_COEF_CONTEXTS);

-      fprintf(f, "\n    }");

-    } while (++band < COEF_BANDS);

-    fprintf(f, "\n  }");

-  } while (++type < BLOCK_TYPES_16X16);

-  fprintf(f, "\n};\n");

-  fclose(f);

-  f = fopen("context.bin", "wb");

-  fwrite(context_counters, sizeof(context_counters), 1, f);

-  fwrite(context_counters_8x8, sizeof(context_counters_8x8), 1, f);

-  fwrite(context_counters_16x16, sizeof(context_counters_16x16), 1, f);

-  fclose(f);

-}

-#endif

-void vp9_tokenize_initialize() {

-  fill_value_tokens();

-}

-static __inline void stuff_b(VP9_COMP *cpi,

-                             MACROBLOCKD *xd,

-                             const BLOCKD * const b,

-                             TOKENEXTRA **tp,

-                             PLANE_TYPE type,

-                             ENTROPY_CONTEXT *a,

-                             ENTROPY_CONTEXT *l,

-                             TX_SIZE tx_size,

-                             int dry_run) {

-  const int *bands;

-  unsigned int (*counts)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];

-  vp9_prob (*probs)[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

-  int pt, band;

-  TOKENEXTRA *t = *tp;

-  const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

-                          get_tx_type(xd, b) : DCT_DCT;

-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

-  switch (tx_size) {

-    default:

-    case TX_4X4:

-      bands = vp9_coef_bands;

-      if (tx_type != DCT_DCT) {

-        counts = cpi->hybrid_coef_counts;

-        probs = cpi->common.fc.hybrid_coef_probs;

-      } else {

-        counts = cpi->coef_counts;

-        probs = cpi->common.fc.coef_probs;

-      }

-      break;

-    case TX_8X8:

-      bands = vp9_coef_bands_8x8;

-      if (tx_type != DCT_DCT) {

-        counts = cpi->hybrid_coef_counts_8x8;

-        probs = cpi->common.fc.hybrid_coef_probs_8x8;

-      } else {

-        counts = cpi->coef_counts_8x8;

-        probs = cpi->common.fc.coef_probs_8x8;

-      }

-      break;

-    case TX_16X16:

-      bands = vp9_coef_bands_16x16;

-      if (tx_type != DCT_DCT) {

-        counts = cpi->hybrid_coef_counts_16x16;

-        probs = cpi->common.fc.hybrid_coef_probs_16x16;

-      } else {

-        counts = cpi->coef_counts_16x16;

-        probs = cpi->common.fc.coef_probs_16x16;

-      }

-      break;

-  }

-  band = bands[(type == PLANE_TYPE_Y_NO_DC) ? 1 : 0];

-  t->Token = DCT_EOB_TOKEN;

-  t->context_tree = probs[type][band][pt];

-  t->skip_eob_node = 0;

-  ++t;

-  *tp = t;

-  *a = *l = 0;

-  if (!dry_run) {

-    ++counts[type][band][pt][DCT_EOB_TOKEN];

-  }

-}

-static void stuff_mb_8x8(VP9_COMP *cpi, MACROBLOCKD *xd,

-                         TOKENEXTRA **t, int dry_run) {

-  ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context;

-  ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context;

-  PLANE_TYPE plane_type;

-  int b;

-  const int has_y2_block = (xd->mode_info_context->mbmi.mode != B_PRED &&

-                            xd->mode_info_context->mbmi.mode != I8X8_PRED &&

-                            xd->mode_info_context->mbmi.mode != SPLITMV);

-  if (has_y2_block) {

-    stuff_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,

-            A + vp9_block2above_8x8[24], L + vp9_block2left_8x8[24],

-            TX_8X8, dry_run);

-    plane_type = PLANE_TYPE_Y_NO_DC;

-  } else {

-    plane_type = PLANE_TYPE_Y_WITH_DC;

-  }

-  for (b = 0; b < 16; b += 4) {

-    stuff_b(cpi, xd, xd->block + b, t, plane_type, A + vp9_block2above_8x8[b],

-            L + vp9_block2left_8x8[b], TX_8X8, dry_run);

-    A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];

-    L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];

-  }

-  for (b = 16; b < 24; b += 4) {

-    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,

-            A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],

-            TX_8X8, dry_run);

-    A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];

-    L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];

-  }

-}

-static void stuff_mb_16x16(VP9_COMP *cpi, MACROBLOCKD *xd,

-                           TOKENEXTRA **t, int dry_run) {

-  ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)xd->above_context;

-  ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)xd->left_context;

-  int b;

-  stuff_b(cpi, xd, xd->block, t, PLANE_TYPE_Y_WITH_DC, A, L, TX_16X16, dry_run);

-  A[1] = A[2] = A[3] = A[0];

-  L[1] = L[2] = L[3] = L[0];

-  for (b = 16; b < 24; b += 4) {

-    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],

-            L + vp9_block2above_8x8[b], TX_8X8, dry_run);

-    A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];

-    L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];

-  }

-  vpx_memset(&A[8], 0, sizeof(A[8]));

-  vpx_memset(&L[8], 0, sizeof(L[8]));

-}

-static void stuff_mb_4x4(VP9_COMP *cpi, MACROBLOCKD *xd,

-                         TOKENEXTRA **t, int dry_run) {

-  ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context;

-  ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context;

-  int b;

-  PLANE_TYPE plane_type;

-  const int has_y2_block = (xd->mode_info_context->mbmi.mode != B_PRED &&

-                            xd->mode_info_context->mbmi.mode != I8X8_PRED &&

-                            xd->mode_info_context->mbmi.mode != SPLITMV);

-  if (has_y2_block) {

-    stuff_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2, A + vp9_block2above[24],

-            L + vp9_block2left[24], TX_4X4, dry_run);

-    plane_type = PLANE_TYPE_Y_NO_DC;

-  } else {

-    plane_type = PLANE_TYPE_Y_WITH_DC;

-  }

-  for (b = 0; b < 16; b++)

-    stuff_b(cpi, xd, xd->block + b, t, plane_type, A + vp9_block2above[b],

-            L + vp9_block2left[b], TX_4X4, dry_run);

-  for (b = 16; b < 24; b++)

-    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],

-            L + vp9_block2left[b], TX_4X4, dry_run);

-}

-static void stuff_mb_8x8_4x4uv(VP9_COMP *cpi, MACROBLOCKD *xd,

-                               TOKENEXTRA **t, int dry_run) {

-  ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context;

-  ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context;

-  int b;

-  for (b = 0; b < 16; b += 4) {

-    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_Y_WITH_DC,

-            A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],

-            TX_8X8, dry_run);

-    A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];

-    L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];

-  }

-  for (b = 16; b < 24; b++)

-    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],

-            L + vp9_block2left[b], TX_4X4, dry_run);

-}

-void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {

-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;

-  TOKENEXTRA * const t_backup = *t;

-  if (tx_size == TX_16X16) {

-    stuff_mb_16x16(cpi, xd, t, dry_run);

-  } else if (tx_size == TX_8X8) {

-    if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||

-        xd->mode_info_context->mbmi.mode == SPLITMV) {

-      stuff_mb_8x8_4x4uv(cpi, xd, t, dry_run);

-    } else {

-      stuff_mb_8x8(cpi, xd, t, dry_run);

-    }

-  } else {

-    stuff_mb_4x4(cpi, xd, t, dry_run);

-  }

-  if (dry_run) {

-    *t = t_backup;

-  }

-}

-void vp9_fix_contexts(MACROBLOCKD *xd) {

-  /* Clear entropy contexts for Y2 blocks */

-  if ((xd->mode_info_context->mbmi.mode != B_PRED

-      && xd->mode_info_context->mbmi.mode != I8X8_PRED

-      && xd->mode_info_context->mbmi.mode != SPLITMV)

-      || xd->mode_info_context->mbmi.txfm_size == TX_16X16

-      ) {

-    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));

-    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));

-  } else {

-    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);

-    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);

-  }

-}

--- a/vp8/encoder/tokenize.h

+++ /dev/null

@@ -1,59 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef tokenize_h

-#define tokenize_h

-#include "vp8/common/entropy.h"

-#include "block.h"

-void vp9_tokenize_initialize();

-typedef struct {

-  short Token;

-  short Extra;

-} TOKENVALUE;

-typedef struct {

-  const vp9_prob *context_tree;

-  short           Extra;

-  unsigned char   Token;

-  unsigned char   skip_eob_node;

-} TOKENEXTRA;

-int rd_cost_mby(MACROBLOCKD *);

-extern int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block);

-extern int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd);

-extern int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block);

-extern int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd);

-extern int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd);

-#ifdef ENTROPY_STATS

-void init_context_counters();

-void print_context_counters();

-extern INT64 context_counters[BLOCK_TYPES][COEF_BANDS]

-                             [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];

-extern INT64 context_counters_8x8[BLOCK_TYPES_8X8][COEF_BANDS]

-                                 [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];

-extern INT64 context_counters_16x16[BLOCK_TYPES_16X16][COEF_BANDS]

-                                   [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];

-#endif

-extern const int *vp9_dct_value_cost_ptr;

-/* TODO: The Token field should be broken out into a separate char array to

- *  improve cache locality, since it's needed for costing when the rest of the

- *  fields are not.

- */

-extern const TOKENVALUE *vp9_dct_value_tokens_ptr;

-#endif  /* tokenize_h */

--- a/vp8/encoder/treewriter.c

+++ /dev/null

@@ -1,39 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "treewriter.h"

-static void cost(

-  int *const C,

-  vp9_tree T,

-  const vp9_prob *const P,

-  int i,

-  int c

-) {

-  const vp9_prob p = P [i >> 1];

-  do {

-    const vp9_tree_index j = T[i];

-    const int d = c + vp9_cost_bit(p, i & 1);

-    if (j <= 0)

-      C[-j] = d;

-    else

-      cost(C, T, P, j, d);

-  } while (++i & 1);

-}

-void vp9_cost_tokens(int *c, const vp9_prob *p, vp9_tree t) {

-  cost(c, t, p, 0, 0);

-}

-void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t) {

-  cost(c, t, p, 2, 0);

-}

--- a/vp8/encoder/treewriter.h

+++ /dev/null

@@ -1,108 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_TREEWRITER_H

-#define __INC_TREEWRITER_H

-/* Trees map alphabets into huffman-like codes suitable for an arithmetic

-   bit coder.  Timothy S Murphy  11 October 2004 */

-#include "vp8/common/treecoder.h"

-#include "boolhuff.h"       /* for now */

-typedef BOOL_CODER vp9_writer;

-#define vp9_write encode_bool

-#define vp9_write_literal vp9_encode_value

-#define vp9_write_bit(W, V) vp9_write(W, V, vp9_prob_half)

-/* Approximate length of an encoded bool in 256ths of a bit at given prob */

-#define vp9_cost_zero(x) (vp9_prob_cost[x])

-#define vp9_cost_one(x) vp9_cost_zero(vp9_complement(x))

-#define vp9_cost_bit(x, b) vp9_cost_zero((b) ? vp9_complement(x) : (x))

-/* VP8BC version is scaled by 2^20 rather than 2^8; see bool_coder.h */

-/* Both of these return bits, not scaled bits. */

-static __inline unsigned int cost_branch(const unsigned int ct[2],

-                                         vp9_prob p) {

-  /* Imitate existing calculation */

-  return ((ct[0] * vp9_cost_zero(p))

-          + (ct[1] * vp9_cost_one(p))) >> 8;

-}

-static __inline unsigned int cost_branch256(const unsigned int ct[2],

-                                            vp9_prob p) {

-  /* Imitate existing calculation */

-  return ((ct[0] * vp9_cost_zero(p))

-          + (ct[1] * vp9_cost_one(p)));

-}

-/* Small functions to write explicit values and tokens, as well as

-   estimate their lengths. */

-static __inline void treed_write(vp9_writer *const w,

-                                 vp9_tree t,

-                                 const vp9_prob *const p,

-                                 int v,

-                                 /* number of bits in v, assumed nonzero */

-                                 int n) {

-  vp9_tree_index i = 0;

-  do {

-    const int b = (v >> --n) & 1;

-    vp9_write(w, b, p[i >> 1]);

-    i = t[i + b];

-  } while (n);

-}

-static __inline void write_token(vp9_writer *const w,

-                                 vp9_tree t,

-                                 const vp9_prob *const p,

-                                 vp9_token *const x) {

-  treed_write(w, t, p, x->value, x->Len);

-}

-static __inline int treed_cost(vp9_tree t,

-                               const vp9_prob *const p,

-                               int v,

-                               /* number of bits in v, assumed nonzero */

-                               int n) {

-  int c = 0;

-  vp9_tree_index i = 0;

-  do {

-    const int b = (v >> --n) & 1;

-    c += vp9_cost_bit(p[i >> 1], b);

-    i = t[i + b];

-  } while (n);

-  return c;

-}

-static __inline int cost_token(vp9_tree t,

-                               const vp9_prob *const p,

-                               vp9_token *const x) {

-  return treed_cost(t, p, x->value, x->Len);

-}

-/* Fill array of costs for all possible token values. */

-void vp9_cost_tokens(int *Costs, const vp9_prob *, vp9_tree);

-void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t);

-#endif

--- a/vp8/encoder/variance.h

+++ /dev/null

@@ -1,84 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef VARIANCE_H

-#define VARIANCE_H

-typedef unsigned int(*vp9_sad_fn_t)(const unsigned char *src_ptr,

-                                    int source_stride,

-                                    const unsigned char *ref_ptr,

-                                    int ref_stride,

-                                    unsigned int max_sad);

-typedef void (*vp9_copy32xn_fn_t)(const unsigned char *src_ptr,

-                                  int source_stride,

-                                  const unsigned char *ref_ptr,

-                                  int ref_stride,

-                                  int n);

-typedef void (*vp9_sad_multi_fn_t)(const unsigned char *src_ptr,

-                                   int source_stride,

-                                   const unsigned char *ref_ptr,

-                                   int  ref_stride,

-                                   unsigned int *sad_array);

-typedef void (*vp9_sad_multi1_fn_t)(const unsigned char *src_ptr,

-                                    int source_stride,

-                                    const unsigned char *ref_ptr,

-                                    int  ref_stride,

-                                    unsigned short *sad_array);

-typedef void (*vp9_sad_multi_d_fn_t)(const unsigned char *src_ptr,

-                                     int source_stride,

-                                     const unsigned char * const ref_ptr[],

-                                     int  ref_stride, unsigned int *sad_array);

-typedef unsigned int (*vp9_variance_fn_t)(const unsigned char *src_ptr,

-                                          int source_stride,

-                                          const unsigned char *ref_ptr,

-                                          int ref_stride,

-                                          unsigned int *sse);

-typedef unsigned int (*vp9_subpixvariance_fn_t)(const unsigned char  *src_ptr,

-                                                int source_stride,

-                                                int xoffset,

-                                                int yoffset,

-                                                const unsigned char *ref_ptr,

-                                                int Refstride,

-                                                unsigned int *sse);

-typedef void (*vp9_ssimpf_fn_t)(unsigned char *s, int sp, unsigned char *r,

-                                int rp, unsigned long *sum_s,

-                                unsigned long *sum_r, unsigned long *sum_sq_s,

-                                unsigned long *sum_sq_r,

-                                unsigned long *sum_sxr);

-typedef unsigned int (*vp9_getmbss_fn_t)(const short *);

-typedef unsigned int (*vp9_get16x16prederror_fn_t)(const unsigned char *src_ptr,

-                                                   int source_stride,

-                                                   const unsigned char *ref_ptr,

-                                                   int  ref_stride);

-typedef struct variance_vtable {

-    vp9_sad_fn_t            sdf;

-    vp9_variance_fn_t       vf;

-    vp9_subpixvariance_fn_t svf;

-    vp9_variance_fn_t       svf_halfpix_h;

-    vp9_variance_fn_t       svf_halfpix_v;

-    vp9_variance_fn_t       svf_halfpix_hv;

-    vp9_sad_multi_fn_t      sdx3f;

-    vp9_sad_multi1_fn_t     sdx8f;

-    vp9_sad_multi_d_fn_t    sdx4df;

-    vp9_copy32xn_fn_t       copymem;

-} vp9_variance_fn_ptr_t;

-#endif

--- a/vp8/encoder/variance_c.c

+++ /dev/null

@@ -1,540 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "variance.h"

-#include "vp8/common/filter.h"

-unsigned int vp9_get_mb_ss_c(const short *src_ptr) {

-  unsigned int i, sum = 0;

-  for (i = 0; i < 256; i++) {

-    sum += (src_ptr[i] * src_ptr[i]);

-  }

-  return sum;

-}

-static void variance(const unsigned char *src_ptr,

-                     int  source_stride,

-                     const unsigned char *ref_ptr,

-                     int  recon_stride,

-                     int  w,

-                     int  h,

-                     unsigned int *sse,

-                     int *sum) {

-  int i, j;

-  int diff;

-  *sum = 0;

-  *sse = 0;

-  for (i = 0; i < h; i++) {

-    for (j = 0; j < w; j++) {

-      diff = src_ptr[j] - ref_ptr[j];

-      *sum += diff;

-      *sse += diff * diff;

-    }

-    src_ptr += source_stride;

-    ref_ptr += recon_stride;

-  }

-}

-#if CONFIG_SUPERBLOCKS

-unsigned int vp9_variance32x32_c(const unsigned char *src_ptr,

-                                 int  source_stride,

-                                 const unsigned char *ref_ptr,

-                                 int  recon_stride,

-                                 unsigned int *sse) {

-  unsigned int var;

-  int avg;

-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, &var, &avg);

-  *sse = var;

-  return (var - ((avg * avg) >> 10));

-}

-#endif

-unsigned int vp9_variance16x16_c(const unsigned char *src_ptr,

-                                 int  source_stride,

-                                 const unsigned char *ref_ptr,

-                                 int  recon_stride,

-                                 unsigned int *sse) {

-  unsigned int var;

-  int avg;

-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);

-  *sse = var;

-  return (var - ((avg * avg) >> 8));

-}

-unsigned int vp9_variance8x16_c(const unsigned char *src_ptr,

-                                int  source_stride,

-                                const unsigned char *ref_ptr,

-                                int  recon_stride,

-                                unsigned int *sse) {

-  unsigned int var;

-  int avg;

-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);

-  *sse = var;

-  return (var - ((avg * avg) >> 7));

-}

-unsigned int vp9_variance16x8_c(const unsigned char *src_ptr,

-                                int  source_stride,

-                                const unsigned char *ref_ptr,

-                                int  recon_stride,

-                                unsigned int *sse) {

-  unsigned int var;

-  int avg;

-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);

-  *sse = var;

-  return (var - ((avg * avg) >> 7));

-}

-unsigned int vp9_variance8x8_c(const unsigned char *src_ptr,

-                               int  source_stride,

-                               const unsigned char *ref_ptr,

-                               int  recon_stride,

-                               unsigned int *sse) {

-  unsigned int var;

-  int avg;

-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);

-  *sse = var;

-  return (var - ((avg * avg) >> 6));

-}

-unsigned int vp9_variance4x4_c(const unsigned char *src_ptr,

-                               int  source_stride,

-                               const unsigned char *ref_ptr,

-                               int  recon_stride,

-                               unsigned int *sse) {

-  unsigned int var;

-  int avg;

-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);

-  *sse = var;

-  return (var - ((avg * avg) >> 4));

-}

-unsigned int vp9_mse16x16_c(const unsigned char *src_ptr,

-                            int  source_stride,

-                            const unsigned char *ref_ptr,

-                            int  recon_stride,

-                            unsigned int *sse) {

-  unsigned int var;

-  int avg;

-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);

-  *sse = var;

-  return var;

-}

-/****************************************************************************

- *

- *  ROUTINE       : filter_block2d_bil_first_pass

- *

- *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.

- *                  UINT32 src_pixels_per_line : Stride of input block.

- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).

- *                  UINT32 output_height     : Input block height.

- *                  UINT32 output_width      : Input block width.

- *                  INT32  *vp9_filter          : Array of 2 bi-linear filter taps.

- *

- *  OUTPUTS       : INT32 *output_ptr        : Pointer to filtered block.

- *

- *  RETURNS       : void

- *

- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in

- *                  either horizontal or vertical direction to produce the

- *                  filtered output block. Used to implement first-pass

- *                  of 2-D separable filter.

- *

- *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.

- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.

- *                  pixel_step defines whether the filter is applied

- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).

- *                  It defines the offset required to move from one input

- *                  to the next.

- *

- ****************************************************************************/

-static void var_filter_block2d_bil_first_pass(const unsigned char *src_ptr,

-                                              unsigned short *output_ptr,

-                                              unsigned int src_pixels_per_line,

-                                              int pixel_step,

-                                              unsigned int output_height,

-                                              unsigned int output_width,

-                                              const short *vp9_filter) {

-  unsigned int i, j;

-  for (i = 0; i < output_height; i++) {

-    for (j = 0; j < output_width; j++) {

-      // Apply bilinear filter

-      output_ptr[j] = (((int)src_ptr[0]          * vp9_filter[0]) +

-                       ((int)src_ptr[pixel_step] * vp9_filter[1]) +

-                       (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;

-      src_ptr++;

-    }

-    // Next row...

-    src_ptr    += src_pixels_per_line - output_width;

-    output_ptr += output_width;

-  }

-}

-/****************************************************************************

- *

- *  ROUTINE       : filter_block2d_bil_second_pass

- *

- *  INPUTS        : INT32  *src_ptr          : Pointer to source block.

- *                  UINT32 src_pixels_per_line : Stride of input block.

- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).

- *                  UINT32 output_height     : Input block height.

- *                  UINT32 output_width      : Input block width.

- *                  INT32  *vp9_filter          : Array of 2 bi-linear filter taps.

- *

- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.

- *

- *  RETURNS       : void

- *

- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in

- *                  either horizontal or vertical direction to produce the

- *                  filtered output block. Used to implement second-pass

- *                  of 2-D separable filter.

- *

- *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.

- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.

- *                  pixel_step defines whether the filter is applied

- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).

- *                  It defines the offset required to move from one input

- *                  to the next.

- *

- ****************************************************************************/

-static void var_filter_block2d_bil_second_pass(const unsigned short *src_ptr,

-                                               unsigned char *output_ptr,

-                                               unsigned int src_pixels_per_line,

-                                               unsigned int pixel_step,

-                                               unsigned int output_height,

-                                               unsigned int output_width,

-                                               const short *vp9_filter) {

-  unsigned int  i, j;

-  int  Temp;

-  for (i = 0; i < output_height; i++) {

-    for (j = 0; j < output_width; j++) {

-      // Apply filter

-      Temp = ((int)src_ptr[0]         * vp9_filter[0]) +

-             ((int)src_ptr[pixel_step] * vp9_filter[1]) +

-             (VP9_FILTER_WEIGHT / 2);

-      output_ptr[j] = (unsigned int)(Temp >> VP9_FILTER_SHIFT);

-      src_ptr++;

-    }

-    // Next row...

-    src_ptr    += src_pixels_per_line - output_width;

-    output_ptr += output_width;

-  }

-}

-unsigned int vp9_sub_pixel_variance4x4_c(const unsigned char  *src_ptr,

-                                         int  src_pixels_per_line,

-                                         int  xoffset,

-                                         int  yoffset,

-                                         const unsigned char *dst_ptr,

-                                         int dst_pixels_per_line,

-                                         unsigned int *sse) {

-  unsigned char  temp2[20 * 16];

-  const short *HFilter, *VFilter;

-  unsigned short FData3[5 * 4]; // Temp data bufffer used in filtering

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  // First filter 1d Horizontal

-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);

-  // Now filter Verticaly

-  var_filter_block2d_bil_second_pass(FData3, temp2, 4,  4,  4,  4, VFilter);

-  return vp9_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);

-}

-unsigned int vp9_sub_pixel_variance8x8_c(const unsigned char  *src_ptr,

-                                         int  src_pixels_per_line,

-                                         int  xoffset,

-                                         int  yoffset,

-                                         const unsigned char *dst_ptr,

-                                         int dst_pixels_per_line,

-                                         unsigned int *sse) {

-  unsigned short FData3[9 * 8]; // Temp data bufffer used in filtering

-  unsigned char  temp2[20 * 16];

-  const short *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);

-  var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);

-  return vp9_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);

-}

-unsigned int vp9_sub_pixel_variance16x16_c(const unsigned char  *src_ptr,

-                                           int  src_pixels_per_line,

-                                           int  xoffset,

-                                           int  yoffset,

-                                           const unsigned char *dst_ptr,

-                                           int dst_pixels_per_line,

-                                           unsigned int *sse) {

-  unsigned short FData3[17 * 16]; // Temp data bufffer used in filtering

-  unsigned char  temp2[20 * 16];

-  const short *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);

-  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);

-  return vp9_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);

-}

-#if CONFIG_SUPERBLOCKS

-unsigned int vp9_sub_pixel_variance32x32_c(const unsigned char  *src_ptr,

-                                           int  src_pixels_per_line,

-                                           int  xoffset,

-                                           int  yoffset,

-                                           const unsigned char *dst_ptr,

-                                           int dst_pixels_per_line,

-                                           unsigned int *sse) {

-  unsigned short FData3[33 * 32]; // Temp data bufffer used in filtering

-  unsigned char  temp2[36 * 32];

-  const short *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter);

-  var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter);

-  return vp9_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);

-}

-#endif

-unsigned int vp9_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr,

-                                              int  source_stride,

-                                              const unsigned char *ref_ptr,

-                                              int  recon_stride,

-                                              unsigned int *sse) {

-  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 0,

-                                       ref_ptr, recon_stride, sse);

-}

-#if CONFIG_SUPERBLOCKS

-unsigned int vp9_variance_halfpixvar32x32_h_c(const unsigned char *src_ptr,

-                                              int  source_stride,

-                                              const unsigned char *ref_ptr,

-                                              int  recon_stride,

-                                              unsigned int *sse) {

-  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 0,

-                                       ref_ptr, recon_stride, sse);

-}

-#endif

-unsigned int vp9_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr,

-                                              int  source_stride,

-                                              const unsigned char *ref_ptr,

-                                              int  recon_stride,

-                                              unsigned int *sse) {

-  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8,

-                                       ref_ptr, recon_stride, sse);

-}

-#if CONFIG_SUPERBLOCKS

-unsigned int vp9_variance_halfpixvar32x32_v_c(const unsigned char *src_ptr,

-                                              int  source_stride,

-                                              const unsigned char *ref_ptr,

-                                              int  recon_stride,

-                                              unsigned int *sse) {

-  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 0, 8,

-                                       ref_ptr, recon_stride, sse);

-}

-#endif

-unsigned int vp9_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr,

-                                               int  source_stride,

-                                               const unsigned char *ref_ptr,

-                                               int  recon_stride,

-                                               unsigned int *sse) {

-  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 8,

-                                       ref_ptr, recon_stride, sse);

-}

-#if CONFIG_SUPERBLOCKS

-unsigned int vp9_variance_halfpixvar32x32_hv_c(const unsigned char *src_ptr,

-                                               int  source_stride,

-                                               const unsigned char *ref_ptr,

-                                               int  recon_stride,

-                                               unsigned int *sse) {

-  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 8,

-                                       ref_ptr, recon_stride, sse);

-}

-#endif

-unsigned int vp9_sub_pixel_mse16x16_c(const unsigned char  *src_ptr,

-                                      int  src_pixels_per_line,

-                                      int  xoffset,

-                                      int  yoffset,

-                                      const unsigned char *dst_ptr,

-                                      int dst_pixels_per_line,

-                                      unsigned int *sse) {

-  vp9_sub_pixel_variance16x16_c(src_ptr, src_pixels_per_line,

-                                xoffset, yoffset, dst_ptr,

-                                dst_pixels_per_line, sse);

-  return *sse;

-}

-#if CONFIG_SUPERBLOCKS

-unsigned int vp9_sub_pixel_mse32x32_c(const unsigned char  *src_ptr,

-                                      int  src_pixels_per_line,

-                                      int  xoffset,

-                                      int  yoffset,

-                                      const unsigned char *dst_ptr,

-                                      int dst_pixels_per_line,

-                                      unsigned int *sse) {

-  vp9_sub_pixel_variance32x32_c(src_ptr, src_pixels_per_line,

-                                xoffset, yoffset, dst_ptr,

-                                dst_pixels_per_line, sse);

-  return *sse;

-}

-#endif

-unsigned int vp9_sub_pixel_variance16x8_c(const unsigned char  *src_ptr,

-                                          int  src_pixels_per_line,

-                                          int  xoffset,

-                                          int  yoffset,

-                                          const unsigned char *dst_ptr,

-                                          int dst_pixels_per_line,

-                                          unsigned int *sse) {

-  unsigned short FData3[16 * 9];  // Temp data bufffer used in filtering

-  unsigned char  temp2[20 * 16];

-  const short *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);

-  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);

-  return vp9_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);

-}

-unsigned int vp9_sub_pixel_variance8x16_c(const unsigned char  *src_ptr,

-                                          int  src_pixels_per_line,

-                                          int  xoffset,

-                                          int  yoffset,

-                                          const unsigned char *dst_ptr,

-                                          int dst_pixels_per_line,

-                                          unsigned int *sse) {

-  unsigned short FData3[9 * 16];  // Temp data bufffer used in filtering

-  unsigned char  temp2[20 * 16];

-  const short *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,

-                                    1, 17, 8, HFilter);

-  var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);

-  return vp9_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);

-}

-#if CONFIG_NEWBESTREFMV

-unsigned int vp9_variance2x16_c(const unsigned char *src_ptr,

-                                const int  source_stride,

-                                const unsigned char *ref_ptr,

-                                const int  recon_stride,

-                                unsigned int *sse) {

-  unsigned int var;

-  int avg;

-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 2, 16, &var, &avg);

-  *sse = var;

-  return (var - ((avg * avg) >> 5));

-}

-unsigned int vp9_variance16x2_c(const unsigned char *src_ptr,

-                                const int  source_stride,

-                                const unsigned char *ref_ptr,

-                                const int  recon_stride,

-                                unsigned int *sse) {

-  unsigned int var;

-  int avg;

-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 2, &var, &avg);

-  *sse = var;

-  return (var - ((avg * avg) >> 5));

-}

-unsigned int vp9_sub_pixel_variance16x2_c(const unsigned char  *src_ptr,

-                                          const int  src_pixels_per_line,

-                                          const int  xoffset,

-                                          const int  yoffset,

-                                          const unsigned char *dst_ptr,

-                                          const int dst_pixels_per_line,

-                                          unsigned int *sse) {

-  unsigned short FData3[16 * 3];  // Temp data bufffer used in filtering

-  unsigned char  temp2[20 * 16];

-  const short *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  var_filter_block2d_bil_first_pass(src_ptr, FData3,

-                                    src_pixels_per_line, 1, 3, 16, HFilter);

-  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 2, 16, VFilter);

-  return vp9_variance16x2_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);

-}

-unsigned int vp9_sub_pixel_variance2x16_c(const unsigned char  *src_ptr,

-                                          const int  src_pixels_per_line,

-                                          const int  xoffset,

-                                          const int  yoffset,

-                                          const unsigned char *dst_ptr,

-                                          const int dst_pixels_per_line,

-                                          unsigned int *sse) {

-  unsigned short FData3[2 * 17];  // Temp data bufffer used in filtering

-  unsigned char  temp2[2 * 16];

-  const short *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  var_filter_block2d_bil_first_pass(src_ptr, FData3,

-                                    src_pixels_per_line, 1, 17, 2, HFilter);

-  var_filter_block2d_bil_second_pass(FData3, temp2, 2, 2, 16, 2, VFilter);

-  return vp9_variance2x16_c(temp2, 2, dst_ptr, dst_pixels_per_line, sse);

-}

-#endif

--- a/vp8/encoder/x86/dct_mmx.asm

+++ /dev/null

@@ -1,241 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch)

-global sym(vp9_short_fdct4x4_mmx)

-sym(vp9_short_fdct4x4_mmx):

-    push        rbp

-    mov         rbp,        rsp

-    SHADOW_ARGS_TO_STACK 3

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rsi,        arg(0)      ; input

-        mov         rdi,        arg(1)      ; output

-        movsxd      rax,        dword ptr arg(2) ;pitch

-        lea         rcx,        [rsi + rax*2]

-        ; read the input data

-        movq        mm0,        [rsi]

-        movq        mm1,        [rsi + rax]

-        movq        mm2,        [rcx]

-        movq        mm4,        [rcx + rax]

-        ; transpose for the first stage

-        movq        mm3,        mm0         ; 00 01 02 03

-        movq        mm5,        mm2         ; 20 21 22 23

-        punpcklwd   mm0,        mm1         ; 00 10 01 11

-        punpckhwd   mm3,        mm1         ; 02 12 03 13

-        punpcklwd   mm2,        mm4         ; 20 30 21 31

-        punpckhwd   mm5,        mm4         ; 22 32 23 33

-        movq        mm1,        mm0         ; 00 10 01 11

-        punpckldq   mm0,        mm2         ; 00 10 20 30

-        punpckhdq   mm1,        mm2         ; 01 11 21 31

-        movq        mm2,        mm3         ; 02 12 03 13

-        punpckldq   mm2,        mm5         ; 02 12 22 32

-        punpckhdq   mm3,        mm5         ; 03 13 23 33

-        ; mm0 0

-        ; mm1 1

-        ; mm2 2

-        ; mm3 3

-        ; first stage

-        movq        mm5,        mm0

-        movq        mm4,        mm1

-        paddw       mm0,        mm3         ; a1 = 0 + 3

-        paddw       mm1,        mm2         ; b1 = 1 + 2

-        psubw       mm4,        mm2         ; c1 = 1 - 2

-        psubw       mm5,        mm3         ; d1 = 0 - 3

-        psllw       mm5,        3

-        psllw       mm4,        3

-        psllw       mm0,        3

-        psllw       mm1,        3

-        ; output 0 and 2

-        movq        mm2,        mm0         ; a1

-        paddw       mm0,        mm1         ; op[0] = a1 + b1

-        psubw       mm2,        mm1         ; op[2] = a1 - b1

-        ; output 1 and 3

-        ; interleave c1, d1

-        movq        mm1,        mm5         ; d1

-        punpcklwd   mm1,        mm4         ; c1 d1

-        punpckhwd   mm5,        mm4         ; c1 d1

-        movq        mm3,        mm1

-        movq        mm4,        mm5

-        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

-        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

-        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

-        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

-        paddd       mm1,        MMWORD PTR[GLOBAL(_14500)]

-        paddd       mm4,        MMWORD PTR[GLOBAL(_14500)]

-        paddd       mm3,        MMWORD PTR[GLOBAL(_7500)]

-        paddd       mm5,        MMWORD PTR[GLOBAL(_7500)]

-        psrad       mm1,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12

-        psrad       mm4,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12

-        psrad       mm3,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12

-        psrad       mm5,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12

-        packssdw    mm1,        mm4         ; op[1]

-        packssdw    mm3,        mm5         ; op[3]

-        ; done with vertical

-        ; transpose for the second stage

-        movq        mm4,        mm0         ; 00 10 20 30

-        movq        mm5,        mm2         ; 02 12 22 32

-        punpcklwd   mm0,        mm1         ; 00 01 10 11

-        punpckhwd   mm4,        mm1         ; 20 21 30 31

-        punpcklwd   mm2,        mm3         ; 02 03 12 13

-        punpckhwd   mm5,        mm3         ; 22 23 32 33

-        movq        mm1,        mm0         ; 00 01 10 11

-        punpckldq   mm0,        mm2         ; 00 01 02 03

-        punpckhdq   mm1,        mm2         ; 01 22 12 13

-        movq        mm2,        mm4         ; 20 31 30 31

-        punpckldq   mm2,        mm5         ; 20 21 22 23

-        punpckhdq   mm4,        mm5         ; 30 31 32 33

-        ; mm0 0

-        ; mm1 1

-        ; mm2 2

-        ; mm3 4

-        movq        mm5,        mm0

-        movq        mm3,        mm1

-        paddw       mm0,        mm4         ; a1 = 0 + 3

-        paddw       mm1,        mm2         ; b1 = 1 + 2

-        psubw       mm3,        mm2         ; c1 = 1 - 2

-        psubw       mm5,        mm4         ; d1 = 0 - 3

-        pxor        mm6,        mm6         ; zero out for compare

-        pcmpeqw     mm6,        mm5         ; d1 != 0

-        pandn       mm6,        MMWORD PTR[GLOBAL(_cmp_mask)]   ; clear upper,

-                                                                ; and keep bit 0 of lower

-        ; output 0 and 2

-        movq        mm2,        mm0         ; a1

-        paddw       mm0,        mm1         ; a1 + b1

-        psubw       mm2,        mm1         ; a1 - b1

-        paddw       mm0,        MMWORD PTR[GLOBAL(_7w)]

-        paddw       mm2,        MMWORD PTR[GLOBAL(_7w)]

-        psraw       mm0,        4           ; op[0] = (a1 + b1 + 7)>>4

-        psraw       mm2,        4           ; op[8] = (a1 - b1 + 7)>>4

-        movq        MMWORD PTR[rdi + 0 ],  mm0

-        movq        MMWORD PTR[rdi + 16],  mm2

-        ; output 1 and 3

-        ; interleave c1, d1

-        movq        mm1,        mm5         ; d1

-        punpcklwd   mm1,        mm3         ; c1 d1

-        punpckhwd   mm5,        mm3         ; c1 d1

-        movq        mm3,        mm1

-        movq        mm4,        mm5

-        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

-        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

-        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

-        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

-        paddd       mm1,        MMWORD PTR[GLOBAL(_12000)]

-        paddd       mm4,        MMWORD PTR[GLOBAL(_12000)]

-        paddd       mm3,        MMWORD PTR[GLOBAL(_51000)]

-        paddd       mm5,        MMWORD PTR[GLOBAL(_51000)]

-        psrad       mm1,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16

-        psrad       mm4,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16

-        psrad       mm3,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16

-        psrad       mm5,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16

-        packssdw    mm1,        mm4         ; op[4]

-        packssdw    mm3,        mm5         ; op[12]

-        paddw       mm1,        mm6         ; op[4] += (d1!=0)

-        movq        MMWORD PTR[rdi + 8 ],  mm1

-        movq        MMWORD PTR[rdi + 24],  mm3

-     ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 8

-_5352_2217:

-    dw 5352

-    dw 2217

-    dw 5352

-    dw 2217

-align 8

-_2217_neg5352:

-    dw 2217

-    dw -5352

-    dw 2217

-    dw -5352

-align 8

-_cmp_mask:

-    times 4 dw 1

-align 8

-_7w:

-    times 4 dw 7

-align 8

-_14500:

-    times 2 dd 14500

-align 8

-_7500:

-    times 2 dd 7500

-align 8

-_12000:

-    times 2 dd 12000

-align 8

-_51000:

-    times 2 dd 51000

--- a/vp8/encoder/x86/dct_sse2.asm

+++ /dev/null

@@ -1,432 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%macro STACK_FRAME_CREATE 0

-%if ABI_IS_32BIT

-  %define       input       rsi

-  %define       output      rdi

-  %define       pitch       rax

-    push        rbp

-    mov         rbp, rsp

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    mov         rsi, arg(0)

-    mov         rdi, arg(1)

-    movsxd      rax, dword ptr arg(2)

-    lea         rcx, [rsi + rax*2]

-%else

-  %ifidn __OUTPUT_FORMAT__,x64

-    %define     input       rcx

-    %define     output      rdx

-    %define     pitch       r8

-    SAVE_XMM 7, u

-  %else

-    %define     input       rdi

-    %define     output      rsi

-    %define     pitch       rdx

-  %endif

-%endif

-%endmacro

-%macro STACK_FRAME_DESTROY 0

-  %define     input

-  %define     output

-  %define     pitch

-%if ABI_IS_32BIT

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    pop         rbp

-%else

-  %ifidn __OUTPUT_FORMAT__,x64

-    RESTORE_XMM

-  %endif

-%endif

-    ret

-%endmacro

-;void vp9_short_fdct4x4_sse2(short *input, short *output, int pitch)

-global sym(vp9_short_fdct4x4_sse2)

-sym(vp9_short_fdct4x4_sse2):

-    STACK_FRAME_CREATE

-    movq        xmm0, MMWORD PTR[input        ] ;03 02 01 00

-    movq        xmm2, MMWORD PTR[input+  pitch] ;13 12 11 10

-    lea         input,          [input+2*pitch]

-    movq        xmm1, MMWORD PTR[input        ] ;23 22 21 20

-    movq        xmm3, MMWORD PTR[input+  pitch] ;33 32 31 30

-    punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00

-    punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20

-    movdqa      xmm2, xmm0

-    punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00

-    punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10

-    movdqa      xmm1, xmm0

-    punpckldq   xmm0, xmm2                      ;31 21 30 20 11 10 01 00

-    pshufhw     xmm1, xmm1, 0b1h                ;22 23 02 03 xx xx xx xx

-    pshufhw     xmm2, xmm2, 0b1h                ;32 33 12 13 xx xx xx xx

-    punpckhdq   xmm1, xmm2                      ;32 33 22 23 12 13 02 03

-    movdqa      xmm3, xmm0

-    paddw       xmm0, xmm1                      ;b1 a1 b1 a1 b1 a1 b1 a1

-    psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1

-    psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3

-    psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3

-    movdqa      xmm1, xmm0

-    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1

-    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1

-    movdqa      xmm4, xmm3

-    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]   ;c1*2217 + d1*5352

-    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352

-    paddd       xmm3, XMMWORD PTR[GLOBAL(_14500)]

-    paddd       xmm4, XMMWORD PTR[GLOBAL(_7500)]

-    psrad       xmm3, 12            ;(c1 * 2217 + d1 * 5352 +  14500)>>12

-    psrad       xmm4, 12            ;(d1 * 2217 - c1 * 5352 +   7500)>>12

-    packssdw    xmm0, xmm1                      ;op[2] op[0]

-    packssdw    xmm3, xmm4                      ;op[3] op[1]

-    ; 23 22 21 20 03 02 01 00

-    ;

-    ; 33 32 31 30 13 12 11 10

-    ;

-    movdqa      xmm2, xmm0

-    punpcklqdq  xmm0, xmm3                      ;13 12 11 10 03 02 01 00

-    punpckhqdq  xmm2, xmm3                      ;23 22 21 20 33 32 31 30

-    movdqa      xmm3, xmm0

-    punpcklwd   xmm0, xmm2                      ;32 30 22 20 12 10 02 00

-    punpckhwd   xmm3, xmm2                      ;33 31 23 21 13 11 03 01

-    movdqa      xmm2, xmm0

-    punpcklwd   xmm0, xmm3                      ;13 12 11 10 03 02 01 00

-    punpckhwd   xmm2, xmm3                      ;33 32 31 30 23 22 21 20

-    movdqa      xmm5, XMMWORD PTR[GLOBAL(_7)]

-    pshufd      xmm2, xmm2, 04eh

-    movdqa      xmm3, xmm0

-    paddw       xmm0, xmm2                      ;b1 b1 b1 b1 a1 a1 a1 a1

-    psubw       xmm3, xmm2                      ;c1 c1 c1 c1 d1 d1 d1 d1

-    pshufd      xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 b1 a1 a1

-    movdqa      xmm2, xmm3                      ;save d1 for compare

-    pshufd      xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 c1 d1 d1

-    pshuflw     xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 a1 b1 a1

-    pshuflw     xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 d1 c1 d1

-    pshufhw     xmm0, xmm0, 0d8h                ;b1 a1 b1 a1 b1 a1 b1 a1

-    pshufhw     xmm3, xmm3, 0d8h                ;c1 d1 c1 d1 c1 d1 c1 d1

-    movdqa      xmm1, xmm0

-    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1

-    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1

-    pxor        xmm4, xmm4                      ;zero out for compare

-    paddd       xmm0, xmm5

-    paddd       xmm1, xmm5

-    pcmpeqw     xmm2, xmm4

-    psrad       xmm0, 4                         ;(a1 + b1 + 7)>>4

-    psrad       xmm1, 4                         ;(a1 - b1 + 7)>>4

-    pandn       xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,

-                                                     ;and keep bit 0 of lower

-    movdqa      xmm4, xmm3

-    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]    ;c1*2217 + d1*5352

-    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352

-    paddd       xmm3, XMMWORD PTR[GLOBAL(_12000)]

-    paddd       xmm4, XMMWORD PTR[GLOBAL(_51000)]

-    packssdw    xmm0, xmm1                      ;op[8] op[0]

-    psrad       xmm3, 16                ;(c1 * 2217 + d1 * 5352 +  12000)>>16

-    psrad       xmm4, 16                ;(d1 * 2217 - c1 * 5352 +  51000)>>16

-    packssdw    xmm3, xmm4                      ;op[12] op[4]

-    movdqa      xmm1, xmm0

-    paddw       xmm3, xmm2                      ;op[4] += (d1!=0)

-    punpcklqdq  xmm0, xmm3                      ;op[4] op[0]

-    punpckhqdq  xmm1, xmm3                      ;op[12] op[8]

-    movdqa      XMMWORD PTR[output +  0], xmm0

-    movdqa      XMMWORD PTR[output + 16], xmm1

-    STACK_FRAME_DESTROY

-;void vp9_short_fdct8x4_sse2(short *input, short *output, int pitch)

-global sym(vp9_short_fdct8x4_sse2)

-sym(vp9_short_fdct8x4_sse2):

-    STACK_FRAME_CREATE

-        ; read the input data

-        movdqa      xmm0,       [input        ]

-        movdqa      xmm2,       [input+  pitch]

-        lea         input,      [input+2*pitch]

-        movdqa      xmm4,       [input        ]

-        movdqa      xmm3,       [input+  pitch]

-        ; transpose for the first stage

-        movdqa      xmm1,       xmm0        ; 00 01 02 03 04 05 06 07

-        movdqa      xmm5,       xmm4        ; 20 21 22 23 24 25 26 27

-        punpcklwd   xmm0,       xmm2        ; 00 10 01 11 02 12 03 13

-        punpckhwd   xmm1,       xmm2        ; 04 14 05 15 06 16 07 17

-        punpcklwd   xmm4,       xmm3        ; 20 30 21 31 22 32 23 33

-        punpckhwd   xmm5,       xmm3        ; 24 34 25 35 26 36 27 37

-        movdqa      xmm2,       xmm0        ; 00 10 01 11 02 12 03 13

-        punpckldq   xmm0,       xmm4        ; 00 10 20 30 01 11 21 31

-        punpckhdq   xmm2,       xmm4        ; 02 12 22 32 03 13 23 33

-        movdqa      xmm4,       xmm1        ; 04 14 05 15 06 16 07 17

-        punpckldq   xmm4,       xmm5        ; 04 14 24 34 05 15 25 35

-        punpckhdq   xmm1,       xmm5        ; 06 16 26 36 07 17 27 37

-        movdqa      xmm3,       xmm2        ; 02 12 22 32 03 13 23 33

-        punpckhqdq  xmm3,       xmm1        ; 03 13 23 33 07 17 27 37

-        punpcklqdq  xmm2,       xmm1        ; 02 12 22 32 06 16 26 36

-        movdqa      xmm1,       xmm0        ; 00 10 20 30 01 11 21 31

-        punpcklqdq  xmm0,       xmm4        ; 00 10 20 30 04 14 24 34

-        punpckhqdq  xmm1,       xmm4        ; 01 11 21 32 05 15 25 35

-        ; xmm0 0

-        ; xmm1 1

-        ; xmm2 2

-        ; xmm3 3

-        ; first stage

-        movdqa      xmm5,       xmm0

-        movdqa      xmm4,       xmm1

-        paddw       xmm0,       xmm3        ; a1 = 0 + 3

-        paddw       xmm1,       xmm2        ; b1 = 1 + 2

-        psubw       xmm4,       xmm2        ; c1 = 1 - 2

-        psubw       xmm5,       xmm3        ; d1 = 0 - 3

-        psllw       xmm5,        3

-        psllw       xmm4,        3

-        psllw       xmm0,        3

-        psllw       xmm1,        3

-        ; output 0 and 2

-        movdqa      xmm2,       xmm0        ; a1

-        paddw       xmm0,       xmm1        ; op[0] = a1 + b1

-        psubw       xmm2,       xmm1        ; op[2] = a1 - b1

-        ; output 1 and 3

-        ; interleave c1, d1

-        movdqa      xmm1,       xmm5        ; d1

-        punpcklwd   xmm1,       xmm4        ; c1 d1

-        punpckhwd   xmm5,       xmm4        ; c1 d1

-        movdqa      xmm3,       xmm1

-        movdqa      xmm4,       xmm5

-        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

-        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

-        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

-        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

-        paddd       xmm1,       XMMWORD PTR[GLOBAL(_14500)]

-        paddd       xmm4,       XMMWORD PTR[GLOBAL(_14500)]

-        paddd       xmm3,       XMMWORD PTR[GLOBAL(_7500)]

-        paddd       xmm5,       XMMWORD PTR[GLOBAL(_7500)]

-        psrad       xmm1,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12

-        psrad       xmm4,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12

-        psrad       xmm3,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12

-        psrad       xmm5,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12

-        packssdw    xmm1,       xmm4        ; op[1]

-        packssdw    xmm3,       xmm5        ; op[3]

-        ; done with vertical

-        ; transpose for the second stage

-        movdqa      xmm4,       xmm0         ; 00 10 20 30 04 14 24 34

-        movdqa      xmm5,       xmm2         ; 02 12 22 32 06 16 26 36

-        punpcklwd   xmm0,       xmm1         ; 00 01 10 11 20 21 30 31

-        punpckhwd   xmm4,       xmm1         ; 04 05 14 15 24 25 34 35

-        punpcklwd   xmm2,       xmm3         ; 02 03 12 13 22 23 32 33

-        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37

-        movdqa      xmm1,       xmm0         ; 00 01 10 11 20 21 30 31

-        punpckldq   xmm0,       xmm2         ; 00 01 02 03 10 11 12 13

-        punpckhdq   xmm1,       xmm2         ; 20 21 22 23 30 31 32 33

-        movdqa      xmm2,       xmm4         ; 04 05 14 15 24 25 34 35

-        punpckldq   xmm2,       xmm5         ; 04 05 06 07 14 15 16 17

-        punpckhdq   xmm4,       xmm5         ; 24 25 26 27 34 35 36 37

-        movdqa      xmm3,       xmm1         ; 20 21 22 23 30 31 32 33

-        punpckhqdq  xmm3,       xmm4         ; 30 31 32 33 34 35 36 37

-        punpcklqdq  xmm1,       xmm4         ; 20 21 22 23 24 25 26 27

-        movdqa      xmm4,       xmm0         ; 00 01 02 03 10 11 12 13

-        punpcklqdq  xmm0,       xmm2         ; 00 01 02 03 04 05 06 07

-        punpckhqdq  xmm4,       xmm2         ; 10 11 12 13 14 15 16 17

-        ; xmm0 0

-        ; xmm1 4

-        ; xmm2 1

-        ; xmm3 3

-        movdqa      xmm5,       xmm0

-        movdqa      xmm2,       xmm1

-        paddw       xmm0,       xmm3        ; a1 = 0 + 3

-        paddw       xmm1,       xmm4        ; b1 = 1 + 2

-        psubw       xmm4,       xmm2        ; c1 = 1 - 2

-        psubw       xmm5,       xmm3        ; d1 = 0 - 3

-        pxor        xmm6,       xmm6        ; zero out for compare

-        pcmpeqw     xmm6,       xmm5        ; d1 != 0

-        pandn       xmm6,       XMMWORD PTR[GLOBAL(_cmp_mask8x4)]   ; clear upper,

-                                                                    ; and keep bit 0 of lower

-        ; output 0 and 2

-        movdqa      xmm2,       xmm0        ; a1

-        paddw       xmm0,       xmm1        ; a1 + b1

-        psubw       xmm2,       xmm1        ; a1 - b1

-        paddw       xmm0,       XMMWORD PTR[GLOBAL(_7w)]

-        paddw       xmm2,       XMMWORD PTR[GLOBAL(_7w)]

-        psraw       xmm0,       4           ; op[0] = (a1 + b1 + 7)>>4

-        psraw       xmm2,       4           ; op[8] = (a1 - b1 + 7)>>4

-        ; output 1 and 3

-        ; interleave c1, d1

-        movdqa      xmm1,       xmm5        ; d1

-        punpcklwd   xmm1,       xmm4        ; c1 d1

-        punpckhwd   xmm5,       xmm4        ; c1 d1

-        movdqa      xmm3,       xmm1

-        movdqa      xmm4,       xmm5

-        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

-        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

-        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

-        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

-        paddd       xmm1,       XMMWORD PTR[GLOBAL(_12000)]

-        paddd       xmm4,       XMMWORD PTR[GLOBAL(_12000)]

-        paddd       xmm3,       XMMWORD PTR[GLOBAL(_51000)]

-        paddd       xmm5,       XMMWORD PTR[GLOBAL(_51000)]

-        psrad       xmm1,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16

-        psrad       xmm4,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16

-        psrad       xmm3,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16

-        psrad       xmm5,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16

-        packssdw    xmm1,       xmm4        ; op[4]

-        packssdw    xmm3,       xmm5        ; op[12]

-        paddw       xmm1,       xmm6        ; op[4] += (d1!=0)

-        movdqa      xmm4,       xmm0

-        movdqa      xmm5,       xmm2

-        punpcklqdq  xmm0,       xmm1

-        punpckhqdq  xmm4,       xmm1

-        punpcklqdq  xmm2,       xmm3

-        punpckhqdq  xmm5,       xmm3

-        movdqa      XMMWORD PTR[output + 0 ],  xmm0

-        movdqa      XMMWORD PTR[output + 16],  xmm2

-        movdqa      XMMWORD PTR[output + 32],  xmm4

-        movdqa      XMMWORD PTR[output + 48],  xmm5

-    STACK_FRAME_DESTROY

-SECTION_RODATA

-align 16

-_5352_2217:

-    dw 5352

-    dw 2217

-    dw 5352

-    dw 2217

-    dw 5352

-    dw 2217

-    dw 5352

-    dw 2217

-align 16

-_2217_neg5352:

-    dw 2217

-    dw -5352

-    dw 2217

-    dw -5352

-    dw 2217

-    dw -5352

-    dw 2217

-    dw -5352

-align 16

-_mult_add:

-    times 8 dw 1

-align 16

-_cmp_mask:

-    times 4 dw 1

-    times 4 dw 0

-align 16

-_cmp_mask8x4:

-    times 8 dw 1

-align 16

-_mult_sub:

-    dw 1

-    dw -1

-    dw 1

-    dw -1

-    dw 1

-    dw -1

-    dw 1

-    dw -1

-align 16

-_7:

-    times 4 dd 7

-align 16

-_7w:

-    times 8 dw 7

-align 16

-_14500:

-    times 4 dd 14500

-align 16

-_7500:

-    times 4 dd 7500

-align 16

-_12000:

-    times 4 dd 12000

-align 16

-_51000:

-    times 4 dd 51000

--- a/vp8/encoder/x86/encodeopt.asm

+++ /dev/null

@@ -1,386 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;int vp9_block_error_xmm(short *coeff_ptr,  short *dcoef_ptr)

-global sym(vp9_block_error_xmm)

-sym(vp9_block_error_xmm):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 2

-    push rsi

-    push rdi

-    ; end prologue

-        mov         rsi,        arg(0) ;coeff_ptr

-        mov         rdi,        arg(1) ;dcoef_ptr

-        movdqa      xmm0,       [rsi]

-        movdqa      xmm1,       [rdi]

-        movdqa      xmm2,       [rsi+16]

-        movdqa      xmm3,       [rdi+16]

-        psubw       xmm0,       xmm1

-        psubw       xmm2,       xmm3

-        pmaddwd     xmm0,       xmm0

-        pmaddwd     xmm2,       xmm2

-        paddd       xmm0,       xmm2

-        pxor        xmm5,       xmm5

-        movdqa      xmm1,       xmm0

-        punpckldq   xmm0,       xmm5

-        punpckhdq   xmm1,       xmm5

-        paddd       xmm0,       xmm1

-        movdqa      xmm1,       xmm0

-        psrldq      xmm0,       8

-        paddd       xmm0,       xmm1

-        movq        rax,        xmm0

-    pop rdi

-    pop rsi

-    ; begin epilog

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;int vp9_block_error_mmx(short *coeff_ptr,  short *dcoef_ptr)

-global sym(vp9_block_error_mmx)

-sym(vp9_block_error_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 2

-    push rsi

-    push rdi

-    ; end prolog

-        mov         rsi,        arg(0) ;coeff_ptr

-        pxor        mm7,        mm7

-        mov         rdi,        arg(1) ;dcoef_ptr

-        movq        mm3,        [rsi]

-        movq        mm4,        [rdi]

-        movq        mm5,        [rsi+8]

-        movq        mm6,        [rdi+8]

-        pxor        mm1,        mm1 ; from movd mm1, dc ; dc =0

-        movq        mm2,        mm7

-        psubw       mm5,        mm6

-        por         mm1,        mm2

-        pmaddwd     mm5,        mm5

-        pcmpeqw     mm1,        mm7

-        psubw       mm3,        mm4

-        pand        mm1,        mm3

-        pmaddwd     mm1,        mm1

-        paddd       mm1,        mm5

-        movq        mm3,        [rsi+16]

-        movq        mm4,        [rdi+16]

-        movq        mm5,        [rsi+24]

-        movq        mm6,        [rdi+24]

-        psubw       mm5,        mm6

-        pmaddwd     mm5,        mm5

-        psubw       mm3,        mm4

-        pmaddwd     mm3,        mm3

-        paddd       mm3,        mm5

-        paddd       mm1,        mm3

-        movq        mm0,        mm1

-        psrlq       mm1,        32

-        paddd       mm0,        mm1

-        movq        rax,        mm0

-    pop rdi

-    pop rsi

-    ; begin epilog

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);

-global sym(vp9_mbblock_error_mmx_impl)

-sym(vp9_mbblock_error_mmx_impl):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 3

-    push rsi

-    push rdi

-    ; end prolog

-        mov         rsi,        arg(0) ;coeff_ptr

-        pxor        mm7,        mm7

-        mov         rdi,        arg(1) ;dcoef_ptr

-        pxor        mm2,        mm2

-        movd        mm1,        dword ptr arg(2) ;dc

-        por         mm1,        mm2

-        pcmpeqw     mm1,        mm7

-        mov         rcx,        16

-.mberror_loop_mmx:

-        movq        mm3,       [rsi]

-        movq        mm4,       [rdi]

-        movq        mm5,       [rsi+8]

-        movq        mm6,       [rdi+8]

-        psubw       mm5,        mm6

-        pmaddwd     mm5,        mm5

-        psubw       mm3,        mm4

-        pand        mm3,        mm1

-        pmaddwd     mm3,        mm3

-        paddd       mm2,        mm5

-        paddd       mm2,        mm3

-        movq        mm3,       [rsi+16]

-        movq        mm4,       [rdi+16]

-        movq        mm5,       [rsi+24]

-        movq        mm6,       [rdi+24]

-        psubw       mm5,        mm6

-        pmaddwd     mm5,        mm5

-        psubw       mm3,        mm4

-        pmaddwd     mm3,        mm3

-        paddd       mm2,        mm5

-        paddd       mm2,        mm3

-        add         rsi,        32

-        add         rdi,        32

-        sub         rcx,        1

-        jnz         .mberror_loop_mmx

-        movq        mm0,        mm2

-        psrlq       mm2,        32

-        paddd       mm0,        mm2

-        movq        rax,        mm0

-    pop rdi

-    pop rsi

-    ; begin epilog

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);

-global sym(vp9_mbblock_error_xmm_impl)

-sym(vp9_mbblock_error_xmm_impl):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 3

-    SAVE_XMM 6

-    push rsi

-    push rdi

-    ; end prolog

-        mov         rsi,        arg(0) ;coeff_ptr

-        pxor        xmm6,       xmm6

-        mov         rdi,        arg(1) ;dcoef_ptr

-        pxor        xmm4,       xmm4

-        movd        xmm5,       dword ptr arg(2) ;dc

-        por         xmm5,       xmm4

-        pcmpeqw     xmm5,       xmm6

-        mov         rcx,        16

-.mberror_loop:

-        movdqa      xmm0,       [rsi]

-        movdqa      xmm1,       [rdi]

-        movdqa      xmm2,       [rsi+16]

-        movdqa      xmm3,       [rdi+16]

-        psubw       xmm2,       xmm3

-        pmaddwd     xmm2,       xmm2

-        psubw       xmm0,       xmm1

-        pand        xmm0,       xmm5

-        pmaddwd     xmm0,       xmm0

-        add         rsi,        32

-        add         rdi,        32

-        sub         rcx,        1

-        paddd       xmm4,       xmm2

-        paddd       xmm4,       xmm0

-        jnz         .mberror_loop

-        movdqa      xmm0,       xmm4

-        punpckldq   xmm0,       xmm6

-        punpckhdq   xmm4,       xmm6

-        paddd       xmm0,       xmm4

-        movdqa      xmm1,       xmm0

-        psrldq      xmm0,       8

-        paddd       xmm0,       xmm1

-        movq        rax,        xmm0

-    pop rdi

-    pop rsi

-    ; begin epilog

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);

-global sym(vp9_mbuverror_mmx_impl)

-sym(vp9_mbuverror_mmx_impl):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 2

-    push rsi

-    push rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;s_ptr

-        mov             rdi,        arg(1) ;d_ptr

-        mov             rcx,        16

-        pxor            mm7,        mm7

-.mbuverror_loop_mmx:

-        movq            mm1,        [rsi]

-        movq            mm2,        [rdi]

-        psubw           mm1,        mm2

-        pmaddwd         mm1,        mm1

-        movq            mm3,        [rsi+8]

-        movq            mm4,        [rdi+8]

-        psubw           mm3,        mm4

-        pmaddwd         mm3,        mm3

-        paddd           mm7,        mm1

-        paddd           mm7,        mm3

-        add             rsi,        16

-        add             rdi,        16

-        dec             rcx

-        jnz             .mbuverror_loop_mmx

-        movq            mm0,        mm7

-        psrlq           mm7,        32

-        paddd           mm0,        mm7

-        movq            rax,        mm0

-    pop rdi

-    pop rsi

-    ; begin epilog

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);

-global sym(vp9_mbuverror_xmm_impl)

-sym(vp9_mbuverror_xmm_impl):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 2

-    push rsi

-    push rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;s_ptr

-        mov             rdi,        arg(1) ;d_ptr

-        mov             rcx,        16

-        pxor            xmm3,       xmm3

-.mbuverror_loop:

-        movdqa          xmm1,       [rsi]

-        movdqa          xmm2,       [rdi]

-        psubw           xmm1,       xmm2

-        pmaddwd         xmm1,       xmm1

-        paddd           xmm3,       xmm1

-        add             rsi,        16

-        add             rdi,        16

-        dec             rcx

-        jnz             .mbuverror_loop

-        pxor        xmm0,           xmm0

-        movdqa      xmm1,           xmm3

-        movdqa      xmm2,           xmm1

-        punpckldq   xmm1,           xmm0

-        punpckhdq   xmm2,           xmm0

-        paddd       xmm1,           xmm2

-        movdqa      xmm2,           xmm1

-        psrldq      xmm1,           8

-        paddd       xmm1,           xmm2

-        movq            rax,            xmm1

-    pop rdi

-    pop rsi

-    ; begin epilog

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

--- a/vp8/encoder/x86/fwalsh_sse2.asm

+++ /dev/null

@@ -1,164 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;void vp9_short_walsh4x4_sse2(short *input, short *output, int pitch)

-global sym(vp9_short_walsh4x4_sse2)

-sym(vp9_short_walsh4x4_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 3

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    mov     rsi, arg(0)           ; input

-    mov     rdi, arg(1)           ; output

-    movsxd  rdx, dword ptr arg(2) ; pitch

-    ; first for loop

-    movq    xmm0, MMWORD PTR [rsi]           ; load input

-    movq    xmm1, MMWORD PTR [rsi + rdx]

-    lea     rsi,  [rsi + rdx*2]

-    movq    xmm2, MMWORD PTR [rsi]

-    movq    xmm3, MMWORD PTR [rsi + rdx]

-    punpcklwd xmm0,  xmm1

-    punpcklwd xmm2,  xmm3

-    movdqa    xmm1, xmm0

-    punpckldq xmm0, xmm2           ; ip[1] ip[0]

-    punpckhdq xmm1, xmm2           ; ip[3] ip[2]

-    movdqa    xmm2, xmm0

-    paddw     xmm0, xmm1

-    psubw     xmm2, xmm1

-    psllw     xmm0, 2              ; d1  a1

-    psllw     xmm2, 2              ; c1  b1

-    movdqa    xmm1, xmm0

-    punpcklqdq xmm0, xmm2          ; b1  a1

-    punpckhqdq xmm1, xmm2          ; c1  d1

-    pxor      xmm6, xmm6

-    movq      xmm6, xmm0

-    pxor      xmm7, xmm7

-    pcmpeqw   xmm7, xmm6

-    paddw     xmm7, [GLOBAL(c1)]

-    movdqa    xmm2, xmm0

-    paddw     xmm0, xmm1           ; b1+c1  a1+d1

-    psubw     xmm2, xmm1           ; b1-c1  a1-d1

-    paddw     xmm0, xmm7           ; b1+c1  a1+d1+(a1!=0)

-    ; second for loop

-    ; input: 13  9  5  1 12  8  4  0 (xmm0)

-    ;        14 10  6  2 15 11  7  3 (xmm2)

-    ; after shuffle:

-    ;        13  5  9  1 12  4  8  0 (xmm0)

-    ;        14  6 10  2 15  7 11  3 (xmm1)

-    pshuflw   xmm3, xmm0, 0xd8

-    pshufhw   xmm0, xmm3, 0xd8

-    pshuflw   xmm3, xmm2, 0xd8

-    pshufhw   xmm1, xmm3, 0xd8

-    movdqa    xmm2, xmm0

-    pmaddwd   xmm0, [GLOBAL(c1)]    ; d11 a11 d10 a10

-    pmaddwd   xmm2, [GLOBAL(cn1)]   ; c11 b11 c10 b10

-    movdqa    xmm3, xmm1

-    pmaddwd   xmm1, [GLOBAL(c1)]    ; d12 a12 d13 a13

-    pmaddwd   xmm3, [GLOBAL(cn1)]   ; c12 b12 c13 b13

-    pshufd    xmm4, xmm0, 0xd8      ; d11 d10 a11 a10

-    pshufd    xmm5, xmm2, 0xd8      ; c11 c10 b11 b10

-    pshufd    xmm6, xmm1, 0x72      ; d13 d12 a13 a12

-    pshufd    xmm7, xmm3, 0x72      ; c13 c12 b13 b12

-    movdqa    xmm0, xmm4

-    punpcklqdq xmm0, xmm5           ; b11 b10 a11 a10

-    punpckhqdq xmm4, xmm5           ; c11 c10 d11 d10

-    movdqa    xmm1, xmm6

-    punpcklqdq xmm1, xmm7           ; b13 b12 a13 a12

-    punpckhqdq xmm6, xmm7           ; c13 c12 d13 d12

-    movdqa    xmm2, xmm0

-    paddd     xmm0, xmm4            ; b21 b20 a21 a20

-    psubd     xmm2, xmm4            ; c21 c20 d21 d20

-    movdqa    xmm3, xmm1

-    paddd     xmm1, xmm6            ; b23 b22 a23 a22

-    psubd     xmm3, xmm6            ; c23 c22 d23 d22

-    pxor      xmm4, xmm4

-    movdqa    xmm5, xmm4

-    pcmpgtd   xmm4, xmm0

-    pcmpgtd   xmm5, xmm2

-    pand      xmm4, [GLOBAL(cd1)]

-    pand      xmm5, [GLOBAL(cd1)]

-    pxor      xmm6, xmm6

-    movdqa    xmm7, xmm6

-    pcmpgtd   xmm6, xmm1

-    pcmpgtd   xmm7, xmm3

-    pand      xmm6, [GLOBAL(cd1)]

-    pand      xmm7, [GLOBAL(cd1)]

-    paddd     xmm0, xmm4

-    paddd     xmm2, xmm5

-    paddd     xmm0, [GLOBAL(cd3)]

-    paddd     xmm2, [GLOBAL(cd3)]

-    paddd     xmm1, xmm6

-    paddd     xmm3, xmm7

-    paddd     xmm1, [GLOBAL(cd3)]

-    paddd     xmm3, [GLOBAL(cd3)]

-    psrad     xmm0, 3

-    psrad     xmm1, 3

-    psrad     xmm2, 3

-    psrad     xmm3, 3

-    movdqa    xmm4, xmm0

-    punpcklqdq xmm0, xmm1           ; a23 a22 a21 a20

-    punpckhqdq xmm4, xmm1           ; b23 b22 b21 b20

-    movdqa    xmm5, xmm2

-    punpckhqdq xmm2, xmm3           ; c23 c22 c21 c20

-    punpcklqdq xmm5, xmm3           ; d23 d22 d21 d20

-    packssdw  xmm0, xmm4            ; b23 b22 b21 b20 a23 a22 a21 a20

-    packssdw  xmm2, xmm5            ; d23 d22 d21 d20 c23 c22 c21 c20

-    movdqa  XMMWORD PTR [rdi], xmm0

-    movdqa  XMMWORD PTR [rdi + 16], xmm2

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-c1:

-    dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001

-align 16

-cn1:

-    dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff

-align 16

-cd1:

-    dd 0x00000001, 0x00000001, 0x00000001, 0x00000001

-align 16

-cd3:

-    dd 0x00000003, 0x00000003, 0x00000003, 0x00000003

--- a/vp8/encoder/x86/mcomp_x86.h

+++ /dev/null

@@ -1,40 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef MCOMP_X86_H

-#define MCOMP_X86_H

-#if HAVE_SSE3

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_search_full_search

-#define vp9_search_full_search vp9_full_search_sadx3

-#undef  vp9_search_refining_search

-#define vp9_search_refining_search vp9_refining_search_sadx4

-#undef  vp9_search_diamond_search

-#define vp9_search_diamond_search vp9_diamond_search_sadx4

-#endif

-#endif

-#if HAVE_SSE4_1

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_search_full_search

-#define vp9_search_full_search vp9_full_search_sadx8

-#endif

-#endif

-#endif

--- a/vp8/encoder/x86/quantize_mmx.asm

+++ /dev/null

@@ -1,286 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;int vp9_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,

-;                           short *qcoeff_ptr,short *dequant_ptr,

-;                           short *scan_mask, short *round_ptr,

-;                           short *quant_ptr, short *dqcoeff_ptr);

-global sym(vp9_fast_quantize_b_impl_mmx)

-sym(vp9_fast_quantize_b_impl_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 8

-    push rsi

-    push rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;coeff_ptr

-        movq            mm0,        [rsi]

-        mov             rax,        arg(1) ;zbin_ptr

-        movq            mm1,        [rax]

-        movq            mm3,        mm0

-        psraw           mm0,        15

-        pxor            mm3,        mm0

-        psubw           mm3,        mm0         ; abs

-        movq            mm2,        mm3

-        pcmpgtw         mm1,        mm2

-        pandn           mm1,        mm2

-        movq            mm3,        mm1

-        mov             rdx,        arg(6) ;quant_ptr

-        movq            mm1,        [rdx]

-        mov             rcx,        arg(5) ;round_ptr

-        movq            mm2,        [rcx]

-        paddw           mm3,        mm2

-        pmulhuw         mm3,        mm1

-        pxor            mm3,        mm0

-        psubw           mm3,        mm0     ;gain the sign back

-        mov             rdi,        arg(2) ;qcoeff_ptr

-        movq            mm0,        mm3

-        movq            [rdi],      mm3

-        mov             rax,        arg(3) ;dequant_ptr

-        movq            mm2,        [rax]

-        pmullw          mm3,        mm2

-        mov             rax,        arg(7) ;dqcoeff_ptr

-        movq            [rax],      mm3

-        ; next 8

-        movq            mm4,        [rsi+8]

-        mov             rax,        arg(1) ;zbin_ptr

-        movq            mm5,        [rax+8]

-        movq            mm7,        mm4

-        psraw           mm4,        15

-        pxor            mm7,        mm4

-        psubw           mm7,        mm4         ; abs

-        movq            mm6,        mm7

-        pcmpgtw         mm5,        mm6

-        pandn           mm5,        mm6

-        movq            mm7,        mm5

-        movq            mm5,        [rdx+8]

-        movq            mm6,        [rcx+8]

-        paddw           mm7,        mm6

-        pmulhuw         mm7,        mm5

-        pxor            mm7,        mm4

-        psubw           mm7,        mm4;gain the sign back

-        mov             rdi,        arg(2) ;qcoeff_ptr

-        movq            mm1,        mm7

-        movq            [rdi+8],    mm7

-        mov             rax,        arg(3) ;dequant_ptr

-        movq            mm6,        [rax+8]

-        pmullw          mm7,        mm6

-        mov             rax,        arg(7) ;dqcoeff_ptr

-        movq            [rax+8],    mm7

-                ; next 8

-        movq            mm4,        [rsi+16]

-        mov             rax,        arg(1) ;zbin_ptr

-        movq            mm5,        [rax+16]

-        movq            mm7,        mm4

-        psraw           mm4,        15

-        pxor            mm7,        mm4

-        psubw           mm7,        mm4         ; abs

-        movq            mm6,        mm7

-        pcmpgtw         mm5,        mm6

-        pandn           mm5,        mm6

-        movq            mm7,        mm5

-        movq            mm5,        [rdx+16]

-        movq            mm6,        [rcx+16]

-        paddw           mm7,        mm6

-        pmulhuw         mm7,        mm5

-        pxor            mm7,        mm4

-        psubw           mm7,        mm4;gain the sign back

-        mov             rdi,        arg(2) ;qcoeff_ptr

-        movq            mm1,        mm7

-        movq            [rdi+16],   mm7

-        mov             rax,        arg(3) ;dequant_ptr

-        movq            mm6,        [rax+16]

-        pmullw          mm7,        mm6

-        mov             rax,        arg(7) ;dqcoeff_ptr

-        movq            [rax+16],   mm7

-                ; next 8

-        movq            mm4,        [rsi+24]

-        mov             rax,        arg(1) ;zbin_ptr

-        movq            mm5,        [rax+24]

-        movq            mm7,        mm4

-        psraw           mm4,        15

-        pxor            mm7,        mm4

-        psubw           mm7,        mm4         ; abs

-        movq            mm6,        mm7

-        pcmpgtw         mm5,        mm6

-        pandn           mm5,        mm6

-        movq            mm7,        mm5

-        movq            mm5,        [rdx+24]

-        movq            mm6,        [rcx+24]

-        paddw           mm7,        mm6

-        pmulhuw         mm7,        mm5

-        pxor            mm7,        mm4

-        psubw           mm7,        mm4;gain the sign back

-        mov             rdi,        arg(2) ;qcoeff_ptr

-        movq            mm1,        mm7

-        movq            [rdi+24],   mm7

-        mov             rax,        arg(3) ;dequant_ptr

-        movq            mm6,        [rax+24]

-        pmullw          mm7,        mm6

-        mov             rax,        arg(7) ;dqcoeff_ptr

-        movq            [rax+24],   mm7

-        mov             rdi,        arg(4) ;scan_mask

-        mov             rsi,        arg(2) ;qcoeff_ptr

-        pxor            mm5,        mm5

-        pxor            mm7,        mm7

-        movq            mm0,        [rsi]

-        movq            mm1,        [rsi+8]

-        movq            mm2,        [rdi]

-        movq            mm3,        [rdi+8];

-        pcmpeqw         mm0,        mm7

-        pcmpeqw         mm1,        mm7

-        pcmpeqw         mm6,        mm6

-        pxor            mm0,        mm6

-        pxor            mm1,        mm6

-        psrlw           mm0,        15

-        psrlw           mm1,        15

-        pmaddwd         mm0,        mm2

-        pmaddwd         mm1,        mm3

-        movq            mm5,        mm0

-        paddd           mm5,        mm1

-        movq            mm0,        [rsi+16]

-        movq            mm1,        [rsi+24]

-        movq            mm2,        [rdi+16]

-        movq            mm3,        [rdi+24];

-        pcmpeqw         mm0,        mm7

-        pcmpeqw         mm1,        mm7

-        pcmpeqw         mm6,        mm6

-        pxor            mm0,        mm6

-        pxor            mm1,        mm6

-        psrlw           mm0,        15

-        psrlw           mm1,        15

-        pmaddwd         mm0,        mm2

-        pmaddwd         mm1,        mm3

-        paddd           mm5,        mm0

-        paddd           mm5,        mm1

-        movq            mm0,        mm5

-        psrlq           mm5,        32

-        paddd           mm0,        mm5

-        ; eob adjustment begins here

-        movq            rcx,        mm0

-        and             rcx,        0xffff

-        xor             rdx,        rdx

-        sub             rdx,        rcx ; rdx=-rcx

-        bsr             rax,        rcx

-        inc             rax

-        sar             rdx,        31

-        and             rax,        rdx

-        ; Substitute the sse assembly for the old mmx mixed assembly/C. The

-        ; following is kept as reference

-        ;    movq            rcx,        mm0

-        ;    bsr             rax,        rcx

-        ;

-        ;    mov             eob,        rax

-        ;    mov             eee,        rcx

-        ;

-        ;if(eee==0)

-        ;{

-        ;    eob=-1;

-        ;}

-        ;else if(eee<0)

-        ;{

-        ;    eob=15;

-        ;}

-        ;d->eob = eob+1;

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

--- a/vp8/encoder/x86/quantize_sse2.asm

+++ /dev/null

@@ -1,380 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%include "asm_enc_offsets.asm"

-; void vp9_regular_quantize_b_sse2 | arg

-;  (BLOCK  *b,                     |  0

-;   BLOCKD *d)                     |  1

-global sym(vp9_regular_quantize_b_sse2)

-sym(vp9_regular_quantize_b_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SAVE_XMM 7

-    GET_GOT     rbx

-%if ABI_IS_32BIT

-    push        rdi

-    push        rsi

-%else

-  %ifidn __OUTPUT_FORMAT__,x64

-    push        rdi

-    push        rsi

-  %endif

-%endif

-    ALIGN_STACK 16, rax

-    %define zrun_zbin_boost   0  ;  8

-    %define abs_minus_zbin    8  ; 32

-    %define temp_qcoeff       40 ; 32

-    %define qcoeff            72 ; 32

-    %define stack_size        104

-    sub         rsp, stack_size

-    ; end prolog

-%if ABI_IS_32BIT

-    mov         rdi, arg(0)                 ; BLOCK *b

-    mov         rsi, arg(1)                 ; BLOCKD *d

-%else

-  %ifidn __OUTPUT_FORMAT__,x64

-    mov         rdi, rcx                    ; BLOCK *b

-    mov         rsi, rdx                    ; BLOCKD *d

-  %else

-    ;mov         rdi, rdi                    ; BLOCK *b

-    ;mov         rsi, rsi                    ; BLOCKD *d

-  %endif

-%endif

-    mov         rdx, [rdi + vp9_block_coeff] ; coeff_ptr

-    mov         rcx, [rdi + vp9_block_zbin] ; zbin_ptr

-    movd        xmm7, [rdi + vp9_block_zbin_extra] ; zbin_oq_value

-    ; z

-    movdqa      xmm0, [rdx]

-    movdqa      xmm4, [rdx + 16]

-    mov         rdx, [rdi + vp9_block_round] ; round_ptr

-    pshuflw     xmm7, xmm7, 0

-    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value

-    movdqa      xmm1, xmm0

-    movdqa      xmm5, xmm4

-    ; sz

-    psraw       xmm0, 15

-    psraw       xmm4, 15

-    ; (z ^ sz)

-    pxor        xmm1, xmm0

-    pxor        xmm5, xmm4

-    ; x = abs(z)

-    psubw       xmm1, xmm0

-    psubw       xmm5, xmm4

-    movdqa      xmm2, [rcx]

-    movdqa      xmm3, [rcx + 16]

-    mov         rcx, [rdi + vp9_block_quant] ; quant_ptr

-    ; *zbin_ptr + zbin_oq_value

-    paddw       xmm2, xmm7

-    paddw       xmm3, xmm7

-    ; x - (*zbin_ptr + zbin_oq_value)

-    psubw       xmm1, xmm2

-    psubw       xmm5, xmm3

-    movdqa      [rsp + abs_minus_zbin], xmm1

-    movdqa      [rsp + abs_minus_zbin + 16], xmm5

-    ; add (zbin_ptr + zbin_oq_value) back

-    paddw       xmm1, xmm2

-    paddw       xmm5, xmm3

-    movdqa      xmm2, [rdx]

-    movdqa      xmm6, [rdx + 16]

-    movdqa      xmm3, [rcx]

-    movdqa      xmm7, [rcx + 16]

-    ; x + round

-    paddw       xmm1, xmm2

-    paddw       xmm5, xmm6

-    ; y = x * quant_ptr >> 16

-    pmulhw      xmm3, xmm1

-    pmulhw      xmm7, xmm5

-    ; y += x

-    paddw       xmm1, xmm3

-    paddw       xmm5, xmm7

-    movdqa      [rsp + temp_qcoeff], xmm1

-    movdqa      [rsp + temp_qcoeff + 16], xmm5

-    pxor        xmm6, xmm6

-    ; zero qcoeff

-    movdqa      [rsp + qcoeff], xmm6

-    movdqa      [rsp + qcoeff + 16], xmm6

-    mov         rdx, [rdi + vp9_block_zrun_zbin_boost] ; zbin_boost_ptr

-    mov         rax, [rdi + vp9_block_quant_shift] ; quant_shift_ptr

-    mov         [rsp + zrun_zbin_boost], rdx

-%macro ZIGZAG_LOOP 1

-    ; x

-    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]

-    ; if (x >= zbin)

-    sub         cx, WORD PTR[rdx]           ; x - zbin

-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++

-    jl          .rq_zigzag_loop_%1           ; x < zbin

-    movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]

-    ; downshift by quant_shift[rc]

-    movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]

-    sar         edi, cl                     ; also sets Z bit

-    je          .rq_zigzag_loop_%1           ; !y

-    mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]

-    mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost

-.rq_zigzag_loop_%1:

-%endmacro

-; in vp9_default_zig_zag1d order: see vp8/common/entropy.c

-ZIGZAG_LOOP  0

-ZIGZAG_LOOP  1

-ZIGZAG_LOOP  4

-ZIGZAG_LOOP  8

-ZIGZAG_LOOP  5

-ZIGZAG_LOOP  2

-ZIGZAG_LOOP  3

-ZIGZAG_LOOP  6

-ZIGZAG_LOOP  9

-ZIGZAG_LOOP 12

-ZIGZAG_LOOP 13

-ZIGZAG_LOOP 10

-ZIGZAG_LOOP  7

-ZIGZAG_LOOP 11

-ZIGZAG_LOOP 14

-ZIGZAG_LOOP 15

-    movdqa      xmm2, [rsp + qcoeff]

-    movdqa      xmm3, [rsp + qcoeff + 16]

-    mov         rcx, [rsi + vp9_blockd_dequant] ; dequant_ptr

-    mov         rdi, [rsi + vp9_blockd_dqcoeff] ; dqcoeff_ptr

-    ; y ^ sz

-    pxor        xmm2, xmm0

-    pxor        xmm3, xmm4

-    ; x = (y ^ sz) - sz

-    psubw       xmm2, xmm0

-    psubw       xmm3, xmm4

-    ; dequant

-    movdqa      xmm0, [rcx]

-    movdqa      xmm1, [rcx + 16]

-    mov         rcx, [rsi + vp9_blockd_qcoeff] ; qcoeff_ptr

-    pmullw      xmm0, xmm2

-    pmullw      xmm1, xmm3

-    movdqa      [rcx], xmm2        ; store qcoeff

-    movdqa      [rcx + 16], xmm3

-    movdqa      [rdi], xmm0        ; store dqcoeff

-    movdqa      [rdi + 16], xmm1

-    ; select the last value (in zig_zag order) for EOB

-    pcmpeqw     xmm2, xmm6

-    pcmpeqw     xmm3, xmm6

-    ; !

-    pcmpeqw     xmm6, xmm6

-    pxor        xmm2, xmm6

-    pxor        xmm3, xmm6

-    ; mask inv_zig_zag

-    pand        xmm2, [GLOBAL(inv_zig_zag)]

-    pand        xmm3, [GLOBAL(inv_zig_zag + 16)]

-    ; select the max value

-    pmaxsw      xmm2, xmm3

-    pshufd      xmm3, xmm2, 00001110b

-    pmaxsw      xmm2, xmm3

-    pshuflw     xmm3, xmm2, 00001110b

-    pmaxsw      xmm2, xmm3

-    pshuflw     xmm3, xmm2, 00000001b

-    pmaxsw      xmm2, xmm3

-    movd        eax, xmm2

-    and         eax, 0xff

-    mov         [rsi + vp9_blockd_eob], eax

-    ; begin epilog

-    add         rsp, stack_size

-    pop         rsp

-%if ABI_IS_32BIT

-    pop         rsi

-    pop         rdi

-%else

-  %ifidn __OUTPUT_FORMAT__,x64

-    pop         rsi

-    pop         rdi

-  %endif

-%endif

-    RESTORE_GOT

-    RESTORE_XMM

-    pop         rbp

-    ret

-; void vp9_fast_quantize_b_sse2 | arg

-;  (BLOCK  *b,                  |  0

-;   BLOCKD *d)                  |  1

-global sym(vp9_fast_quantize_b_sse2)

-sym(vp9_fast_quantize_b_sse2):

-    push        rbp

-    mov         rbp, rsp

-    GET_GOT     rbx

-%if ABI_IS_32BIT

-    push        rdi

-    push        rsi

-%else

-  %ifidn __OUTPUT_FORMAT__,x64

-    push        rdi

-    push        rsi

-  %else

-    ; these registers are used for passing arguments

-  %endif

-%endif

-    ; end prolog

-%if ABI_IS_32BIT

-    mov         rdi, arg(0)                 ; BLOCK *b

-    mov         rsi, arg(1)                 ; BLOCKD *d

-%else

-  %ifidn __OUTPUT_FORMAT__,x64

-    mov         rdi, rcx                    ; BLOCK *b

-    mov         rsi, rdx                    ; BLOCKD *d

-  %else

-    ;mov         rdi, rdi                    ; BLOCK *b

-    ;mov         rsi, rsi                    ; BLOCKD *d

-  %endif

-%endif

-    mov         rax, [rdi + vp9_block_coeff]

-    mov         rcx, [rdi + vp9_block_round]

-    mov         rdx, [rdi + vp9_block_quant_fast]

-    ; z = coeff

-    movdqa      xmm0, [rax]

-    movdqa      xmm4, [rax + 16]

-    ; dup z so we can save sz

-    movdqa      xmm1, xmm0

-    movdqa      xmm5, xmm4

-    ; sz = z >> 15

-    psraw       xmm0, 15

-    psraw       xmm4, 15

-    ; x = abs(z) = (z ^ sz) - sz

-    pxor        xmm1, xmm0

-    pxor        xmm5, xmm4

-    psubw       xmm1, xmm0

-    psubw       xmm5, xmm4

-    ; x += round

-    paddw       xmm1, [rcx]

-    paddw       xmm5, [rcx + 16]

-    mov         rax, [rsi + vp9_blockd_qcoeff]

-    mov         rcx, [rsi + vp9_blockd_dequant]

-    mov         rdi, [rsi + vp9_blockd_dqcoeff]

-    ; y = x * quant >> 16

-    pmulhw      xmm1, [rdx]

-    pmulhw      xmm5, [rdx + 16]

-    ; x = (y ^ sz) - sz

-    pxor        xmm1, xmm0

-    pxor        xmm5, xmm4

-    psubw       xmm1, xmm0

-    psubw       xmm5, xmm4

-    ; qcoeff = x

-    movdqa      [rax], xmm1

-    movdqa      [rax + 16], xmm5

-    ; x * dequant

-    movdqa      xmm2, xmm1

-    movdqa      xmm3, xmm5

-    pmullw      xmm2, [rcx]

-    pmullw      xmm3, [rcx + 16]

-    ; dqcoeff = x * dequant

-    movdqa      [rdi], xmm2

-    movdqa      [rdi + 16], xmm3

-    pxor        xmm4, xmm4                  ;clear all bits

-    pcmpeqw     xmm1, xmm4

-    pcmpeqw     xmm5, xmm4

-    pcmpeqw     xmm4, xmm4                  ;set all bits

-    pxor        xmm1, xmm4

-    pxor        xmm5, xmm4

-    pand        xmm1, [GLOBAL(inv_zig_zag)]

-    pand        xmm5, [GLOBAL(inv_zig_zag + 16)]

-    pmaxsw      xmm1, xmm5

-    ; now down to 8

-    pshufd      xmm5, xmm1, 00001110b

-    pmaxsw      xmm1, xmm5

-    ; only 4 left

-    pshuflw     xmm5, xmm1, 00001110b

-    pmaxsw      xmm1, xmm5

-    ; okay, just 2!

-    pshuflw     xmm5, xmm1, 00000001b

-    pmaxsw      xmm1, xmm5

-    movd        eax, xmm1

-    and         eax, 0xff

-    mov         [rsi + vp9_blockd_eob], eax

-    ; begin epilog

-%if ABI_IS_32BIT

-    pop         rsi

-    pop         rdi

-%else

-  %ifidn __OUTPUT_FORMAT__,x64

-    pop         rsi

-    pop         rdi

-  %endif

-%endif

-    RESTORE_GOT

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-inv_zig_zag:

-  dw 0x0001, 0x0002, 0x0006, 0x0007

-  dw 0x0003, 0x0005, 0x0008, 0x000d

-  dw 0x0004, 0x0009, 0x000c, 0x000e

-  dw 0x000a, 0x000b, 0x000f, 0x0010

--- a/vp8/encoder/x86/quantize_sse4.asm

+++ /dev/null

@@ -1,254 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%include "asm_enc_offsets.asm"

-; void vp9_regular_quantize_b_sse4 | arg

-;  (BLOCK  *b,                     |  0

-;   BLOCKD *d)                     |  1

-global sym(vp9_regular_quantize_b_sse4)

-sym(vp9_regular_quantize_b_sse4):

-%if ABI_IS_32BIT

-    push        rbp

-    mov         rbp, rsp

-    GET_GOT     rbx

-    push        rdi

-    push        rsi

-    ALIGN_STACK 16, rax

-    %define qcoeff      0 ; 32

-    %define stack_size 32

-    sub         rsp, stack_size

-%else

-  %ifidn __OUTPUT_FORMAT__,x64

-    SAVE_XMM 8, u

-    push        rdi

-    push        rsi

-  %endif

-%endif

-    ; end prolog

-%if ABI_IS_32BIT

-    mov         rdi, arg(0)                 ; BLOCK *b

-    mov         rsi, arg(1)                 ; BLOCKD *d

-%else

-  %ifidn __OUTPUT_FORMAT__,x64

-    mov         rdi, rcx                    ; BLOCK *b

-    mov         rsi, rdx                    ; BLOCKD *d

-  %else

-    ;mov         rdi, rdi                    ; BLOCK *b

-    ;mov         rsi, rsi                    ; BLOCKD *d

-  %endif

-%endif

-    mov         rax, [rdi + vp9_block_coeff]

-    mov         rcx, [rdi + vp9_block_zbin]

-    mov         rdx, [rdi + vp9_block_round]

-    movd        xmm7, [rdi + vp9_block_zbin_extra]

-    ; z

-    movdqa      xmm0, [rax]

-    movdqa      xmm1, [rax + 16]

-    ; duplicate zbin_oq_value

-    pshuflw     xmm7, xmm7, 0

-    punpcklwd   xmm7, xmm7

-    movdqa      xmm2, xmm0

-    movdqa      xmm3, xmm1

-    ; sz

-    psraw       xmm0, 15

-    psraw       xmm1, 15

-    ; (z ^ sz)

-    pxor        xmm2, xmm0

-    pxor        xmm3, xmm1

-    ; x = abs(z)

-    psubw       xmm2, xmm0

-    psubw       xmm3, xmm1

-    ; zbin

-    movdqa      xmm4, [rcx]

-    movdqa      xmm5, [rcx + 16]

-    ; *zbin_ptr + zbin_oq_value

-    paddw       xmm4, xmm7

-    paddw       xmm5, xmm7

-    movdqa      xmm6, xmm2

-    movdqa      xmm7, xmm3

-    ; x - (*zbin_ptr + zbin_oq_value)

-    psubw       xmm6, xmm4

-    psubw       xmm7, xmm5

-    ; round

-    movdqa      xmm4, [rdx]

-    movdqa      xmm5, [rdx + 16]

-    mov         rax, [rdi + vp9_block_quant_shift]

-    mov         rcx, [rdi + vp9_block_quant]

-    mov         rdx, [rdi + vp9_block_zrun_zbin_boost]

-    ; x + round

-    paddw       xmm2, xmm4

-    paddw       xmm3, xmm5

-    ; quant

-    movdqa      xmm4, [rcx]

-    movdqa      xmm5, [rcx + 16]

-    ; y = x * quant_ptr >> 16

-    pmulhw      xmm4, xmm2

-    pmulhw      xmm5, xmm3

-    ; y += x

-    paddw       xmm2, xmm4

-    paddw       xmm3, xmm5

-    pxor        xmm4, xmm4

-%if ABI_IS_32BIT

-    movdqa      [rsp + qcoeff], xmm4

-    movdqa      [rsp + qcoeff + 16], xmm4

-%else

-    pxor        xmm8, xmm8

-%endif

-    ; quant_shift

-    movdqa      xmm5, [rax]

-    ; zrun_zbin_boost

-    mov         rax, rdx

-%macro ZIGZAG_LOOP 5

-    ; x

-    pextrw      ecx, %4, %2

-    ; if (x >= zbin)

-    sub         cx, WORD PTR[rdx]           ; x - zbin

-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++

-    jl          .rq_zigzag_loop_%1          ; x < zbin

-    pextrw      edi, %3, %2                 ; y

-    ; downshift by quant_shift[rc]

-    pextrb      ecx, xmm5, %1               ; quant_shift[rc]

-    sar         edi, cl                     ; also sets Z bit

-    je          .rq_zigzag_loop_%1          ; !y

-%if ABI_IS_32BIT

-    mov         WORD PTR[rsp + qcoeff + %1 *2], di

-%else

-    pinsrw      %5, edi, %2                 ; qcoeff[rc]

-%endif

-    mov         rdx, rax                    ; reset to b->zrun_zbin_boost

-.rq_zigzag_loop_%1:

-%endmacro

-; in vp9_default_zig_zag1d order: see vp8/common/entropy.c

-ZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4

-ZIGZAG_LOOP  1, 1, xmm2, xmm6, xmm4

-ZIGZAG_LOOP  4, 4, xmm2, xmm6, xmm4

-ZIGZAG_LOOP  8, 0, xmm3, xmm7, xmm8

-ZIGZAG_LOOP  5, 5, xmm2, xmm6, xmm4

-ZIGZAG_LOOP  2, 2, xmm2, xmm6, xmm4

-ZIGZAG_LOOP  3, 3, xmm2, xmm6, xmm4

-ZIGZAG_LOOP  6, 6, xmm2, xmm6, xmm4

-ZIGZAG_LOOP  9, 1, xmm3, xmm7, xmm8

-ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8

-ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8

-ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8

-ZIGZAG_LOOP  7, 7, xmm2, xmm6, xmm4

-ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8

-ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8

-ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8

-    mov         rcx, [rsi + vp9_blockd_dequant]

-    mov         rdi, [rsi + vp9_blockd_dqcoeff]

-%if ABI_IS_32BIT

-    movdqa      xmm4, [rsp + qcoeff]

-    movdqa      xmm5, [rsp + qcoeff + 16]

-%else

-    %define     xmm5 xmm8

-%endif

-    ; y ^ sz

-    pxor        xmm4, xmm0

-    pxor        xmm5, xmm1

-    ; x = (y ^ sz) - sz

-    psubw       xmm4, xmm0

-    psubw       xmm5, xmm1

-    ; dequant

-    movdqa      xmm0, [rcx]

-    movdqa      xmm1, [rcx + 16]

-    mov         rcx, [rsi + vp9_blockd_qcoeff]

-    pmullw      xmm0, xmm4

-    pmullw      xmm1, xmm5

-    ; store qcoeff

-    movdqa      [rcx], xmm4

-    movdqa      [rcx + 16], xmm5

-    ; store dqcoeff

-    movdqa      [rdi], xmm0

-    movdqa      [rdi + 16], xmm1

-    ; select the last value (in zig_zag order) for EOB

-    pxor        xmm6, xmm6

-    pcmpeqw     xmm4, xmm6

-    pcmpeqw     xmm5, xmm6

-    packsswb    xmm4, xmm5

-    pshufb      xmm4, [GLOBAL(zig_zag1d)]

-    pmovmskb    edx, xmm4

-    xor         rdi, rdi

-    mov         eax, -1

-    xor         dx, ax

-    bsr         eax, edx

-    sub         edi, edx

-    sar         edi, 31

-    add         eax, 1

-    and         eax, edi

-    mov         [rsi + vp9_blockd_eob], eax

-    ; begin epilog

-%if ABI_IS_32BIT

-    add         rsp, stack_size

-    pop         rsp

-    pop         rsi

-    pop         rdi

-    RESTORE_GOT

-    pop         rbp

-%else

-  %undef xmm5

-  %ifidn __OUTPUT_FORMAT__,x64

-    pop         rsi

-    pop         rdi

-    RESTORE_XMM

-  %endif

-%endif

-    ret

-SECTION_RODATA

-align 16

-; vp8/common/entropy.c: vp9_default_zig_zag1d

-zig_zag1d:

-    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15

--- a/vp8/encoder/x86/quantize_ssse3.asm

+++ /dev/null

@@ -1,138 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license and patent

-;  grant that can be found in the LICENSE file in the root of the source

-;  tree. All contributing project authors may be found in the AUTHORS

-;  file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%include "asm_enc_offsets.asm"

-; void vp9_fast_quantize_b_ssse3 | arg

-;  (BLOCK  *b,                   |  0

-;   BLOCKD *d)                   |  1

-;

-global sym(vp9_fast_quantize_b_ssse3)

-sym(vp9_fast_quantize_b_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    GET_GOT     rbx

-%if ABI_IS_32BIT

-    push        rdi

-    push        rsi

-%else

-  %ifidn __OUTPUT_FORMAT__,x64

-    push        rdi

-    push        rsi

-  %endif

-%endif

-    ; end prolog

-%if ABI_IS_32BIT

-    mov         rdi, arg(0)                 ; BLOCK *b

-    mov         rsi, arg(1)                 ; BLOCKD *d

-%else

-  %ifidn __OUTPUT_FORMAT__,x64

-    mov         rdi, rcx                    ; BLOCK *b

-    mov         rsi, rdx                    ; BLOCKD *d

-  %else

-    ;mov         rdi, rdi                    ; BLOCK *b

-    ;mov         rsi, rsi                    ; BLOCKD *d

-  %endif

-%endif

-    mov         rax, [rdi + vp9_block_coeff]

-    mov         rcx, [rdi + vp9_block_round]

-    mov         rdx, [rdi + vp9_block_quant_fast]

-    ; coeff

-    movdqa      xmm0, [rax]

-    movdqa      xmm4, [rax + 16]

-    ; round

-    movdqa      xmm2, [rcx]

-    movdqa      xmm3, [rcx + 16]

-    movdqa      xmm1, xmm0

-    movdqa      xmm5, xmm4

-    ; sz = z >> 15

-    psraw       xmm0, 15

-    psraw       xmm4, 15

-    pabsw       xmm1, xmm1

-    pabsw       xmm5, xmm5

-    paddw       xmm1, xmm2

-    paddw       xmm5, xmm3

-    ; quant_fast

-    pmulhw      xmm1, [rdx]

-    pmulhw      xmm5, [rdx + 16]

-    mov         rax, [rsi + vp9_blockd_qcoeff]

-    mov         rdi, [rsi + vp9_blockd_dequant]

-    mov         rcx, [rsi + vp9_blockd_dqcoeff]

-    pxor        xmm1, xmm0

-    pxor        xmm5, xmm4

-    psubw       xmm1, xmm0

-    psubw       xmm5, xmm4

-    movdqa      [rax], xmm1

-    movdqa      [rax + 16], xmm5

-    movdqa      xmm2, [rdi]

-    movdqa      xmm3, [rdi + 16]

-    pxor        xmm4, xmm4

-    pmullw      xmm2, xmm1

-    pmullw      xmm3, xmm5

-    pcmpeqw     xmm1, xmm4                  ;non zero mask

-    pcmpeqw     xmm5, xmm4                  ;non zero mask

-    packsswb    xmm1, xmm5

-    pshufb      xmm1, [GLOBAL(zz_shuf)]

-    pmovmskb    edx, xmm1

-    xor         rdi, rdi

-    mov         eax, -1

-    xor         dx, ax                      ;flip the bits for bsr

-    bsr         eax, edx

-    movdqa      [rcx], xmm2                 ;store dqcoeff

-    movdqa      [rcx + 16], xmm3            ;store dqcoeff

-    sub         edi, edx                    ;check for all zeros in bit mask

-    sar         edi, 31                     ;0 or -1

-    add         eax, 1

-    and         eax, edi                    ;if the bit mask was all zero,

-                                            ;then eob = 0

-    mov         [rsi + vp9_blockd_eob], eax

-    ; begin epilog

-%if ABI_IS_32BIT

-    pop         rsi

-    pop         rdi

-%else

-  %ifidn __OUTPUT_FORMAT__,x64

-    pop         rsi

-    pop         rdi

-  %endif

-%endif

-    RESTORE_GOT

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-zz_shuf:

-    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15

--- a/vp8/encoder/x86/quantize_x86.h

+++ /dev/null

@@ -1,48 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license and patent

- *  grant that can be found in the LICENSE file in the root of the source

- *  tree. All contributing project authors may be found in the AUTHORS

- *  file in the root of the source tree.

- */

-#ifndef QUANTIZE_X86_H

-#define QUANTIZE_X86_H

-/* Note:

- *

- * This platform is commonly built for runtime CPU detection. If you modify

- * any of the function mappings present in this file, be sure to also update

- * them in the function pointer initialization code

- */

-#if HAVE_MMX

-#endif /* HAVE_MMX */

-#if HAVE_SSE2

-extern prototype_quantize_block(vp9_regular_quantize_b_sse2);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef vp9_quantize_quantb

-#define vp9_quantize_quantb vp9_regular_quantize_b_sse2

-#endif /* !CONFIG_RUNTIME_CPU_DETECT */

-#endif /* HAVE_SSE2 */

-#if HAVE_SSE4_1

-extern prototype_quantize_block(vp9_regular_quantize_b_sse4);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef vp9_quantize_quantb

-#define vp9_quantize_quantb vp9_regular_quantize_b_sse4

-#endif /* !CONFIG_RUNTIME_CPU_DETECT */

-#endif /* HAVE_SSE4_1 */

-#endif /* QUANTIZE_X86_H */

--- a/vp8/encoder/x86/sad_mmx.asm

+++ /dev/null

@@ -1,427 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-global sym(vp9_sad16x16_mmx)

-global sym(vp9_sad8x16_mmx)

-global sym(vp9_sad8x8_mmx)

-global sym(vp9_sad4x4_mmx)

-global sym(vp9_sad16x8_mmx)

-;unsigned int vp9_sad16x16_mmx(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride)

-sym(vp9_sad16x16_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    push rsi

-    push rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;src_ptr

-        mov             rdi,        arg(2) ;ref_ptr

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        lea             rcx,        [rsi+rax*8]

-        lea             rcx,        [rcx+rax*8]

-        pxor            mm7,        mm7

-        pxor            mm6,        mm6

-.x16x16sad_mmx_loop:

-        movq            mm0,        QWORD PTR [rsi]

-        movq            mm2,        QWORD PTR [rsi+8]

-        movq            mm1,        QWORD PTR [rdi]

-        movq            mm3,        QWORD PTR [rdi+8]

-        movq            mm4,        mm0

-        movq            mm5,        mm2

-        psubusb         mm0,        mm1

-        psubusb         mm1,        mm4

-        psubusb         mm2,        mm3

-        psubusb         mm3,        mm5

-        por             mm0,        mm1

-        por             mm2,        mm3

-        movq            mm1,        mm0

-        movq            mm3,        mm2

-        punpcklbw       mm0,        mm6

-        punpcklbw       mm2,        mm6

-        punpckhbw       mm1,        mm6

-        punpckhbw       mm3,        mm6

-        paddw           mm0,        mm2

-        paddw           mm1,        mm3

-        lea             rsi,        [rsi+rax]

-        add             rdi,        rdx

-        paddw           mm7,        mm0

-        paddw           mm7,        mm1

-        cmp             rsi,        rcx

-        jne             .x16x16sad_mmx_loop

-        movq            mm0,        mm7

-        punpcklwd       mm0,        mm6

-        punpckhwd       mm7,        mm6

-        paddw           mm0,        mm7

-        movq            mm7,        mm0

-        psrlq           mm0,        32

-        paddw           mm7,        mm0

-        movq            rax,        mm7

-    pop rdi

-    pop rsi

-    mov rsp, rbp

-    ; begin epilog

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int vp9_sad8x16_mmx(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride)

-sym(vp9_sad8x16_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    push rsi

-    push rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;src_ptr

-        mov             rdi,        arg(2) ;ref_ptr

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        lea             rcx,        [rsi+rax*8]

-        lea             rcx,        [rcx+rax*8]

-        pxor            mm7,        mm7

-        pxor            mm6,        mm6

-.x8x16sad_mmx_loop:

-        movq            mm0,        QWORD PTR [rsi]

-        movq            mm1,        QWORD PTR [rdi]

-        movq            mm2,        mm0

-        psubusb         mm0,        mm1

-        psubusb         mm1,        mm2

-        por             mm0,        mm1

-        movq            mm2,        mm0

-        punpcklbw       mm0,        mm6

-        punpckhbw       mm2,        mm6

-        lea             rsi,        [rsi+rax]

-        add             rdi,        rdx

-        paddw           mm7,        mm0

-        paddw           mm7,        mm2

-        cmp             rsi,        rcx

-        jne             .x8x16sad_mmx_loop

-        movq            mm0,        mm7

-        punpcklwd       mm0,        mm6

-        punpckhwd       mm7,        mm6

-        paddw           mm0,        mm7

-        movq            mm7,        mm0

-        psrlq           mm0,        32

-        paddw           mm7,        mm0

-        movq            rax,        mm7

-    pop rdi

-    pop rsi

-    mov rsp, rbp

-    ; begin epilog

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int vp9_sad8x8_mmx(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride)

-sym(vp9_sad8x8_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    push rsi

-    push rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;src_ptr

-        mov             rdi,        arg(2) ;ref_ptr

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        lea             rcx,        [rsi+rax*8]

-        pxor            mm7,        mm7

-        pxor            mm6,        mm6

-.x8x8sad_mmx_loop:

-        movq            mm0,        QWORD PTR [rsi]

-        movq            mm1,        QWORD PTR [rdi]

-        movq            mm2,        mm0

-        psubusb         mm0,        mm1

-        psubusb         mm1,        mm2

-        por             mm0,        mm1

-        movq            mm2,        mm0

-        punpcklbw       mm0,        mm6

-        punpckhbw       mm2,        mm6

-        paddw           mm0,        mm2

-        lea             rsi,       [rsi+rax]

-        add             rdi,        rdx

-        paddw           mm7,       mm0

-        cmp             rsi,        rcx

-        jne             .x8x8sad_mmx_loop

-        movq            mm0,        mm7

-        punpcklwd       mm0,        mm6

-        punpckhwd       mm7,        mm6

-        paddw           mm0,        mm7

-        movq            mm7,        mm0

-        psrlq           mm0,        32

-        paddw           mm7,        mm0

-        movq            rax,        mm7

-    pop rdi

-    pop rsi

-    mov rsp, rbp

-    ; begin epilog

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int vp9_sad4x4_mmx(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride)

-sym(vp9_sad4x4_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    push rsi

-    push rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;src_ptr

-        mov             rdi,        arg(2) ;ref_ptr

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        movd            mm0,        DWORD PTR [rsi]

-        movd            mm1,        DWORD PTR [rdi]

-        movd            mm2,        DWORD PTR [rsi+rax]

-        movd            mm3,        DWORD PTR [rdi+rdx]

-        punpcklbw       mm0,        mm2

-        punpcklbw       mm1,        mm3

-        movq            mm2,        mm0

-        psubusb         mm0,        mm1

-        psubusb         mm1,        mm2

-        por             mm0,        mm1

-        movq            mm2,        mm0

-        pxor            mm3,        mm3

-        punpcklbw       mm0,        mm3

-        punpckhbw       mm2,        mm3

-        paddw           mm0,        mm2

-        lea             rsi,        [rsi+rax*2]

-        lea             rdi,        [rdi+rdx*2]

-        movd            mm4,        DWORD PTR [rsi]

-        movd            mm5,        DWORD PTR [rdi]

-        movd            mm6,        DWORD PTR [rsi+rax]

-        movd            mm7,        DWORD PTR [rdi+rdx]

-        punpcklbw       mm4,        mm6

-        punpcklbw       mm5,        mm7

-        movq            mm6,        mm4

-        psubusb         mm4,        mm5

-        psubusb         mm5,        mm6

-        por             mm4,        mm5

-        movq            mm5,        mm4

-        punpcklbw       mm4,        mm3

-        punpckhbw       mm5,        mm3

-        paddw           mm4,        mm5

-        paddw           mm0,        mm4

-        movq            mm1,        mm0

-        punpcklwd       mm0,        mm3

-        punpckhwd       mm1,        mm3

-        paddw           mm0,        mm1

-        movq            mm1,        mm0

-        psrlq           mm0,        32

-        paddw           mm0,        mm1

-        movq            rax,        mm0

-    pop rdi

-    pop rsi

-    mov rsp, rbp

-    ; begin epilog

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int vp9_sad16x8_mmx(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride)

-sym(vp9_sad16x8_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    push rsi

-    push rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;src_ptr

-        mov             rdi,        arg(2) ;ref_ptr

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        lea             rcx,        [rsi+rax*8]

-        pxor            mm7,        mm7

-        pxor            mm6,        mm6

-.x16x8sad_mmx_loop:

-        movq            mm0,       [rsi]

-        movq            mm1,       [rdi]

-        movq            mm2,        [rsi+8]

-        movq            mm3,        [rdi+8]

-        movq            mm4,        mm0

-        movq            mm5,        mm2

-        psubusb         mm0,        mm1

-        psubusb         mm1,        mm4

-        psubusb         mm2,        mm3

-        psubusb         mm3,        mm5

-        por             mm0,        mm1

-        por             mm2,        mm3

-        movq            mm1,        mm0

-        movq            mm3,        mm2

-        punpcklbw       mm0,        mm6

-        punpckhbw       mm1,        mm6

-        punpcklbw       mm2,        mm6

-        punpckhbw       mm3,        mm6

-        paddw           mm0,        mm2

-        paddw           mm1,        mm3

-        paddw           mm0,        mm1

-        lea             rsi,        [rsi+rax]

-        add             rdi,        rdx

-        paddw           mm7,        mm0

-        cmp             rsi,        rcx

-        jne             .x16x8sad_mmx_loop

-        movq            mm0,        mm7

-        punpcklwd       mm0,        mm6

-        punpckhwd       mm7,        mm6

-        paddw           mm0,        mm7

-        movq            mm7,        mm0

-        psrlq           mm0,        32

-        paddw           mm7,        mm0

-        movq            rax,        mm7

-    pop rdi

-    pop rsi

-    mov rsp, rbp

-    ; begin epilog

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

--- a/vp8/encoder/x86/sad_sse2.asm

+++ /dev/null

@@ -1,410 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;unsigned int vp9_sad16x16_wmt(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride)

-global sym(vp9_sad16x16_wmt)

-sym(vp9_sad16x16_wmt):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    SAVE_XMM 6

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;src_ptr

-        mov             rdi,        arg(2) ;ref_ptr

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        lea             rcx,        [rsi+rax*8]

-        lea             rcx,        [rcx+rax*8]

-        pxor            xmm6,       xmm6

-.x16x16sad_wmt_loop:

-        movq            xmm0,       QWORD PTR [rsi]

-        movq            xmm2,       QWORD PTR [rsi+8]

-        movq            xmm1,       QWORD PTR [rdi]

-        movq            xmm3,       QWORD PTR [rdi+8]

-        movq            xmm4,       QWORD PTR [rsi+rax]

-        movq            xmm5,       QWORD PTR [rdi+rdx]

-        punpcklbw       xmm0,       xmm2

-        punpcklbw       xmm1,       xmm3

-        psadbw          xmm0,       xmm1

-        movq            xmm2,       QWORD PTR [rsi+rax+8]

-        movq            xmm3,       QWORD PTR [rdi+rdx+8]

-        lea             rsi,        [rsi+rax*2]

-        lea             rdi,        [rdi+rdx*2]

-        punpcklbw       xmm4,       xmm2

-        punpcklbw       xmm5,       xmm3

-        psadbw          xmm4,       xmm5

-        paddw           xmm6,       xmm0

-        paddw           xmm6,       xmm4

-        cmp             rsi,        rcx

-        jne             .x16x16sad_wmt_loop

-        movq            xmm0,       xmm6

-        psrldq          xmm6,       8

-        paddw           xmm0,       xmm6

-        movq            rax,        xmm0

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int vp9_sad8x16_wmt(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride,

-;    int  max_err)

-global sym(vp9_sad8x16_wmt)

-sym(vp9_sad8x16_wmt):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    push        rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;src_ptr

-        mov             rdi,        arg(2) ;ref_ptr

-        movsxd          rbx,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        lea             rcx,        [rsi+rbx*8]

-        lea             rcx,        [rcx+rbx*8]

-        pxor            mm7,        mm7

-.x8x16sad_wmt_loop:

-        movq            rax,        mm7

-        cmp             eax,        arg(4)

-        jg              .x8x16sad_wmt_early_exit

-        movq            mm0,        QWORD PTR [rsi]

-        movq            mm1,        QWORD PTR [rdi]

-        movq            mm2,        QWORD PTR [rsi+rbx]

-        movq            mm3,        QWORD PTR [rdi+rdx]

-        psadbw          mm0,        mm1

-        psadbw          mm2,        mm3

-        lea             rsi,        [rsi+rbx*2]

-        lea             rdi,        [rdi+rdx*2]

-        paddw           mm7,        mm0

-        paddw           mm7,        mm2

-        cmp             rsi,        rcx

-        jne             .x8x16sad_wmt_loop

-        movq            rax,        mm7

-.x8x16sad_wmt_early_exit:

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    pop         rbx

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int vp9_sad8x8_wmt(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride)

-global sym(vp9_sad8x8_wmt)

-sym(vp9_sad8x8_wmt):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    push        rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;src_ptr

-        mov             rdi,        arg(2) ;ref_ptr

-        movsxd          rbx,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        lea             rcx,        [rsi+rbx*8]

-        pxor            mm7,        mm7

-.x8x8sad_wmt_loop:

-        movq            rax,        mm7

-        cmp             eax,        arg(4)

-        jg              .x8x8sad_wmt_early_exit

-        movq            mm0,        QWORD PTR [rsi]

-        movq            mm1,        QWORD PTR [rdi]

-        psadbw          mm0,        mm1

-        lea             rsi,        [rsi+rbx]

-        add             rdi,        rdx

-        paddw           mm7,        mm0

-        cmp             rsi,        rcx

-        jne             .x8x8sad_wmt_loop

-        movq            rax,        mm7

-.x8x8sad_wmt_early_exit:

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    pop         rbx

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int vp9_sad4x4_wmt(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride)

-global sym(vp9_sad4x4_wmt)

-sym(vp9_sad4x4_wmt):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;src_ptr

-        mov             rdi,        arg(2) ;ref_ptr

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        movd            mm0,        DWORD PTR [rsi]

-        movd            mm1,        DWORD PTR [rdi]

-        movd            mm2,        DWORD PTR [rsi+rax]

-        movd            mm3,        DWORD PTR [rdi+rdx]

-        punpcklbw       mm0,        mm2

-        punpcklbw       mm1,        mm3

-        psadbw          mm0,        mm1

-        lea             rsi,        [rsi+rax*2]

-        lea             rdi,        [rdi+rdx*2]

-        movd            mm4,        DWORD PTR [rsi]

-        movd            mm5,        DWORD PTR [rdi]

-        movd            mm6,        DWORD PTR [rsi+rax]

-        movd            mm7,        DWORD PTR [rdi+rdx]

-        punpcklbw       mm4,        mm6

-        punpcklbw       mm5,        mm7

-        psadbw          mm4,        mm5

-        paddw           mm0,        mm4

-        movq            rax,        mm0

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int vp9_sad16x8_wmt(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride)

-global sym(vp9_sad16x8_wmt)

-sym(vp9_sad16x8_wmt):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    push        rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;src_ptr

-        mov             rdi,        arg(2) ;ref_ptr

-        movsxd          rbx,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        lea             rcx,        [rsi+rbx*8]

-        pxor            mm7,        mm7

-.x16x8sad_wmt_loop:

-        movq            rax,        mm7

-        cmp             eax,        arg(4)

-        jg              .x16x8sad_wmt_early_exit

-        movq            mm0,        QWORD PTR [rsi]

-        movq            mm2,        QWORD PTR [rsi+8]

-        movq            mm1,        QWORD PTR [rdi]

-        movq            mm3,        QWORD PTR [rdi+8]

-        movq            mm4,        QWORD PTR [rsi+rbx]

-        movq            mm5,        QWORD PTR [rdi+rdx]

-        psadbw          mm0,        mm1

-        psadbw          mm2,        mm3

-        movq            mm1,        QWORD PTR [rsi+rbx+8]

-        movq            mm3,        QWORD PTR [rdi+rdx+8]

-        psadbw          mm4,        mm5

-        psadbw          mm1,        mm3

-        lea             rsi,        [rsi+rbx*2]

-        lea             rdi,        [rdi+rdx*2]

-        paddw           mm0,        mm2

-        paddw           mm4,        mm1

-        paddw           mm7,        mm0

-        paddw           mm7,        mm4

-        cmp             rsi,        rcx

-        jne             .x16x8sad_wmt_loop

-        movq            rax,        mm7

-.x16x8sad_wmt_early_exit:

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    pop         rbx

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_copy32xn_sse2(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *dst_ptr,

-;    int  dst_stride,

-;    int height);

-global sym(vp9_copy32xn_sse2)

-sym(vp9_copy32xn_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    SAVE_XMM 7

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;src_ptr

-        mov             rdi,        arg(2) ;dst_ptr

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;dst_stride

-        movsxd          rcx,        dword ptr arg(4) ;height

-.block_copy_sse2_loopx4:

-        movdqu          xmm0,       XMMWORD PTR [rsi]

-        movdqu          xmm1,       XMMWORD PTR [rsi + 16]

-        movdqu          xmm2,       XMMWORD PTR [rsi + rax]

-        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]

-        lea             rsi,        [rsi+rax*2]

-        movdqu          xmm4,       XMMWORD PTR [rsi]

-        movdqu          xmm5,       XMMWORD PTR [rsi + 16]

-        movdqu          xmm6,       XMMWORD PTR [rsi + rax]

-        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]

-        lea             rsi,    [rsi+rax*2]

-        movdqa          XMMWORD PTR [rdi], xmm0

-        movdqa          XMMWORD PTR [rdi + 16], xmm1

-        movdqa          XMMWORD PTR [rdi + rdx], xmm2

-        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3

-        lea             rdi,    [rdi+rdx*2]

-        movdqa          XMMWORD PTR [rdi], xmm4

-        movdqa          XMMWORD PTR [rdi + 16], xmm5

-        movdqa          XMMWORD PTR [rdi + rdx], xmm6

-        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7

-        lea             rdi,    [rdi+rdx*2]

-        sub             rcx,     4

-        cmp             rcx,     4

-        jge             .block_copy_sse2_loopx4

-        cmp             rcx, 0

-        je              .copy_is_done

-.block_copy_sse2_loop:

-        movdqu          xmm0,       XMMWORD PTR [rsi]

-        movdqu          xmm1,       XMMWORD PTR [rsi + 16]

-        lea             rsi,    [rsi+rax]

-        movdqa          XMMWORD PTR [rdi], xmm0

-        movdqa          XMMWORD PTR [rdi + 16], xmm1

-        lea             rdi,    [rdi+rdx]

-        sub             rcx,     1

-        jne             .block_copy_sse2_loop

-.copy_is_done:

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

--- a/vp8/encoder/x86/sad_sse3.asm

+++ /dev/null

@@ -1,960 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%macro STACK_FRAME_CREATE_X3 0

-%if ABI_IS_32BIT

-  %define     src_ptr       rsi

-  %define     src_stride    rax

-  %define     ref_ptr       rdi

-  %define     ref_stride    rdx

-  %define     end_ptr       rcx

-  %define     ret_var       rbx

-  %define     result_ptr    arg(4)

-  %define     max_err       arg(4)

-  %define     height        dword ptr arg(4)

-    push        rbp

-    mov         rbp,        rsp

-    push        rsi

-    push        rdi

-    push        rbx

-    mov         rsi,        arg(0)              ; src_ptr

-    mov         rdi,        arg(2)              ; ref_ptr

-    movsxd      rax,        dword ptr arg(1)    ; src_stride

-    movsxd      rdx,        dword ptr arg(3)    ; ref_stride

-%else

-  %ifidn __OUTPUT_FORMAT__,x64

-    SAVE_XMM 7, u

-    %define     src_ptr     rcx

-    %define     src_stride  rdx

-    %define     ref_ptr     r8

-    %define     ref_stride  r9

-    %define     end_ptr     r10

-    %define     ret_var     r11

-    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]

-    %define     max_err     [rsp+xmm_stack_space+8+4*8]

-    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]

-  %else

-    %define     src_ptr     rdi

-    %define     src_stride  rsi

-    %define     ref_ptr     rdx

-    %define     ref_stride  rcx

-    %define     end_ptr     r9

-    %define     ret_var     r10

-    %define     result_ptr  r8

-    %define     max_err     r8

-    %define     height      r8

-  %endif

-%endif

-%endmacro

-%macro STACK_FRAME_DESTROY_X3 0

-  %define     src_ptr

-  %define     src_stride

-  %define     ref_ptr

-  %define     ref_stride

-  %define     end_ptr

-  %define     ret_var

-  %define     result_ptr

-  %define     max_err

-  %define     height

-%if ABI_IS_32BIT

-    pop         rbx

-    pop         rdi

-    pop         rsi

-    pop         rbp

-%else

-  %ifidn __OUTPUT_FORMAT__,x64

-    RESTORE_XMM

-  %endif

-%endif

-    ret

-%endmacro

-%macro STACK_FRAME_CREATE_X4 0

-%if ABI_IS_32BIT

-  %define     src_ptr       rsi

-  %define     src_stride    rax

-  %define     r0_ptr        rcx

-  %define     r1_ptr        rdx

-  %define     r2_ptr        rbx

-  %define     r3_ptr        rdi

-  %define     ref_stride    rbp

-  %define     result_ptr    arg(4)

-    push        rbp

-    mov         rbp,        rsp

-    push        rsi

-    push        rdi

-    push        rbx

-    push        rbp

-    mov         rdi,        arg(2)              ; ref_ptr_base

-    LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi

-    mov         rsi,        arg(0)              ; src_ptr

-    movsxd      rbx,        dword ptr arg(1)    ; src_stride

-    movsxd      rbp,        dword ptr arg(3)    ; ref_stride

-    xchg        rbx,        rax

-%else

-  %ifidn __OUTPUT_FORMAT__,x64

-    SAVE_XMM 7, u

-    %define     src_ptr     rcx

-    %define     src_stride  rdx

-    %define     r0_ptr      rsi

-    %define     r1_ptr      r10

-    %define     r2_ptr      r11

-    %define     r3_ptr      r8

-    %define     ref_stride  r9

-    %define     result_ptr  [rsp+xmm_stack_space+16+4*8]

-    push        rsi

-    LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr

-  %else

-    %define     src_ptr     rdi

-    %define     src_stride  rsi

-    %define     r0_ptr      r9

-    %define     r1_ptr      r10

-    %define     r2_ptr      r11

-    %define     r3_ptr      rdx

-    %define     ref_stride  rcx

-    %define     result_ptr  r8

-    LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr

-  %endif

-%endif

-%endmacro

-%macro STACK_FRAME_DESTROY_X4 0

-  %define     src_ptr

-  %define     src_stride

-  %define     r0_ptr

-  %define     r1_ptr

-  %define     r2_ptr

-  %define     r3_ptr

-  %define     ref_stride

-  %define     result_ptr

-%if ABI_IS_32BIT

-    pop         rbx

-    pop         rdi

-    pop         rsi

-    pop         rbp

-%else

-  %ifidn __OUTPUT_FORMAT__,x64

-    pop         rsi

-    RESTORE_XMM

-  %endif

-%endif

-    ret

-%endmacro

-%macro PROCESS_16X2X3 5

-%if %1==0

-        movdqa          xmm0,       XMMWORD PTR [%2]

-        lddqu           xmm5,       XMMWORD PTR [%3]

-        lddqu           xmm6,       XMMWORD PTR [%3+1]

-        lddqu           xmm7,       XMMWORD PTR [%3+2]

-        psadbw          xmm5,       xmm0

-        psadbw          xmm6,       xmm0

-        psadbw          xmm7,       xmm0

-%else

-        movdqa          xmm0,       XMMWORD PTR [%2]

-        lddqu           xmm1,       XMMWORD PTR [%3]

-        lddqu           xmm2,       XMMWORD PTR [%3+1]

-        lddqu           xmm3,       XMMWORD PTR [%3+2]

-        psadbw          xmm1,       xmm0

-        psadbw          xmm2,       xmm0

-        psadbw          xmm3,       xmm0

-        paddw           xmm5,       xmm1

-        paddw           xmm6,       xmm2

-        paddw           xmm7,       xmm3

-%endif

-        movdqa          xmm0,       XMMWORD PTR [%2+%4]

-        lddqu           xmm1,       XMMWORD PTR [%3+%5]

-        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]

-        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]

-%if %1==0 || %1==1

-        lea             %2,         [%2+%4*2]

-        lea             %3,         [%3+%5*2]

-%endif

-        psadbw          xmm1,       xmm0

-        psadbw          xmm2,       xmm0

-        psadbw          xmm3,       xmm0

-        paddw           xmm5,       xmm1

-        paddw           xmm6,       xmm2

-        paddw           xmm7,       xmm3

-%endmacro

-%macro PROCESS_8X2X3 5

-%if %1==0

-        movq            mm0,       QWORD PTR [%2]

-        movq            mm5,       QWORD PTR [%3]

-        movq            mm6,       QWORD PTR [%3+1]

-        movq            mm7,       QWORD PTR [%3+2]

-        psadbw          mm5,       mm0

-        psadbw          mm6,       mm0

-        psadbw          mm7,       mm0

-%else

-        movq            mm0,       QWORD PTR [%2]

-        movq            mm1,       QWORD PTR [%3]

-        movq            mm2,       QWORD PTR [%3+1]

-        movq            mm3,       QWORD PTR [%3+2]

-        psadbw          mm1,       mm0

-        psadbw          mm2,       mm0

-        psadbw          mm3,       mm0

-        paddw           mm5,       mm1

-        paddw           mm6,       mm2

-        paddw           mm7,       mm3

-%endif

-        movq            mm0,       QWORD PTR [%2+%4]

-        movq            mm1,       QWORD PTR [%3+%5]

-        movq            mm2,       QWORD PTR [%3+%5+1]

-        movq            mm3,       QWORD PTR [%3+%5+2]

-%if %1==0 || %1==1

-        lea             %2,        [%2+%4*2]

-        lea             %3,        [%3+%5*2]

-%endif

-        psadbw          mm1,       mm0

-        psadbw          mm2,       mm0

-        psadbw          mm3,       mm0

-        paddw           mm5,       mm1

-        paddw           mm6,       mm2

-        paddw           mm7,       mm3

-%endmacro

-%macro LOAD_X4_ADDRESSES 5

-        mov             %2,         [%1+REG_SZ_BYTES*0]

-        mov             %3,         [%1+REG_SZ_BYTES*1]

-        mov             %4,         [%1+REG_SZ_BYTES*2]

-        mov             %5,         [%1+REG_SZ_BYTES*3]

-%endmacro

-%macro PROCESS_16X2X4 8

-%if %1==0

-        movdqa          xmm0,       XMMWORD PTR [%2]

-        lddqu           xmm4,       XMMWORD PTR [%3]

-        lddqu           xmm5,       XMMWORD PTR [%4]

-        lddqu           xmm6,       XMMWORD PTR [%5]

-        lddqu           xmm7,       XMMWORD PTR [%6]

-        psadbw          xmm4,       xmm0

-        psadbw          xmm5,       xmm0

-        psadbw          xmm6,       xmm0

-        psadbw          xmm7,       xmm0

-%else

-        movdqa          xmm0,       XMMWORD PTR [%2]

-        lddqu           xmm1,       XMMWORD PTR [%3]

-        lddqu           xmm2,       XMMWORD PTR [%4]

-        lddqu           xmm3,       XMMWORD PTR [%5]

-        psadbw          xmm1,       xmm0

-        psadbw          xmm2,       xmm0

-        psadbw          xmm3,       xmm0

-        paddw           xmm4,       xmm1

-        lddqu           xmm1,       XMMWORD PTR [%6]

-        paddw           xmm5,       xmm2

-        paddw           xmm6,       xmm3

-        psadbw          xmm1,       xmm0

-        paddw           xmm7,       xmm1

-%endif

-        movdqa          xmm0,       XMMWORD PTR [%2+%7]

-        lddqu           xmm1,       XMMWORD PTR [%3+%8]

-        lddqu           xmm2,       XMMWORD PTR [%4+%8]

-        lddqu           xmm3,       XMMWORD PTR [%5+%8]

-        psadbw          xmm1,       xmm0

-        psadbw          xmm2,       xmm0

-        psadbw          xmm3,       xmm0

-        paddw           xmm4,       xmm1

-        lddqu           xmm1,       XMMWORD PTR [%6+%8]

-        paddw           xmm5,       xmm2

-        paddw           xmm6,       xmm3

-%if %1==0 || %1==1

-        lea             %2,         [%2+%7*2]

-        lea             %3,         [%3+%8*2]

-        lea             %4,         [%4+%8*2]

-        lea             %5,         [%5+%8*2]

-        lea             %6,         [%6+%8*2]

-%endif

-        psadbw          xmm1,       xmm0

-        paddw           xmm7,       xmm1

-%endmacro

-%macro PROCESS_8X2X4 8

-%if %1==0

-        movq            mm0,        QWORD PTR [%2]

-        movq            mm4,        QWORD PTR [%3]

-        movq            mm5,        QWORD PTR [%4]

-        movq            mm6,        QWORD PTR [%5]

-        movq            mm7,        QWORD PTR [%6]

-        psadbw          mm4,        mm0

-        psadbw          mm5,        mm0

-        psadbw          mm6,        mm0

-        psadbw          mm7,        mm0

-%else

-        movq            mm0,        QWORD PTR [%2]

-        movq            mm1,        QWORD PTR [%3]

-        movq            mm2,        QWORD PTR [%4]

-        movq            mm3,        QWORD PTR [%5]

-        psadbw          mm1,        mm0

-        psadbw          mm2,        mm0

-        psadbw          mm3,        mm0

-        paddw           mm4,        mm1

-        movq            mm1,        QWORD PTR [%6]

-        paddw           mm5,        mm2

-        paddw           mm6,        mm3

-        psadbw          mm1,        mm0

-        paddw           mm7,        mm1

-%endif

-        movq            mm0,        QWORD PTR [%2+%7]

-        movq            mm1,        QWORD PTR [%3+%8]

-        movq            mm2,        QWORD PTR [%4+%8]

-        movq            mm3,        QWORD PTR [%5+%8]

-        psadbw          mm1,        mm0

-        psadbw          mm2,        mm0

-        psadbw          mm3,        mm0

-        paddw           mm4,        mm1

-        movq            mm1,        QWORD PTR [%6+%8]

-        paddw           mm5,        mm2

-        paddw           mm6,        mm3

-%if %1==0 || %1==1

-        lea             %2,         [%2+%7*2]

-        lea             %3,         [%3+%8*2]

-        lea             %4,         [%4+%8*2]

-        lea             %5,         [%5+%8*2]

-        lea             %6,         [%6+%8*2]

-%endif

-        psadbw          mm1,        mm0

-        paddw           mm7,        mm1

-%endmacro

-;void int vp9_sad16x16x3_sse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride,

-;    int  *results)

-global sym(vp9_sad16x16x3_sse3)

-sym(vp9_sad16x16x3_sse3):

-    STACK_FRAME_CREATE_X3

-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride

-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride

-        mov             rcx,        result_ptr

-        movq            xmm0,       xmm5

-        psrldq          xmm5,       8

-        paddw           xmm0,       xmm5

-        movd            [rcx],      xmm0

-;-

-        movq            xmm0,       xmm6

-        psrldq          xmm6,       8

-        paddw           xmm0,       xmm6

-        movd            [rcx+4],    xmm0

-;-

-        movq            xmm0,       xmm7

-        psrldq          xmm7,       8

-        paddw           xmm0,       xmm7

-        movd            [rcx+8],    xmm0

-    STACK_FRAME_DESTROY_X3

-;void int vp9_sad16x8x3_sse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride,

-;    int  *results)

-global sym(vp9_sad16x8x3_sse3)

-sym(vp9_sad16x8x3_sse3):

-    STACK_FRAME_CREATE_X3

-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride

-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride

-        mov             rcx,        result_ptr

-        movq            xmm0,       xmm5

-        psrldq          xmm5,       8

-        paddw           xmm0,       xmm5

-        movd            [rcx],      xmm0

-;-

-        movq            xmm0,       xmm6

-        psrldq          xmm6,       8

-        paddw           xmm0,       xmm6

-        movd            [rcx+4],    xmm0

-;-

-        movq            xmm0,       xmm7

-        psrldq          xmm7,       8

-        paddw           xmm0,       xmm7

-        movd            [rcx+8],    xmm0

-    STACK_FRAME_DESTROY_X3

-;void int vp9_sad8x16x3_sse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride,

-;    int  *results)

-global sym(vp9_sad8x16x3_sse3)

-sym(vp9_sad8x16x3_sse3):

-    STACK_FRAME_CREATE_X3

-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride

-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride

-        mov             rcx,        result_ptr

-        punpckldq       mm5,        mm6

-        movq            [rcx],      mm5

-        movd            [rcx+8],    mm7

-    STACK_FRAME_DESTROY_X3

-;void int vp9_sad8x8x3_sse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride,

-;    int  *results)

-global sym(vp9_sad8x8x3_sse3)

-sym(vp9_sad8x8x3_sse3):

-    STACK_FRAME_CREATE_X3

-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride

-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride

-        mov             rcx,        result_ptr

-        punpckldq       mm5,        mm6

-        movq            [rcx],      mm5

-        movd            [rcx+8],    mm7

-    STACK_FRAME_DESTROY_X3

-;void int vp9_sad4x4x3_sse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride,

-;    int  *results)

-global sym(vp9_sad4x4x3_sse3)

-sym(vp9_sad4x4x3_sse3):

-    STACK_FRAME_CREATE_X3

-        movd            mm0,        DWORD PTR [src_ptr]

-        movd            mm1,        DWORD PTR [ref_ptr]

-        movd            mm2,        DWORD PTR [src_ptr+src_stride]

-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]

-        punpcklbw       mm0,        mm2

-        punpcklbw       mm1,        mm3

-        movd            mm4,        DWORD PTR [ref_ptr+1]

-        movd            mm5,        DWORD PTR [ref_ptr+2]

-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]

-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]

-        psadbw          mm1,        mm0

-        punpcklbw       mm4,        mm2

-        punpcklbw       mm5,        mm3

-        psadbw          mm4,        mm0

-        psadbw          mm5,        mm0

-        lea             src_ptr,    [src_ptr+src_stride*2]

-        lea             ref_ptr,    [ref_ptr+ref_stride*2]

-        movd            mm0,        DWORD PTR [src_ptr]

-        movd            mm2,        DWORD PTR [ref_ptr]

-        movd            mm3,        DWORD PTR [src_ptr+src_stride]

-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]

-        punpcklbw       mm0,        mm3

-        punpcklbw       mm2,        mm6

-        movd            mm3,        DWORD PTR [ref_ptr+1]

-        movd            mm7,        DWORD PTR [ref_ptr+2]

-        psadbw          mm2,        mm0

-        paddw           mm1,        mm2

-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]

-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]

-        punpcklbw       mm3,        mm2

-        punpcklbw       mm7,        mm6

-        psadbw          mm3,        mm0

-        psadbw          mm7,        mm0

-        paddw           mm3,        mm4

-        paddw           mm7,        mm5

-        mov             rcx,        result_ptr

-        punpckldq       mm1,        mm3

-        movq            [rcx],      mm1

-        movd            [rcx+8],    mm7

-    STACK_FRAME_DESTROY_X3

-;unsigned int vp9_sad16x16_sse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride,

-;    int  max_err)

-;%define lddqu movdqu

-global sym(vp9_sad16x16_sse3)

-sym(vp9_sad16x16_sse3):

-    STACK_FRAME_CREATE_X3

-        mov             end_ptr,    4

-        pxor            xmm7,        xmm7

-.vp9_sad16x16_sse3_loop:

-        movdqa          xmm0,       XMMWORD PTR [src_ptr]

-        movdqu          xmm1,       XMMWORD PTR [ref_ptr]

-        movdqa          xmm2,       XMMWORD PTR [src_ptr+src_stride]

-        movdqu          xmm3,       XMMWORD PTR [ref_ptr+ref_stride]

-        lea             src_ptr,    [src_ptr+src_stride*2]

-        lea             ref_ptr,    [ref_ptr+ref_stride*2]

-        movdqa          xmm4,       XMMWORD PTR [src_ptr]

-        movdqu          xmm5,       XMMWORD PTR [ref_ptr]

-        movdqa          xmm6,       XMMWORD PTR [src_ptr+src_stride]

-        psadbw          xmm0,       xmm1

-        movdqu          xmm1,       XMMWORD PTR [ref_ptr+ref_stride]

-        psadbw          xmm2,       xmm3

-        psadbw          xmm4,       xmm5

-        psadbw          xmm6,       xmm1

-        lea             src_ptr,    [src_ptr+src_stride*2]

-        lea             ref_ptr,    [ref_ptr+ref_stride*2]

-        paddw           xmm7,        xmm0

-        paddw           xmm7,        xmm2

-        paddw           xmm7,        xmm4

-        paddw           xmm7,        xmm6

-        sub             end_ptr,     1

-        jne             .vp9_sad16x16_sse3_loop

-        movq            xmm0,       xmm7

-        psrldq          xmm7,       8

-        paddw           xmm0,       xmm7

-        movq            rax,        xmm0

-    STACK_FRAME_DESTROY_X3

-;void vp9_copy32xn_sse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *dst_ptr,

-;    int  dst_stride,

-;    int height);

-global sym(vp9_copy32xn_sse3)

-sym(vp9_copy32xn_sse3):

-    STACK_FRAME_CREATE_X3

-.block_copy_sse3_loopx4:

-        lea             end_ptr,    [src_ptr+src_stride*2]

-        movdqu          xmm0,       XMMWORD PTR [src_ptr]

-        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]

-        movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]

-        movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]

-        movdqu          xmm4,       XMMWORD PTR [end_ptr]

-        movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]

-        movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]

-        movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]

-        lea             src_ptr,    [src_ptr+src_stride*4]

-        lea             end_ptr,    [ref_ptr+ref_stride*2]

-        movdqa          XMMWORD PTR [ref_ptr], xmm0

-        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1

-        movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2

-        movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3

-        movdqa          XMMWORD PTR [end_ptr], xmm4

-        movdqa          XMMWORD PTR [end_ptr + 16], xmm5

-        movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6

-        movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7

-        lea             ref_ptr,    [ref_ptr+ref_stride*4]

-        sub             height,     4

-        cmp             height,     4

-        jge             .block_copy_sse3_loopx4

-        ;Check to see if there is more rows need to be copied.

-        cmp             height, 0

-        je              .copy_is_done

-.block_copy_sse3_loop:

-        movdqu          xmm0,       XMMWORD PTR [src_ptr]

-        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]

-        lea             src_ptr,    [src_ptr+src_stride]

-        movdqa          XMMWORD PTR [ref_ptr], xmm0

-        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1

-        lea             ref_ptr,    [ref_ptr+ref_stride]

-        sub             height,     1

-        jne             .block_copy_sse3_loop

-.copy_is_done:

-    STACK_FRAME_DESTROY_X3

-;void vp9_sad16x16x4d_sse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr_base,

-;    int  ref_stride,

-;    int  *results)

-global sym(vp9_sad16x16x4d_sse3)

-sym(vp9_sad16x16x4d_sse3):

-    STACK_FRAME_CREATE_X4

-        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-%if ABI_IS_32BIT

-        pop             rbp

-%endif

-        mov             rcx,        result_ptr

-        movq            xmm0,       xmm4

-        psrldq          xmm4,       8

-        paddw           xmm0,       xmm4

-        movd            [rcx],      xmm0

-;-

-        movq            xmm0,       xmm5

-        psrldq          xmm5,       8

-        paddw           xmm0,       xmm5

-        movd            [rcx+4],    xmm0

-;-

-        movq            xmm0,       xmm6

-        psrldq          xmm6,       8

-        paddw           xmm0,       xmm6

-        movd            [rcx+8],    xmm0

-;-

-        movq            xmm0,       xmm7

-        psrldq          xmm7,       8

-        paddw           xmm0,       xmm7

-        movd            [rcx+12],   xmm0

-    STACK_FRAME_DESTROY_X4

-;void vp9_sad16x8x4d_sse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr_base,

-;    int  ref_stride,

-;    int  *results)

-global sym(vp9_sad16x8x4d_sse3)

-sym(vp9_sad16x8x4d_sse3):

-    STACK_FRAME_CREATE_X4

-        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-%if ABI_IS_32BIT

-        pop             rbp

-%endif

-        mov             rcx,        result_ptr

-        movq            xmm0,       xmm4

-        psrldq          xmm4,       8

-        paddw           xmm0,       xmm4

-        movd            [rcx],      xmm0

-;-

-        movq            xmm0,       xmm5

-        psrldq          xmm5,       8

-        paddw           xmm0,       xmm5

-        movd            [rcx+4],    xmm0

-;-

-        movq            xmm0,       xmm6

-        psrldq          xmm6,       8

-        paddw           xmm0,       xmm6

-        movd            [rcx+8],    xmm0

-;-

-        movq            xmm0,       xmm7

-        psrldq          xmm7,       8

-        paddw           xmm0,       xmm7

-        movd            [rcx+12],   xmm0

-    STACK_FRAME_DESTROY_X4

-;void int vp9_sad8x16x4d_sse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride,

-;    int  *results)

-global sym(vp9_sad8x16x4d_sse3)

-sym(vp9_sad8x16x4d_sse3):

-    STACK_FRAME_CREATE_X4

-        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-%if ABI_IS_32BIT

-        pop             rbp

-%endif

-        mov             rcx,        result_ptr

-        punpckldq       mm4,        mm5

-        punpckldq       mm6,        mm7

-        movq            [rcx],      mm4

-        movq            [rcx+8],    mm6

-    STACK_FRAME_DESTROY_X4

-;void int vp9_sad8x8x4d_sse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride,

-;    int  *results)

-global sym(vp9_sad8x8x4d_sse3)

-sym(vp9_sad8x8x4d_sse3):

-    STACK_FRAME_CREATE_X4

-        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-%if ABI_IS_32BIT

-        pop             rbp

-%endif

-        mov             rcx,        result_ptr

-        punpckldq       mm4,        mm5

-        punpckldq       mm6,        mm7

-        movq            [rcx],      mm4

-        movq            [rcx+8],    mm6

-    STACK_FRAME_DESTROY_X4

-;void int vp9_sad4x4x4d_sse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride,

-;    int  *results)

-global sym(vp9_sad4x4x4d_sse3)

-sym(vp9_sad4x4x4d_sse3):

-    STACK_FRAME_CREATE_X4

-        movd            mm0,        DWORD PTR [src_ptr]

-        movd            mm1,        DWORD PTR [r0_ptr]

-        movd            mm2,        DWORD PTR [src_ptr+src_stride]

-        movd            mm3,        DWORD PTR [r0_ptr+ref_stride]

-        punpcklbw       mm0,        mm2

-        punpcklbw       mm1,        mm3

-        movd            mm4,        DWORD PTR [r1_ptr]

-        movd            mm5,        DWORD PTR [r2_ptr]

-        movd            mm6,        DWORD PTR [r3_ptr]

-        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]

-        movd            mm3,        DWORD PTR [r2_ptr+ref_stride]

-        movd            mm7,        DWORD PTR [r3_ptr+ref_stride]

-        psadbw          mm1,        mm0

-        punpcklbw       mm4,        mm2

-        punpcklbw       mm5,        mm3

-        punpcklbw       mm6,        mm7

-        psadbw          mm4,        mm0

-        psadbw          mm5,        mm0

-        psadbw          mm6,        mm0

-        lea             src_ptr,    [src_ptr+src_stride*2]

-        lea             r0_ptr,     [r0_ptr+ref_stride*2]

-        lea             r1_ptr,     [r1_ptr+ref_stride*2]

-        lea             r2_ptr,     [r2_ptr+ref_stride*2]

-        lea             r3_ptr,     [r3_ptr+ref_stride*2]

-        movd            mm0,        DWORD PTR [src_ptr]

-        movd            mm2,        DWORD PTR [r0_ptr]

-        movd            mm3,        DWORD PTR [src_ptr+src_stride]

-        movd            mm7,        DWORD PTR [r0_ptr+ref_stride]

-        punpcklbw       mm0,        mm3

-        punpcklbw       mm2,        mm7

-        movd            mm3,        DWORD PTR [r1_ptr]

-        movd            mm7,        DWORD PTR [r2_ptr]

-        psadbw          mm2,        mm0

-%if ABI_IS_32BIT

-        mov             rax,        rbp

-        pop             rbp

-%define     ref_stride    rax

-%endif

-        mov             rsi,        result_ptr

-        paddw           mm1,        mm2

-        movd            [rsi],      mm1

-        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]

-        movd            mm1,        DWORD PTR [r2_ptr+ref_stride]

-        punpcklbw       mm3,        mm2

-        punpcklbw       mm7,        mm1

-        psadbw          mm3,        mm0

-        psadbw          mm7,        mm0

-        movd            mm2,        DWORD PTR [r3_ptr]

-        movd            mm1,        DWORD PTR [r3_ptr+ref_stride]

-        paddw           mm3,        mm4

-        paddw           mm7,        mm5

-        movd            [rsi+4],    mm3

-        punpcklbw       mm2,        mm1

-        movd            [rsi+8],    mm7

-        psadbw          mm2,        mm0

-        paddw           mm2,        mm6

-        movd            [rsi+12],   mm2

-    STACK_FRAME_DESTROY_X4

--- a/vp8/encoder/x86/sad_sse4.asm

+++ /dev/null

@@ -1,353 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%macro PROCESS_16X2X8 1

-%if %1

-        movdqa          xmm0,       XMMWORD PTR [rsi]

-        movq            xmm1,       MMWORD PTR [rdi]

-        movq            xmm3,       MMWORD PTR [rdi+8]

-        movq            xmm2,       MMWORD PTR [rdi+16]

-        punpcklqdq      xmm1,       xmm3

-        punpcklqdq      xmm3,       xmm2

-        movdqa          xmm2,       xmm1

-        mpsadbw         xmm1,       xmm0,  0x0

-        mpsadbw         xmm2,       xmm0,  0x5

-        psrldq          xmm0,       8

-        movdqa          xmm4,       xmm3

-        mpsadbw         xmm3,       xmm0,  0x0

-        mpsadbw         xmm4,       xmm0,  0x5

-        paddw           xmm1,       xmm2

-        paddw           xmm1,       xmm3

-        paddw           xmm1,       xmm4

-%else

-        movdqa          xmm0,       XMMWORD PTR [rsi]

-        movq            xmm5,       MMWORD PTR [rdi]

-        movq            xmm3,       MMWORD PTR [rdi+8]

-        movq            xmm2,       MMWORD PTR [rdi+16]

-        punpcklqdq      xmm5,       xmm3

-        punpcklqdq      xmm3,       xmm2

-        movdqa          xmm2,       xmm5

-        mpsadbw         xmm5,       xmm0,  0x0

-        mpsadbw         xmm2,       xmm0,  0x5

-        psrldq          xmm0,       8

-        movdqa          xmm4,       xmm3

-        mpsadbw         xmm3,       xmm0,  0x0

-        mpsadbw         xmm4,       xmm0,  0x5

-        paddw           xmm5,       xmm2

-        paddw           xmm5,       xmm3

-        paddw           xmm5,       xmm4

-        paddw           xmm1,       xmm5

-%endif

-        movdqa          xmm0,       XMMWORD PTR [rsi + rax]

-        movq            xmm5,       MMWORD PTR [rdi+ rdx]

-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]

-        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]

-        punpcklqdq      xmm5,       xmm3

-        punpcklqdq      xmm3,       xmm2

-        lea             rsi,        [rsi+rax*2]

-        lea             rdi,        [rdi+rdx*2]

-        movdqa          xmm2,       xmm5

-        mpsadbw         xmm5,       xmm0,  0x0

-        mpsadbw         xmm2,       xmm0,  0x5

-        psrldq          xmm0,       8

-        movdqa          xmm4,       xmm3

-        mpsadbw         xmm3,       xmm0,  0x0

-        mpsadbw         xmm4,       xmm0,  0x5

-        paddw           xmm5,       xmm2

-        paddw           xmm5,       xmm3

-        paddw           xmm5,       xmm4

-        paddw           xmm1,       xmm5

-%endmacro

-%macro PROCESS_8X2X8 1

-%if %1

-        movq            xmm0,       MMWORD PTR [rsi]

-        movq            xmm1,       MMWORD PTR [rdi]

-        movq            xmm3,       MMWORD PTR [rdi+8]

-        punpcklqdq      xmm1,       xmm3

-        movdqa          xmm2,       xmm1

-        mpsadbw         xmm1,       xmm0,  0x0

-        mpsadbw         xmm2,       xmm0,  0x5

-        paddw           xmm1,       xmm2

-%else

-        movq            xmm0,       MMWORD PTR [rsi]

-        movq            xmm5,       MMWORD PTR [rdi]

-        movq            xmm3,       MMWORD PTR [rdi+8]

-        punpcklqdq      xmm5,       xmm3

-        movdqa          xmm2,       xmm5

-        mpsadbw         xmm5,       xmm0,  0x0

-        mpsadbw         xmm2,       xmm0,  0x5

-        paddw           xmm5,       xmm2

-        paddw           xmm1,       xmm5

-%endif

-        movq            xmm0,       MMWORD PTR [rsi + rax]

-        movq            xmm5,       MMWORD PTR [rdi+ rdx]

-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]

-        punpcklqdq      xmm5,       xmm3

-        lea             rsi,        [rsi+rax*2]

-        lea             rdi,        [rdi+rdx*2]

-        movdqa          xmm2,       xmm5

-        mpsadbw         xmm5,       xmm0,  0x0

-        mpsadbw         xmm2,       xmm0,  0x5

-        paddw           xmm5,       xmm2

-        paddw           xmm1,       xmm5

-%endmacro

-%macro PROCESS_4X2X8 1

-%if %1

-        movd            xmm0,       [rsi]

-        movq            xmm1,       MMWORD PTR [rdi]

-        movq            xmm3,       MMWORD PTR [rdi+8]

-        punpcklqdq      xmm1,       xmm3

-        mpsadbw         xmm1,       xmm0,  0x0

-%else

-        movd            xmm0,       [rsi]

-        movq            xmm5,       MMWORD PTR [rdi]

-        movq            xmm3,       MMWORD PTR [rdi+8]

-        punpcklqdq      xmm5,       xmm3

-        mpsadbw         xmm5,       xmm0,  0x0

-        paddw           xmm1,       xmm5

-%endif

-        movd            xmm0,       [rsi + rax]

-        movq            xmm5,       MMWORD PTR [rdi+ rdx]

-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]

-        punpcklqdq      xmm5,       xmm3

-        lea             rsi,        [rsi+rax*2]

-        lea             rdi,        [rdi+rdx*2]

-        mpsadbw         xmm5,       xmm0,  0x0

-        paddw           xmm1,       xmm5

-%endmacro

-;void vp9_sad16x16x8_sse4(

-;    const unsigned char *src_ptr,

-;    int  src_stride,

-;    const unsigned char *ref_ptr,

-;    int  ref_stride,

-;    unsigned short *sad_array);

-global sym(vp9_sad16x16x8_sse4)

-sym(vp9_sad16x16x8_sse4):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov             rsi,        arg(0)           ;src_ptr

-        mov             rdi,        arg(2)           ;ref_ptr

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        PROCESS_16X2X8 1

-        PROCESS_16X2X8 0

-        PROCESS_16X2X8 0

-        PROCESS_16X2X8 0

-        PROCESS_16X2X8 0

-        PROCESS_16X2X8 0

-        PROCESS_16X2X8 0

-        PROCESS_16X2X8 0

-        mov             rdi,        arg(4)           ;Results

-        movdqa          XMMWORD PTR [rdi],    xmm1

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_sad16x8x8_sse4(

-;    const unsigned char *src_ptr,

-;    int  src_stride,

-;    const unsigned char *ref_ptr,

-;    int  ref_stride,

-;    unsigned short *sad_array

-;);

-global sym(vp9_sad16x8x8_sse4)

-sym(vp9_sad16x8x8_sse4):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov             rsi,        arg(0)           ;src_ptr

-        mov             rdi,        arg(2)           ;ref_ptr

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        PROCESS_16X2X8 1

-        PROCESS_16X2X8 0

-        PROCESS_16X2X8 0

-        PROCESS_16X2X8 0

-        mov             rdi,        arg(4)           ;Results

-        movdqa          XMMWORD PTR [rdi],    xmm1

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_sad8x8x8_sse4(

-;    const unsigned char *src_ptr,

-;    int  src_stride,

-;    const unsigned char *ref_ptr,

-;    int  ref_stride,

-;    unsigned short *sad_array

-;);

-global sym(vp9_sad8x8x8_sse4)

-sym(vp9_sad8x8x8_sse4):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov             rsi,        arg(0)           ;src_ptr

-        mov             rdi,        arg(2)           ;ref_ptr

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        PROCESS_8X2X8 1

-        PROCESS_8X2X8 0

-        PROCESS_8X2X8 0

-        PROCESS_8X2X8 0

-        mov             rdi,        arg(4)           ;Results

-        movdqa          XMMWORD PTR [rdi],    xmm1

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_sad8x16x8_sse4(

-;    const unsigned char *src_ptr,

-;    int  src_stride,

-;    const unsigned char *ref_ptr,

-;    int  ref_stride,

-;    unsigned short *sad_array

-;);

-global sym(vp9_sad8x16x8_sse4)

-sym(vp9_sad8x16x8_sse4):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov             rsi,        arg(0)           ;src_ptr

-        mov             rdi,        arg(2)           ;ref_ptr

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        PROCESS_8X2X8 1

-        PROCESS_8X2X8 0

-        PROCESS_8X2X8 0

-        PROCESS_8X2X8 0

-        PROCESS_8X2X8 0

-        PROCESS_8X2X8 0

-        PROCESS_8X2X8 0

-        PROCESS_8X2X8 0

-        mov             rdi,        arg(4)           ;Results

-        movdqa          XMMWORD PTR [rdi],    xmm1

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_sad4x4x8_c(

-;    const unsigned char *src_ptr,

-;    int  src_stride,

-;    const unsigned char *ref_ptr,

-;    int  ref_stride,

-;    unsigned short *sad_array

-;);

-global sym(vp9_sad4x4x8_sse4)

-sym(vp9_sad4x4x8_sse4):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov             rsi,        arg(0)           ;src_ptr

-        mov             rdi,        arg(2)           ;ref_ptr

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        PROCESS_4X2X8 1

-        PROCESS_4X2X8 0

-        mov             rdi,        arg(4)           ;Results

-        movdqa          XMMWORD PTR [rdi],    xmm1

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

--- a/vp8/encoder/x86/sad_ssse3.asm

+++ /dev/null

@@ -1,370 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%macro PROCESS_16X2X3 1

-%if %1

-        movdqa          xmm0,       XMMWORD PTR [rsi]

-        lddqu           xmm5,       XMMWORD PTR [rdi]

-        lddqu           xmm6,       XMMWORD PTR [rdi+1]

-        lddqu           xmm7,       XMMWORD PTR [rdi+2]

-        psadbw          xmm5,       xmm0

-        psadbw          xmm6,       xmm0

-        psadbw          xmm7,       xmm0

-%else

-        movdqa          xmm0,       XMMWORD PTR [rsi]

-        lddqu           xmm1,       XMMWORD PTR [rdi]

-        lddqu           xmm2,       XMMWORD PTR [rdi+1]

-        lddqu           xmm3,       XMMWORD PTR [rdi+2]

-        psadbw          xmm1,       xmm0

-        psadbw          xmm2,       xmm0

-        psadbw          xmm3,       xmm0

-        paddw           xmm5,       xmm1

-        paddw           xmm6,       xmm2

-        paddw           xmm7,       xmm3

-%endif

-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]

-        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]

-        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]

-        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]

-        lea             rsi,        [rsi+rax*2]

-        lea             rdi,        [rdi+rdx*2]

-        psadbw          xmm1,       xmm0

-        psadbw          xmm2,       xmm0

-        psadbw          xmm3,       xmm0

-        paddw           xmm5,       xmm1

-        paddw           xmm6,       xmm2

-        paddw           xmm7,       xmm3

-%endmacro

-%macro PROCESS_16X2X3_OFFSET 2

-%if %1

-        movdqa          xmm0,       XMMWORD PTR [rsi]

-        movdqa          xmm4,       XMMWORD PTR [rdi]

-        movdqa          xmm7,       XMMWORD PTR [rdi+16]

-        movdqa          xmm5,       xmm7

-        palignr         xmm5,       xmm4,       %2

-        movdqa          xmm6,       xmm7

-        palignr         xmm6,       xmm4,       (%2+1)

-        palignr         xmm7,       xmm4,       (%2+2)

-        psadbw          xmm5,       xmm0

-        psadbw          xmm6,       xmm0

-        psadbw          xmm7,       xmm0

-%else

-        movdqa          xmm0,       XMMWORD PTR [rsi]

-        movdqa          xmm4,       XMMWORD PTR [rdi]

-        movdqa          xmm3,       XMMWORD PTR [rdi+16]

-        movdqa          xmm1,       xmm3

-        palignr         xmm1,       xmm4,       %2

-        movdqa          xmm2,       xmm3

-        palignr         xmm2,       xmm4,       (%2+1)

-        palignr         xmm3,       xmm4,       (%2+2)

-        psadbw          xmm1,       xmm0

-        psadbw          xmm2,       xmm0

-        psadbw          xmm3,       xmm0

-        paddw           xmm5,       xmm1

-        paddw           xmm6,       xmm2

-        paddw           xmm7,       xmm3

-%endif

-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]

-        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]

-        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]

-        movdqa          xmm1,       xmm3

-        palignr         xmm1,       xmm4,       %2

-        movdqa          xmm2,       xmm3

-        palignr         xmm2,       xmm4,       (%2+1)

-        palignr         xmm3,       xmm4,       (%2+2)

-        lea             rsi,        [rsi+rax*2]

-        lea             rdi,        [rdi+rdx*2]

-        psadbw          xmm1,       xmm0

-        psadbw          xmm2,       xmm0

-        psadbw          xmm3,       xmm0

-        paddw           xmm5,       xmm1

-        paddw           xmm6,       xmm2

-        paddw           xmm7,       xmm3

-%endmacro

-%macro PROCESS_16X16X3_OFFSET 2

-%2_aligned_by_%1:

-        sub             rdi,        %1

-        PROCESS_16X2X3_OFFSET 1, %1

-        PROCESS_16X2X3_OFFSET 0, %1

-        PROCESS_16X2X3_OFFSET 0, %1

-        PROCESS_16X2X3_OFFSET 0, %1

-        PROCESS_16X2X3_OFFSET 0, %1

-        PROCESS_16X2X3_OFFSET 0, %1

-        PROCESS_16X2X3_OFFSET 0, %1

-        PROCESS_16X2X3_OFFSET 0, %1

-        jmp             %2_store_off

-%endmacro

-%macro PROCESS_16X8X3_OFFSET 2

-%2_aligned_by_%1:

-        sub             rdi,        %1

-        PROCESS_16X2X3_OFFSET 1, %1

-        PROCESS_16X2X3_OFFSET 0, %1

-        PROCESS_16X2X3_OFFSET 0, %1

-        PROCESS_16X2X3_OFFSET 0, %1

-        jmp             %2_store_off

-%endmacro

-;void int vp9_sad16x16x3_ssse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride,

-;    int  *results)

-global sym(vp9_sad16x16x3_ssse3)

-sym(vp9_sad16x16x3_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    SAVE_XMM 7

-    push        rsi

-    push        rdi

-    push        rcx

-    ; end prolog

-        mov             rsi,        arg(0) ;src_ptr

-        mov             rdi,        arg(2) ;ref_ptr

-        mov             rdx,        0xf

-        and             rdx,        rdi

-        jmp .vp9_sad16x16x3_ssse3_skiptable

-.vp9_sad16x16x3_ssse3_jumptable:

-        dd .vp9_sad16x16x3_ssse3_aligned_by_0  - .vp9_sad16x16x3_ssse3_do_jump

-        dd .vp9_sad16x16x3_ssse3_aligned_by_1  - .vp9_sad16x16x3_ssse3_do_jump

-        dd .vp9_sad16x16x3_ssse3_aligned_by_2  - .vp9_sad16x16x3_ssse3_do_jump

-        dd .vp9_sad16x16x3_ssse3_aligned_by_3  - .vp9_sad16x16x3_ssse3_do_jump

-        dd .vp9_sad16x16x3_ssse3_aligned_by_4  - .vp9_sad16x16x3_ssse3_do_jump

-        dd .vp9_sad16x16x3_ssse3_aligned_by_5  - .vp9_sad16x16x3_ssse3_do_jump

-        dd .vp9_sad16x16x3_ssse3_aligned_by_6  - .vp9_sad16x16x3_ssse3_do_jump

-        dd .vp9_sad16x16x3_ssse3_aligned_by_7  - .vp9_sad16x16x3_ssse3_do_jump

-        dd .vp9_sad16x16x3_ssse3_aligned_by_8  - .vp9_sad16x16x3_ssse3_do_jump

-        dd .vp9_sad16x16x3_ssse3_aligned_by_9  - .vp9_sad16x16x3_ssse3_do_jump

-        dd .vp9_sad16x16x3_ssse3_aligned_by_10 - .vp9_sad16x16x3_ssse3_do_jump

-        dd .vp9_sad16x16x3_ssse3_aligned_by_11 - .vp9_sad16x16x3_ssse3_do_jump

-        dd .vp9_sad16x16x3_ssse3_aligned_by_12 - .vp9_sad16x16x3_ssse3_do_jump

-        dd .vp9_sad16x16x3_ssse3_aligned_by_13 - .vp9_sad16x16x3_ssse3_do_jump

-        dd .vp9_sad16x16x3_ssse3_aligned_by_14 - .vp9_sad16x16x3_ssse3_do_jump

-        dd .vp9_sad16x16x3_ssse3_aligned_by_15 - .vp9_sad16x16x3_ssse3_do_jump

-.vp9_sad16x16x3_ssse3_skiptable:

-        call .vp9_sad16x16x3_ssse3_do_jump

-.vp9_sad16x16x3_ssse3_do_jump:

-        pop             rcx                         ; get the address of do_jump

-        mov             rax,  .vp9_sad16x16x3_ssse3_jumptable - .vp9_sad16x16x3_ssse3_do_jump

-        add             rax,  rcx  ; get the absolute address of vp9_sad16x16x3_ssse3_jumptable

-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable

-        add             rcx,        rax

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        jmp             rcx

-        PROCESS_16X16X3_OFFSET 0,  .vp9_sad16x16x3_ssse3

-        PROCESS_16X16X3_OFFSET 1,  .vp9_sad16x16x3_ssse3

-        PROCESS_16X16X3_OFFSET 2,  .vp9_sad16x16x3_ssse3

-        PROCESS_16X16X3_OFFSET 3,  .vp9_sad16x16x3_ssse3

-        PROCESS_16X16X3_OFFSET 4,  .vp9_sad16x16x3_ssse3

-        PROCESS_16X16X3_OFFSET 5,  .vp9_sad16x16x3_ssse3

-        PROCESS_16X16X3_OFFSET 6,  .vp9_sad16x16x3_ssse3

-        PROCESS_16X16X3_OFFSET 7,  .vp9_sad16x16x3_ssse3

-        PROCESS_16X16X3_OFFSET 8,  .vp9_sad16x16x3_ssse3

-        PROCESS_16X16X3_OFFSET 9,  .vp9_sad16x16x3_ssse3

-        PROCESS_16X16X3_OFFSET 10, .vp9_sad16x16x3_ssse3

-        PROCESS_16X16X3_OFFSET 11, .vp9_sad16x16x3_ssse3

-        PROCESS_16X16X3_OFFSET 12, .vp9_sad16x16x3_ssse3

-        PROCESS_16X16X3_OFFSET 13, .vp9_sad16x16x3_ssse3

-        PROCESS_16X16X3_OFFSET 14, .vp9_sad16x16x3_ssse3

-.vp9_sad16x16x3_ssse3_aligned_by_15:

-        PROCESS_16X2X3 1

-        PROCESS_16X2X3 0

-        PROCESS_16X2X3 0

-        PROCESS_16X2X3 0

-        PROCESS_16X2X3 0

-        PROCESS_16X2X3 0

-        PROCESS_16X2X3 0

-        PROCESS_16X2X3 0

-.vp9_sad16x16x3_ssse3_store_off:

-        mov             rdi,        arg(4) ;Results

-        movq            xmm0,       xmm5

-        psrldq          xmm5,       8

-        paddw           xmm0,       xmm5

-        movd            [rdi],      xmm0

-;-

-        movq            xmm0,       xmm6

-        psrldq          xmm6,       8

-        paddw           xmm0,       xmm6

-        movd            [rdi+4],    xmm0

-;-

-        movq            xmm0,       xmm7

-        psrldq          xmm7,       8

-        paddw           xmm0,       xmm7

-        movd            [rdi+8],    xmm0

-    ; begin epilog

-    pop         rcx

-    pop         rdi

-    pop         rsi

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void int vp9_sad16x8x3_ssse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride,

-;    int  *results)

-global sym(vp9_sad16x8x3_ssse3)

-sym(vp9_sad16x8x3_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    SAVE_XMM 7

-    push        rsi

-    push        rdi

-    push        rcx

-    ; end prolog

-        mov             rsi,        arg(0) ;src_ptr

-        mov             rdi,        arg(2) ;ref_ptr

-        mov             rdx,        0xf

-        and             rdx,        rdi

-        jmp .vp9_sad16x8x3_ssse3_skiptable

-.vp9_sad16x8x3_ssse3_jumptable:

-        dd .vp9_sad16x8x3_ssse3_aligned_by_0  - .vp9_sad16x8x3_ssse3_do_jump

-        dd .vp9_sad16x8x3_ssse3_aligned_by_1  - .vp9_sad16x8x3_ssse3_do_jump

-        dd .vp9_sad16x8x3_ssse3_aligned_by_2  - .vp9_sad16x8x3_ssse3_do_jump

-        dd .vp9_sad16x8x3_ssse3_aligned_by_3  - .vp9_sad16x8x3_ssse3_do_jump

-        dd .vp9_sad16x8x3_ssse3_aligned_by_4  - .vp9_sad16x8x3_ssse3_do_jump

-        dd .vp9_sad16x8x3_ssse3_aligned_by_5  - .vp9_sad16x8x3_ssse3_do_jump

-        dd .vp9_sad16x8x3_ssse3_aligned_by_6  - .vp9_sad16x8x3_ssse3_do_jump

-        dd .vp9_sad16x8x3_ssse3_aligned_by_7  - .vp9_sad16x8x3_ssse3_do_jump

-        dd .vp9_sad16x8x3_ssse3_aligned_by_8  - .vp9_sad16x8x3_ssse3_do_jump

-        dd .vp9_sad16x8x3_ssse3_aligned_by_9  - .vp9_sad16x8x3_ssse3_do_jump

-        dd .vp9_sad16x8x3_ssse3_aligned_by_10 - .vp9_sad16x8x3_ssse3_do_jump

-        dd .vp9_sad16x8x3_ssse3_aligned_by_11 - .vp9_sad16x8x3_ssse3_do_jump

-        dd .vp9_sad16x8x3_ssse3_aligned_by_12 - .vp9_sad16x8x3_ssse3_do_jump

-        dd .vp9_sad16x8x3_ssse3_aligned_by_13 - .vp9_sad16x8x3_ssse3_do_jump

-        dd .vp9_sad16x8x3_ssse3_aligned_by_14 - .vp9_sad16x8x3_ssse3_do_jump

-        dd .vp9_sad16x8x3_ssse3_aligned_by_15 - .vp9_sad16x8x3_ssse3_do_jump

-.vp9_sad16x8x3_ssse3_skiptable:

-        call .vp9_sad16x8x3_ssse3_do_jump

-.vp9_sad16x8x3_ssse3_do_jump:

-        pop             rcx                         ; get the address of do_jump

-        mov             rax,  .vp9_sad16x8x3_ssse3_jumptable - .vp9_sad16x8x3_ssse3_do_jump

-        add             rax,  rcx  ; get the absolute address of vp9_sad16x8x3_ssse3_jumptable

-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable

-        add             rcx,        rax

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        jmp             rcx

-        PROCESS_16X8X3_OFFSET 0,  .vp9_sad16x8x3_ssse3

-        PROCESS_16X8X3_OFFSET 1,  .vp9_sad16x8x3_ssse3

-        PROCESS_16X8X3_OFFSET 2,  .vp9_sad16x8x3_ssse3

-        PROCESS_16X8X3_OFFSET 3,  .vp9_sad16x8x3_ssse3

-        PROCESS_16X8X3_OFFSET 4,  .vp9_sad16x8x3_ssse3

-        PROCESS_16X8X3_OFFSET 5,  .vp9_sad16x8x3_ssse3

-        PROCESS_16X8X3_OFFSET 6,  .vp9_sad16x8x3_ssse3

-        PROCESS_16X8X3_OFFSET 7,  .vp9_sad16x8x3_ssse3

-        PROCESS_16X8X3_OFFSET 8,  .vp9_sad16x8x3_ssse3

-        PROCESS_16X8X3_OFFSET 9,  .vp9_sad16x8x3_ssse3

-        PROCESS_16X8X3_OFFSET 10, .vp9_sad16x8x3_ssse3

-        PROCESS_16X8X3_OFFSET 11, .vp9_sad16x8x3_ssse3

-        PROCESS_16X8X3_OFFSET 12, .vp9_sad16x8x3_ssse3

-        PROCESS_16X8X3_OFFSET 13, .vp9_sad16x8x3_ssse3

-        PROCESS_16X8X3_OFFSET 14, .vp9_sad16x8x3_ssse3

-.vp9_sad16x8x3_ssse3_aligned_by_15:

-        PROCESS_16X2X3 1

-        PROCESS_16X2X3 0

-        PROCESS_16X2X3 0

-        PROCESS_16X2X3 0

-.vp9_sad16x8x3_ssse3_store_off:

-        mov             rdi,        arg(4) ;Results

-        movq            xmm0,       xmm5

-        psrldq          xmm5,       8

-        paddw           xmm0,       xmm5

-        movd            [rdi],      xmm0

-;-

-        movq            xmm0,       xmm6

-        psrldq          xmm6,       8

-        paddw           xmm0,       xmm6

-        movd            [rdi+4],    xmm0

-;-

-        movq            xmm0,       xmm7

-        psrldq          xmm7,       8

-        paddw           xmm0,       xmm7

-        movd            [rdi+8],    xmm0

-    ; begin epilog

-    pop         rcx

-    pop         rdi

-    pop         rsi

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

--- a/vp8/encoder/x86/ssim_opt.asm

+++ /dev/null

@@ -1,216 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr

-%macro TABULATE_SSIM 0

-        paddusw         xmm15, xmm3  ; sum_s

-        paddusw         xmm14, xmm4  ; sum_r

-        movdqa          xmm1, xmm3

-        pmaddwd         xmm1, xmm1

-        paddd           xmm13, xmm1 ; sum_sq_s

-        movdqa          xmm2, xmm4

-        pmaddwd         xmm2, xmm2

-        paddd           xmm12, xmm2 ; sum_sq_r

-        pmaddwd         xmm3, xmm4

-        paddd           xmm11, xmm3  ; sum_sxr

-%endmacro

-; Sum across the register %1 starting with q words

-%macro SUM_ACROSS_Q 1

-        movdqa          xmm2,%1

-        punpckldq       %1,xmm0

-        punpckhdq       xmm2,xmm0

-        paddq           %1,xmm2

-        movdqa          xmm2,%1

-        punpcklqdq      %1,xmm0

-        punpckhqdq      xmm2,xmm0

-        paddq           %1,xmm2

-%endmacro

-; Sum across the register %1 starting with q words

-%macro SUM_ACROSS_W 1

-        movdqa          xmm1, %1

-        punpcklwd       %1,xmm0

-        punpckhwd       xmm1,xmm0

-        paddd           %1, xmm1

-        SUM_ACROSS_Q    %1

-%endmacro

-;void ssim_parms_sse2(

-;    unsigned char *s,

-;    int sp,

-;    unsigned char *r,

-;    int rp

-;    unsigned long *sum_s,

-;    unsigned long *sum_r,

-;    unsigned long *sum_sq_s,

-;    unsigned long *sum_sq_r,

-;    unsigned long *sum_sxr);

-;

-; TODO: Use parm passing through structure, probably don't need the pxors

-; ( calling app will initialize to 0 ) could easily fit everything in sse2

-; without too much hastle, and can probably do better estimates with psadw

-; or pavgb At this point this is just meant to be first pass for calculating

-; all the parms needed for 16x16 ssim so we can play with dssim as distortion

-; in mode selection code.

-global sym(vp9_ssim_parms_16x16_sse2)

-sym(vp9_ssim_parms_16x16_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 9

-    SAVE_XMM 15

-    push        rsi

-    push        rdi

-    ; end prolog

-    mov             rsi,        arg(0) ;s

-    mov             rcx,        arg(1) ;sp

-    mov             rdi,        arg(2) ;r

-    mov             rax,        arg(3) ;rp

-    pxor            xmm0, xmm0

-    pxor            xmm15,xmm15  ;sum_s

-    pxor            xmm14,xmm14  ;sum_r

-    pxor            xmm13,xmm13  ;sum_sq_s

-    pxor            xmm12,xmm12  ;sum_sq_r

-    pxor            xmm11,xmm11  ;sum_sxr

-    mov             rdx, 16      ;row counter

-.NextRow:

-    ;grab source and reference pixels

-    movdqu          xmm5, [rsi]

-    movdqu          xmm6, [rdi]

-    movdqa          xmm3, xmm5

-    movdqa          xmm4, xmm6

-    punpckhbw       xmm3, xmm0 ; high_s

-    punpckhbw       xmm4, xmm0 ; high_r

-    TABULATE_SSIM

-    movdqa          xmm3, xmm5

-    movdqa          xmm4, xmm6

-    punpcklbw       xmm3, xmm0 ; low_s

-    punpcklbw       xmm4, xmm0 ; low_r

-    TABULATE_SSIM

-    add             rsi, rcx   ; next s row

-    add             rdi, rax   ; next r row

-    dec             rdx        ; counter

-    jnz .NextRow

-    SUM_ACROSS_W    xmm15

-    SUM_ACROSS_W    xmm14

-    SUM_ACROSS_Q    xmm13

-    SUM_ACROSS_Q    xmm12

-    SUM_ACROSS_Q    xmm11

-    mov             rdi,arg(4)

-    movd            [rdi], xmm15;

-    mov             rdi,arg(5)

-    movd            [rdi], xmm14;

-    mov             rdi,arg(6)

-    movd            [rdi], xmm13;

-    mov             rdi,arg(7)

-    movd            [rdi], xmm12;

-    mov             rdi,arg(8)

-    movd            [rdi], xmm11;

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void ssim_parms_sse2(

-;    unsigned char *s,

-;    int sp,

-;    unsigned char *r,

-;    int rp

-;    unsigned long *sum_s,

-;    unsigned long *sum_r,

-;    unsigned long *sum_sq_s,

-;    unsigned long *sum_sq_r,

-;    unsigned long *sum_sxr);

-;

-; TODO: Use parm passing through structure, probably don't need the pxors

-; ( calling app will initialize to 0 ) could easily fit everything in sse2

-; without too much hastle, and can probably do better estimates with psadw

-; or pavgb At this point this is just meant to be first pass for calculating

-; all the parms needed for 16x16 ssim so we can play with dssim as distortion

-; in mode selection code.

-global sym(vp9_ssim_parms_8x8_sse2)

-sym(vp9_ssim_parms_8x8_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 9

-    SAVE_XMM 15

-    push        rsi

-    push        rdi

-    ; end prolog

-    mov             rsi,        arg(0) ;s

-    mov             rcx,        arg(1) ;sp

-    mov             rdi,        arg(2) ;r

-    mov             rax,        arg(3) ;rp

-    pxor            xmm0, xmm0

-    pxor            xmm15,xmm15  ;sum_s

-    pxor            xmm14,xmm14  ;sum_r

-    pxor            xmm13,xmm13  ;sum_sq_s

-    pxor            xmm12,xmm12  ;sum_sq_r

-    pxor            xmm11,xmm11  ;sum_sxr

-    mov             rdx, 8      ;row counter

-.NextRow:

-    ;grab source and reference pixels

-    movq            xmm3, [rsi]

-    movq            xmm4, [rdi]

-    punpcklbw       xmm3, xmm0 ; low_s

-    punpcklbw       xmm4, xmm0 ; low_r

-    TABULATE_SSIM

-    add             rsi, rcx   ; next s row

-    add             rdi, rax   ; next r row

-    dec             rdx        ; counter

-    jnz .NextRow

-    SUM_ACROSS_W    xmm15

-    SUM_ACROSS_W    xmm14

-    SUM_ACROSS_Q    xmm13

-    SUM_ACROSS_Q    xmm12

-    SUM_ACROSS_Q    xmm11

-    mov             rdi,arg(4)

-    movd            [rdi], xmm15;

-    mov             rdi,arg(5)

-    movd            [rdi], xmm14;

-    mov             rdi,arg(6)

-    movd            [rdi], xmm13;

-    mov             rdi,arg(7)

-    movd            [rdi], xmm12;

-    mov             rdi,arg(8)

-    movd            [rdi], xmm11;

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

--- a/vp8/encoder/x86/subtract_mmx.asm

+++ /dev/null

@@ -1,432 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;void vp9_subtract_b_mmx_impl(unsigned char *z,  int src_stride,

-;                            short *diff, unsigned char *Predictor,

-;                            int pitch);

-global sym(vp9_subtract_b_mmx_impl)

-sym(vp9_subtract_b_mmx_impl):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    push rsi

-    push rdi

-    ; end prolog

-        mov     rdi,        arg(2) ;diff

-        mov     rax,        arg(3) ;Predictor

-        mov     rsi,        arg(0) ;z

-        movsxd  rdx,        dword ptr arg(1);src_stride;

-        movsxd  rcx,        dword ptr arg(4);pitch

-        pxor    mm7,        mm7

-        movd    mm0,        [rsi]

-        movd    mm1,        [rax]

-        punpcklbw   mm0,    mm7

-        punpcklbw   mm1,    mm7

-        psubw   mm0,        mm1

-        movq    [rdi],      mm0

-        movd    mm0,        [rsi+rdx]

-        movd    mm1,        [rax+rcx]

-        punpcklbw   mm0,    mm7

-        punpcklbw   mm1,    mm7

-        psubw   mm0,        mm1

-        movq    [rdi+rcx*2],mm0

-        movd    mm0,        [rsi+rdx*2]

-        movd    mm1,        [rax+rcx*2]

-        punpcklbw   mm0,    mm7

-        punpcklbw   mm1,    mm7

-        psubw   mm0,        mm1

-        movq    [rdi+rcx*4],        mm0

-        lea     rsi,        [rsi+rdx*2]

-        lea     rcx,        [rcx+rcx*2]

-        movd    mm0,        [rsi+rdx]

-        movd    mm1,        [rax+rcx]

-        punpcklbw   mm0,    mm7

-        punpcklbw   mm1,    mm7

-        psubw   mm0,        mm1

-        movq    [rdi+rcx*2],        mm0

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)

-global sym(vp9_subtract_mby_mmx)

-sym(vp9_subtract_mby_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    push rsi

-    push rdi

-    ; end prolog

-            mov         rsi,            arg(1) ;src

-            mov         rdi,            arg(0) ;diff

-            mov         rax,            arg(2) ;pred

-            movsxd      rdx,            dword ptr arg(3) ;stride

-            mov         rcx,            16

-            pxor        mm0,            mm0

-.submby_loop:

-            movq        mm1,            [rsi]

-            movq        mm3,            [rax]

-            movq        mm2,            mm1

-            movq        mm4,            mm3

-            punpcklbw   mm1,            mm0

-            punpcklbw   mm3,            mm0

-            punpckhbw   mm2,            mm0

-            punpckhbw   mm4,            mm0

-            psubw       mm1,            mm3

-            psubw       mm2,            mm4

-            movq        [rdi],          mm1

-            movq        [rdi+8],        mm2

-            movq        mm1,            [rsi+8]

-            movq        mm3,            [rax+8]

-            movq        mm2,            mm1

-            movq        mm4,            mm3

-            punpcklbw   mm1,            mm0

-            punpcklbw   mm3,            mm0

-            punpckhbw   mm2,            mm0

-            punpckhbw   mm4,            mm0

-            psubw       mm1,            mm3

-            psubw       mm2,            mm4

-            movq        [rdi+16],       mm1

-            movq        [rdi+24],       mm2

-            add         rdi,            32

-            add         rax,            16

-            lea         rsi,            [rsi+rdx]

-            sub         rcx,            1

-            jnz         .submby_loop

-    pop rdi

-    pop rsi

-    ; begin epilog

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)

-global sym(vp9_subtract_mbuv_mmx)

-sym(vp9_subtract_mbuv_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    push rsi

-    push rdi

-    ; end prolog

-    ;short *udiff = diff + 256;

-    ;short *vdiff = diff + 320;

-    ;unsigned char *upred = pred + 256;

-    ;unsigned char *vpred = pred + 320;

-        ;unsigned char  *z    = usrc;

-        ;unsigned short *diff = udiff;

-        ;unsigned char  *Predictor= upred;

-            mov     rdi,        arg(0) ;diff

-            mov     rax,        arg(3) ;pred

-            mov     rsi,        arg(1) ;z = usrc

-            add     rdi,        256*2  ;diff = diff + 256 (shorts)

-            add     rax,        256    ;Predictor = pred + 256

-            movsxd  rdx,        dword ptr arg(4) ;stride;

-            pxor    mm7,        mm7

-            movq    mm0,        [rsi]

-            movq    mm1,        [rax]

-            movq    mm3,        mm0

-            movq    mm4,        mm1

-            punpcklbw   mm0,    mm7

-            punpcklbw   mm1,    mm7

-            punpckhbw   mm3,    mm7

-            punpckhbw   mm4,    mm7

-            psubw   mm0,        mm1

-            psubw   mm3,        mm4

-            movq    [rdi],      mm0

-            movq    [rdi+8],    mm3

-            movq    mm0,        [rsi+rdx]

-            movq    mm1,        [rax+8]

-            movq    mm3,        mm0

-            movq    mm4,        mm1

-            punpcklbw   mm0,    mm7

-            punpcklbw   mm1,    mm7

-            punpckhbw   mm3,    mm7

-            punpckhbw   mm4,    mm7

-            psubw   mm0,        mm1

-            psubw   mm3,        mm4

-            movq    [rdi+16],   mm0

-            movq    [rdi+24],   mm3

-            movq    mm0,        [rsi+rdx*2]

-            movq    mm1,        [rax+16]

-            movq    mm3,        mm0

-            movq    mm4,        mm1

-            punpcklbw   mm0,    mm7

-            punpcklbw   mm1,    mm7

-            punpckhbw   mm3,    mm7

-            punpckhbw   mm4,    mm7

-            psubw   mm0,        mm1

-            psubw   mm3,        mm4

-            movq    [rdi+32],   mm0

-            movq    [rdi+40],   mm3

-            lea     rsi,        [rsi+rdx*2]

-            movq    mm0,        [rsi+rdx]

-            movq    mm1,        [rax+24]

-            movq    mm3,        mm0

-            movq    mm4,        mm1

-            punpcklbw   mm0,    mm7

-            punpcklbw   mm1,    mm7

-            punpckhbw   mm3,    mm7

-            punpckhbw   mm4,    mm7

-            psubw   mm0,        mm1

-            psubw   mm3,        mm4

-            movq    [rdi+48],   mm0

-            movq    [rdi+56],   mm3

-            add     rdi,        64

-            add     rax,        32

-            lea     rsi,        [rsi+rdx*2]

-            movq    mm0,        [rsi]

-            movq    mm1,        [rax]

-            movq    mm3,        mm0

-            movq    mm4,        mm1

-            punpcklbw   mm0,    mm7

-            punpcklbw   mm1,    mm7

-            punpckhbw   mm3,    mm7

-            punpckhbw   mm4,    mm7

-            psubw   mm0,        mm1

-            psubw   mm3,        mm4

-            movq    [rdi],      mm0

-            movq    [rdi+8],    mm3

-            movq    mm0,        [rsi+rdx]

-            movq    mm1,        [rax+8]

-            movq    mm3,        mm0

-            movq    mm4,        mm1

-            punpcklbw   mm0,    mm7

-            punpcklbw   mm1,    mm7

-            punpckhbw   mm3,    mm7

-            punpckhbw   mm4,    mm7

-            psubw   mm0,        mm1

-            psubw   mm3,        mm4

-            movq    [rdi+16],   mm0

-            movq    [rdi+24],   mm3

-            movq    mm0,        [rsi+rdx*2]

-            movq    mm1,        [rax+16]

-            movq    mm3,        mm0

-            movq    mm4,        mm1

-            punpcklbw   mm0,    mm7

-            punpcklbw   mm1,    mm7

-            punpckhbw   mm3,    mm7

-            punpckhbw   mm4,    mm7

-            psubw   mm0,        mm1

-            psubw   mm3,        mm4

-            movq    [rdi+32],   mm0

-            movq    [rdi+40],   mm3

-            lea     rsi,        [rsi+rdx*2]

-            movq    mm0,        [rsi+rdx]

-            movq    mm1,        [rax+24]

-            movq    mm3,        mm0

-            movq    mm4,        mm1

-            punpcklbw   mm0,    mm7

-            punpcklbw   mm1,    mm7

-            punpckhbw   mm3,    mm7

-            punpckhbw   mm4,    mm7

-            psubw   mm0,        mm1

-            psubw   mm3,        mm4

-            movq    [rdi+48],   mm0

-            movq    [rdi+56],   mm3

-        ;unsigned char  *z    = vsrc;

-        ;unsigned short *diff = vdiff;

-        ;unsigned char  *Predictor= vpred;

-            mov     rdi,        arg(0) ;diff

-            mov     rax,        arg(3) ;pred

-            mov     rsi,        arg(2) ;z = usrc

-            add     rdi,        320*2  ;diff = diff + 320 (shorts)

-            add     rax,        320    ;Predictor = pred + 320

-            movsxd  rdx,        dword ptr arg(4) ;stride;

-            pxor    mm7,        mm7

-            movq    mm0,        [rsi]

-            movq    mm1,        [rax]

-            movq    mm3,        mm0

-            movq    mm4,        mm1

-            punpcklbw   mm0,    mm7

-            punpcklbw   mm1,    mm7

-            punpckhbw   mm3,    mm7

-            punpckhbw   mm4,    mm7

-            psubw   mm0,        mm1

-            psubw   mm3,        mm4

-            movq    [rdi],      mm0

-            movq    [rdi+8],    mm3

-            movq    mm0,        [rsi+rdx]

-            movq    mm1,        [rax+8]

-            movq    mm3,        mm0

-            movq    mm4,        mm1

-            punpcklbw   mm0,    mm7

-            punpcklbw   mm1,    mm7

-            punpckhbw   mm3,    mm7

-            punpckhbw   mm4,    mm7

-            psubw   mm0,        mm1

-            psubw   mm3,        mm4

-            movq    [rdi+16],   mm0

-            movq    [rdi+24],   mm3

-            movq    mm0,        [rsi+rdx*2]

-            movq    mm1,        [rax+16]

-            movq    mm3,        mm0

-            movq    mm4,        mm1

-            punpcklbw   mm0,    mm7

-            punpcklbw   mm1,    mm7

-            punpckhbw   mm3,    mm7

-            punpckhbw   mm4,    mm7

-            psubw   mm0,        mm1

-            psubw   mm3,        mm4

-            movq    [rdi+32],   mm0

-            movq    [rdi+40],   mm3

-            lea     rsi,        [rsi+rdx*2]

-            movq    mm0,        [rsi+rdx]

-            movq    mm1,        [rax+24]

-            movq    mm3,        mm0

-            movq    mm4,        mm1

-            punpcklbw   mm0,    mm7

-            punpcklbw   mm1,    mm7

-            punpckhbw   mm3,    mm7

-            punpckhbw   mm4,    mm7

-            psubw   mm0,        mm1

-            psubw   mm3,        mm4

-            movq    [rdi+48],   mm0

-            movq    [rdi+56],   mm3

-            add     rdi,        64

-            add     rax,        32

-            lea     rsi,        [rsi+rdx*2]

-            movq    mm0,        [rsi]

-            movq    mm1,        [rax]

-            movq    mm3,        mm0

-            movq    mm4,        mm1

-            punpcklbw   mm0,    mm7

-            punpcklbw   mm1,    mm7

-            punpckhbw   mm3,    mm7

-            punpckhbw   mm4,    mm7

-            psubw   mm0,        mm1

-            psubw   mm3,        mm4

-            movq    [rdi],      mm0

-            movq    [rdi+8],    mm3

-            movq    mm0,        [rsi+rdx]

-            movq    mm1,        [rax+8]

-            movq    mm3,        mm0

-            movq    mm4,        mm1

-            punpcklbw   mm0,    mm7

-            punpcklbw   mm1,    mm7

-            punpckhbw   mm3,    mm7

-            punpckhbw   mm4,    mm7

-            psubw   mm0,        mm1

-            psubw   mm3,        mm4

-            movq    [rdi+16],   mm0

-            movq    [rdi+24],   mm3

-            movq    mm0,        [rsi+rdx*2]

-            movq    mm1,        [rax+16]

-            movq    mm3,        mm0

-            movq    mm4,        mm1

-            punpcklbw   mm0,    mm7

-            punpcklbw   mm1,    mm7

-            punpckhbw   mm3,    mm7

-            punpckhbw   mm4,    mm7

-            psubw   mm0,        mm1

-            psubw   mm3,        mm4

-            movq    [rdi+32],   mm0

-            movq    [rdi+40],   mm3

-            lea     rsi,        [rsi+rdx*2]

-            movq    mm0,        [rsi+rdx]

-            movq    mm1,        [rax+24]

-            movq    mm3,        mm0

-            movq    mm4,        mm1

-            punpcklbw   mm0,    mm7

-            punpcklbw   mm1,    mm7

-            punpckhbw   mm3,    mm7

-            punpckhbw   mm4,    mm7

-            psubw   mm0,        mm1

-            psubw   mm3,        mm4

-            movq    [rdi+48],   mm0

-            movq    [rdi+56],   mm3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

--- a/vp8/encoder/x86/subtract_sse2.asm

+++ /dev/null

@@ -1,356 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;void vp9_subtract_b_sse2_impl(unsigned char *z,  int src_stride,

-;                            short *diff, unsigned char *Predictor,

-;                            int pitch);

-global sym(vp9_subtract_b_sse2_impl)

-sym(vp9_subtract_b_sse2_impl):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    ; end prolog

-        mov     rdi,        arg(2) ;diff

-        mov     rax,        arg(3) ;Predictor

-        mov     rsi,        arg(0) ;z

-        movsxd  rdx,        dword ptr arg(1);src_stride;

-        movsxd  rcx,        dword ptr arg(4);pitch

-        pxor    mm7,        mm7

-        movd    mm0,        [rsi]

-        movd    mm1,        [rax]

-        punpcklbw   mm0,    mm7

-        punpcklbw   mm1,    mm7

-        psubw   mm0,        mm1

-        movq    MMWORD PTR [rdi],      mm0

-        movd    mm0,        [rsi+rdx]

-        movd    mm1,        [rax+rcx]

-        punpcklbw   mm0,    mm7

-        punpcklbw   mm1,    mm7

-        psubw   mm0,        mm1

-        movq    MMWORD PTR [rdi+rcx*2], mm0

-        movd    mm0,        [rsi+rdx*2]

-        movd    mm1,        [rax+rcx*2]

-        punpcklbw   mm0,    mm7

-        punpcklbw   mm1,    mm7

-        psubw   mm0,        mm1

-        movq    MMWORD PTR [rdi+rcx*4], mm0

-        lea     rsi,        [rsi+rdx*2]

-        lea     rcx,        [rcx+rcx*2]

-        movd    mm0,        [rsi+rdx]

-        movd    mm1,        [rax+rcx]

-        punpcklbw   mm0,    mm7

-        punpcklbw   mm1,    mm7

-        psubw   mm0,        mm1

-        movq    MMWORD PTR [rdi+rcx*2], mm0

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)

-global sym(vp9_subtract_mby_sse2)

-sym(vp9_subtract_mby_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    ; end prolog

-            mov         rsi,            arg(1) ;src

-            mov         rdi,            arg(0) ;diff

-            mov         rax,            arg(2) ;pred

-            movsxd      rdx,            dword ptr arg(3) ;stride

-            mov         rcx,            8      ; do two lines at one time

-.submby_loop:

-            movdqa      xmm0,           XMMWORD PTR [rsi]   ; src

-            movdqa      xmm1,           XMMWORD PTR [rax]   ; pred

-            movdqa      xmm2,           xmm0

-            psubb       xmm0,           xmm1

-            pxor        xmm1,           [GLOBAL(t80)]   ;convert to signed values

-            pxor        xmm2,           [GLOBAL(t80)]

-            pcmpgtb     xmm1,           xmm2            ; obtain sign information

-            movdqa      xmm2,    xmm0

-            movdqa      xmm3,    xmm1

-            punpcklbw   xmm0,    xmm1            ; put sign back to subtraction

-            punpckhbw   xmm2,    xmm3            ; put sign back to subtraction

-            movdqa      XMMWORD PTR [rdi],   xmm0

-            movdqa      XMMWORD PTR [rdi +16], xmm2

-            movdqa      xmm4,           XMMWORD PTR [rsi + rdx]

-            movdqa      xmm5,           XMMWORD PTR [rax + 16]

-            movdqa      xmm6,           xmm4

-            psubb       xmm4,           xmm5

-            pxor        xmm5,           [GLOBAL(t80)]   ;convert to signed values

-            pxor        xmm6,           [GLOBAL(t80)]

-            pcmpgtb     xmm5,           xmm6            ; obtain sign information

-            movdqa      xmm6,    xmm4

-            movdqa      xmm7,    xmm5

-            punpcklbw   xmm4,    xmm5            ; put sign back to subtraction

-            punpckhbw   xmm6,    xmm7            ; put sign back to subtraction

-            movdqa      XMMWORD PTR [rdi +32], xmm4

-            movdqa      XMMWORD PTR [rdi +48], xmm6

-            add         rdi,            64

-            add         rax,            32

-            lea         rsi,            [rsi+rdx*2]

-            sub         rcx,            1

-            jnz         .submby_loop

-    pop rdi

-    pop rsi

-    ; begin epilog

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)

-global sym(vp9_subtract_mbuv_sse2)

-sym(vp9_subtract_mbuv_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    ; end prolog

-            mov     rdi,        arg(0) ;diff

-            mov     rax,        arg(3) ;pred

-            mov     rsi,        arg(1) ;z = usrc

-            add     rdi,        256*2  ;diff = diff + 256 (shorts)

-            add     rax,        256    ;Predictor = pred + 256

-            movsxd  rdx,        dword ptr arg(4) ;stride;

-            lea     rcx,        [rdx + rdx*2]

-            ;u

-            ;line 0 1

-            movq       xmm0,    MMWORD PTR [rsi]  ; src

-            movq       xmm2,    MMWORD PTR [rsi+rdx]

-            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred

-            punpcklqdq xmm0,    xmm2

-            movdqa     xmm2,    xmm0

-            psubb      xmm0,    xmm1            ; subtraction with sign missed

-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values

-            pxor       xmm2,    [GLOBAL(t80)]

-            pcmpgtb    xmm1,    xmm2            ; obtain sign information

-            movdqa     xmm2,    xmm0

-            movdqa     xmm3,    xmm1

-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction

-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction

-            movdqa     XMMWORD PTR [rdi],   xmm0

-            movdqa     XMMWORD PTR [rdi +16],   xmm2

-            ;line 2 3

-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src

-            movq       xmm2,    MMWORD PTR [rsi+rcx]

-            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred

-            punpcklqdq xmm0,    xmm2

-            movdqa     xmm2,    xmm0

-            psubb      xmm0,    xmm1            ; subtraction with sign missed

-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values

-            pxor       xmm2,    [GLOBAL(t80)]

-            pcmpgtb    xmm1,    xmm2            ; obtain sign information

-            movdqa     xmm2,    xmm0

-            movdqa     xmm3,    xmm1

-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction

-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction

-            movdqa     XMMWORD PTR [rdi + 32],   xmm0

-            movdqa     XMMWORD PTR [rdi + 48],   xmm2

-            ;line 4 5

-            lea        rsi,     [rsi + rdx*4]

-            movq       xmm0,    MMWORD PTR [rsi]  ; src

-            movq       xmm2,    MMWORD PTR [rsi+rdx]

-            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred

-            punpcklqdq xmm0,    xmm2

-            movdqa     xmm2,    xmm0

-            psubb      xmm0,    xmm1            ; subtraction with sign missed

-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values

-            pxor       xmm2,    [GLOBAL(t80)]

-            pcmpgtb    xmm1,    xmm2            ; obtain sign information

-            movdqa     xmm2,    xmm0

-            movdqa     xmm3,    xmm1

-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction

-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction

-            movdqa     XMMWORD PTR [rdi + 64],   xmm0

-            movdqa     XMMWORD PTR [rdi + 80],   xmm2

-            ;line 6 7

-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src

-            movq       xmm2,    MMWORD PTR [rsi+rcx]

-            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred

-            punpcklqdq xmm0,    xmm2

-            movdqa     xmm2,    xmm0

-            psubb      xmm0,    xmm1            ; subtraction with sign missed

-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values

-            pxor       xmm2,    [GLOBAL(t80)]

-            pcmpgtb    xmm1,    xmm2            ; obtain sign information

-            movdqa     xmm2,    xmm0

-            movdqa     xmm3,    xmm1

-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction

-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction

-            movdqa     XMMWORD PTR [rdi + 96],   xmm0

-            movdqa     XMMWORD PTR [rdi + 112],  xmm2

-            ;v

-            mov     rsi,        arg(2) ;z = vsrc

-            add     rdi,        64*2  ;diff = diff + 320 (shorts)

-            add     rax,        64    ;Predictor = pred + 320

-            ;line 0 1

-            movq       xmm0,    MMWORD PTR [rsi]  ; src

-            movq       xmm2,    MMWORD PTR [rsi+rdx]

-            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred

-            punpcklqdq xmm0,    xmm2

-            movdqa     xmm2,    xmm0

-            psubb      xmm0,    xmm1            ; subtraction with sign missed

-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values

-            pxor       xmm2,    [GLOBAL(t80)]

-            pcmpgtb    xmm1,    xmm2            ; obtain sign information

-            movdqa     xmm2,    xmm0

-            movdqa     xmm3,    xmm1

-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction

-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction

-            movdqa     XMMWORD PTR [rdi],   xmm0

-            movdqa     XMMWORD PTR [rdi +16],   xmm2

-            ;line 2 3

-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src

-            movq       xmm2,    MMWORD PTR [rsi+rcx]

-            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred

-            punpcklqdq xmm0,    xmm2

-            movdqa     xmm2,    xmm0

-            psubb      xmm0,    xmm1            ; subtraction with sign missed

-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values

-            pxor       xmm2,    [GLOBAL(t80)]

-            pcmpgtb    xmm1,    xmm2            ; obtain sign information

-            movdqa     xmm2,    xmm0

-            movdqa     xmm3,    xmm1

-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction

-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction

-            movdqa     XMMWORD PTR [rdi + 32],   xmm0

-            movdqa     XMMWORD PTR [rdi + 48],   xmm2

-            ;line 4 5

-            lea        rsi,     [rsi + rdx*4]

-            movq       xmm0,    MMWORD PTR [rsi]  ; src

-            movq       xmm2,    MMWORD PTR [rsi+rdx]

-            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred

-            punpcklqdq xmm0,    xmm2

-            movdqa     xmm2,    xmm0

-            psubb      xmm0,    xmm1            ; subtraction with sign missed

-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values

-            pxor       xmm2,    [GLOBAL(t80)]

-            pcmpgtb    xmm1,    xmm2            ; obtain sign information

-            movdqa     xmm2,    xmm0

-            movdqa     xmm3,    xmm1

-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction

-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction

-            movdqa     XMMWORD PTR [rdi + 64],   xmm0

-            movdqa     XMMWORD PTR [rdi + 80],   xmm2

-            ;line 6 7

-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src

-            movq       xmm2,    MMWORD PTR [rsi+rcx]

-            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred

-            punpcklqdq xmm0,    xmm2

-            movdqa     xmm2,    xmm0

-            psubb      xmm0,    xmm1            ; subtraction with sign missed

-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values

-            pxor       xmm2,    [GLOBAL(t80)]

-            pcmpgtb    xmm1,    xmm2            ; obtain sign information

-            movdqa     xmm2,    xmm0

-            movdqa     xmm3,    xmm1

-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction

-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction

-            movdqa     XMMWORD PTR [rdi + 96],   xmm0

-            movdqa     XMMWORD PTR [rdi + 112],  xmm2

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-t80:

-    times 16 db 0x80

--- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm

+++ /dev/null

@@ -1,207 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-; void vp9_temporal_filter_apply_sse2 | arg

-;  (unsigned char  *frame1,           |  0

-;   unsigned int    stride,           |  1

-;   unsigned char  *frame2,           |  2

-;   unsigned int    block_size,       |  3

-;   int             strength,         |  4

-;   int             filter_weight,    |  5

-;   unsigned int   *accumulator,      |  6

-;   unsigned short *count)            |  7

-global sym(vp9_temporal_filter_apply_sse2)

-sym(vp9_temporal_filter_apply_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 8

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ALIGN_STACK 16, rax

-    %define block_size    0

-    %define strength      16

-    %define filter_weight 32

-    %define rounding_bit  48

-    %define rbp_backup    64

-    %define stack_size    80

-    sub         rsp,           stack_size

-    mov         [rsp + rbp_backup], rbp

-    ; end prolog

-        mov         rdx,            arg(3)

-        mov         [rsp + block_size], rdx

-        movd        xmm6,            arg(4)

-        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read

-        ; calculate the rounding bit outside the loop

-        ; 0x8000 >> (16 - strength)

-        mov         rdx,            16

-        sub         rdx,            arg(4) ; 16 - strength

-        movd        xmm4,           rdx    ; can't use rdx w/ shift

-        movdqa      xmm5,           [GLOBAL(_const_top_bit)]

-        psrlw       xmm5,           xmm4

-        movdqa      [rsp + rounding_bit], xmm5

-        mov         rsi,            arg(0) ; src/frame1

-        mov         rdx,            arg(2) ; predictor frame

-        mov         rdi,            arg(6) ; accumulator

-        mov         rax,            arg(7) ; count

-        ; dup the filter weight and store for later

-        movd        xmm0,           arg(5) ; filter_weight

-        pshuflw     xmm0,           xmm0, 0

-        punpcklwd   xmm0,           xmm0

-        movdqa      [rsp + filter_weight], xmm0

-        mov         rbp,            arg(1) ; stride

-        pxor        xmm7,           xmm7   ; zero for extraction

-        lea         rcx,            [rdx + 16*16*1]

-        cmp         dword ptr [rsp + block_size], 8

-        jne         .temporal_filter_apply_load_16

-        lea         rcx,            [rdx + 8*8*1]

-.temporal_filter_apply_load_8:

-        movq        xmm0,           [rsi]  ; first row

-        lea         rsi,            [rsi + rbp] ; += stride

-        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]

-        movq        xmm1,           [rsi]  ; second row

-        lea         rsi,            [rsi + rbp] ; += stride

-        punpcklbw   xmm1,           xmm7   ; src[ 8-15]

-        jmp         .temporal_filter_apply_load_finished

-.temporal_filter_apply_load_16:

-        movdqa      xmm0,           [rsi]  ; src (frame1)

-        lea         rsi,            [rsi + rbp] ; += stride

-        movdqa      xmm1,           xmm0

-        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]

-        punpckhbw   xmm1,           xmm7   ; src[ 8-15]

-.temporal_filter_apply_load_finished:

-        movdqa      xmm2,           [rdx]  ; predictor (frame2)

-        movdqa      xmm3,           xmm2

-        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]

-        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]

-        ; modifier = src_byte - pixel_value

-        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]

-        psubw       xmm1,           xmm3   ; src - pred[ 8-15]

-        ; modifier *= modifier

-        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2

-        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2

-        ; modifier *= 3

-        pmullw      xmm0,           [GLOBAL(_const_3w)]

-        pmullw      xmm1,           [GLOBAL(_const_3w)]

-        ; modifer += 0x8000 >> (16 - strength)

-        paddw       xmm0,           [rsp + rounding_bit]

-        paddw       xmm1,           [rsp + rounding_bit]

-        ; modifier >>= strength

-        psrlw       xmm0,           [rsp + strength]

-        psrlw       xmm1,           [rsp + strength]

-        ; modifier = 16 - modifier

-        ; saturation takes care of modifier > 16

-        movdqa      xmm3,           [GLOBAL(_const_16w)]

-        movdqa      xmm2,           [GLOBAL(_const_16w)]

-        psubusw     xmm3,           xmm1

-        psubusw     xmm2,           xmm0

-        ; modifier *= filter_weight

-        pmullw      xmm2,           [rsp + filter_weight]

-        pmullw      xmm3,           [rsp + filter_weight]

-        ; count

-        movdqa      xmm4,           [rax]

-        movdqa      xmm5,           [rax+16]

-        ; += modifier

-        paddw       xmm4,           xmm2

-        paddw       xmm5,           xmm3

-        ; write back

-        movdqa      [rax],          xmm4

-        movdqa      [rax+16],       xmm5

-        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))

-        ; load and extract the predictor up to shorts

-        pxor        xmm7,           xmm7

-        movdqa      xmm0,           [rdx]

-        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))

-        movdqa      xmm1,           xmm0

-        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]

-        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]

-        ; modifier *= pixel_value

-        pmullw      xmm0,           xmm2

-        pmullw      xmm1,           xmm3

-        ; expand to double words

-        movdqa      xmm2,           xmm0

-        punpcklwd   xmm0,           xmm7   ; [ 0- 3]

-        punpckhwd   xmm2,           xmm7   ; [ 4- 7]

-        movdqa      xmm3,           xmm1

-        punpcklwd   xmm1,           xmm7   ; [ 8-11]

-        punpckhwd   xmm3,           xmm7   ; [12-15]

-        ; accumulator

-        movdqa      xmm4,           [rdi]

-        movdqa      xmm5,           [rdi+16]

-        movdqa      xmm6,           [rdi+32]

-        movdqa      xmm7,           [rdi+48]

-        ; += modifier

-        paddd       xmm4,           xmm0

-        paddd       xmm5,           xmm2

-        paddd       xmm6,           xmm1

-        paddd       xmm7,           xmm3

-        ; write back

-        movdqa      [rdi],          xmm4

-        movdqa      [rdi+16],       xmm5

-        movdqa      [rdi+32],       xmm6

-        movdqa      [rdi+48],       xmm7

-        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))

-        cmp         rdx,            rcx

-        je          .temporal_filter_apply_epilog

-        pxor        xmm7,           xmm7   ; zero for extraction

-        cmp         dword ptr [rsp + block_size], 16

-        je          .temporal_filter_apply_load_16

-        jmp         .temporal_filter_apply_load_8

-.temporal_filter_apply_epilog:

-    ; begin epilog

-    mov         rbp,            [rsp + rbp_backup]

-    add         rsp,            stack_size

-    pop         rsp

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-_const_3w:

-    times 8 dw 3

-align 16

-_const_top_bit:

-    times 8 dw 1<<15

-align 16

-_const_16w

-    times 8 dw 16

--- a/vp8/encoder/x86/temporal_filter_x86.h

+++ /dev/null

@@ -1,27 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef __INC_TEMPORAL_FILTER_X86_H

-#define __INC_TEMPORAL_FILTER_X86_H

-#if HAVE_SSE2

-extern prototype_apply(vp9_temporal_filter_apply_sse2);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_temporal_filter_apply

-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2

-#endif

-#endif

-#endif // __INC_TEMPORAL_FILTER_X86_H

--- a/vp8/encoder/x86/variance_impl_mmx.asm

+++ /dev/null

@@ -1,851 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;unsigned int vp9_get_mb_ss_mmx( short *src_ptr )

-global sym(vp9_get_mb_ss_mmx)

-sym(vp9_get_mb_ss_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    sub         rsp, 8

-    ; end prolog

-        mov         rax, arg(0) ;src_ptr

-        mov         rcx, 16

-        pxor        mm4, mm4

-.NEXTROW:

-        movq        mm0, [rax]

-        movq        mm1, [rax+8]

-        movq        mm2, [rax+16]

-        movq        mm3, [rax+24]

-        pmaddwd     mm0, mm0

-        pmaddwd     mm1, mm1

-        pmaddwd     mm2, mm2

-        pmaddwd     mm3, mm3

-        paddd       mm4, mm0

-        paddd       mm4, mm1

-        paddd       mm4, mm2

-        paddd       mm4, mm3

-        add         rax, 32

-        dec         rcx

-        ja          .NEXTROW

-        movq        QWORD PTR [rsp], mm4

-        ;return sum[0]+sum[1];

-        movsxd      rax, dword ptr [rsp]

-        movsxd      rcx, dword ptr [rsp+4]

-        add         rax, rcx

-    ; begin epilog

-    add rsp, 8

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int vp9_get8x8var_mmx

-;(

-;    unsigned char *src_ptr,

-;    int  source_stride,

-;    unsigned char *ref_ptr,

-;    int  recon_stride,

-;    unsigned int *SSE,

-;    int *Sum

-;)

-global sym(vp9_get8x8var_mmx)

-sym(vp9_get8x8var_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    push rsi

-    push rdi

-    push rbx

-    sub         rsp, 16

-    ; end prolog

-        pxor        mm5, mm5                    ; Blank mmx6

-        pxor        mm6, mm6                    ; Blank mmx7

-        pxor        mm7, mm7                    ; Blank mmx7

-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses

-        mov         rbx, arg(2) ;[ref_ptr]

-        movsxd      rcx, dword ptr arg(1) ;[source_stride]

-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]

-        ; Row 1

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Row 2

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Row 3

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Row 4

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Row 5

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        ;              movq        mm4, [rbx + rdx]

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Row 6

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Row 7

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Row 8

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm2, mm0                    ; Take copies

-        movq        mm3, mm1                    ; Take copies

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

-        punpckhbw   mm3, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        psubsw      mm2, mm3                    ; A-B (high order) to MM2

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        paddw       mm5, mm2                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        pmaddwd     mm2, mm2                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        paddd       mm7, mm0                    ; accumulate in mm7

-        paddd       mm7, mm2                    ; accumulate in mm7

-        ; Now accumulate the final results.

-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory

-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory

-        movsx       rdx, WORD PTR [rsp+8]

-        movsx       rcx, WORD PTR [rsp+10]

-        movsx       rbx, WORD PTR [rsp+12]

-        movsx       rax, WORD PTR [rsp+14]

-        add         rdx, rcx

-        add         rbx, rax

-        add         rdx, rbx    ;XSum

-        movsxd      rax, DWORD PTR [rsp]

-        movsxd      rcx, DWORD PTR [rsp+4]

-        add         rax, rcx    ;XXSum

-        mov         rsi, arg(4) ;SSE

-        mov         rdi, arg(5) ;Sum

-        mov         dword ptr [rsi], eax

-        mov         dword ptr [rdi], edx

-        xor         rax, rax    ; return 0

-    ; begin epilog

-    add rsp, 16

-    pop rbx

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int

-;vp9_get4x4var_mmx

-;(

-;    unsigned char *src_ptr,

-;    int  source_stride,

-;    unsigned char *ref_ptr,

-;    int  recon_stride,

-;    unsigned int *SSE,

-;    int *Sum

-;)

-global sym(vp9_get4x4var_mmx)

-sym(vp9_get4x4var_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    push rsi

-    push rdi

-    push rbx

-    sub         rsp, 16

-    ; end prolog

-        pxor        mm5, mm5                    ; Blank mmx6

-        pxor        mm6, mm6                    ; Blank mmx7

-        pxor        mm7, mm7                    ; Blank mmx7

-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses

-        mov         rbx, arg(2) ;[ref_ptr]

-        movsxd      rcx, dword ptr arg(1) ;[source_stride]

-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]

-        ; Row 1

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        ; Row 2

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        ; Row 3

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        ; Row 4

-        movq        mm0, [rax]                  ; Copy eight bytes to mm0

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        paddw       mm5, mm0                    ; accumulate differences in mm5

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        paddd       mm7, mm0                    ; accumulate in mm7

-        ; Now accumulate the final results.

-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory

-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory

-        movsx       rdx, WORD PTR [rsp+8]

-        movsx       rcx, WORD PTR [rsp+10]

-        movsx       rbx, WORD PTR [rsp+12]

-        movsx       rax, WORD PTR [rsp+14]

-        add         rdx, rcx

-        add         rbx, rax

-        add         rdx, rbx    ;XSum

-        movsxd      rax, DWORD PTR [rsp]

-        movsxd      rcx, DWORD PTR [rsp+4]

-        add         rax, rcx    ;XXSum

-        mov         rsi, arg(4) ;SSE

-        mov         rdi, arg(5) ;Sum

-        mov         dword ptr [rsi], eax

-        mov         dword ptr [rdi], edx

-        xor         rax, rax    ; return 0

-    ; begin epilog

-    add rsp, 16

-    pop rbx

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int

-;vp9_get4x4sse_cs_mmx

-;(

-;    unsigned char *src_ptr,

-;    int  source_stride,

-;    unsigned char *ref_ptr,

-;    int  recon_stride

-;)

-global sym(vp9_get4x4sse_cs_mmx)

-sym(vp9_get4x4sse_cs_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    push rsi

-    push rdi

-    push rbx

-    ; end prolog

-        pxor        mm6, mm6                    ; Blank mmx7

-        pxor        mm7, mm7                    ; Blank mmx7

-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses

-        mov         rbx, arg(2) ;[ref_ptr]

-        movsxd      rcx, dword ptr arg(1) ;[source_stride]

-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]

-        ; Row 1

-        movd        mm0, [rax]                  ; Copy eight bytes to mm0

-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        ; Row 2

-        movd        mm0, [rax]                  ; Copy eight bytes to mm0

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        ; Row 3

-        movd        mm0, [rax]                  ; Copy eight bytes to mm0

-        punpcklbw   mm1, mm6

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        add         rbx,rdx                     ; Inc pointer into ref data

-        add         rax,rcx                     ; Inc pointer into the new data

-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1

-        paddd       mm7, mm0                    ; accumulate in mm7

-        ; Row 4

-        movd        mm0, [rax]                  ; Copy eight bytes to mm0

-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

-        punpcklbw   mm1, mm6

-        psubsw      mm0, mm1                    ; A-B (low order) to MM0

-        pmaddwd     mm0, mm0                    ; square and accumulate

-        paddd       mm7, mm0                    ; accumulate in mm7

-        movq        mm0,    mm7                 ;

-        psrlq       mm7,    32

-        paddd       mm0,    mm7

-        movq        rax,    mm0

-    ; begin epilog

-    pop rbx

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-%define mmx_filter_shift            7

-;void vp9_filter_block2d_bil4x4_var_mmx

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned short *HFilter,

-;    unsigned short *VFilter,

-;    int *sum,

-;    unsigned int *sumsquared

-;)

-global sym(vp9_filter_block2d_bil4x4_var_mmx)

-sym(vp9_filter_block2d_bil4x4_var_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 8

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    sub         rsp, 16

-    ; end prolog

-        pxor            mm6,            mm6                 ;

-        pxor            mm7,            mm7                 ;

-        mov             rax,            arg(4) ;HFilter             ;

-        mov             rdx,            arg(5) ;VFilter             ;

-        mov             rsi,            arg(0) ;ref_ptr              ;

-        mov             rdi,            arg(2) ;src_ptr              ;

-        mov             rcx,            4                   ;

-        pxor            mm0,            mm0                 ;

-        movd            mm1,            [rsi]               ;

-        movd            mm3,            [rsi+1]             ;

-        punpcklbw       mm1,            mm0                 ;

-        pmullw          mm1,            [rax]               ;

-        punpcklbw       mm3,            mm0                 ;

-        pmullw          mm3,            [rax+8]             ;

-        paddw           mm1,            mm3                 ;

-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm1,            mmx_filter_shift    ;

-        movq            mm5,            mm1

-%if ABI_IS_32BIT

-        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;

-%else

-        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;

-        add             rsi, r8

-%endif

-.filter_block2d_bil4x4_var_mmx_loop:

-        movd            mm1,            [rsi]               ;

-        movd            mm3,            [rsi+1]             ;

-        punpcklbw       mm1,            mm0                 ;

-        pmullw          mm1,            [rax]               ;

-        punpcklbw       mm3,            mm0                 ;

-        pmullw          mm3,            [rax+8]             ;

-        paddw           mm1,            mm3                 ;

-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm1,            mmx_filter_shift    ;

-        movq            mm3,            mm5                 ;

-        movq            mm5,            mm1                 ;

-        pmullw          mm3,            [rdx]               ;

-        pmullw          mm1,            [rdx+8]             ;

-        paddw           mm1,            mm3                 ;

-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm1,            mmx_filter_shift    ;

-        movd            mm3,            [rdi]               ;

-        punpcklbw       mm3,            mm0                 ;

-        psubw           mm1,            mm3                 ;

-        paddw           mm6,            mm1                 ;

-        pmaddwd         mm1,            mm1                 ;

-        paddd           mm7,            mm1                 ;

-%if ABI_IS_32BIT

-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;

-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;

-%else

-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line

-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line

-        add             rsi,            r8

-        add             rdi,            r9

-%endif

-        sub             rcx,            1                   ;

-        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;

-        pxor            mm3,            mm3                 ;

-        pxor            mm2,            mm2                 ;

-        punpcklwd       mm2,            mm6                 ;

-        punpckhwd       mm3,            mm6                 ;

-        paddd           mm2,            mm3                 ;

-        movq            mm6,            mm2                 ;

-        psrlq           mm6,            32                  ;

-        paddd           mm2,            mm6                 ;

-        psrad           mm2,            16                  ;

-        movq            mm4,            mm7                 ;

-        psrlq           mm4,            32                  ;

-        paddd           mm4,            mm7                 ;

-        mov             rdi,            arg(6) ;sum

-        mov             rsi,            arg(7) ;sumsquared

-        movd            dword ptr [rdi],          mm2                 ;

-        movd            dword ptr [rsi],          mm4                 ;

-    ; begin epilog

-    add rsp, 16

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block2d_bil_var_mmx

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned int Height,

-;    unsigned short *HFilter,

-;    unsigned short *VFilter,

-;    int *sum,

-;    unsigned int *sumsquared

-;)

-global sym(vp9_filter_block2d_bil_var_mmx)

-sym(vp9_filter_block2d_bil_var_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 9

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    sub         rsp, 16

-    ; end prolog

-        pxor            mm6,            mm6                 ;

-        pxor            mm7,            mm7                 ;

-        mov             rax,            arg(5) ;HFilter             ;

-        mov             rdx,            arg(6) ;VFilter             ;

-        mov             rsi,            arg(0) ;ref_ptr              ;

-        mov             rdi,            arg(2) ;src_ptr              ;

-        movsxd          rcx,            dword ptr arg(4) ;Height              ;

-        pxor            mm0,            mm0                 ;

-        movq            mm1,            [rsi]               ;

-        movq            mm3,            [rsi+1]             ;

-        movq            mm2,            mm1                 ;

-        movq            mm4,            mm3                 ;

-        punpcklbw       mm1,            mm0                 ;

-        punpckhbw       mm2,            mm0                 ;

-        pmullw          mm1,            [rax]               ;

-        pmullw          mm2,            [rax]               ;

-        punpcklbw       mm3,            mm0                 ;

-        punpckhbw       mm4,            mm0                 ;

-        pmullw          mm3,            [rax+8]             ;

-        pmullw          mm4,            [rax+8]             ;

-        paddw           mm1,            mm3                 ;

-        paddw           mm2,            mm4                 ;

-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm1,            mmx_filter_shift    ;

-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm2,            mmx_filter_shift    ;

-        movq            mm5,            mm1

-        packuswb        mm5,            mm2                 ;

-%if ABI_IS_32BIT

-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line

-%else

-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line

-        add             rsi,            r8

-%endif

-.filter_block2d_bil_var_mmx_loop:

-        movq            mm1,            [rsi]               ;

-        movq            mm3,            [rsi+1]             ;

-        movq            mm2,            mm1                 ;

-        movq            mm4,            mm3                 ;

-        punpcklbw       mm1,            mm0                 ;

-        punpckhbw       mm2,            mm0                 ;

-        pmullw          mm1,            [rax]               ;

-        pmullw          mm2,            [rax]               ;

-        punpcklbw       mm3,            mm0                 ;

-        punpckhbw       mm4,            mm0                 ;

-        pmullw          mm3,            [rax+8]             ;

-        pmullw          mm4,            [rax+8]             ;

-        paddw           mm1,            mm3                 ;

-        paddw           mm2,            mm4                 ;

-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm1,            mmx_filter_shift    ;

-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm2,            mmx_filter_shift    ;

-        movq            mm3,            mm5                 ;

-        movq            mm4,            mm5                 ;

-        punpcklbw       mm3,            mm0                 ;

-        punpckhbw       mm4,            mm0                 ;

-        movq            mm5,            mm1                 ;

-        packuswb        mm5,            mm2                 ;

-        pmullw          mm3,            [rdx]               ;

-        pmullw          mm4,            [rdx]               ;

-        pmullw          mm1,            [rdx+8]             ;

-        pmullw          mm2,            [rdx+8]             ;

-        paddw           mm1,            mm3                 ;

-        paddw           mm2,            mm4                 ;

-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;

-        psraw           mm1,            mmx_filter_shift    ;

-        psraw           mm2,            mmx_filter_shift    ;

-        movq            mm3,            [rdi]               ;

-        movq            mm4,            mm3                 ;

-        punpcklbw       mm3,            mm0                 ;

-        punpckhbw       mm4,            mm0                 ;

-        psubw           mm1,            mm3                 ;

-        psubw           mm2,            mm4                 ;

-        paddw           mm6,            mm1                 ;

-        pmaddwd         mm1,            mm1                 ;

-        paddw           mm6,            mm2                 ;

-        pmaddwd         mm2,            mm2                 ;

-        paddd           mm7,            mm1                 ;

-        paddd           mm7,            mm2                 ;

-%if ABI_IS_32BIT

-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;

-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;

-%else

-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;

-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;

-        add             rsi,            r8

-        add             rdi,            r9

-%endif

-        sub             rcx,            1                   ;

-        jnz             .filter_block2d_bil_var_mmx_loop       ;

-        pxor            mm3,            mm3                 ;

-        pxor            mm2,            mm2                 ;

-        punpcklwd       mm2,            mm6                 ;

-        punpckhwd       mm3,            mm6                 ;

-        paddd           mm2,            mm3                 ;

-        movq            mm6,            mm2                 ;

-        psrlq           mm6,            32                  ;

-        paddd           mm2,            mm6                 ;

-        psrad           mm2,            16                  ;

-        movq            mm4,            mm7                 ;

-        psrlq           mm4,            32                  ;

-        paddd           mm4,            mm7                 ;

-        mov             rdi,            arg(7) ;sum

-        mov             rsi,            arg(8) ;sumsquared

-        movd            dword ptr [rdi],          mm2                 ;

-        movd            dword ptr [rsi],          mm4                 ;

-    ; begin epilog

-    add rsp, 16

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-;short mmx_bi_rd[4] = { 64, 64, 64, 64};

-align 16

-mmx_bi_rd:

-    times 4 dw 64

--- a/vp8/encoder/x86/variance_impl_sse2.asm

+++ /dev/null

@@ -1,1367 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%define xmm_filter_shift            7

-;unsigned int vp9_get_mb_ss_sse2

-;(

-;    short *src_ptr

-;)

-global sym(vp9_get_mb_ss_sse2)

-sym(vp9_get_mb_ss_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 1

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    sub         rsp, 16

-    ; end prolog

-        mov         rax, arg(0) ;[src_ptr]

-        mov         rcx, 8

-        pxor        xmm4, xmm4

-.NEXTROW:

-        movdqa      xmm0, [rax]

-        movdqa      xmm1, [rax+16]

-        movdqa      xmm2, [rax+32]

-        movdqa      xmm3, [rax+48]

-        pmaddwd     xmm0, xmm0

-        pmaddwd     xmm1, xmm1

-        pmaddwd     xmm2, xmm2

-        pmaddwd     xmm3, xmm3

-        paddd       xmm0, xmm1

-        paddd       xmm2, xmm3

-        paddd       xmm4, xmm0

-        paddd       xmm4, xmm2

-        add         rax, 0x40

-        dec         rcx

-        ja          .NEXTROW

-        movdqa      xmm3,xmm4

-        psrldq      xmm4,8

-        paddd       xmm4,xmm3

-        movdqa      xmm3,xmm4

-        psrldq      xmm4,4

-        paddd       xmm4,xmm3

-        movq        rax,xmm4

-    ; begin epilog

-    add rsp, 16

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int vp9_get16x16var_sse2

-;(

-;    unsigned char   *  src_ptr,

-;    int             source_stride,

-;    unsigned char   *  ref_ptr,

-;    int             recon_stride,

-;    unsigned int    *  SSE,

-;    int             *  Sum

-;)

-global sym(vp9_get16x16var_sse2)

-sym(vp9_get16x16var_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    push rbx

-    push rsi

-    push rdi

-    ; end prolog

-        mov         rsi,            arg(0) ;[src_ptr]

-        mov         rdi,            arg(2) ;[ref_ptr]

-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]

-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]

-        ; Prefetch data

-        lea             rcx,    [rax+rax*2]

-        prefetcht0      [rsi]

-        prefetcht0      [rsi+rax]

-        prefetcht0      [rsi+rax*2]

-        prefetcht0      [rsi+rcx]

-        lea             rbx,    [rsi+rax*4]

-        prefetcht0      [rbx]

-        prefetcht0      [rbx+rax]

-        prefetcht0      [rbx+rax*2]

-        prefetcht0      [rbx+rcx]

-        lea             rcx,    [rdx+rdx*2]

-        prefetcht0      [rdi]

-        prefetcht0      [rdi+rdx]

-        prefetcht0      [rdi+rdx*2]

-        prefetcht0      [rdi+rcx]

-        lea             rbx,    [rdi+rdx*4]

-        prefetcht0      [rbx]

-        prefetcht0      [rbx+rdx]

-        prefetcht0      [rbx+rdx*2]

-        prefetcht0      [rbx+rcx]

-        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack

-        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs

-        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse

-        mov         rcx,            16

-.var16loop:

-        movdqu      xmm1,           XMMWORD PTR [rsi]

-        movdqu      xmm2,           XMMWORD PTR [rdi]

-        prefetcht0      [rsi+rax*8]

-        prefetcht0      [rdi+rdx*8]

-        movdqa      xmm3,           xmm1

-        movdqa      xmm4,           xmm2

-        punpcklbw   xmm1,           xmm0

-        punpckhbw   xmm3,           xmm0

-        punpcklbw   xmm2,           xmm0

-        punpckhbw   xmm4,           xmm0

-        psubw       xmm1,           xmm2

-        psubw       xmm3,           xmm4

-        paddw       xmm7,           xmm1

-        pmaddwd     xmm1,           xmm1

-        paddw       xmm7,           xmm3

-        pmaddwd     xmm3,           xmm3

-        paddd       xmm6,           xmm1

-        paddd       xmm6,           xmm3

-        add         rsi,            rax

-        add         rdi,            rdx

-        sub         rcx,            1

-        jnz         .var16loop

-        movdqa      xmm1,           xmm6

-        pxor        xmm6,           xmm6

-        pxor        xmm5,           xmm5

-        punpcklwd   xmm6,           xmm7

-        punpckhwd   xmm5,           xmm7

-        psrad       xmm5,           16

-        psrad       xmm6,           16

-        paddd       xmm6,           xmm5

-        movdqa      xmm2,           xmm1

-        punpckldq   xmm1,           xmm0

-        punpckhdq   xmm2,           xmm0

-        movdqa      xmm7,           xmm6

-        paddd       xmm1,           xmm2

-        punpckldq   xmm6,           xmm0

-        punpckhdq   xmm7,           xmm0

-        paddd       xmm6,           xmm7

-        movdqa      xmm2,           xmm1

-        movdqa      xmm7,           xmm6

-        psrldq      xmm1,           8

-        psrldq      xmm6,           8

-        paddd       xmm7,           xmm6

-        paddd       xmm1,           xmm2

-        mov         rax,            arg(5) ;[Sum]

-        mov         rdi,            arg(4) ;[SSE]

-        movd DWORD PTR [rax],       xmm7

-        movd DWORD PTR [rdi],       xmm1

-    ; begin epilog

-    pop rdi

-    pop rsi

-    pop rbx

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int vp9_get8x8var_sse2

-;(

-;    unsigned char   *  src_ptr,

-;    int             source_stride,

-;    unsigned char   *  ref_ptr,

-;    int             recon_stride,

-;    unsigned int    *  SSE,

-;    int             *  Sum

-;)

-global sym(vp9_get8x8var_sse2)

-sym(vp9_get8x8var_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    sub         rsp, 16

-    ; end prolog

-        mov         rsi,            arg(0) ;[src_ptr]

-        mov         rdi,            arg(2) ;[ref_ptr]

-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]

-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]

-        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack

-        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs

-        movq        xmm1,           QWORD PTR [rsi]

-        movq        xmm2,           QWORD PTR [rdi]

-        punpcklbw   xmm1,           xmm0

-        punpcklbw   xmm2,           xmm0

-        psubsw      xmm1,           xmm2

-        paddw       xmm7,           xmm1

-        pmaddwd     xmm1,           xmm1

-        movq        xmm2,           QWORD PTR[rsi + rax]

-        movq        xmm3,           QWORD PTR[rdi + rdx]

-        punpcklbw   xmm2,           xmm0

-        punpcklbw   xmm3,           xmm0

-        psubsw      xmm2,           xmm3

-        paddw       xmm7,           xmm2

-        pmaddwd     xmm2,           xmm2

-        paddd       xmm1,           xmm2

-        movq        xmm2,           QWORD PTR[rsi + rax * 2]

-        movq        xmm3,           QWORD PTR[rdi + rdx * 2]

-        punpcklbw   xmm2,           xmm0

-        punpcklbw   xmm3,           xmm0

-        psubsw      xmm2,           xmm3

-        paddw       xmm7,           xmm2

-        pmaddwd     xmm2,           xmm2

-        paddd       xmm1,           xmm2

-        lea         rsi,            [rsi + rax * 2]

-        lea         rdi,            [rdi + rdx * 2]

-        movq        xmm2,           QWORD PTR[rsi + rax]

-        movq        xmm3,           QWORD PTR[rdi + rdx]

-        punpcklbw   xmm2,           xmm0

-        punpcklbw   xmm3,           xmm0

-        psubsw      xmm2,           xmm3

-        paddw       xmm7,           xmm2

-        pmaddwd     xmm2,           xmm2

-        paddd       xmm1,           xmm2

-        movq        xmm2,           QWORD PTR[rsi + rax *2]

-        movq        xmm3,           QWORD PTR[rdi + rdx *2]

-        punpcklbw   xmm2,           xmm0

-        punpcklbw   xmm3,           xmm0

-        psubsw      xmm2,           xmm3

-        paddw       xmm7,           xmm2

-        pmaddwd     xmm2,           xmm2

-        paddd       xmm1,           xmm2

-        lea         rsi,            [rsi + rax * 2]

-        lea         rdi,            [rdi + rdx * 2]

-        movq        xmm2,           QWORD PTR[rsi + rax]

-        movq        xmm3,           QWORD PTR[rdi + rdx]

-        punpcklbw   xmm2,           xmm0

-        punpcklbw   xmm3,           xmm0

-        psubsw      xmm2,           xmm3

-        paddw       xmm7,           xmm2

-        pmaddwd     xmm2,           xmm2

-        paddd       xmm1,           xmm2

-        movq        xmm2,           QWORD PTR[rsi + rax *2]

-        movq        xmm3,           QWORD PTR[rdi + rdx *2]

-        punpcklbw   xmm2,           xmm0

-        punpcklbw   xmm3,           xmm0

-        psubsw      xmm2,           xmm3

-        paddw       xmm7,           xmm2

-        pmaddwd     xmm2,           xmm2

-        paddd       xmm1,           xmm2

-        lea         rsi,            [rsi + rax * 2]

-        lea         rdi,            [rdi + rdx * 2]

-        movq        xmm2,           QWORD PTR[rsi + rax]

-        movq        xmm3,           QWORD PTR[rdi + rdx]

-        punpcklbw   xmm2,           xmm0

-        punpcklbw   xmm3,           xmm0

-        psubsw      xmm2,           xmm3

-        paddw       xmm7,           xmm2

-        pmaddwd     xmm2,           xmm2

-        paddd       xmm1,           xmm2

-        movdqa      xmm6,           xmm7

-        punpcklwd   xmm6,           xmm0

-        punpckhwd   xmm7,           xmm0

-        movdqa      xmm2,           xmm1

-        paddw       xmm6,           xmm7

-        punpckldq   xmm1,           xmm0

-        punpckhdq   xmm2,           xmm0

-        movdqa      xmm7,           xmm6

-        paddd       xmm1,           xmm2

-        punpckldq   xmm6,           xmm0

-        punpckhdq   xmm7,           xmm0

-        paddw       xmm6,           xmm7

-        movdqa      xmm2,           xmm1

-        movdqa      xmm7,           xmm6

-        psrldq      xmm1,           8

-        psrldq      xmm6,           8

-        paddw       xmm7,           xmm6

-        paddd       xmm1,           xmm2

-        mov         rax,            arg(5) ;[Sum]

-        mov         rdi,            arg(4) ;[SSE]

-        movq        rdx,            xmm7

-        movsx       rcx,            dx

-        mov  dword ptr [rax],       ecx

-        movd DWORD PTR [rdi],       xmm1

-    ; begin epilog

-    add rsp, 16

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block2d_bil_var_sse2

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned int Height,

-;    int  xoffset,

-;    int  yoffset,

-;    int *sum,

-;    unsigned int *sumsquared;;

-;

-;)

-global sym(vp9_filter_block2d_bil_var_sse2)

-sym(vp9_filter_block2d_bil_var_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 9

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    push rbx

-    ; end prolog

-        pxor            xmm6,           xmm6                 ;

-        pxor            xmm7,           xmm7                 ;

-        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding

-        movdqa          xmm4,           XMMWORD PTR [rsi]

-        lea             rcx,            [GLOBAL(bilinear_filters_sse2)]

-        movsxd          rax,            dword ptr arg(5)     ; xoffset

-        cmp             rax,            0                    ; skip first_pass filter if xoffset=0

-        je              filter_block2d_bil_var_sse2_sp_only

-        shl             rax,            5                    ; point to filter coeff with xoffset

-        lea             rax,            [rax + rcx]          ; HFilter

-        movsxd          rdx,            dword ptr arg(6)     ; yoffset

-        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0

-        je              filter_block2d_bil_var_sse2_fp_only

-        shl             rdx,            5

-        lea             rdx,            [rdx + rcx]          ; VFilter

-        mov             rsi,            arg(0)               ;ref_ptr

-        mov             rdi,            arg(2)               ;src_ptr

-        movsxd          rcx,            dword ptr arg(4)     ;Height

-        pxor            xmm0,           xmm0                 ;

-        movq            xmm1,           QWORD PTR [rsi]      ;

-        movq            xmm3,           QWORD PTR [rsi+1]    ;

-        punpcklbw       xmm1,           xmm0                 ;

-        pmullw          xmm1,           [rax]                ;

-        punpcklbw       xmm3,           xmm0

-        pmullw          xmm3,           [rax+16]             ;

-        paddw           xmm1,           xmm3                 ;

-        paddw           xmm1,           xmm4                 ;

-        psraw           xmm1,           xmm_filter_shift     ;

-        movdqa          xmm5,           xmm1

-        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line

-        lea             rsi,            [rsi + rbx]

-%if ABI_IS_32BIT=0

-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line

-%endif

-filter_block2d_bil_var_sse2_loop:

-        movq            xmm1,           QWORD PTR [rsi]               ;

-        movq            xmm3,           QWORD PTR [rsi+1]             ;

-        punpcklbw       xmm1,           xmm0                 ;

-        pmullw          xmm1,           [rax]               ;

-        punpcklbw       xmm3,           xmm0                 ;

-        pmullw          xmm3,           [rax+16]             ;

-        paddw           xmm1,           xmm3                 ;

-        paddw           xmm1,           xmm4               ;

-        psraw           xmm1,           xmm_filter_shift    ;

-        movdqa          xmm3,           xmm5                 ;

-        movdqa          xmm5,           xmm1                 ;

-        pmullw          xmm3,           [rdx]               ;

-        pmullw          xmm1,           [rdx+16]             ;

-        paddw           xmm1,           xmm3                 ;

-        paddw           xmm1,           xmm4                 ;

-        psraw           xmm1,           xmm_filter_shift    ;

-        movq            xmm3,           QWORD PTR [rdi]               ;

-        punpcklbw       xmm3,           xmm0                 ;

-        psubw           xmm1,           xmm3                 ;

-        paddw           xmm6,           xmm1                 ;

-        pmaddwd         xmm1,           xmm1                 ;

-        paddd           xmm7,           xmm1                 ;

-        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line

-%if ABI_IS_32BIT

-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line

-%else

-        lea             rdi,            [rdi + r9]

-%endif

-        sub             rcx,            1                   ;

-        jnz             filter_block2d_bil_var_sse2_loop       ;

-        jmp             filter_block2d_bil_variance

-filter_block2d_bil_var_sse2_sp_only:

-        movsxd          rdx,            dword ptr arg(6)     ; yoffset

-        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0

-        je              filter_block2d_bil_var_sse2_full_pixel

-        shl             rdx,            5

-        lea             rdx,            [rdx + rcx]          ; VFilter

-        mov             rsi,            arg(0)               ;ref_ptr

-        mov             rdi,            arg(2)               ;src_ptr

-        movsxd          rcx,            dword ptr arg(4)     ;Height

-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line

-        pxor            xmm0,           xmm0                 ;

-        movq            xmm1,           QWORD PTR [rsi]      ;

-        punpcklbw       xmm1,           xmm0                 ;

-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line

-        lea             rsi,            [rsi + rax]

-filter_block2d_bil_sp_only_loop:

-        movq            xmm3,           QWORD PTR [rsi]             ;

-        punpcklbw       xmm3,           xmm0                 ;

-        movdqa          xmm5,           xmm3

-        pmullw          xmm1,           [rdx]               ;

-        pmullw          xmm3,           [rdx+16]             ;

-        paddw           xmm1,           xmm3                 ;

-        paddw           xmm1,           xmm4                 ;

-        psraw           xmm1,           xmm_filter_shift    ;

-        movq            xmm3,           QWORD PTR [rdi]               ;

-        punpcklbw       xmm3,           xmm0                 ;

-        psubw           xmm1,           xmm3                 ;

-        paddw           xmm6,           xmm1                 ;

-        pmaddwd         xmm1,           xmm1                 ;

-        paddd           xmm7,           xmm1                 ;

-        movdqa          xmm1,           xmm5                 ;

-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line

-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line

-        sub             rcx,            1                   ;

-        jnz             filter_block2d_bil_sp_only_loop       ;

-        jmp             filter_block2d_bil_variance

-filter_block2d_bil_var_sse2_full_pixel:

-        mov             rsi,            arg(0)               ;ref_ptr

-        mov             rdi,            arg(2)               ;src_ptr

-        movsxd          rcx,            dword ptr arg(4)     ;Height

-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line

-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line

-        pxor            xmm0,           xmm0                 ;

-filter_block2d_bil_full_pixel_loop:

-        movq            xmm1,           QWORD PTR [rsi]               ;

-        punpcklbw       xmm1,           xmm0                 ;

-        movq            xmm2,           QWORD PTR [rdi]               ;

-        punpcklbw       xmm2,           xmm0                 ;

-        psubw           xmm1,           xmm2                 ;

-        paddw           xmm6,           xmm1                 ;

-        pmaddwd         xmm1,           xmm1                 ;

-        paddd           xmm7,           xmm1                 ;

-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line

-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line

-        sub             rcx,            1                   ;

-        jnz             filter_block2d_bil_full_pixel_loop       ;

-        jmp             filter_block2d_bil_variance

-filter_block2d_bil_var_sse2_fp_only:

-        mov             rsi,            arg(0)               ;ref_ptr

-        mov             rdi,            arg(2)               ;src_ptr

-        movsxd          rcx,            dword ptr arg(4)     ;Height

-        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line

-        pxor            xmm0,           xmm0                 ;

-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line

-filter_block2d_bil_fp_only_loop:

-        movq            xmm1,           QWORD PTR [rsi]       ;

-        movq            xmm3,           QWORD PTR [rsi+1]     ;

-        punpcklbw       xmm1,           xmm0                 ;

-        pmullw          xmm1,           [rax]               ;

-        punpcklbw       xmm3,           xmm0                 ;

-        pmullw          xmm3,           [rax+16]             ;

-        paddw           xmm1,           xmm3                 ;

-        paddw           xmm1,           xmm4  ;

-        psraw           xmm1,           xmm_filter_shift    ;

-        movq            xmm3,           QWORD PTR [rdi]     ;

-        punpcklbw       xmm3,           xmm0                 ;

-        psubw           xmm1,           xmm3                 ;

-        paddw           xmm6,           xmm1                 ;

-        pmaddwd         xmm1,           xmm1                 ;

-        paddd           xmm7,           xmm1                 ;

-        lea             rsi,            [rsi + rdx]

-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line

-        sub             rcx,            1                   ;

-        jnz             filter_block2d_bil_fp_only_loop       ;

-        jmp             filter_block2d_bil_variance

-filter_block2d_bil_variance:

-        movdq2q         mm6,            xmm6                ;

-        movdq2q         mm7,            xmm7                ;

-        psrldq          xmm6,           8

-        psrldq          xmm7,           8

-        movdq2q         mm2,            xmm6

-        movdq2q         mm3,            xmm7

-        paddw           mm6,            mm2

-        paddd           mm7,            mm3

-        pxor            mm3,            mm3                 ;

-        pxor            mm2,            mm2                 ;

-        punpcklwd       mm2,            mm6                 ;

-        punpckhwd       mm3,            mm6                 ;

-        paddd           mm2,            mm3                 ;

-        movq            mm6,            mm2                 ;

-        psrlq           mm6,            32                  ;

-        paddd           mm2,            mm6                 ;

-        psrad           mm2,            16                  ;

-        movq            mm4,            mm7                 ;

-        psrlq           mm4,            32                  ;

-        paddd           mm4,            mm7                 ;

-        mov             rsi,            arg(7) ; sum

-        mov             rdi,            arg(8) ; sumsquared

-        movd            [rsi],          mm2    ; xsum

-        movd            [rdi],          mm4    ; xxsum

-    ; begin epilog

-    pop rbx

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_half_horiz_vert_variance8x_h_sse2

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned int Height,

-;    int *sum,

-;    unsigned int *sumsquared

-;)

-global sym(vp9_half_horiz_vert_variance8x_h_sse2)

-sym(vp9_half_horiz_vert_variance8x_h_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    ; end prolog

-%if ABI_IS_32BIT=0

-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line

-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line

-%endif

-        pxor            xmm6,           xmm6                ;  error accumulator

-        pxor            xmm7,           xmm7                ;  sse eaccumulator

-        mov             rsi,            arg(0) ;ref_ptr              ;

-        mov             rdi,            arg(2) ;src_ptr              ;

-        movsxd          rcx,            dword ptr arg(4) ;Height              ;

-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line

-        pxor            xmm0,           xmm0                ;

-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8

-        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9

-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1

-%if ABI_IS_32BIT

-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source

-%else

-        add             rsi, r8

-%endif

-.half_horiz_vert_variance8x_h_1:

-        movq            xmm1,           QWORD PTR [rsi]     ;

-        movq            xmm2,           QWORD PTR [rsi+1]   ;

-        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1

-        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above

-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above

-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8

-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above

-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3

-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences

-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5

-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences

-        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row

-%if ABI_IS_32BIT

-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source

-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination

-%else

-        add             rsi, r8

-        add             rdi, r9

-%endif

-        sub             rcx,            1                   ;

-        jnz             .half_horiz_vert_variance8x_h_1     ;

-        movdq2q         mm6,            xmm6                ;

-        movdq2q         mm7,            xmm7                ;

-        psrldq          xmm6,           8

-        psrldq          xmm7,           8

-        movdq2q         mm2,            xmm6

-        movdq2q         mm3,            xmm7

-        paddw           mm6,            mm2

-        paddd           mm7,            mm3

-        pxor            mm3,            mm3                 ;

-        pxor            mm2,            mm2                 ;

-        punpcklwd       mm2,            mm6                 ;

-        punpckhwd       mm3,            mm6                 ;

-        paddd           mm2,            mm3                 ;

-        movq            mm6,            mm2                 ;

-        psrlq           mm6,            32                  ;

-        paddd           mm2,            mm6                 ;

-        psrad           mm2,            16                  ;

-        movq            mm4,            mm7                 ;

-        psrlq           mm4,            32                  ;

-        paddd           mm4,            mm7                 ;

-        mov             rsi,            arg(5) ; sum

-        mov             rdi,            arg(6) ; sumsquared

-        movd            [rsi],          mm2                 ;

-        movd            [rdi],          mm4                 ;

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_half_horiz_vert_variance16x_h_sse2

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned int Height,

-;    int *sum,

-;    unsigned int *sumsquared

-;)

-global sym(vp9_half_horiz_vert_variance16x_h_sse2)

-sym(vp9_half_horiz_vert_variance16x_h_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    ; end prolog

-        pxor            xmm6,           xmm6                ;  error accumulator

-        pxor            xmm7,           xmm7                ;  sse eaccumulator

-        mov             rsi,            arg(0) ;ref_ptr              ;

-        mov             rdi,            arg(2) ;src_ptr              ;

-        movsxd          rcx,            dword ptr arg(4) ;Height              ;

-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line

-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line

-        pxor            xmm0,           xmm0                ;

-        movdqu          xmm5,           XMMWORD PTR [rsi]

-        movdqu          xmm3,           XMMWORD PTR [rsi+1]

-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1

-        lea             rsi,            [rsi + rax]

-.half_horiz_vert_variance16x_h_1:

-        movdqu          xmm1,           XMMWORD PTR [rsi]     ;

-        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;

-        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1

-        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above

-        movdqa          xmm4,           xmm5

-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above

-        punpckhbw       xmm4,           xmm0

-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7

-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above

-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3

-        movq            xmm3,           QWORD PTR [rdi+8]

-        punpcklbw       xmm3,           xmm0

-        psubw           xmm4,           xmm3

-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences

-        paddw           xmm6,           xmm4

-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5

-        pmaddwd         xmm4,           xmm4

-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences

-        paddd           xmm7,           xmm4

-        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row

-        lea             rsi,            [rsi + rax]

-        lea             rdi,            [rdi + rdx]

-        sub             rcx,            1                   ;

-        jnz             .half_horiz_vert_variance16x_h_1    ;

-        pxor        xmm1,           xmm1

-        pxor        xmm5,           xmm5

-        punpcklwd   xmm0,           xmm6

-        punpckhwd   xmm1,           xmm6

-        psrad       xmm0,           16

-        psrad       xmm1,           16

-        paddd       xmm0,           xmm1

-        movdqa      xmm1,           xmm0

-        movdqa      xmm6,           xmm7

-        punpckldq   xmm6,           xmm5

-        punpckhdq   xmm7,           xmm5

-        paddd       xmm6,           xmm7

-        punpckldq   xmm0,           xmm5

-        punpckhdq   xmm1,           xmm5

-        paddd       xmm0,           xmm1

-        movdqa      xmm7,           xmm6

-        movdqa      xmm1,           xmm0

-        psrldq      xmm7,           8

-        psrldq      xmm1,           8

-        paddd       xmm6,           xmm7

-        paddd       xmm0,           xmm1

-        mov         rsi,            arg(5) ;[Sum]

-        mov         rdi,            arg(6) ;[SSE]

-        movd        [rsi],       xmm0

-        movd        [rdi],       xmm6

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_half_vert_variance8x_h_sse2

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned int Height,

-;    int *sum,

-;    unsigned int *sumsquared

-;)

-global sym(vp9_half_vert_variance8x_h_sse2)

-sym(vp9_half_vert_variance8x_h_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    ; end prolog

-%if ABI_IS_32BIT=0

-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line

-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line

-%endif

-        pxor            xmm6,           xmm6                ;  error accumulator

-        pxor            xmm7,           xmm7                ;  sse eaccumulator

-        mov             rsi,            arg(0) ;ref_ptr              ;

-        mov             rdi,            arg(2) ;src_ptr              ;

-        movsxd          rcx,            dword ptr arg(4) ;Height              ;

-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line

-        pxor            xmm0,           xmm0                ;

-.half_vert_variance8x_h_1:

-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8

-        movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9

-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)

-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above

-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8

-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above

-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3

-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences

-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5

-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences

-%if ABI_IS_32BIT

-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source

-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination

-%else

-        add             rsi, r8

-        add             rdi, r9

-%endif

-        sub             rcx,            1                   ;

-        jnz             .half_vert_variance8x_h_1          ;

-        movdq2q         mm6,            xmm6                ;

-        movdq2q         mm7,            xmm7                ;

-        psrldq          xmm6,           8

-        psrldq          xmm7,           8

-        movdq2q         mm2,            xmm6

-        movdq2q         mm3,            xmm7

-        paddw           mm6,            mm2

-        paddd           mm7,            mm3

-        pxor            mm3,            mm3                 ;

-        pxor            mm2,            mm2                 ;

-        punpcklwd       mm2,            mm6                 ;

-        punpckhwd       mm3,            mm6                 ;

-        paddd           mm2,            mm3                 ;

-        movq            mm6,            mm2                 ;

-        psrlq           mm6,            32                  ;

-        paddd           mm2,            mm6                 ;

-        psrad           mm2,            16                  ;

-        movq            mm4,            mm7                 ;

-        psrlq           mm4,            32                  ;

-        paddd           mm4,            mm7                 ;

-        mov             rsi,            arg(5) ; sum

-        mov             rdi,            arg(6) ; sumsquared

-        movd            [rsi],          mm2                 ;

-        movd            [rdi],          mm4                 ;

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_half_vert_variance16x_h_sse2

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned int Height,

-;    int *sum,

-;    unsigned int *sumsquared

-;)

-global sym(vp9_half_vert_variance16x_h_sse2)

-sym(vp9_half_vert_variance16x_h_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    ; end prolog

-        pxor            xmm6,           xmm6                ;  error accumulator

-        pxor            xmm7,           xmm7                ;  sse eaccumulator

-        mov             rsi,            arg(0)              ;ref_ptr

-        mov             rdi,            arg(2)              ;src_ptr

-        movsxd          rcx,            dword ptr arg(4)    ;Height

-        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line

-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line

-        movdqu          xmm5,           XMMWORD PTR [rsi]

-        lea             rsi,            [rsi + rax          ]

-        pxor            xmm0,           xmm0

-.half_vert_variance16x_h_1:

-        movdqu          xmm3,           XMMWORD PTR [rsi]

-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)

-        movdqa          xmm4,           xmm5

-        punpcklbw       xmm5,           xmm0

-        punpckhbw       xmm4,           xmm0

-        movq            xmm2,           QWORD PTR [rdi]

-        punpcklbw       xmm2,           xmm0

-        psubw           xmm5,           xmm2

-        movq            xmm2,           QWORD PTR [rdi+8]

-        punpcklbw       xmm2,           xmm0

-        psubw           xmm4,           xmm2

-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences

-        paddw           xmm6,           xmm4

-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5

-        pmaddwd         xmm4,           xmm4

-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences

-        paddd           xmm7,           xmm4

-        movdqa          xmm5,           xmm3

-        lea             rsi,            [rsi + rax]

-        lea             rdi,            [rdi + rdx]

-        sub             rcx,            1

-        jnz             .half_vert_variance16x_h_1

-        pxor        xmm1,           xmm1

-        pxor        xmm5,           xmm5

-        punpcklwd   xmm0,           xmm6

-        punpckhwd   xmm1,           xmm6

-        psrad       xmm0,           16

-        psrad       xmm1,           16

-        paddd       xmm0,           xmm1

-        movdqa      xmm1,           xmm0

-        movdqa      xmm6,           xmm7

-        punpckldq   xmm6,           xmm5

-        punpckhdq   xmm7,           xmm5

-        paddd       xmm6,           xmm7

-        punpckldq   xmm0,           xmm5

-        punpckhdq   xmm1,           xmm5

-        paddd       xmm0,           xmm1

-        movdqa      xmm7,           xmm6

-        movdqa      xmm1,           xmm0

-        psrldq      xmm7,           8

-        psrldq      xmm1,           8

-        paddd       xmm6,           xmm7

-        paddd       xmm0,           xmm1

-        mov         rsi,            arg(5) ;[Sum]

-        mov         rdi,            arg(6) ;[SSE]

-        movd        [rsi],       xmm0

-        movd        [rdi],       xmm6

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_half_horiz_variance8x_h_sse2

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned int Height,

-;    int *sum,

-;    unsigned int *sumsquared

-;)

-global sym(vp9_half_horiz_variance8x_h_sse2)

-sym(vp9_half_horiz_variance8x_h_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    ; end prolog

-%if ABI_IS_32BIT=0

-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line

-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line

-%endif

-        pxor            xmm6,           xmm6                ;  error accumulator

-        pxor            xmm7,           xmm7                ;  sse eaccumulator

-        mov             rsi,            arg(0) ;ref_ptr              ;

-        mov             rdi,            arg(2) ;src_ptr              ;

-        movsxd          rcx,            dword ptr arg(4) ;Height              ;

-        pxor            xmm0,           xmm0                ;

-.half_horiz_variance8x_h_1:

-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8

-        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9

-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)

-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above

-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8

-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above

-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3

-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences

-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5

-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences

-%if ABI_IS_32BIT

-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source

-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination

-%else

-        add             rsi, r8

-        add             rdi, r9

-%endif

-        sub             rcx,            1                   ;

-        jnz             .half_horiz_variance8x_h_1          ;

-        movdq2q         mm6,            xmm6                ;

-        movdq2q         mm7,            xmm7                ;

-        psrldq          xmm6,           8

-        psrldq          xmm7,           8

-        movdq2q         mm2,            xmm6

-        movdq2q         mm3,            xmm7

-        paddw           mm6,            mm2

-        paddd           mm7,            mm3

-        pxor            mm3,            mm3                 ;

-        pxor            mm2,            mm2                 ;

-        punpcklwd       mm2,            mm6                 ;

-        punpckhwd       mm3,            mm6                 ;

-        paddd           mm2,            mm3                 ;

-        movq            mm6,            mm2                 ;

-        psrlq           mm6,            32                  ;

-        paddd           mm2,            mm6                 ;

-        psrad           mm2,            16                  ;

-        movq            mm4,            mm7                 ;

-        psrlq           mm4,            32                  ;

-        paddd           mm4,            mm7                 ;

-        mov             rsi,            arg(5) ; sum

-        mov             rdi,            arg(6) ; sumsquared

-        movd            [rsi],          mm2                 ;

-        movd            [rdi],          mm4                 ;

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_half_horiz_variance16x_h_sse2

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned int Height,

-;    int *sum,

-;    unsigned int *sumsquared

-;)

-global sym(vp9_half_horiz_variance16x_h_sse2)

-sym(vp9_half_horiz_variance16x_h_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    ; end prolog

-        pxor            xmm6,           xmm6                ;  error accumulator

-        pxor            xmm7,           xmm7                ;  sse eaccumulator

-        mov             rsi,            arg(0) ;ref_ptr              ;

-        mov             rdi,            arg(2) ;src_ptr              ;

-        movsxd          rcx,            dword ptr arg(4) ;Height              ;

-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line

-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line

-        pxor            xmm0,           xmm0                ;

-.half_horiz_variance16x_h_1:

-        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15

-        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16

-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)

-        movdqa          xmm1,           xmm5

-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above

-        punpckhbw       xmm1,           xmm0

-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7

-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above

-        movq            xmm2,           QWORD PTR [rdi+8]

-        punpcklbw       xmm2,           xmm0

-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3

-        psubw           xmm1,           xmm2

-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences

-        paddw           xmm6,           xmm1

-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5

-        pmaddwd         xmm1,           xmm1

-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences

-        paddd           xmm7,           xmm1

-        lea             rsi,            [rsi + rax]

-        lea             rdi,            [rdi + rdx]

-        sub             rcx,            1                   ;

-        jnz             .half_horiz_variance16x_h_1         ;

-        pxor        xmm1,           xmm1

-        pxor        xmm5,           xmm5

-        punpcklwd   xmm0,           xmm6

-        punpckhwd   xmm1,           xmm6

-        psrad       xmm0,           16

-        psrad       xmm1,           16

-        paddd       xmm0,           xmm1

-        movdqa      xmm1,           xmm0

-        movdqa      xmm6,           xmm7

-        punpckldq   xmm6,           xmm5

-        punpckhdq   xmm7,           xmm5

-        paddd       xmm6,           xmm7

-        punpckldq   xmm0,           xmm5

-        punpckhdq   xmm1,           xmm5

-        paddd       xmm0,           xmm1

-        movdqa      xmm7,           xmm6

-        movdqa      xmm1,           xmm0

-        psrldq      xmm7,           8

-        psrldq      xmm1,           8

-        paddd       xmm6,           xmm7

-        paddd       xmm0,           xmm1

-        mov         rsi,            arg(5) ;[Sum]

-        mov         rdi,            arg(6) ;[SSE]

-        movd        [rsi],       xmm0

-        movd        [rdi],       xmm6

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};

-align 16

-xmm_bi_rd:

-    times 8 dw 64

-align 16

-bilinear_filters_sse2:

-    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0

-    dw 120, 120, 120, 120, 120, 120, 120, 120,  8,  8,  8,  8,  8,  8,  8,  8

-    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16

-    dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24

-    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32

-    dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40

-    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48

-    dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56

-    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64

-    dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72

-    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80

-    dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88

-    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96

-    dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104

-    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112

-    dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120

--- a/vp8/encoder/x86/variance_impl_ssse3.asm

+++ /dev/null

@@ -1,372 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%define xmm_filter_shift            7

-;void vp9_filter_block2d_bil_var_ssse3

-;(

-;    unsigned char *ref_ptr,

-;    int ref_pixels_per_line,

-;    unsigned char *src_ptr,

-;    int src_pixels_per_line,

-;    unsigned int Height,

-;    int  xoffset,

-;    int  yoffset,

-;    int *sum,

-;    unsigned int *sumsquared;;

-;

-;)

-;Note: The filter coefficient at offset=0 is 128. Since the second register

-;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.

-global sym(vp9_filter_block2d_bil_var_ssse3)

-sym(vp9_filter_block2d_bil_var_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 9

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push rsi

-    push rdi

-    ; end prolog

-        pxor            xmm6,           xmm6

-        pxor            xmm7,           xmm7

-        lea             rcx,            [GLOBAL(bilinear_filters_ssse3)]

-        movsxd          rax,            dword ptr arg(5)     ; xoffset

-        cmp             rax,            0                    ; skip first_pass filter if xoffset=0

-        je              .filter_block2d_bil_var_ssse3_sp_only

-        shl             rax,            4                    ; point to filter coeff with xoffset

-        lea             rax,            [rax + rcx]          ; HFilter

-        movsxd          rdx,            dword ptr arg(6)     ; yoffset

-        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0

-        je              .filter_block2d_bil_var_ssse3_fp_only

-        shl             rdx,            4

-        lea             rdx,            [rdx + rcx]          ; VFilter

-        mov             rsi,            arg(0)               ;ref_ptr

-        mov             rdi,            arg(2)               ;src_ptr

-        movsxd          rcx,            dword ptr arg(4)     ;Height

-        movdqu          xmm0,           XMMWORD PTR [rsi]

-        movdqu          xmm1,           XMMWORD PTR [rsi+1]

-        movdqa          xmm2,           xmm0

-        punpcklbw       xmm0,           xmm1

-        punpckhbw       xmm2,           xmm1

-        pmaddubsw       xmm0,           [rax]

-        pmaddubsw       xmm2,           [rax]

-        paddw           xmm0,           [GLOBAL(xmm_bi_rd)]

-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]

-        psraw           xmm0,           xmm_filter_shift

-        psraw           xmm2,           xmm_filter_shift

-        packuswb        xmm0,           xmm2

-%if ABI_IS_32BIT

-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line

-%else

-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line

-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line

-        lea             rsi,            [rsi + r8]

-%endif

-.filter_block2d_bil_var_ssse3_loop:

-        movdqu          xmm1,           XMMWORD PTR [rsi]

-        movdqu          xmm2,           XMMWORD PTR [rsi+1]

-        movdqa          xmm3,           xmm1

-        punpcklbw       xmm1,           xmm2

-        punpckhbw       xmm3,           xmm2

-        pmaddubsw       xmm1,           [rax]

-        pmaddubsw       xmm3,           [rax]

-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]

-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]

-        psraw           xmm1,           xmm_filter_shift

-        psraw           xmm3,           xmm_filter_shift

-        packuswb        xmm1,           xmm3

-        movdqa          xmm2,           xmm0

-        movdqa          xmm0,           xmm1

-        movdqa          xmm3,           xmm2

-        punpcklbw       xmm2,           xmm1

-        punpckhbw       xmm3,           xmm1

-        pmaddubsw       xmm2,           [rdx]

-        pmaddubsw       xmm3,           [rdx]

-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]

-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]

-        psraw           xmm2,           xmm_filter_shift

-        psraw           xmm3,           xmm_filter_shift

-        movq            xmm1,           QWORD PTR [rdi]

-        pxor            xmm4,           xmm4

-        punpcklbw       xmm1,           xmm4

-        movq            xmm5,           QWORD PTR [rdi+8]

-        punpcklbw       xmm5,           xmm4

-        psubw           xmm2,           xmm1

-        psubw           xmm3,           xmm5

-        paddw           xmm6,           xmm2

-        paddw           xmm6,           xmm3

-        pmaddwd         xmm2,           xmm2

-        pmaddwd         xmm3,           xmm3

-        paddd           xmm7,           xmm2

-        paddd           xmm7,           xmm3

-%if ABI_IS_32BIT

-        add             rsi,            dword ptr arg(1)     ;ref_pixels_per_line

-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line

-%else

-        lea             rsi,            [rsi + r8]

-        lea             rdi,            [rdi + r9]

-%endif

-        sub             rcx,            1

-        jnz             .filter_block2d_bil_var_ssse3_loop

-        jmp             .filter_block2d_bil_variance

-.filter_block2d_bil_var_ssse3_sp_only:

-        movsxd          rdx,            dword ptr arg(6)     ; yoffset

-        cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0

-        je              .filter_block2d_bil_var_ssse3_full_pixel

-        shl             rdx,            4

-        lea             rdx,            [rdx + rcx]          ; VFilter

-        mov             rsi,            arg(0)               ;ref_ptr

-        mov             rdi,            arg(2)               ;src_ptr

-        movsxd          rcx,            dword ptr arg(4)     ;Height

-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line

-        movdqu          xmm1,           XMMWORD PTR [rsi]

-        movdqa          xmm0,           xmm1

-%if ABI_IS_32BIT=0

-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line

-%endif

-        lea             rsi,            [rsi + rax]

-.filter_block2d_bil_sp_only_loop:

-        movdqu          xmm3,           XMMWORD PTR [rsi]

-        movdqa          xmm2,           xmm1

-        movdqa          xmm0,           xmm3

-        punpcklbw       xmm1,           xmm3

-        punpckhbw       xmm2,           xmm3

-        pmaddubsw       xmm1,           [rdx]

-        pmaddubsw       xmm2,           [rdx]

-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]

-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]

-        psraw           xmm1,           xmm_filter_shift

-        psraw           xmm2,           xmm_filter_shift

-        movq            xmm3,           QWORD PTR [rdi]

-        pxor            xmm4,           xmm4

-        punpcklbw       xmm3,           xmm4

-        movq            xmm5,           QWORD PTR [rdi+8]

-        punpcklbw       xmm5,           xmm4

-        psubw           xmm1,           xmm3

-        psubw           xmm2,           xmm5

-        paddw           xmm6,           xmm1

-        paddw           xmm6,           xmm2

-        pmaddwd         xmm1,           xmm1

-        pmaddwd         xmm2,           xmm2

-        paddd           xmm7,           xmm1

-        paddd           xmm7,           xmm2

-        movdqa          xmm1,           xmm0

-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line

-%if ABI_IS_32BIT

-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line

-%else

-        lea             rdi,            [rdi + r9]

-%endif

-        sub             rcx,            1

-        jnz             .filter_block2d_bil_sp_only_loop

-        jmp             .filter_block2d_bil_variance

-.filter_block2d_bil_var_ssse3_full_pixel:

-        mov             rsi,            arg(0)               ;ref_ptr

-        mov             rdi,            arg(2)               ;src_ptr

-        movsxd          rcx,            dword ptr arg(4)     ;Height

-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line

-        movsxd          rdx,            dword ptr arg(3)     ;src_pixels_per_line

-        pxor            xmm0,           xmm0

-.filter_block2d_bil_full_pixel_loop:

-        movq            xmm1,           QWORD PTR [rsi]

-        punpcklbw       xmm1,           xmm0

-        movq            xmm2,           QWORD PTR [rsi+8]

-        punpcklbw       xmm2,           xmm0

-        movq            xmm3,           QWORD PTR [rdi]

-        punpcklbw       xmm3,           xmm0

-        movq            xmm4,           QWORD PTR [rdi+8]

-        punpcklbw       xmm4,           xmm0

-        psubw           xmm1,           xmm3

-        psubw           xmm2,           xmm4

-        paddw           xmm6,           xmm1

-        paddw           xmm6,           xmm2

-        pmaddwd         xmm1,           xmm1

-        pmaddwd         xmm2,           xmm2

-        paddd           xmm7,           xmm1

-        paddd           xmm7,           xmm2

-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line

-        lea             rdi,            [rdi + rdx]          ;src_pixels_per_line

-        sub             rcx,            1

-        jnz             .filter_block2d_bil_full_pixel_loop

-        jmp             .filter_block2d_bil_variance

-.filter_block2d_bil_var_ssse3_fp_only:

-        mov             rsi,            arg(0)               ;ref_ptr

-        mov             rdi,            arg(2)               ;src_ptr

-        movsxd          rcx,            dword ptr arg(4)     ;Height

-        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line

-        pxor            xmm0,           xmm0

-%if ABI_IS_32BIT=0

-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line

-%endif

-.filter_block2d_bil_fp_only_loop:

-        movdqu          xmm1,           XMMWORD PTR [rsi]

-        movdqu          xmm2,           XMMWORD PTR [rsi+1]

-        movdqa          xmm3,           xmm1

-        punpcklbw       xmm1,           xmm2

-        punpckhbw       xmm3,           xmm2

-        pmaddubsw       xmm1,           [rax]

-        pmaddubsw       xmm3,           [rax]

-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]

-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]

-        psraw           xmm1,           xmm_filter_shift

-        psraw           xmm3,           xmm_filter_shift

-        movq            xmm2,           XMMWORD PTR [rdi]

-        pxor            xmm4,           xmm4

-        punpcklbw       xmm2,           xmm4

-        movq            xmm5,           QWORD PTR [rdi+8]

-        punpcklbw       xmm5,           xmm4

-        psubw           xmm1,           xmm2

-        psubw           xmm3,           xmm5

-        paddw           xmm6,           xmm1

-        paddw           xmm6,           xmm3

-        pmaddwd         xmm1,           xmm1

-        pmaddwd         xmm3,           xmm3

-        paddd           xmm7,           xmm1

-        paddd           xmm7,           xmm3

-        lea             rsi,            [rsi + rdx]

-%if ABI_IS_32BIT

-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line

-%else

-        lea             rdi,            [rdi + r9]

-%endif

-        sub             rcx,            1

-        jnz             .filter_block2d_bil_fp_only_loop

-        jmp             .filter_block2d_bil_variance

-.filter_block2d_bil_variance:

-        pxor        xmm0,           xmm0

-        pxor        xmm1,           xmm1

-        pxor        xmm5,           xmm5

-        punpcklwd   xmm0,           xmm6

-        punpckhwd   xmm1,           xmm6

-        psrad       xmm0,           16

-        psrad       xmm1,           16

-        paddd       xmm0,           xmm1

-        movdqa      xmm1,           xmm0

-        movdqa      xmm6,           xmm7

-        punpckldq   xmm6,           xmm5

-        punpckhdq   xmm7,           xmm5

-        paddd       xmm6,           xmm7

-        punpckldq   xmm0,           xmm5

-        punpckhdq   xmm1,           xmm5

-        paddd       xmm0,           xmm1

-        movdqa      xmm7,           xmm6

-        movdqa      xmm1,           xmm0

-        psrldq      xmm7,           8

-        psrldq      xmm1,           8

-        paddd       xmm6,           xmm7

-        paddd       xmm0,           xmm1

-        mov         rsi,            arg(7) ;[Sum]

-        mov         rdi,            arg(8) ;[SSE]

-        movd        [rsi],       xmm0

-        movd        [rdi],       xmm6

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-xmm_bi_rd:

-    times 8 dw 64

-align 16

-bilinear_filters_ssse3:

-    times 8 db 128, 0

-    times 8 db 120, 8

-    times 8 db 112, 16

-    times 8 db 104, 24

-    times 8 db  96, 32

-    times 8 db  88, 40

-    times 8 db  80, 48

-    times 8 db  72, 56

-    times 8 db  64, 64

-    times 8 db  56, 72

-    times 8 db  48, 80

-    times 8 db  40, 88

-    times 8 db  32, 96

-    times 8 db  24, 104

-    times 8 db  16, 112

-    times 8 db   8, 120

--- a/vp8/encoder/x86/variance_mmx.c

+++ /dev/null

@@ -1,406 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_config.h"

-#include "vp8/encoder/variance.h"

-#include "vp8/common/pragmas.h"

-#include "vpx_ports/mem.h"

-extern void filter_block1d_h6_mmx

-(

-  const unsigned char *src_ptr,

-  unsigned short *output_ptr,

-  unsigned int src_pixels_per_line,

-  unsigned int pixel_step,

-  unsigned int output_height,

-  unsigned int output_width,

-  short *vp7_filter

-);

-extern void filter_block1d_v6_mmx

-(

-  const short *src_ptr,

-  unsigned char *output_ptr,

-  unsigned int pixels_per_line,

-  unsigned int pixel_step,

-  unsigned int output_height,

-  unsigned int output_width,

-  short *vp7_filter

-);

-extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr);

-extern unsigned int vp9_get8x8var_mmx

-(

-  const unsigned char *src_ptr,

-  int  source_stride,

-  const unsigned char *ref_ptr,

-  int  recon_stride,

-  unsigned int *SSE,

-  int *Sum

-);

-extern unsigned int vp9_get4x4var_mmx

-(

-  const unsigned char *src_ptr,

-  int  source_stride,

-  const unsigned char *ref_ptr,

-  int  recon_stride,

-  unsigned int *SSE,

-  int *Sum

-);

-extern void vp9_filter_block2d_bil4x4_var_mmx

-(

-  const unsigned char *ref_ptr,

-  int ref_pixels_per_line,

-  const unsigned char *src_ptr,

-  int src_pixels_per_line,

-  const short *HFilter,

-  const short *VFilter,

-  int *sum,

-  unsigned int *sumsquared

-);

-extern void vp9_filter_block2d_bil_var_mmx

-(

-  const unsigned char *ref_ptr,

-  int ref_pixels_per_line,

-  const unsigned char *src_ptr,

-  int src_pixels_per_line,

-  unsigned int Height,

-  const short *HFilter,

-  const short *VFilter,

-  int *sum,

-  unsigned int *sumsquared

-);

-unsigned int vp9_variance4x4_mmx(

-  const unsigned char *src_ptr,

-  int  source_stride,

-  const unsigned char *ref_ptr,

-  int  recon_stride,

-  unsigned int *sse) {

-  unsigned int var;

-  int avg;

-  vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);

-  *sse = var;

-  return (var - ((avg * avg) >> 4));

-}

-unsigned int vp9_variance8x8_mmx(

-  const unsigned char *src_ptr,

-  int  source_stride,

-  const unsigned char *ref_ptr,

-  int  recon_stride,

-  unsigned int *sse) {

-  unsigned int var;

-  int avg;

-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);

-  *sse = var;

-  return (var - ((avg * avg) >> 6));

-}

-unsigned int vp9_mse16x16_mmx(

-  const unsigned char *src_ptr,

-  int  source_stride,

-  const unsigned char *ref_ptr,

-  int  recon_stride,

-  unsigned int *sse) {

-  unsigned int sse0, sse1, sse2, sse3, var;

-  int sum0, sum1, sum2, sum3;

-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);

-  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);

-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);

-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);

-  var = sse0 + sse1 + sse2 + sse3;

-  *sse = var;

-  return var;

-}

-unsigned int vp9_variance16x16_mmx(

-  const unsigned char *src_ptr,

-  int  source_stride,

-  const unsigned char *ref_ptr,

-  int  recon_stride,

-  unsigned int *sse) {

-  unsigned int sse0, sse1, sse2, sse3, var;

-  int sum0, sum1, sum2, sum3, avg;

-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);

-  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);

-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);

-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);

-  var = sse0 + sse1 + sse2 + sse3;

-  avg = sum0 + sum1 + sum2 + sum3;

-  *sse = var;

-  return (var - ((avg * avg) >> 8));

-}

-unsigned int vp9_variance16x8_mmx(

-  const unsigned char *src_ptr,

-  int  source_stride,

-  const unsigned char *ref_ptr,

-  int  recon_stride,

-  unsigned int *sse) {

-  unsigned int sse0, sse1, var;

-  int sum0, sum1, avg;

-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);

-  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);

-  var = sse0 + sse1;

-  avg = sum0 + sum1;

-  *sse = var;

-  return (var - ((avg * avg) >> 7));

-}

-unsigned int vp9_variance8x16_mmx(

-  const unsigned char *src_ptr,

-  int  source_stride,

-  const unsigned char *ref_ptr,

-  int  recon_stride,

-  unsigned int *sse) {

-  unsigned int sse0, sse1, var;

-  int sum0, sum1, avg;

-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);

-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);

-  var = sse0 + sse1;

-  avg = sum0 + sum1;

-  *sse = var;

-  return (var - ((avg * avg) >> 7));

-}

-///////////////////////////////////////////////////////////////////////////

-// the mmx function that does the bilinear filtering and var calculation //

-// int one pass                                                          //

-///////////////////////////////////////////////////////////////////////////

-DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {

-  { 128, 128, 128, 128,  0,  0,  0,  0 },

-  { 120, 120, 120, 120,  8,  8,  8,  8 },

-  { 112, 112, 112, 112, 16, 16, 16, 16 },

-  { 104, 104, 104, 104, 24, 24, 24, 24 },

-  {  96, 96, 96, 96, 32, 32, 32, 32 },

-  {  88, 88, 88, 88, 40, 40, 40, 40 },

-  {  80, 80, 80, 80, 48, 48, 48, 48 },

-  {  72, 72, 72, 72, 56, 56, 56, 56 },

-  {  64, 64, 64, 64, 64, 64, 64, 64 },

-  {  56, 56, 56, 56, 72, 72, 72, 72 },

-  {  48, 48, 48, 48, 80, 80, 80, 80 },

-  {  40, 40, 40, 40, 88, 88, 88, 88 },

-  {  32, 32, 32, 32, 96, 96, 96, 96 },

-  {  24, 24, 24, 24, 104, 104, 104, 104 },

-  {  16, 16, 16, 16, 112, 112, 112, 112 },

-  {   8,  8,  8,  8, 120, 120, 120, 120 }

-};

-unsigned int vp9_sub_pixel_variance4x4_mmx

-(

-  const unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  const unsigned char *dst_ptr,

-  int dst_pixels_per_line,

-  unsigned int *sse)

-{

-  int xsum;

-  unsigned int xxsum;

-  vp9_filter_block2d_bil4x4_var_mmx(

-    src_ptr, src_pixels_per_line,

-    dst_ptr, dst_pixels_per_line,

-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],

-    &xsum, &xxsum

-  );

-  *sse = xxsum;

-  return (xxsum - ((xsum * xsum) >> 4));

-}

-unsigned int vp9_sub_pixel_variance8x8_mmx

-(

-  const unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  const unsigned char *dst_ptr,

-  int dst_pixels_per_line,

-  unsigned int *sse

-) {

-  int xsum;

-  unsigned int xxsum;

-  vp9_filter_block2d_bil_var_mmx(

-    src_ptr, src_pixels_per_line,

-    dst_ptr, dst_pixels_per_line, 8,

-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],

-    &xsum, &xxsum

-  );

-  *sse = xxsum;

-  return (xxsum - ((xsum * xsum) >> 6));

-}

-unsigned int vp9_sub_pixel_variance16x16_mmx

-(

-  const unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  const unsigned char *dst_ptr,

-  int dst_pixels_per_line,

-  unsigned int *sse

-) {

-  int xsum0, xsum1;

-  unsigned int xxsum0, xxsum1;

-  vp9_filter_block2d_bil_var_mmx(

-    src_ptr, src_pixels_per_line,

-    dst_ptr, dst_pixels_per_line, 16,

-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],

-    &xsum0, &xxsum0

-  );

-  vp9_filter_block2d_bil_var_mmx(

-    src_ptr + 8, src_pixels_per_line,

-    dst_ptr + 8, dst_pixels_per_line, 16,

-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],

-    &xsum1, &xxsum1

-  );

-  xsum0 += xsum1;

-  xxsum0 += xxsum1;

-  *sse = xxsum0;

-  return (xxsum0 - ((xsum0 * xsum0) >> 8));

-}

-unsigned int vp9_sub_pixel_mse16x16_mmx(

-  const unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  const unsigned char *dst_ptr,

-  int dst_pixels_per_line,

-  unsigned int *sse

-) {

-  vp9_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);

-  return *sse;

-}

-unsigned int vp9_sub_pixel_variance16x8_mmx

-(

-  const unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  const unsigned char *dst_ptr,

-  int dst_pixels_per_line,

-  unsigned int *sse

-) {

-  int xsum0, xsum1;

-  unsigned int xxsum0, xxsum1;

-  vp9_filter_block2d_bil_var_mmx(

-    src_ptr, src_pixels_per_line,

-    dst_ptr, dst_pixels_per_line, 8,

-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],

-    &xsum0, &xxsum0

-  );

-  vp9_filter_block2d_bil_var_mmx(

-    src_ptr + 8, src_pixels_per_line,

-    dst_ptr + 8, dst_pixels_per_line, 8,

-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],

-    &xsum1, &xxsum1

-  );

-  xsum0 += xsum1;

-  xxsum0 += xxsum1;

-  *sse = xxsum0;

-  return (xxsum0 - ((xsum0 * xsum0) >> 7));

-}

-unsigned int vp9_sub_pixel_variance8x16_mmx

-(

-  const unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  const unsigned char *dst_ptr,

-  int dst_pixels_per_line,

-  unsigned int *sse

-) {

-  int xsum;

-  unsigned int xxsum;

-  vp9_filter_block2d_bil_var_mmx(

-    src_ptr, src_pixels_per_line,

-    dst_ptr, dst_pixels_per_line, 16,

-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],

-    &xsum, &xxsum

-  );

-  *sse = xxsum;

-  return (xxsum - ((xsum * xsum) >> 7));

-}

-unsigned int vp9_variance_halfpixvar16x16_h_mmx(

-  const unsigned char *src_ptr,

-  int  source_stride,

-  const unsigned char *ref_ptr,

-  int  recon_stride,

-  unsigned int *sse) {

-  return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 0,

-                                         ref_ptr, recon_stride, sse);

-}

-unsigned int vp9_variance_halfpixvar16x16_v_mmx(

-  const unsigned char *src_ptr,

-  int  source_stride,

-  const unsigned char *ref_ptr,

-  int  recon_stride,

-  unsigned int *sse) {

-  return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 8,

-                                         ref_ptr, recon_stride, sse);

-}

-unsigned int vp9_variance_halfpixvar16x16_hv_mmx(

-  const unsigned char *src_ptr,

-  int  source_stride,

-  const unsigned char *ref_ptr,

-  int  recon_stride,

-  unsigned int *sse) {

-  return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 8,

-                                         ref_ptr, recon_stride, sse);

-}

--- a/vp8/encoder/x86/variance_sse2.c

+++ /dev/null

@@ -1,517 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_config.h"

-#include "vp8/encoder/variance.h"

-#include "vp8/common/pragmas.h"

-#include "vpx_ports/mem.h"

-#define HALFNDX 8

-extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);

-extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);

-extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);

-extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);

-extern void vp9_filter_block2d_bil4x4_var_mmx

-(

-  const unsigned char *ref_ptr,

-  int ref_pixels_per_line,

-  const unsigned char *src_ptr,

-  int src_pixels_per_line,

-  const short *HFilter,

-  const short *VFilter,

-  int *sum,

-  unsigned int *sumsquared

-);

-extern unsigned int vp9_get4x4var_mmx

-(

-  const unsigned char *src_ptr,

-  int  source_stride,

-  const unsigned char *ref_ptr,

-  int  recon_stride,

-  unsigned int *SSE,

-  int *Sum

-);

-unsigned int vp9_get_mb_ss_sse2

-(

-  const short *src_ptr

-);

-unsigned int vp9_get16x16var_sse2

-(

-  const unsigned char *src_ptr,

-  int source_stride,

-  const unsigned char *ref_ptr,

-  int recon_stride,

-  unsigned int *SSE,

-  int *Sum

-);

-unsigned int vp9_get8x8var_sse2

-(

-  const unsigned char *src_ptr,

-  int source_stride,

-  const unsigned char *ref_ptr,

-  int recon_stride,

-  unsigned int *SSE,

-  int *Sum

-);

-void vp9_filter_block2d_bil_var_sse2

-(

-  const unsigned char *ref_ptr,

-  int ref_pixels_per_line,

-  const unsigned char *src_ptr,

-  int src_pixels_per_line,

-  unsigned int Height,

-  int  xoffset,

-  int  yoffset,

-  int *sum,

-  unsigned int *sumsquared

-);

-void vp9_half_horiz_vert_variance8x_h_sse2

-(

-  const unsigned char *ref_ptr,

-  int ref_pixels_per_line,

-  const unsigned char *src_ptr,

-  int src_pixels_per_line,

-  unsigned int Height,

-  int *sum,

-  unsigned int *sumsquared

-);

-void vp9_half_horiz_vert_variance16x_h_sse2

-(

-  const unsigned char *ref_ptr,

-  int ref_pixels_per_line,

-  const unsigned char *src_ptr,

-  int src_pixels_per_line,

-  unsigned int Height,

-  int *sum,

-  unsigned int *sumsquared

-);

-void vp9_half_horiz_variance8x_h_sse2

-(

-  const unsigned char *ref_ptr,

-  int ref_pixels_per_line,

-  const unsigned char *src_ptr,

-  int src_pixels_per_line,

-  unsigned int Height,

-  int *sum,

-  unsigned int *sumsquared

-);

-void vp9_half_horiz_variance16x_h_sse2

-(

-  const unsigned char *ref_ptr,

-  int ref_pixels_per_line,

-  const unsigned char *src_ptr,

-  int src_pixels_per_line,

-  unsigned int Height,

-  int *sum,

-  unsigned int *sumsquared

-);

-void vp9_half_vert_variance8x_h_sse2

-(

-  const unsigned char *ref_ptr,

-  int ref_pixels_per_line,

-  const unsigned char *src_ptr,

-  int src_pixels_per_line,

-  unsigned int Height,

-  int *sum,

-  unsigned int *sumsquared

-);

-void vp9_half_vert_variance16x_h_sse2

-(

-  const unsigned char *ref_ptr,

-  int ref_pixels_per_line,

-  const unsigned char *src_ptr,

-  int src_pixels_per_line,

-  unsigned int Height,

-  int *sum,

-  unsigned int *sumsquared

-);

-DECLARE_ALIGNED(16, extern short, vp9_bilinear_filters_mmx[16][8]);

-unsigned int vp9_variance4x4_wmt(

-  const unsigned char *src_ptr,

-  int  source_stride,

-  const unsigned char *ref_ptr,

-  int  recon_stride,

-  unsigned int *sse) {

-  unsigned int var;

-  int avg;

-  vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);

-  *sse = var;

-  return (var - ((avg * avg) >> 4));

-}

-unsigned int vp9_variance8x8_wmt

-(

-  const unsigned char *src_ptr,

-  int  source_stride,

-  const unsigned char *ref_ptr,

-  int  recon_stride,

-  unsigned int *sse) {

-  unsigned int var;

-  int avg;

-  vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);

-  *sse = var;

-  return (var - ((avg * avg) >> 6));

-}

-unsigned int vp9_variance16x16_wmt

-(

-  const unsigned char *src_ptr,

-  int  source_stride,

-  const unsigned char *ref_ptr,

-  int  recon_stride,

-  unsigned int *sse) {

-  unsigned int sse0;

-  int sum0;

-  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);

-  *sse = sse0;

-  return (sse0 - ((sum0 * sum0) >> 8));

-}

-unsigned int vp9_mse16x16_wmt(

-  const unsigned char *src_ptr,

-  int  source_stride,

-  const unsigned char *ref_ptr,

-  int  recon_stride,

-  unsigned int *sse) {

-  unsigned int sse0;

-  int sum0;

-  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);

-  *sse = sse0;

-  return sse0;

-}

-unsigned int vp9_variance16x8_wmt

-(

-  const unsigned char *src_ptr,

-  int  source_stride,

-  const unsigned char *ref_ptr,

-  int  recon_stride,

-  unsigned int *sse) {

-  unsigned int sse0, sse1, var;

-  int sum0, sum1, avg;

-  vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);

-  vp9_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);

-  var = sse0 + sse1;

-  avg = sum0 + sum1;

-  *sse = var;

-  return (var - ((avg * avg) >> 7));

-}

-unsigned int vp9_variance8x16_wmt

-(

-  const unsigned char *src_ptr,

-  int  source_stride,

-  const unsigned char *ref_ptr,

-  int  recon_stride,

-  unsigned int *sse) {

-  unsigned int sse0, sse1, var;

-  int sum0, sum1, avg;

-  vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);

-  vp9_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);

-  var = sse0 + sse1;

-  avg = sum0 + sum1;

-  *sse = var;

-  return (var - ((avg * avg) >> 7));

-}

-unsigned int vp9_sub_pixel_variance4x4_wmt

-(

-  const unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  const unsigned char *dst_ptr,

-  int dst_pixels_per_line,

-  unsigned int *sse

-) {

-  int xsum;

-  unsigned int xxsum;

-  vp9_filter_block2d_bil4x4_var_mmx(

-    src_ptr, src_pixels_per_line,

-    dst_ptr, dst_pixels_per_line,

-    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],

-    &xsum, &xxsum

-  );

-  *sse = xxsum;

-  return (xxsum - ((xsum * xsum) >> 4));

-}

-unsigned int vp9_sub_pixel_variance8x8_wmt

-(

-  const unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  const unsigned char *dst_ptr,

-  int dst_pixels_per_line,

-  unsigned int *sse

-) {

-  int xsum;

-  unsigned int xxsum;

-  if (xoffset == HALFNDX && yoffset == 0) {

-    vp9_half_horiz_variance8x_h_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 8,

-      &xsum, &xxsum);

-  } else if (xoffset == 0 && yoffset == HALFNDX) {

-    vp9_half_vert_variance8x_h_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 8,

-      &xsum, &xxsum);

-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {

-    vp9_half_horiz_vert_variance8x_h_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 8,

-      &xsum, &xxsum);

-  } else {

-    vp9_filter_block2d_bil_var_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 8,

-      xoffset, yoffset,

-      &xsum, &xxsum);

-  }

-  *sse = xxsum;

-  return (xxsum - ((xsum * xsum) >> 6));

-}

-unsigned int vp9_sub_pixel_variance16x16_wmt

-(

-  const unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  const unsigned char *dst_ptr,

-  int dst_pixels_per_line,

-  unsigned int *sse

-) {

-  int xsum0, xsum1;

-  unsigned int xxsum0, xxsum1;

-  // note we could avoid these if statements if the calling function

-  // just called the appropriate functions inside.

-  if (xoffset == HALFNDX && yoffset == 0) {

-    vp9_half_horiz_variance16x_h_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 16,

-      &xsum0, &xxsum0);

-  } else if (xoffset == 0 && yoffset == HALFNDX) {

-    vp9_half_vert_variance16x_h_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 16,

-      &xsum0, &xxsum0);

-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {

-    vp9_half_horiz_vert_variance16x_h_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 16,

-      &xsum0, &xxsum0);

-  } else {

-    vp9_filter_block2d_bil_var_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 16,

-      xoffset, yoffset,

-      &xsum0, &xxsum0

-    );

-    vp9_filter_block2d_bil_var_sse2(

-      src_ptr + 8, src_pixels_per_line,

-      dst_ptr + 8, dst_pixels_per_line, 16,

-      xoffset, yoffset,

-      &xsum1, &xxsum1

-    );

-    xsum0 += xsum1;

-    xxsum0 += xxsum1;

-  }

-  *sse = xxsum0;

-  return (xxsum0 - ((xsum0 * xsum0) >> 8));

-}

-unsigned int vp9_sub_pixel_mse16x16_wmt(

-  const unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  const unsigned char *dst_ptr,

-  int dst_pixels_per_line,

-  unsigned int *sse

-) {

-  vp9_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);

-  return *sse;

-}

-unsigned int vp9_sub_pixel_variance16x8_wmt

-(

-  const unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  const unsigned char *dst_ptr,

-  int dst_pixels_per_line,

-  unsigned int *sse

-) {

-  int xsum0, xsum1;

-  unsigned int xxsum0, xxsum1;

-  if (xoffset == HALFNDX && yoffset == 0) {

-    vp9_half_horiz_variance16x_h_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 8,

-      &xsum0, &xxsum0);

-  } else if (xoffset == 0 && yoffset == HALFNDX) {

-    vp9_half_vert_variance16x_h_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 8,

-      &xsum0, &xxsum0);

-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {

-    vp9_half_horiz_vert_variance16x_h_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 8,

-      &xsum0, &xxsum0);

-  } else {

-    vp9_filter_block2d_bil_var_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 8,

-      xoffset, yoffset,

-      &xsum0, &xxsum0);

-    vp9_filter_block2d_bil_var_sse2(

-      src_ptr + 8, src_pixels_per_line,

-      dst_ptr + 8, dst_pixels_per_line, 8,

-      xoffset, yoffset,

-      &xsum1, &xxsum1);

-    xsum0 += xsum1;

-    xxsum0 += xxsum1;

-  }

-  *sse = xxsum0;

-  return (xxsum0 - ((xsum0 * xsum0) >> 7));

-}

-unsigned int vp9_sub_pixel_variance8x16_wmt

-(

-  const unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  const unsigned char *dst_ptr,

-  int dst_pixels_per_line,

-  unsigned int *sse

-) {

-  int xsum;

-  unsigned int xxsum;

-  if (xoffset == HALFNDX && yoffset == 0) {

-    vp9_half_horiz_variance8x_h_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 16,

-      &xsum, &xxsum);

-  } else if (xoffset == 0 && yoffset == HALFNDX) {

-    vp9_half_vert_variance8x_h_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 16,

-      &xsum, &xxsum);

-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {

-    vp9_half_horiz_vert_variance8x_h_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 16,

-      &xsum, &xxsum);

-  } else {

-    vp9_filter_block2d_bil_var_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 16,

-      xoffset, yoffset,

-      &xsum, &xxsum);

-  }

-  *sse = xxsum;

-  return (xxsum - ((xsum * xsum) >> 7));

-}

-unsigned int vp9_variance_halfpixvar16x16_h_wmt(

-  const unsigned char *src_ptr,

-  int  src_pixels_per_line,

-  const unsigned char *dst_ptr,

-  int  dst_pixels_per_line,

-  unsigned int *sse) {

-  int xsum0;

-  unsigned int xxsum0;

-  vp9_half_horiz_variance16x_h_sse2(

-    src_ptr, src_pixels_per_line,

-    dst_ptr, dst_pixels_per_line, 16,

-    &xsum0, &xxsum0);

-  *sse = xxsum0;

-  return (xxsum0 - ((xsum0 * xsum0) >> 8));

-}

-unsigned int vp9_variance_halfpixvar16x16_v_wmt(

-  const unsigned char *src_ptr,

-  int  src_pixels_per_line,

-  const unsigned char *dst_ptr,

-  int  dst_pixels_per_line,

-  unsigned int *sse) {

-  int xsum0;

-  unsigned int xxsum0;

-  vp9_half_vert_variance16x_h_sse2(

-    src_ptr, src_pixels_per_line,

-    dst_ptr, dst_pixels_per_line, 16,

-    &xsum0, &xxsum0);

-  *sse = xxsum0;

-  return (xxsum0 - ((xsum0 * xsum0) >> 8));

-}

-unsigned int vp9_variance_halfpixvar16x16_hv_wmt(

-  const unsigned char *src_ptr,

-  int  src_pixels_per_line,

-  const unsigned char *dst_ptr,

-  int  dst_pixels_per_line,

-  unsigned int *sse) {

-  int xsum0;

-  unsigned int xxsum0;

-  vp9_half_horiz_vert_variance16x_h_sse2(

-    src_ptr, src_pixels_per_line,

-    dst_ptr, dst_pixels_per_line, 16,

-    &xsum0, &xxsum0);

-  *sse = xxsum0;

-  return (xxsum0 - ((xsum0 * xsum0) >> 8));

-}

--- a/vp8/encoder/x86/variance_ssse3.c

+++ /dev/null

@@ -1,151 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_config.h"

-#include "vp8/encoder/variance.h"

-#include "vp8/common/pragmas.h"

-#include "vpx_ports/mem.h"

-#define HALFNDX 8

-extern unsigned int vp9_get16x16var_sse2

-(

-  const unsigned char *src_ptr,

-  int source_stride,

-  const unsigned char *ref_ptr,

-  int recon_stride,

-  unsigned int *SSE,

-  int *Sum

-);

-extern void vp9_half_horiz_vert_variance16x_h_sse2

-(

-  const unsigned char *ref_ptr,

-  int ref_pixels_per_line,

-  const unsigned char *src_ptr,

-  int src_pixels_per_line,

-  unsigned int Height,

-  int *sum,

-  unsigned int *sumsquared

-);

-extern void vp9_half_horiz_variance16x_h_sse2

-(

-  const unsigned char *ref_ptr,

-  int ref_pixels_per_line,

-  const unsigned char *src_ptr,

-  int src_pixels_per_line,

-  unsigned int Height,

-  int *sum,

-  unsigned int *sumsquared

-);

-extern void vp9_half_vert_variance16x_h_sse2

-(

-  const unsigned char *ref_ptr,

-  int ref_pixels_per_line,

-  const unsigned char *src_ptr,

-  int src_pixels_per_line,

-  unsigned int Height,

-  int *sum,

-  unsigned int *sumsquared

-);

-extern void vp9_filter_block2d_bil_var_ssse3

-(

-  const unsigned char *ref_ptr,

-  int ref_pixels_per_line,

-  const unsigned char *src_ptr,

-  int src_pixels_per_line,

-  unsigned int Height,

-  int  xoffset,

-  int  yoffset,

-  int *sum,

-  unsigned int *sumsquared

-);

-unsigned int vp9_sub_pixel_variance16x16_ssse3

-(

-  const unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  const unsigned char *dst_ptr,

-  int dst_pixels_per_line,

-  unsigned int *sse

-) {

-  int xsum0;

-  unsigned int xxsum0;

-  // note we could avoid these if statements if the calling function

-  // just called the appropriate functions inside.

-  if (xoffset == HALFNDX && yoffset == 0) {

-    vp9_half_horiz_variance16x_h_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 16,

-      &xsum0, &xxsum0);

-  } else if (xoffset == 0 && yoffset == HALFNDX) {

-    vp9_half_vert_variance16x_h_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 16,

-      &xsum0, &xxsum0);

-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {

-    vp9_half_horiz_vert_variance16x_h_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 16,

-      &xsum0, &xxsum0);

-  } else {

-    vp9_filter_block2d_bil_var_ssse3(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 16,

-      xoffset, yoffset,

-      &xsum0, &xxsum0);

-  }

-  *sse = xxsum0;

-  return (xxsum0 - ((xsum0 * xsum0) >> 8));

-}

-unsigned int vp9_sub_pixel_variance16x8_ssse3

-(

-  const unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  const unsigned char *dst_ptr,

-  int dst_pixels_per_line,

-  unsigned int *sse

-) {

-  int xsum0;

-  unsigned int xxsum0;

-  if (xoffset == HALFNDX && yoffset == 0) {

-    vp9_half_horiz_variance16x_h_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 8,

-      &xsum0, &xxsum0);

-  } else if (xoffset == 0 && yoffset == HALFNDX) {

-    vp9_half_vert_variance16x_h_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 8,

-      &xsum0, &xxsum0);

-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {

-    vp9_half_horiz_vert_variance16x_h_sse2(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 8,

-      &xsum0, &xxsum0);

-  } else {

-    vp9_filter_block2d_bil_var_ssse3(

-      src_ptr, src_pixels_per_line,

-      dst_ptr, dst_pixels_per_line, 8,

-      xoffset, yoffset,

-      &xsum0, &xxsum0);

-  }

-  *sse = xxsum0;

-  return (xxsum0 - ((xsum0 * xsum0) >> 7));

-}

--- a/vp8/encoder/x86/x86_csystemdependent.c

+++ /dev/null

@@ -1,114 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_ports/config.h"

-#include "vpx_ports/x86.h"

-#include "vp8/encoder/variance.h"

-#include "vp8/encoder/onyx_int.h"

-#if HAVE_MMX

-void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) {

-  vp9_short_fdct4x4_mmx(input,   output,    pitch);

-  vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch);

-}

-int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);

-int vp9_mbblock_error_mmx(MACROBLOCK *mb, int dc) {

-  short *coeff_ptr =  mb->block[0].coeff;

-  short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;

-  return vp9_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc);

-}

-int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);

-int vp9_mbuverror_mmx(MACROBLOCK *mb) {

-  short *s_ptr = &mb->coeff[256];

-  short *d_ptr = &mb->e_mbd.dqcoeff[256];

-  return vp9_mbuverror_mmx_impl(s_ptr, d_ptr);

-}

-void vp9_subtract_b_mmx_impl(unsigned char *z,  int src_stride,

-                             short *diff, unsigned char *predictor,

-                             int pitch);

-void vp9_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) {

-  unsigned char *z = *(be->base_src) + be->src;

-  unsigned int  src_stride = be->src_stride;

-  short *diff = &be->src_diff[0];

-  unsigned char *predictor = &bd->predictor[0];

-  vp9_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);

-}

-#endif

-#if HAVE_SSE2

-int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);

-int vp9_mbblock_error_xmm(MACROBLOCK *mb, int dc) {

-  short *coeff_ptr =  mb->block[0].coeff;

-  short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;

-  return vp9_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc);

-}

-int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);

-int vp9_mbuverror_xmm(MACROBLOCK *mb) {

-  short *s_ptr = &mb->coeff[256];

-  short *d_ptr = &mb->e_mbd.dqcoeff[256];

-  return vp9_mbuverror_xmm_impl(s_ptr, d_ptr);

-}

-void vp9_subtract_b_sse2_impl(unsigned char *z,  int src_stride,

-                              short *diff, unsigned char *predictor,

-                              int pitch);

-void vp9_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) {

-  unsigned char *z = *(be->base_src) + be->src;

-  unsigned int  src_stride = be->src_stride;

-  short *diff = &be->src_diff[0];

-  unsigned char *predictor = &bd->predictor[0];

-  vp9_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);

-}

-#endif

-void vp9_arch_x86_encoder_init(VP9_COMP *cpi) {

-#if CONFIG_RUNTIME_CPU_DETECT

-  int flags = x86_simd_caps();

-  /* Note:

-   *

-   * This platform can be built without runtime CPU detection as well. If

-   * you modify any of the function mappings present in this file, be sure

-   * to also update them in static mapings (<arch>/filename_<arch>.h)

-   */

-  /* Override default functions with fastest ones for this CPU. */

-#if HAVE_SSE2

-  if (flags & HAS_SSE2) {

-    cpi->rtcd.temporal.apply                 = vp9_temporal_filter_apply_sse2;

-  }

-#endif

-#if HAVE_SSE3

-  if (flags & HAS_SSE3) {

-    cpi->rtcd.search.full_search             = vp9_full_search_sadx3;

-    cpi->rtcd.search.diamond_search          = vp9_diamond_search_sadx4;

-    cpi->rtcd.search.refining_search         = vp9_refining_search_sadx4;

-  }

-#endif

-#if HAVE_SSE4_1

-  if (flags & HAS_SSE4_1) {

-    cpi->rtcd.search.full_search             = vp9_full_search_sadx8;

-  }

-#endif

-#endif

-}

--- a/vp8/exports_dec

+++ /dev/null

@@ -1,2 +1,0 @@

-data vpx_codec_vp8_dx_algo

-text vpx_codec_vp8_dx

--- a/vp8/exports_enc

+++ /dev/null

@@ -1,4 +1,0 @@

-data vpx_codec_vp8_cx_algo

-text vpx_codec_vp8_cx

-data vpx_codec_vp8x_cx_algo

-text vpx_codec_vp8x_cx

--- a/vp8/vp8_common.mk

+++ /dev/null

@@ -1,179 +1,0 @@

-##

-##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-##

-##  Use of this source code is governed by a BSD-style license

-##  that can be found in the LICENSE file in the root of the source

-##  tree. An additional intellectual property rights grant can be found

-##  in the file PATENTS.  All contributing project authors may

-##  be found in the AUTHORS file in the root of the source tree.

-##

-VP8_COMMON_SRCS-yes += vp8_common.mk

-VP8_COMMON_SRCS-yes += common/type_aliases.h

-VP8_COMMON_SRCS-yes += common/pragmas.h

-VP8_COMMON_SRCS-yes += common/ppflags.h

-VP8_COMMON_SRCS-yes += common/onyx.h

-VP8_COMMON_SRCS-yes += common/onyxd.h

-VP8_COMMON_SRCS-yes += common/alloccommon.c

-VP8_COMMON_SRCS-yes += common/asm_com_offsets.c

-VP8_COMMON_SRCS-yes += common/blockd.c

-VP8_COMMON_SRCS-yes += common/coefupdateprobs.h

-VP8_COMMON_SRCS-yes += common/debugmodes.c

-VP8_COMMON_SRCS-yes += common/entropy.c

-VP8_COMMON_SRCS-yes += common/entropymode.c

-VP8_COMMON_SRCS-yes += common/entropymv.c

-VP8_COMMON_SRCS-yes += common/extend.c

-VP8_COMMON_SRCS-yes += common/filter.c

-VP8_COMMON_SRCS-yes += common/filter.h

-VP8_COMMON_SRCS-yes += common/findnearmv.c

-VP8_COMMON_SRCS-yes += common/generic/systemdependent.c

-VP8_COMMON_SRCS-yes += common/idctllm.c

-VP8_COMMON_SRCS-yes += common/alloccommon.h

-VP8_COMMON_SRCS-yes += common/blockd.h

-VP8_COMMON_SRCS-yes += common/common.h

-VP8_COMMON_SRCS-yes += common/common_types.h

-VP8_COMMON_SRCS-yes += common/entropy.h

-VP8_COMMON_SRCS-yes += common/entropymode.h

-VP8_COMMON_SRCS-yes += common/entropymv.h

-VP8_COMMON_SRCS-yes += common/extend.h

-VP8_COMMON_SRCS-yes += common/findnearmv.h

-VP8_COMMON_SRCS-yes += common/header.h

-VP8_COMMON_SRCS-yes += common/idct.h

-VP8_COMMON_SRCS-yes += common/invtrans.h

-VP8_COMMON_SRCS-yes += common/loopfilter.h

-VP8_COMMON_SRCS-yes += common/modecont.h

-VP8_COMMON_SRCS-yes += common/mv.h

-VP8_COMMON_SRCS-yes += common/onyxc_int.h

-VP8_COMMON_SRCS-yes += common/pred_common.h

-VP8_COMMON_SRCS-yes += common/pred_common.c

-VP8_COMMON_SRCS-yes += common/quant_common.h

-VP8_COMMON_SRCS-yes += common/reconinter.h

-VP8_COMMON_SRCS-yes += common/reconintra.h

-VP8_COMMON_SRCS-yes += common/reconintra4x4.h

-VP8_COMMON_SRCS-yes += common/rtcd.c

-VP8_COMMON_SRCS-yes += common/rtcd_defs.sh

-VP8_COMMON_SRCS-yes += common/sadmxn.h

-VP8_COMMON_SRCS-yes += common/seg_common.h

-VP8_COMMON_SRCS-yes += common/seg_common.c

-VP8_COMMON_SRCS-yes += common/setupintrarecon.h

-VP8_COMMON_SRCS-yes += common/subpixel.h

-VP8_COMMON_SRCS-yes += common/swapyv12buffer.h

-VP8_COMMON_SRCS-yes += common/systemdependent.h

-VP8_COMMON_SRCS-yes += common/treecoder.h

-VP8_COMMON_SRCS-yes += common/invtrans.c

-VP8_COMMON_SRCS-yes += common/loopfilter.c

-VP8_COMMON_SRCS-yes += common/loopfilter_filters.c

-VP8_COMMON_SRCS-yes += common/mbpitch.c

-VP8_COMMON_SRCS-yes += common/modecont.c

-VP8_COMMON_SRCS-yes += common/modecontext.c

-VP8_COMMON_SRCS-yes += common/mvref_common.c

-VP8_COMMON_SRCS-yes += common/mvref_common.h

-VP8_COMMON_SRCS-yes += common/quant_common.c

-VP8_COMMON_SRCS-yes += common/recon.c

-VP8_COMMON_SRCS-yes += common/reconinter.c

-VP8_COMMON_SRCS-yes += common/reconintra.c

-VP8_COMMON_SRCS-yes += common/reconintra4x4.c

-VP8_COMMON_SRCS-yes += common/setupintrarecon.c

-VP8_COMMON_SRCS-yes += common/swapyv12buffer.c

-VP8_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c

-VP8_COMMON_SRCS-yes += common/treecoder.c

-VP8_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/implicit_segmentation.c

-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/idct_x86.h

-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/subpixel_x86.h

-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.h

-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/postproc_x86.h

-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/x86_systemdependent.c

-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c

-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c

-VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h

-VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c

-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm

-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm

-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm

-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm

-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm

-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm

-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm

-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c

-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm

-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm

-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm

-VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_8t_ssse3.asm

-VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm

-ifeq ($(CONFIG_POSTPROC),yes)

-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm

-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm

-endif

-# common (c)

-ifeq ($(CONFIG_CSM),yes)

-VP8_COMMON_SRCS-yes += common/maskingmv.c

-VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/mask_sse3.asm

-endif

-VP8_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/filter_sse4.c

-ifeq ($(HAVE_SSE4_1),yes)

-vp8/common/x86/filter_sse4.c.o: CFLAGS += -msse4

-endif

-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/filter_sse2.c

-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/sadmxn_x86.c

-ifeq ($(HAVE_SSE2),yes)

-vp8/common/x86/filter_sse2.c.o: CFLAGS += -msse2

-vp8/common/x86/loopfilter_x86.c.o: CFLAGS += -msse2

-vp8/common/x86/sadmxn_x86.c.o: CFLAGS += -msse2

-endif

-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/arm_systemdependent.c

-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/bilinearfilter_arm.c

-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/bilinearfilter_arm.h

-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/filter_arm.c

-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/idct_arm.h

-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.c

-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.h

-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/recon_arm.h

-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/reconintra_arm.c

-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/subpixel_arm.h

-# common (armv6)

-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/bilinearfilter_v6$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/copymem8x4_v6$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/copymem8x8_v6$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/copymem16x16_v6$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/dc_only_idct_add_v6$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/iwalsh_v6$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/filter_v6$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/idct_v6$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/loopfilter_v6$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/recon_v6$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/simpleloopfilter_v6$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/sixtappredict8x4_v6$(ASM)

-# common (neon)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict4x4_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict8x4_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict8x8_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict16x16_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/copymem8x4_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/copymem8x8_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/copymem16x16_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/dc_only_idct_add_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/iwalsh_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfilter_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/mbloopfilter_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon2b_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon4b_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/reconb_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/shortidct4x4llm_1_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/shortidct4x4llm_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict4x4_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict8x4_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict8x8_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict16x16_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon16x16mb_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/save_neon_reg$(ASM)

-VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon_neon.c

--- a/vp8/vp8_cx_iface.c

+++ /dev/null

@@ -1,1169 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx/vpx_codec.h"

-#include "vpx/internal/vpx_codec_internal.h"

-#include "vpx_version.h"

-#include "vp8/encoder/onyx_int.h"

-#include "vpx/vp8e.h"

-#include "vp8/encoder/firstpass.h"

-#include "vp8/common/onyx.h"

-#include <stdlib.h>

-#include <string.h>

-/* This value is a sentinel for determining whether the user has set a mode

- * directly through the deprecated VP8E_SET_ENCODING_MODE control.

- */

-#define NO_MODE_SET 255

-struct vp8_extracfg {

-  struct vpx_codec_pkt_list *pkt_list;

-  vp8e_encoding_mode      encoding_mode;               /** best, good, realtime            */

-  int                         cpu_used;                    /** available cpu percentage in 1/16*/

-  unsigned int                enable_auto_alt_ref;           /** if encoder decides to uses alternate reference frame */

-  unsigned int                noise_sensitivity;

-  unsigned int                Sharpness;

-  unsigned int                static_thresh;

-  unsigned int                token_partitions;

-  unsigned int                arnr_max_frames;    /* alt_ref Noise Reduction Max Frame Count */

-  unsigned int                arnr_strength;    /* alt_ref Noise Reduction Strength */

-  unsigned int                arnr_type;        /* alt_ref filter type */

-  unsigned int                experimental;

-  vp8e_tuning                 tuning;

-  unsigned int                cq_level;         /* constrained quality level */

-  unsigned int                rc_max_intra_bitrate_pct;

-};

-struct extraconfig_map {

-  int                 usage;

-  struct vp8_extracfg cfg;

-};

-static const struct extraconfig_map extracfg_map[] = {

-  {

-    0,

-    {

-      NULL,

-      VP8_BEST_QUALITY_ENCODING,  /* Encoding Mode */

-      0,                          /* cpu_used      */

-      0,                          /* enable_auto_alt_ref */

-      0,                          /* noise_sensitivity */

-      0,                          /* Sharpness */

-      0,                          /* static_thresh */

-      VP8_ONE_TOKENPARTITION,     /* token_partitions */

-      0,                          /* arnr_max_frames */

-      3,                          /* arnr_strength */

-      3,                          /* arnr_type*/

-      0,                          /* experimental mode */

-      0,                          /* tuning*/

-      10,                         /* cq_level */

-      0,                          /* rc_max_intra_bitrate_pct */

-    }

-  }

-};

-struct vpx_codec_alg_priv {

-  vpx_codec_priv_t        base;

-  vpx_codec_enc_cfg_t     cfg;

-  struct vp8_extracfg     vp8_cfg;

-  VP9_CONFIG              oxcf;

-  VP9_PTR             cpi;

-  unsigned char          *cx_data;

-  unsigned int            cx_data_sz;

-  vpx_image_t             preview_img;

-  unsigned int            next_frame_flag;

-  vp8_postproc_cfg_t      preview_ppcfg;

-  vpx_codec_pkt_list_decl(64) pkt_list;              // changed to accomendate the maximum number of lagged frames allowed

-  int                         deprecated_mode;

-  unsigned int                fixed_kf_cntr;

-};

-static vpx_codec_err_t

-update_error_state(vpx_codec_alg_priv_t                 *ctx,

-                   const struct vpx_internal_error_info *error) {

-  vpx_codec_err_t res;

-  if ((res = error->error_code))

-    ctx->base.err_detail = error->has_detail

-                           ? error->detail

-                           : NULL;

-  return res;

-}

-#undef ERROR

-#define ERROR(str) do {\

-    ctx->base.err_detail = str;\

-    return VPX_CODEC_INVALID_PARAM;\

-  } while(0)

-#define RANGE_CHECK(p,memb,lo,hi) do {\

-    if(!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \

-      ERROR(#memb " out of range ["#lo".."#hi"]");\

-  } while(0)

-#define RANGE_CHECK_HI(p,memb,hi) do {\

-    if(!((p)->memb <= (hi))) \

-      ERROR(#memb " out of range [.."#hi"]");\

-  } while(0)

-#define RANGE_CHECK_LO(p,memb,lo) do {\

-    if(!((p)->memb >= (lo))) \

-      ERROR(#memb " out of range ["#lo"..]");\

-  } while(0)

-#define RANGE_CHECK_BOOL(p,memb) do {\

-    if(!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\

-  } while(0)

-static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,

-                                       const vpx_codec_enc_cfg_t *cfg,

-                                       const struct vp8_extracfg *vp8_cfg) {

-  RANGE_CHECK(cfg, g_w,                   1, 16383); /* 14 bits available */

-  RANGE_CHECK(cfg, g_h,                   1, 16383); /* 14 bits available */

-  RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);

-  RANGE_CHECK(cfg, g_timebase.num,        1, cfg->g_timebase.den);

-  RANGE_CHECK_HI(cfg, g_profile,          3);

-  RANGE_CHECK_HI(cfg, rc_max_quantizer,   63);

-  RANGE_CHECK_HI(cfg, rc_min_quantizer,   cfg->rc_max_quantizer);

-  RANGE_CHECK_HI(cfg, g_threads,          64);

-  RANGE_CHECK_HI(cfg, g_lag_in_frames,    MAX_LAG_BUFFERS);

-  RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CQ);

-  RANGE_CHECK_HI(cfg, rc_undershoot_pct,  1000);

-  RANGE_CHECK_HI(cfg, rc_overshoot_pct,   1000);

-  RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);

-  RANGE_CHECK(cfg, kf_mode,               VPX_KF_DISABLED, VPX_KF_AUTO);

-  // RANGE_CHECK_BOOL(cfg,                 g_delete_firstpassfile);

-  RANGE_CHECK_BOOL(cfg,                   rc_resize_allowed);

-  RANGE_CHECK_HI(cfg, rc_dropframe_thresh,   100);

-  RANGE_CHECK_HI(cfg, rc_resize_up_thresh,   100);

-  RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);

-  RANGE_CHECK(cfg,        g_pass,         VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);

-  /* VP8 does not support a lower bound on the keyframe interval in

-   * automatic keyframe placement mode.

-   */

-  if (cfg->kf_mode != VPX_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist

-      && cfg->kf_min_dist > 0)

-    ERROR("kf_min_dist not supported in auto mode, use 0 "

-          "or kf_max_dist instead.");

-  RANGE_CHECK_BOOL(vp8_cfg,               enable_auto_alt_ref);

-  RANGE_CHECK(vp8_cfg, cpu_used,           -16, 16);

-  RANGE_CHECK(vp8_cfg, encoding_mode,      VP8_BEST_QUALITY_ENCODING, VP8_REAL_TIME_ENCODING);

-  RANGE_CHECK_HI(vp8_cfg, noise_sensitivity,  6);

-  RANGE_CHECK(vp8_cfg, token_partitions,   VP8_ONE_TOKENPARTITION, VP8_EIGHT_TOKENPARTITION);

-  RANGE_CHECK_HI(vp8_cfg, Sharpness,       7);

-  RANGE_CHECK(vp8_cfg, arnr_max_frames, 0, 15);

-  RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);

-  RANGE_CHECK(vp8_cfg, arnr_type,       1, 3);

-  RANGE_CHECK(vp8_cfg, cq_level, 0, 63);

-  if (cfg->g_pass == VPX_RC_LAST_PASS) {

-    size_t           packet_sz = sizeof(FIRSTPASS_STATS);

-    int              n_packets = cfg->rc_twopass_stats_in.sz / packet_sz;

-    FIRSTPASS_STATS *stats;

-    if (!cfg->rc_twopass_stats_in.buf)

-      ERROR("rc_twopass_stats_in.buf not set.");

-    if (cfg->rc_twopass_stats_in.sz % packet_sz)

-      ERROR("rc_twopass_stats_in.sz indicates truncated packet.");

-    if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz)

-      ERROR("rc_twopass_stats_in requires at least two packets.");

-    stats = (void *)((char *)cfg->rc_twopass_stats_in.buf

-                     + (n_packets - 1) * packet_sz);

-    if ((int)(stats->count + 0.5) != n_packets - 1)

-      ERROR("rc_twopass_stats_in missing EOS stats packet");

-  }

-  return VPX_CODEC_OK;

-}

-static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,

-                                    const vpx_image_t    *img) {

-  switch (img->fmt) {

-    case VPX_IMG_FMT_YV12:

-    case VPX_IMG_FMT_I420:

-    case VPX_IMG_FMT_VPXI420:

-    case VPX_IMG_FMT_VPXYV12:

-      break;

-    default:

-      ERROR("Invalid image format. Only YV12 and I420 images are supported");

-  }

-  if ((img->d_w != ctx->cfg.g_w) || (img->d_h != ctx->cfg.g_h))

-    ERROR("Image size must match encoder init configuration size");

-  return VPX_CODEC_OK;

-}

-static vpx_codec_err_t set_vp8e_config(VP9_CONFIG *oxcf,

-                                       vpx_codec_enc_cfg_t cfg,

-                                       struct vp8_extracfg vp8_cfg) {

-  oxcf->Version               = cfg.g_profile;

-  oxcf->Version              |= vp8_cfg.experimental ? 0x4 : 0;

-  oxcf->Width                 = cfg.g_w;

-  oxcf->Height                = cfg.g_h;

-  /* guess a frame rate if out of whack, use 30 */

-  oxcf->frame_rate             = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num);

-  if (oxcf->frame_rate > 180) {

-    oxcf->frame_rate = 30;

-  }

-  switch (cfg.g_pass) {

-    case VPX_RC_ONE_PASS:

-      oxcf->Mode = MODE_BESTQUALITY;

-      break;

-    case VPX_RC_FIRST_PASS:

-      oxcf->Mode = MODE_FIRSTPASS;

-      break;

-    case VPX_RC_LAST_PASS:

-      oxcf->Mode = MODE_SECONDPASS_BEST;

-      break;

-  }

-  if (cfg.g_pass == VPX_RC_FIRST_PASS) {

-    oxcf->allow_lag              = 0;

-    oxcf->lag_in_frames           = 0;

-  } else {

-    oxcf->allow_lag              = (cfg.g_lag_in_frames) > 0;

-    oxcf->lag_in_frames           = cfg.g_lag_in_frames;

-  }

-  // VBR only supported for now.

-  // CBR code has been deprectated for experimental phase.

-  // CQ mode not yet tested

-  oxcf->end_usage          = USAGE_LOCAL_FILE_PLAYBACK;

-  /*if (cfg.rc_end_usage == VPX_CQ)

-      oxcf->end_usage      = USAGE_CONSTRAINED_QUALITY;

-  else

-      oxcf->end_usage      = USAGE_LOCAL_FILE_PLAYBACK;*/

-  oxcf->target_bandwidth       = cfg.rc_target_bitrate;

-  oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;

-  oxcf->best_allowed_q          = cfg.rc_min_quantizer;

-  oxcf->worst_allowed_q         = cfg.rc_max_quantizer;

-  oxcf->cq_level                = vp8_cfg.cq_level;

-  oxcf->fixed_q = -1;

-  oxcf->under_shoot_pct         = cfg.rc_undershoot_pct;

-  oxcf->over_shoot_pct          = cfg.rc_overshoot_pct;

-  oxcf->maximum_buffer_size     = cfg.rc_buf_sz;

-  oxcf->starting_buffer_level   = cfg.rc_buf_initial_sz;

-  oxcf->optimal_buffer_level    = cfg.rc_buf_optimal_sz;

-  oxcf->two_pass_vbrbias        = cfg.rc_2pass_vbr_bias_pct;

-  oxcf->two_pass_vbrmin_section  = cfg.rc_2pass_vbr_minsection_pct;

-  oxcf->two_pass_vbrmax_section  = cfg.rc_2pass_vbr_maxsection_pct;

-  oxcf->auto_key               = cfg.kf_mode == VPX_KF_AUTO

-                                 && cfg.kf_min_dist != cfg.kf_max_dist;

-  // oxcf->kf_min_dist         = cfg.kf_min_dis;

-  oxcf->key_freq               = cfg.kf_max_dist;

-  // oxcf->delete_first_pass_file = cfg.g_delete_firstpassfile;

-  // strcpy(oxcf->first_pass_file, cfg.g_firstpass_file);

-  oxcf->cpu_used               =  vp8_cfg.cpu_used;

-  oxcf->encode_breakout        =  vp8_cfg.static_thresh;

-  oxcf->play_alternate         =  vp8_cfg.enable_auto_alt_ref;

-  oxcf->noise_sensitivity      =  vp8_cfg.noise_sensitivity;

-  oxcf->Sharpness             =  vp8_cfg.Sharpness;

-  oxcf->two_pass_stats_in        =  cfg.rc_twopass_stats_in;

-  oxcf->output_pkt_list         =  vp8_cfg.pkt_list;

-  oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames;

-  oxcf->arnr_strength =  vp8_cfg.arnr_strength;

-  oxcf->arnr_type =      vp8_cfg.arnr_type;

-  oxcf->tuning = vp8_cfg.tuning;

-#if CONFIG_LOSSLESS

-  oxcf->lossless = cfg.lossless;

-#endif

-  /*

-      printf("Current VP8 Settings: \n");

-      printf("target_bandwidth: %d\n", oxcf->target_bandwidth);

-      printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity);

-      printf("Sharpness: %d\n",    oxcf->Sharpness);

-      printf("cpu_used: %d\n",  oxcf->cpu_used);

-      printf("Mode: %d\n",     oxcf->Mode);

-      printf("delete_first_pass_file: %d\n",  oxcf->delete_first_pass_file);

-      printf("auto_key: %d\n",  oxcf->auto_key);

-      printf("key_freq: %d\n", oxcf->key_freq);

-      printf("end_usage: %d\n", oxcf->end_usage);

-      printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct);

-      printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct);

-      printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level);

-      printf("optimal_buffer_level: %d\n",  oxcf->optimal_buffer_level);

-      printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size);

-      printf("fixed_q: %d\n",  oxcf->fixed_q);

-      printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q);

-      printf("best_allowed_q: %d\n", oxcf->best_allowed_q);

-      printf("two_pass_vbrbias: %d\n",  oxcf->two_pass_vbrbias);

-      printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);

-      printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);

-      printf("allow_lag: %d\n", oxcf->allow_lag);

-      printf("lag_in_frames: %d\n", oxcf->lag_in_frames);

-      printf("play_alternate: %d\n", oxcf->play_alternate);

-      printf("Version: %d\n", oxcf->Version);

-      printf("encode_breakout: %d\n", oxcf->encode_breakout);

-  */

-  return VPX_CODEC_OK;

-}

-static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t       *ctx,

-                                       const vpx_codec_enc_cfg_t  *cfg) {

-  vpx_codec_err_t res;

-  if ((cfg->g_w != ctx->cfg.g_w) || (cfg->g_h != ctx->cfg.g_h))

-    ERROR("Cannot change width or height after initialization");

-  /* Prevent increasing lag_in_frames. This check is stricter than it needs

-   * to be -- the limit is not increasing past the first lag_in_frames

-   * value, but we don't track the initial config, only the last successful

-   * config.

-   */

-  if ((cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames))

-    ERROR("Cannot increase lag_in_frames");

-  res = validate_config(ctx, cfg, &ctx->vp8_cfg);

-  if (!res) {

-    ctx->cfg = *cfg;

-    set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);

-    vp9_change_config(ctx->cpi, &ctx->oxcf);

-  }

-  return res;

-}

-int vp9_reverse_trans(int q);

-static vpx_codec_err_t get_param(vpx_codec_alg_priv_t *ctx,

-                                 int                   ctrl_id,

-                                 va_list               args) {

-  void *arg = va_arg(args, void *);

-#define MAP(id, var) case id: *(RECAST(id, arg)) = var; break

-  if (!arg)

-    return VPX_CODEC_INVALID_PARAM;

-  switch (ctrl_id) {

-      MAP(VP8E_GET_LAST_QUANTIZER, vp9_get_quantizer(ctx->cpi));

-      MAP(VP8E_GET_LAST_QUANTIZER_64,

-          vp9_reverse_trans(vp9_get_quantizer(ctx->cpi)));

-  }

-  return VPX_CODEC_OK;

-#undef MAP

-}

-static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,

-                                 int                   ctrl_id,

-                                 va_list               args) {

-  vpx_codec_err_t     res  = VPX_CODEC_OK;

-  struct vp8_extracfg xcfg = ctx->vp8_cfg;

-#define MAP(id, var) case id: var = CAST(id, args); break;

-  switch (ctrl_id) {

-      MAP(VP8E_SET_ENCODING_MODE,         ctx->deprecated_mode);

-      MAP(VP8E_SET_CPUUSED,               xcfg.cpu_used);

-      MAP(VP8E_SET_ENABLEAUTOALTREF,      xcfg.enable_auto_alt_ref);

-      MAP(VP8E_SET_NOISE_SENSITIVITY,     xcfg.noise_sensitivity);

-      MAP(VP8E_SET_SHARPNESS,             xcfg.Sharpness);

-      MAP(VP8E_SET_STATIC_THRESHOLD,      xcfg.static_thresh);

-      MAP(VP8E_SET_TOKEN_PARTITIONS,      xcfg.token_partitions);

-      MAP(VP8E_SET_ARNR_MAXFRAMES,        xcfg.arnr_max_frames);

-      MAP(VP8E_SET_ARNR_STRENGTH,        xcfg.arnr_strength);

-      MAP(VP8E_SET_ARNR_TYPE,        xcfg.arnr_type);

-      MAP(VP8E_SET_TUNING,                xcfg.tuning);

-      MAP(VP8E_SET_CQ_LEVEL,              xcfg.cq_level);

-      MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, xcfg.rc_max_intra_bitrate_pct);

-  }

-  res = validate_config(ctx, &ctx->cfg, &xcfg);

-  if (!res) {

-    ctx->vp8_cfg = xcfg;

-    set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);

-    vp9_change_config(ctx->cpi, &ctx->oxcf);

-  }

-  return res;

-#undef MAP

-}

-static vpx_codec_err_t vp8e_common_init(vpx_codec_ctx_t *ctx,

-                                        int              experimental) {

-  vpx_codec_err_t        res = VPX_DEC_OK;

-  struct vpx_codec_alg_priv *priv;

-  vpx_codec_enc_cfg_t       *cfg;

-  unsigned int               i;

-  VP9_PTR optr;

-  if (!ctx->priv) {

-    priv = calloc(1, sizeof(struct vpx_codec_alg_priv));

-    if (!priv) {

-      return VPX_CODEC_MEM_ERROR;

-    }

-    ctx->priv = &priv->base;

-    ctx->priv->sz = sizeof(*ctx->priv);

-    ctx->priv->iface = ctx->iface;

-    ctx->priv->alg_priv = priv;

-    ctx->priv->init_flags = ctx->init_flags;

-    if (ctx->config.enc) {

-      /* Update the reference to the config structure to an

-       * internal copy.

-       */

-      ctx->priv->alg_priv->cfg = *ctx->config.enc;

-      ctx->config.enc = &ctx->priv->alg_priv->cfg;

-    }

-    cfg =  &ctx->priv->alg_priv->cfg;

-    /* Select the extra vp6 configuration table based on the current

-     * usage value. If the current usage value isn't found, use the

-     * values for usage case 0.

-     */

-    for (i = 0;

-         extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;

-         i++);

-    priv->vp8_cfg = extracfg_map[i].cfg;

-    priv->vp8_cfg.pkt_list = &priv->pkt_list.head;

-    priv->vp8_cfg.experimental = experimental;

-    priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2;

-    if (priv->cx_data_sz < 4096) priv->cx_data_sz = 4096;

-    priv->cx_data = malloc(priv->cx_data_sz);

-    if (!priv->cx_data) {

-      return VPX_CODEC_MEM_ERROR;

-    }

-    priv->deprecated_mode = NO_MODE_SET;

-    vp9_initialize_enc();

-    res = validate_config(priv, &priv->cfg, &priv->vp8_cfg);

-    if (!res) {

-      set_vp8e_config(&ctx->priv->alg_priv->oxcf,

-                      ctx->priv->alg_priv->cfg,

-                      ctx->priv->alg_priv->vp8_cfg);

-      optr = vp9_create_compressor(&ctx->priv->alg_priv->oxcf);

-      if (!optr)

-        res = VPX_CODEC_MEM_ERROR;

-      else

-        ctx->priv->alg_priv->cpi = optr;

-    }

-  }

-  return res;

-}

-static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx) {

-  return vp8e_common_init(ctx, 0);

-}

-#if CONFIG_EXPERIMENTAL

-static vpx_codec_err_t vp8e_exp_init(vpx_codec_ctx_t *ctx) {

-  return vp8e_common_init(ctx, 1);

-}

-#endif

-static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx) {

-  free(ctx->cx_data);

-  vp9_remove_compressor(&ctx->cpi);

-  free(ctx);

-  return VPX_CODEC_OK;

-}

-static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,

-                                       YV12_BUFFER_CONFIG  *yv12) {

-  vpx_codec_err_t        res = VPX_CODEC_OK;

-  yv12->y_buffer = img->planes[VPX_PLANE_Y];

-  yv12->u_buffer = img->planes[VPX_PLANE_U];

-  yv12->v_buffer = img->planes[VPX_PLANE_V];

-  yv12->y_width  = img->d_w;

-  yv12->y_height = img->d_h;

-  yv12->uv_width = (1 + yv12->y_width) / 2;

-  yv12->uv_height = (1 + yv12->y_height) / 2;

-  yv12->y_stride = img->stride[VPX_PLANE_Y];

-  yv12->uv_stride = img->stride[VPX_PLANE_U];

-  yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;

-  yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12); // REG_YUV = 0

-  return res;

-}

-static void pick_quickcompress_mode(vpx_codec_alg_priv_t  *ctx,

-                                    unsigned long          duration,

-                                    unsigned long          deadline) {

-  unsigned int new_qc;

-  /* Use best quality mode if no deadline is given. */

-  if (deadline)

-    new_qc = MODE_GOODQUALITY;

-  else

-    new_qc = MODE_BESTQUALITY;

-  if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)

-    new_qc = MODE_FIRSTPASS;

-  else if (ctx->cfg.g_pass == VPX_RC_LAST_PASS)

-    new_qc = (new_qc == MODE_BESTQUALITY)

-             ? MODE_SECONDPASS_BEST

-             : MODE_SECONDPASS;

-  if (ctx->oxcf.Mode != new_qc) {

-    ctx->oxcf.Mode = new_qc;

-    vp9_change_config(ctx->cpi, &ctx->oxcf);

-  }

-}

-static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t  *ctx,

-                                   const vpx_image_t     *img,

-                                   vpx_codec_pts_t        pts,

-                                   unsigned long          duration,

-                                   vpx_enc_frame_flags_t  flags,

-                                   unsigned long          deadline) {

-  vpx_codec_err_t res = VPX_CODEC_OK;

-  if (img)

-    res = validate_img(ctx, img);

-  pick_quickcompress_mode(ctx, duration, deadline);

-  vpx_codec_pkt_list_init(&ctx->pkt_list);

-  /* Handle Flags */

-  if (((flags & VP8_EFLAG_NO_UPD_GF) && (flags & VP8_EFLAG_FORCE_GF))

-      || ((flags & VP8_EFLAG_NO_UPD_ARF) && (flags & VP8_EFLAG_FORCE_ARF))) {

-    ctx->base.err_detail = "Conflicting flags.";

-    return VPX_CODEC_INVALID_PARAM;

-  }

-  if (flags & (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF

-               | VP8_EFLAG_NO_REF_ARF)) {

-    int ref = 7;

-    if (flags & VP8_EFLAG_NO_REF_LAST)

-      ref ^= VP9_LAST_FLAG;

-    if (flags & VP8_EFLAG_NO_REF_GF)

-      ref ^= VP9_GOLD_FLAG;

-    if (flags & VP8_EFLAG_NO_REF_ARF)

-      ref ^= VP9_ALT_FLAG;

-    vp9_use_as_reference(ctx->cpi, ref);

-  }

-  if (flags & (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF

-               | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_FORCE_GF

-               | VP8_EFLAG_FORCE_ARF)) {

-    int upd = 7;

-    if (flags & VP8_EFLAG_NO_UPD_LAST)

-      upd ^= VP9_LAST_FLAG;

-    if (flags & VP8_EFLAG_NO_UPD_GF)

-      upd ^= VP9_GOLD_FLAG;

-    if (flags & VP8_EFLAG_NO_UPD_ARF)

-      upd ^= VP9_ALT_FLAG;

-    vp9_update_reference(ctx->cpi, upd);

-  }

-  if (flags & VP8_EFLAG_NO_UPD_ENTROPY) {

-    vp9_update_entropy(ctx->cpi, 0);

-  }

-  /* Handle fixed keyframe intervals */

-  if (ctx->cfg.kf_mode == VPX_KF_AUTO

-      && ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) {

-    if (++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) {

-      flags |= VPX_EFLAG_FORCE_KF;

-      ctx->fixed_kf_cntr = 1;

-    }

-  }

-  /* Initialize the encoder instance on the first frame*/

-  if (!res && ctx->cpi) {

-    unsigned int lib_flags;

-    YV12_BUFFER_CONFIG sd;

-    int64_t dst_time_stamp, dst_end_time_stamp;

-    unsigned long size, cx_data_sz;

-    unsigned char *cx_data;

-    /* Set up internal flags */

-    if (ctx->base.init_flags & VPX_CODEC_USE_PSNR)

-      ((VP9_COMP *)ctx->cpi)->b_calculate_psnr = 1;

-    // if (ctx->base.init_flags & VPX_CODEC_USE_OUTPUT_PARTITION)

-    //    ((VP9_COMP *)ctx->cpi)->output_partition = 1;

-    /* Convert API flags to internal codec lib flags */

-    lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;

-    /* vp8 use 10,000,000 ticks/second as time stamp */

-    dst_time_stamp    = pts * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;

-    dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;

-    if (img != NULL) {

-      res = image2yuvconfig(img, &sd);

-      if (vp9_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags,

-                                &sd, dst_time_stamp, dst_end_time_stamp)) {

-        VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;

-        res = update_error_state(ctx, &cpi->common.error);

-      }

-      /* reset for next frame */

-      ctx->next_frame_flag = 0;

-    }

-    cx_data = ctx->cx_data;

-    cx_data_sz = ctx->cx_data_sz;

-    lib_flags = 0;

-    while (cx_data_sz >= ctx->cx_data_sz / 2 &&

-           -1 != vp9_get_compressed_data(ctx->cpi, &lib_flags, &size,

-                                         cx_data, &dst_time_stamp,

-                                         &dst_end_time_stamp, !img)) {

-      if (size) {

-        vpx_codec_pts_t    round, delta;

-        vpx_codec_cx_pkt_t pkt;

-        VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;

-        /* Add the frame packet to the list of returned packets. */

-        round = 1000000 * ctx->cfg.g_timebase.num / 2 - 1;

-        delta = (dst_end_time_stamp - dst_time_stamp);

-        pkt.kind = VPX_CODEC_CX_FRAME_PKT;

-        pkt.data.frame.pts =

-          (dst_time_stamp * ctx->cfg.g_timebase.den + round)

-          / ctx->cfg.g_timebase.num / 10000000;

-        pkt.data.frame.duration =

-          (delta * ctx->cfg.g_timebase.den + round)

-          / ctx->cfg.g_timebase.num / 10000000;

-        pkt.data.frame.flags = lib_flags << 16;

-        if (lib_flags & FRAMEFLAGS_KEY)

-          pkt.data.frame.flags |= VPX_FRAME_IS_KEY;

-        if (!cpi->common.show_frame) {

-          pkt.data.frame.flags |= VPX_FRAME_IS_INVISIBLE;

-          // This timestamp should be as close as possible to the

-          // prior PTS so that if a decoder uses pts to schedule when

-          // to do this, we start right after last frame was decoded.

-          // Invisible frames have no duration.

-          pkt.data.frame.pts = ((cpi->last_time_stamp_seen

-                                 * ctx->cfg.g_timebase.den + round)

-                                / ctx->cfg.g_timebase.num / 10000000) + 1;

-          pkt.data.frame.duration = 0;

-        }

-        if (cpi->droppable)

-          pkt.data.frame.flags |= VPX_FRAME_IS_DROPPABLE;

-        /*if (cpi->output_partition)

-        {

-            int i;

-            const int num_partitions = 1;

-            pkt.data.frame.flags |= VPX_FRAME_IS_FRAGMENT;

-            for (i = 0; i < num_partitions; ++i)

-            {

-                pkt.data.frame.buf = cx_data;

-                pkt.data.frame.sz = cpi->partition_sz[i];

-                pkt.data.frame.partition_id = i;

-                // don't set the fragment bit for the last partition

-                if (i == (num_partitions - 1))

-                    pkt.data.frame.flags &= ~VPX_FRAME_IS_FRAGMENT;

-                vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);

-                cx_data += cpi->partition_sz[i];

-                cx_data_sz -= cpi->partition_sz[i];

-            }

-        }

-        else*/

-        {

-          pkt.data.frame.buf = cx_data;

-          pkt.data.frame.sz  = size;

-          pkt.data.frame.partition_id = -1;

-          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);

-          cx_data += size;

-          cx_data_sz -= size;

-        }

-        // printf("timestamp: %lld, duration: %d\n", pkt->data.frame.pts, pkt->data.frame.duration);

-      }

-    }

-  }

-  return res;

-}

-static const vpx_codec_cx_pkt_t *vp8e_get_cxdata(vpx_codec_alg_priv_t  *ctx,

-                                                 vpx_codec_iter_t      *iter) {

-  return vpx_codec_pkt_list_get(&ctx->pkt_list.head, iter);

-}

-static vpx_codec_err_t vp8e_set_reference(vpx_codec_alg_priv_t *ctx,

-                                          int ctr_id,

-                                          va_list args) {

-  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);

-  if (data) {

-    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;

-    YV12_BUFFER_CONFIG sd;

-    image2yuvconfig(&frame->img, &sd);

-    vp9_set_reference_enc(ctx->cpi, frame->frame_type, &sd);

-    return VPX_CODEC_OK;

-  } else

-    return VPX_CODEC_INVALID_PARAM;

-}

-static vpx_codec_err_t vp8e_get_reference(vpx_codec_alg_priv_t *ctx,

-                                          int ctr_id,

-                                          va_list args) {

-  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);

-  if (data) {

-    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;

-    YV12_BUFFER_CONFIG sd;

-    image2yuvconfig(&frame->img, &sd);

-    vp9_get_reference_enc(ctx->cpi, frame->frame_type, &sd);

-    return VPX_CODEC_OK;

-  } else

-    return VPX_CODEC_INVALID_PARAM;

-}

-static vpx_codec_err_t vp8e_set_previewpp(vpx_codec_alg_priv_t *ctx,

-                                          int ctr_id,

-                                          va_list args) {

-#if CONFIG_POSTPROC

-  vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);

-  (void)ctr_id;

-  if (data) {

-    ctx->preview_ppcfg = *((vp8_postproc_cfg_t *)data);

-    return VPX_CODEC_OK;

-  } else

-    return VPX_CODEC_INVALID_PARAM;

-#else

-  (void)ctx;

-  (void)ctr_id;

-  (void)args;

-  return VPX_CODEC_INCAPABLE;

-#endif

-}

-static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx) {

-  YV12_BUFFER_CONFIG sd;

-  vp9_ppflags_t flags = {0};

-  if (ctx->preview_ppcfg.post_proc_flag) {

-    flags.post_proc_flag        = ctx->preview_ppcfg.post_proc_flag;

-    flags.deblocking_level      = ctx->preview_ppcfg.deblocking_level;

-    flags.noise_level           = ctx->preview_ppcfg.noise_level;

-  }

-  if (0 == vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags)) {

-    /*

-    vpx_img_wrap(&ctx->preview_img, VPX_IMG_FMT_YV12,

-        sd.y_width + 2*VP8BORDERINPIXELS,

-        sd.y_height + 2*VP8BORDERINPIXELS,

-        1,

-        sd.buffer_alloc);

-    vpx_img_set_rect(&ctx->preview_img,

-        VP8BORDERINPIXELS, VP8BORDERINPIXELS,

-        sd.y_width, sd.y_height);

-        */

-    ctx->preview_img.bps = 12;

-    ctx->preview_img.planes[VPX_PLANE_Y] = sd.y_buffer;

-    ctx->preview_img.planes[VPX_PLANE_U] = sd.u_buffer;

-    ctx->preview_img.planes[VPX_PLANE_V] = sd.v_buffer;

-    if (sd.clrtype == REG_YUV)

-      ctx->preview_img.fmt = VPX_IMG_FMT_I420;

-    else

-      ctx->preview_img.fmt = VPX_IMG_FMT_VPXI420;

-    ctx->preview_img.x_chroma_shift = 1;

-    ctx->preview_img.y_chroma_shift = 1;

-    ctx->preview_img.d_w = sd.y_width;

-    ctx->preview_img.d_h = sd.y_height;

-    ctx->preview_img.stride[VPX_PLANE_Y] = sd.y_stride;

-    ctx->preview_img.stride[VPX_PLANE_U] = sd.uv_stride;

-    ctx->preview_img.stride[VPX_PLANE_V] = sd.uv_stride;

-    ctx->preview_img.w   = sd.y_width;

-    ctx->preview_img.h   = sd.y_height;

-    return &ctx->preview_img;

-  } else

-    return NULL;

-}

-static vpx_codec_err_t vp8e_update_entropy(vpx_codec_alg_priv_t *ctx,

-                                           int ctr_id,

-                                           va_list args) {

-  int update = va_arg(args, int);

-  vp9_update_entropy(ctx->cpi, update);

-  return VPX_CODEC_OK;

-}

-static vpx_codec_err_t vp8e_update_reference(vpx_codec_alg_priv_t *ctx,

-                                             int ctr_id,

-                                             va_list args) {

-  int update = va_arg(args, int);

-  vp9_update_reference(ctx->cpi, update);

-  return VPX_CODEC_OK;

-}

-static vpx_codec_err_t vp8e_use_reference(vpx_codec_alg_priv_t *ctx,

-                                          int ctr_id,

-                                          va_list args) {

-  int reference_flag = va_arg(args, int);

-  vp9_use_as_reference(ctx->cpi, reference_flag);

-  return VPX_CODEC_OK;

-}

-static vpx_codec_err_t vp8e_set_roi_map(vpx_codec_alg_priv_t *ctx,

-                                        int ctr_id,

-                                        va_list args) {

-  vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *);

-  if (data) {

-    vpx_roi_map_t *roi = (vpx_roi_map_t *)data;

-    if (!vp9_set_roimap(ctx->cpi, roi->roi_map, roi->rows, roi->cols,

-                        roi->delta_q, roi->delta_lf, roi->static_threshold))

-      return VPX_CODEC_OK;

-    else

-      return VPX_CODEC_INVALID_PARAM;

-  } else

-    return VPX_CODEC_INVALID_PARAM;

-}

-static vpx_codec_err_t vp8e_set_activemap(vpx_codec_alg_priv_t *ctx,

-                                          int ctr_id,

-                                          va_list args) {

-  vpx_active_map_t *data = va_arg(args, vpx_active_map_t *);

-  if (data) {

-    vpx_active_map_t *map = (vpx_active_map_t *)data;

-    if (!vp9_set_active_map(ctx->cpi, map->active_map, map->rows, map->cols))

-      return VPX_CODEC_OK;

-    else

-      return VPX_CODEC_INVALID_PARAM;

-  } else

-    return VPX_CODEC_INVALID_PARAM;

-}

-static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx,

-                                          int ctr_id,

-                                          va_list args) {

-  vpx_scaling_mode_t *data =  va_arg(args, vpx_scaling_mode_t *);

-  if (data) {

-    int res;

-    vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data;

-    res = vp9_set_internal_size(ctx->cpi, scalemode.h_scaling_mode,

-                                scalemode.v_scaling_mode);

-    if (!res) {

-      /*force next frame a key frame to effect scaling mode */

-      ctx->next_frame_flag |= FRAMEFLAGS_KEY;

-      return VPX_CODEC_OK;

-    } else

-      return VPX_CODEC_INVALID_PARAM;

-  } else

-    return VPX_CODEC_INVALID_PARAM;

-}

-static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = {

-  {VP8_SET_REFERENCE,                 vp8e_set_reference},

-  {VP8_COPY_REFERENCE,                vp8e_get_reference},

-  {VP8_SET_POSTPROC,                  vp8e_set_previewpp},

-  {VP8E_UPD_ENTROPY,                  vp8e_update_entropy},

-  {VP8E_UPD_REFERENCE,                vp8e_update_reference},

-  {VP8E_USE_REFERENCE,                vp8e_use_reference},

-  {VP8E_SET_ROI_MAP,                  vp8e_set_roi_map},

-  {VP8E_SET_ACTIVEMAP,                vp8e_set_activemap},

-  {VP8E_SET_SCALEMODE,                vp8e_set_scalemode},

-  {VP8E_SET_ENCODING_MODE,            set_param},

-  {VP8E_SET_CPUUSED,                  set_param},

-  {VP8E_SET_NOISE_SENSITIVITY,        set_param},

-  {VP8E_SET_ENABLEAUTOALTREF,         set_param},

-  {VP8E_SET_SHARPNESS,                set_param},

-  {VP8E_SET_STATIC_THRESHOLD,         set_param},

-  {VP8E_SET_TOKEN_PARTITIONS,         set_param},

-  {VP8E_GET_LAST_QUANTIZER,           get_param},

-  {VP8E_GET_LAST_QUANTIZER_64,        get_param},

-  {VP8E_SET_ARNR_MAXFRAMES,           set_param},

-  {VP8E_SET_ARNR_STRENGTH,           set_param},

-  {VP8E_SET_ARNR_TYPE,           set_param},

-  {VP8E_SET_TUNING,                   set_param},

-  {VP8E_SET_CQ_LEVEL,                 set_param},

-  {VP8E_SET_MAX_INTRA_BITRATE_PCT,    set_param},

-  { -1, NULL},

-};

-static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {

-  {

-    0,

-    {

-      0,                  /* g_usage */

-      0,                  /* g_threads */

-      0,                  /* g_profile */

-      320,                /* g_width */

-      240,                /* g_height */

-      {1, 30},            /* g_timebase */

-      0,                  /* g_error_resilient */

-      VPX_RC_ONE_PASS,    /* g_pass */

-      0,                  /* g_lag_in_frames */

-      0,                  /* rc_dropframe_thresh */

-      0,                  /* rc_resize_allowed */

-      60,                 /* rc_resize_down_thresold */

-      30,                 /* rc_resize_up_thresold */

-      VPX_VBR,            /* rc_end_usage */

-#if VPX_ENCODER_ABI_VERSION > (1 + VPX_CODEC_ABI_VERSION)

-      {0},                /* rc_twopass_stats_in */

-#endif

-      256,                /* rc_target_bandwidth */

-      4,                  /* rc_min_quantizer */

-      63,                 /* rc_max_quantizer */

-      100,                /* rc_undershoot_pct */

-      100,                /* rc_overshoot_pct */

-      6000,               /* rc_max_buffer_size */

-      4000,               /* rc_buffer_initial_size; */

-      5000,               /* rc_buffer_optimal_size; */

-      50,                 /* rc_two_pass_vbrbias  */

-      0,                  /* rc_two_pass_vbrmin_section */

-      400,                /* rc_two_pass_vbrmax_section */

-      /* keyframing settings (kf) */

-      VPX_KF_AUTO,        /* g_kfmode*/

-      0,                  /* kf_min_dist */

-      9999,               /* kf_max_dist */

-#if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION)

-      1,                  /* g_delete_first_pass_file */

-      "vp8.fpf"           /* first pass filename */

-#endif

-    }

-  },

-  { -1, {NOT_IMPLEMENTED}}

-};

-#ifndef VERSION_STRING

-#define VERSION_STRING

-#endif

-CODEC_INTERFACE(vpx_codec_vp8_cx) = {

-  "WebM Project VP8 Encoder" VERSION_STRING,

-  VPX_CODEC_INTERNAL_ABI_VERSION,

-  VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR |

-  VPX_CODEC_CAP_OUTPUT_PARTITION,

-  /* vpx_codec_caps_t          caps; */

-  vp8e_init,          /* vpx_codec_init_fn_t       init; */

-  vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */

-  vp8e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */

-  NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */

-  NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */

-  {

-    NOT_IMPLEMENTED,    /* vpx_codec_peek_si_fn_t    peek_si; */

-    NOT_IMPLEMENTED,    /* vpx_codec_get_si_fn_t     get_si; */

-    NOT_IMPLEMENTED,    /* vpx_codec_decode_fn_t     decode; */

-    NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */

-  },

-  {

-    vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */

-    vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */

-    vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */

-    vp8e_set_config,

-    NOT_IMPLEMENTED,

-    vp8e_get_preview,

-  } /* encoder functions */

-};

-#if CONFIG_EXPERIMENTAL

-CODEC_INTERFACE(vpx_codec_vp8x_cx) = {

-  "VP8 Experimental Encoder" VERSION_STRING,

-  VPX_CODEC_INTERNAL_ABI_VERSION,

-  VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR,

-  /* vpx_codec_caps_t          caps; */

-  vp8e_exp_init,      /* vpx_codec_init_fn_t       init; */

-  vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */

-  vp8e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */

-  NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */

-  NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */

-  {

-    NOT_IMPLEMENTED,    /* vpx_codec_peek_si_fn_t    peek_si; */

-    NOT_IMPLEMENTED,    /* vpx_codec_get_si_fn_t     get_si; */

-    NOT_IMPLEMENTED,    /* vpx_codec_decode_fn_t     decode; */

-    NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */

-  },

-  {

-    vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */

-    vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */

-    vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */

-    vp8e_set_config,

-    NOT_IMPLEMENTED,

-    vp8e_get_preview,

-  } /* encoder functions */

-};

-#endif

-/*

- * BEGIN BACKWARDS COMPATIBILITY SHIM.

- */

-#define FORCE_KEY   2

-static vpx_codec_err_t api1_control(vpx_codec_alg_priv_t *ctx,

-                                    int                   ctrl_id,

-                                    va_list               args) {

-  vpx_codec_ctrl_fn_map_t *entry;

-  switch (ctrl_id) {

-    case VP8E_SET_FLUSHFLAG:

-      /* VP8 sample code did VP8E_SET_FLUSHFLAG followed by

-       * vpx_codec_get_cx_data() rather than vpx_codec_encode().

-       */

-      return vp8e_encode(ctx, NULL, 0, 0, 0, 0);

-    case VP8E_SET_FRAMETYPE:

-      ctx->base.enc.tbd |= FORCE_KEY;

-      return VPX_CODEC_OK;

-  }

-  for (entry = vp8e_ctf_maps; entry && entry->fn; entry++) {

-    if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) {

-      return entry->fn(ctx, ctrl_id, args);

-    }

-  }

-  return VPX_CODEC_ERROR;

-}

-static vpx_codec_ctrl_fn_map_t api1_ctrl_maps[] = {

-  {0, api1_control},

-  { -1, NULL}

-};

-static vpx_codec_err_t api1_encode(vpx_codec_alg_priv_t  *ctx,

-                                   const vpx_image_t     *img,

-                                   vpx_codec_pts_t        pts,

-                                   unsigned long          duration,

-                                   vpx_enc_frame_flags_t  flags,

-                                   unsigned long          deadline) {

-  int force = ctx->base.enc.tbd;

-  ctx->base.enc.tbd = 0;

-  return vp8e_encode

-         (ctx,

-          img,

-          pts,

-          duration,

-          flags | ((force & FORCE_KEY) ? VPX_EFLAG_FORCE_KF : 0),

-          deadline);

-}

-vpx_codec_iface_t vpx_enc_vp8_algo = {

-  "WebM Project VP8 Encoder (Deprecated API)" VERSION_STRING,

-  VPX_CODEC_INTERNAL_ABI_VERSION,

-  VPX_CODEC_CAP_ENCODER,

-  /* vpx_codec_caps_t          caps; */

-  vp8e_init,          /* vpx_codec_init_fn_t       init; */

-  vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */

-  api1_ctrl_maps,     /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */

-  NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */

-  NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */

-  {NOT_IMPLEMENTED},  /* decoder functions */

-  {

-    vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */

-    api1_encode,        /* vpx_codec_encode_fn_t      encode; */

-    vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */

-    vp8e_set_config,

-    NOT_IMPLEMENTED,

-    vp8e_get_preview,

-  } /* encoder functions */

-};

--- a/vp8/vp8_dx_iface.c

+++ /dev/null

@@ -1,717 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <stdlib.h>

-#include <string.h>

-#include "vpx/vpx_decoder.h"

-#include "vpx/vp8dx.h"

-#include "vpx/internal/vpx_codec_internal.h"

-#include "vpx_version.h"

-#include "common/onyxd.h"

-#include "decoder/onyxd_int.h"

-#define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)

-typedef vpx_codec_stream_info_t  vp8_stream_info_t;

-/* Structures for handling memory allocations */

-typedef enum {

-  VP8_SEG_ALG_PRIV     = 256,

-  VP8_SEG_MAX

-} mem_seg_id_t;

-#define NELEMENTS(x) ((int)(sizeof(x)/sizeof(x[0])))

-static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t);

-typedef struct {

-  unsigned int   id;

-  unsigned long  sz;

-  unsigned int   align;

-  unsigned int   flags;

-  unsigned long(*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t);

-} mem_req_t;

-static const mem_req_t vp8_mem_req_segs[] = {

-  {VP8_SEG_ALG_PRIV,    0, 8, VPX_CODEC_MEM_ZERO, vp8_priv_sz},

-  {VP8_SEG_MAX, 0, 0, 0, NULL}

-};

-struct vpx_codec_alg_priv {

-  vpx_codec_priv_t        base;

-  vpx_codec_mmap_t        mmaps[NELEMENTS(vp8_mem_req_segs) - 1];

-  vpx_codec_dec_cfg_t     cfg;

-  vp8_stream_info_t       si;

-  int                     defer_alloc;

-  int                     decoder_init;

-  VP9D_PTR                pbi;

-  int                     postproc_cfg_set;

-  vp8_postproc_cfg_t      postproc_cfg;

-#if CONFIG_POSTPROC_VISUALIZER

-  unsigned int            dbg_postproc_flag;

-  int                     dbg_color_ref_frame_flag;

-  int                     dbg_color_mb_modes_flag;

-  int                     dbg_color_b_modes_flag;

-  int                     dbg_display_mv_flag;

-#endif

-  vpx_image_t             img;

-  int                     img_setup;

-  int                     img_avail;

-};

-static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si,

-                                 vpx_codec_flags_t flags) {

-  /* Although this declaration is constant, we can't use it in the requested

-   * segments list because we want to define the requested segments list

-   * before defining the private type (so that the number of memory maps is

-   * known)

-   */

-  (void)si;

-  return sizeof(vpx_codec_alg_priv_t);

-}

-static void vp8_mmap_dtor(vpx_codec_mmap_t *mmap) {

-  free(mmap->priv);

-}

-static vpx_codec_err_t vp8_mmap_alloc(vpx_codec_mmap_t *mmap) {

-  vpx_codec_err_t  res;

-  unsigned int   align;

-  align = mmap->align ? mmap->align - 1 : 0;

-  if (mmap->flags & VPX_CODEC_MEM_ZERO)

-    mmap->priv = calloc(1, mmap->sz + align);

-  else

-    mmap->priv = malloc(mmap->sz + align);

-  res = (mmap->priv) ? VPX_CODEC_OK : VPX_CODEC_MEM_ERROR;

-  mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align);

-  mmap->dtor = vp8_mmap_dtor;

-  return res;

-}

-static vpx_codec_err_t vp8_validate_mmaps(const vp8_stream_info_t *si,

-                                          const vpx_codec_mmap_t *mmaps,

-                                          vpx_codec_flags_t init_flags) {

-  int i;

-  vpx_codec_err_t res = VPX_CODEC_OK;

-  for (i = 0; i < NELEMENTS(vp8_mem_req_segs) - 1; i++) {

-    /* Ensure the segment has been allocated */

-    if (!mmaps[i].base) {

-      res = VPX_CODEC_MEM_ERROR;

-      break;

-    }

-    /* Verify variable size segment is big enough for the current si. */

-    if (vp8_mem_req_segs[i].calc_sz) {

-      vpx_codec_dec_cfg_t cfg;

-      cfg.w = si->w;

-      cfg.h = si->h;

-      if (mmaps[i].sz < vp8_mem_req_segs[i].calc_sz(&cfg, init_flags)) {

-        res = VPX_CODEC_MEM_ERROR;

-        break;

-      }

-    }

-  }

-  return res;

-}

-static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) {

-  int i;

-  ctx->priv = mmap->base;

-  ctx->priv->sz = sizeof(*ctx->priv);

-  ctx->priv->iface = ctx->iface;

-  ctx->priv->alg_priv = mmap->base;

-  for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++)

-    ctx->priv->alg_priv->mmaps[i].id = vp8_mem_req_segs[i].id;

-  ctx->priv->alg_priv->mmaps[0] = *mmap;

-  ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);

-  ctx->priv->init_flags = ctx->init_flags;

-  if (ctx->config.dec) {

-    /* Update the reference to the config structure to an internal copy. */

-    ctx->priv->alg_priv->cfg = *ctx->config.dec;

-    ctx->config.dec = &ctx->priv->alg_priv->cfg;

-  }

-}

-static void *mmap_lkup(vpx_codec_alg_priv_t *ctx, unsigned int id) {

-  int i;

-  for (i = 0; i < NELEMENTS(ctx->mmaps); i++)

-    if (ctx->mmaps[i].id == id)

-      return ctx->mmaps[i].base;

-  return NULL;

-}

-static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx) {

-  /* nothing to clean up */

-}

-static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx) {

-  vpx_codec_err_t        res = VPX_CODEC_OK;

-  /* This function only allocates space for the vpx_codec_alg_priv_t

-   * structure. More memory may be required at the time the stream

-   * information becomes known.

-   */

-  if (!ctx->priv) {

-    vpx_codec_mmap_t mmap;

-    mmap.id = vp8_mem_req_segs[0].id;

-    mmap.sz = sizeof(vpx_codec_alg_priv_t);

-    mmap.align = vp8_mem_req_segs[0].align;

-    mmap.flags = vp8_mem_req_segs[0].flags;

-    res = vp8_mmap_alloc(&mmap);

-    if (!res) {

-      vp8_init_ctx(ctx, &mmap);

-      ctx->priv->alg_priv->defer_alloc = 1;

-      /*post processing level initialized to do nothing */

-    }

-  }

-  return res;

-}

-static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx) {

-  int i;

-  vp9_remove_decompressor(ctx->pbi);

-  for (i = NELEMENTS(ctx->mmaps) - 1; i >= 0; i--) {

-    if (ctx->mmaps[i].dtor)

-      ctx->mmaps[i].dtor(&ctx->mmaps[i]);

-  }

-  return VPX_CODEC_OK;

-}

-static vpx_codec_err_t vp8_peek_si(const uint8_t         *data,

-                                   unsigned int           data_sz,

-                                   vpx_codec_stream_info_t *si) {

-  vpx_codec_err_t res = VPX_CODEC_OK;

-  if (data + data_sz <= data)

-    res = VPX_CODEC_INVALID_PARAM;

-  else {

-    /* Parse uncompresssed part of key frame header.

-     * 3 bytes:- including version, frame type and an offset

-     * 3 bytes:- sync code (0x9d, 0x01, 0x2a)

-     * 4 bytes:- including image width and height in the lowest 14 bits

-     *           of each 2-byte value.

-     */

-    si->is_kf = 0;

-    if (data_sz >= 10 && !(data[0] & 0x01)) { /* I-Frame */

-      const uint8_t *c = data + 3;

-      si->is_kf = 1;

-      /* vet via sync code */

-      if (c[0] != 0x9d || c[1] != 0x01 || c[2] != 0x2a)

-        res = VPX_CODEC_UNSUP_BITSTREAM;

-      si->w = (c[3] | (c[4] << 8)) & 0x3fff;

-      si->h = (c[5] | (c[6] << 8)) & 0x3fff;

-      /*printf("w=%d, h=%d\n", si->w, si->h);*/

-      if (!(si->h | si->w))

-        res = VPX_CODEC_UNSUP_BITSTREAM;

-    } else

-      res = VPX_CODEC_UNSUP_BITSTREAM;

-  }

-  return res;

-}

-static vpx_codec_err_t vp8_get_si(vpx_codec_alg_priv_t    *ctx,

-                                  vpx_codec_stream_info_t *si) {

-  unsigned int sz;

-  if (si->sz >= sizeof(vp8_stream_info_t))

-    sz = sizeof(vp8_stream_info_t);

-  else

-    sz = sizeof(vpx_codec_stream_info_t);

-  memcpy(si, &ctx->si, sz);

-  si->sz = sz;

-  return VPX_CODEC_OK;

-}

-static vpx_codec_err_t

-update_error_state(vpx_codec_alg_priv_t                 *ctx,

-                   const struct vpx_internal_error_info *error) {

-  vpx_codec_err_t res;

-  if ((res = error->error_code))

-    ctx->base.err_detail = error->has_detail

-                           ? error->detail

-                           : NULL;

-  return res;

-}

-static void yuvconfig2image(vpx_image_t               *img,

-                            const YV12_BUFFER_CONFIG  *yv12,

-                            void                      *user_priv) {

-  /** vpx_img_wrap() doesn't allow specifying independent strides for

-    * the Y, U, and V planes, nor other alignment adjustments that

-    * might be representable by a YV12_BUFFER_CONFIG, so we just

-    * initialize all the fields.*/

-  img->fmt = yv12->clrtype == REG_YUV ?

-             VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;

-  img->w = yv12->y_stride;

-  img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;

-  img->d_w = yv12->y_width;

-  img->d_h = yv12->y_height;

-  img->x_chroma_shift = 1;

-  img->y_chroma_shift = 1;

-  img->planes[VPX_PLANE_Y] = yv12->y_buffer;

-  img->planes[VPX_PLANE_U] = yv12->u_buffer;

-  img->planes[VPX_PLANE_V] = yv12->v_buffer;

-  img->planes[VPX_PLANE_ALPHA] = NULL;

-  img->stride[VPX_PLANE_Y] = yv12->y_stride;

-  img->stride[VPX_PLANE_U] = yv12->uv_stride;

-  img->stride[VPX_PLANE_V] = yv12->uv_stride;

-  img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;

-  img->bps = 12;

-  img->user_priv = user_priv;

-  img->img_data = yv12->buffer_alloc;

-  img->img_data_owner = 0;

-  img->self_allocd = 0;

-}

-static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,

-                                  const uint8_t         *data,

-                                  unsigned int            data_sz,

-                                  void                    *user_priv,

-                                  long                    deadline) {

-  vpx_codec_err_t res = VPX_CODEC_OK;

-  ctx->img_avail = 0;

-  /* Determine the stream parameters. Note that we rely on peek_si to

-   * validate that we have a buffer that does not wrap around the top

-   * of the heap.

-   */

-  if (!ctx->si.h)

-    res = ctx->base.iface->dec.peek_si(data, data_sz, &ctx->si);

-  /* Perform deferred allocations, if required */

-  if (!res && ctx->defer_alloc) {

-    int i;

-    for (i = 1; !res && i < NELEMENTS(ctx->mmaps); i++) {

-      vpx_codec_dec_cfg_t cfg;

-      cfg.w = ctx->si.w;

-      cfg.h = ctx->si.h;

-      ctx->mmaps[i].id = vp8_mem_req_segs[i].id;

-      ctx->mmaps[i].sz = vp8_mem_req_segs[i].sz;

-      ctx->mmaps[i].align = vp8_mem_req_segs[i].align;

-      ctx->mmaps[i].flags = vp8_mem_req_segs[i].flags;

-      if (!ctx->mmaps[i].sz)

-        ctx->mmaps[i].sz = vp8_mem_req_segs[i].calc_sz(&cfg,

-                                                       ctx->base.init_flags);

-      res = vp8_mmap_alloc(&ctx->mmaps[i]);

-    }

-    if (!res)

-      vp8_finalize_mmaps(ctx);

-    ctx->defer_alloc = 0;

-  }

-  /* Initialize the decoder instance on the first frame*/

-  if (!res && !ctx->decoder_init) {

-    res = vp8_validate_mmaps(&ctx->si, ctx->mmaps, ctx->base.init_flags);

-    if (!res) {

-      VP9D_CONFIG oxcf;

-      VP9D_PTR optr;

-      vp9_initialize_dec();

-      oxcf.Width = ctx->si.w;

-      oxcf.Height = ctx->si.h;

-      oxcf.Version = 9;

-      oxcf.postprocess = 0;

-      oxcf.max_threads = ctx->cfg.threads;

-      optr = vp9_create_decompressor(&oxcf);

-      /* If postprocessing was enabled by the application and a

-       * configuration has not been provided, default it.

-       */

-      if (!ctx->postproc_cfg_set

-          && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) {

-        ctx->postproc_cfg.post_proc_flag =

-          VP8_DEBLOCK | VP8_DEMACROBLOCK;

-        ctx->postproc_cfg.deblocking_level = 4;

-        ctx->postproc_cfg.noise_level = 0;

-      }

-      if (!optr)

-        res = VPX_CODEC_ERROR;

-      else

-        ctx->pbi = optr;

-    }

-    ctx->decoder_init = 1;

-  }

-  if (!res && ctx->pbi) {

-    YV12_BUFFER_CONFIG sd;

-    int64_t time_stamp = 0, time_end_stamp = 0;

-    vp9_ppflags_t flags = {0};

-    if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) {

-      flags.post_proc_flag = ctx->postproc_cfg.post_proc_flag

-#if CONFIG_POSTPROC_VISUALIZER

-                             | ((ctx->dbg_color_ref_frame_flag != 0) ? VP9D_DEBUG_CLR_FRM_REF_BLKS : 0)

-                             | ((ctx->dbg_color_mb_modes_flag != 0) ? VP9D_DEBUG_CLR_BLK_MODES : 0)

-                             | ((ctx->dbg_color_b_modes_flag != 0) ? VP9D_DEBUG_CLR_BLK_MODES : 0)

-                             | ((ctx->dbg_display_mv_flag != 0) ? VP9D_DEBUG_DRAW_MV : 0)

-#endif

-;

-      flags.deblocking_level      = ctx->postproc_cfg.deblocking_level;

-      flags.noise_level           = ctx->postproc_cfg.noise_level;

-#if CONFIG_POSTPROC_VISUALIZER

-      flags.display_ref_frame_flag = ctx->dbg_color_ref_frame_flag;

-      flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag;

-      flags.display_b_modes_flag  = ctx->dbg_color_b_modes_flag;

-      flags.display_mv_flag       = ctx->dbg_display_mv_flag;

-#endif

-    }

-    if (vp9_receive_compressed_data(ctx->pbi, data_sz, data, deadline)) {

-      VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;

-      res = update_error_state(ctx, &pbi->common.error);

-    }

-    if (!res && 0 == vp9_get_raw_frame(ctx->pbi, &sd, &time_stamp,

-                                       &time_end_stamp, &flags)) {

-      yuvconfig2image(&ctx->img, &sd, user_priv);

-      ctx->img_avail = 1;

-    }

-  }

-  return res;

-}

-static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t  *ctx,

-                                  vpx_codec_iter_t      *iter) {

-  vpx_image_t *img = NULL;

-  if (ctx->img_avail) {

-    /* iter acts as a flip flop, so an image is only returned on the first

-     * call to get_frame.

-     */

-    if (!(*iter)) {

-      img = &ctx->img;

-      *iter = img;

-    }

-  }

-  return img;

-}

-static

-vpx_codec_err_t vp8_xma_get_mmap(const vpx_codec_ctx_t      *ctx,

-                                 vpx_codec_mmap_t           *mmap,

-                                 vpx_codec_iter_t           *iter) {

-  vpx_codec_err_t     res;

-  const mem_req_t  *seg_iter = *iter;

-  /* Get address of next segment request */

-  do {

-    if (!seg_iter)

-      seg_iter = vp8_mem_req_segs;

-    else if (seg_iter->id != VP8_SEG_MAX)

-      seg_iter++;

-    *iter = (vpx_codec_iter_t)seg_iter;

-    if (seg_iter->id != VP8_SEG_MAX) {

-      mmap->id = seg_iter->id;

-      mmap->sz = seg_iter->sz;

-      mmap->align = seg_iter->align;

-      mmap->flags = seg_iter->flags;

-      if (!seg_iter->sz)

-        mmap->sz = seg_iter->calc_sz(ctx->config.dec, ctx->init_flags);

-      res = VPX_CODEC_OK;

-    } else

-      res = VPX_CODEC_LIST_END;

-  } while (!mmap->sz && res != VPX_CODEC_LIST_END);

-  return res;

-}

-static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t         *ctx,

-                                        const vpx_codec_mmap_t  *mmap) {

-  vpx_codec_err_t res = VPX_CODEC_MEM_ERROR;

-  int i, done;

-  if (!ctx->priv) {

-    if (mmap->id == VP8_SEG_ALG_PRIV) {

-      if (!ctx->priv) {

-        vp8_init_ctx(ctx, mmap);

-        res = VPX_CODEC_OK;

-      }

-    }

-  }

-  done = 1;

-  if (!res && ctx->priv->alg_priv) {

-    for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++) {

-      if (ctx->priv->alg_priv->mmaps[i].id == mmap->id)

-        if (!ctx->priv->alg_priv->mmaps[i].base) {

-          ctx->priv->alg_priv->mmaps[i] = *mmap;

-          res = VPX_CODEC_OK;

-        }

-      done &= (ctx->priv->alg_priv->mmaps[i].base != NULL);

-    }

-  }

-  if (done && !res) {

-    vp8_finalize_mmaps(ctx->priv->alg_priv);

-    res = ctx->iface->init(ctx);

-  }

-  return res;

-}

-static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,

-                                       YV12_BUFFER_CONFIG  *yv12) {

-  vpx_codec_err_t        res = VPX_CODEC_OK;

-  yv12->y_buffer = img->planes[VPX_PLANE_Y];

-  yv12->u_buffer = img->planes[VPX_PLANE_U];

-  yv12->v_buffer = img->planes[VPX_PLANE_V];

-  yv12->y_width  = img->d_w;

-  yv12->y_height = img->d_h;

-  yv12->uv_width = yv12->y_width / 2;

-  yv12->uv_height = yv12->y_height / 2;

-  yv12->y_stride = img->stride[VPX_PLANE_Y];

-  yv12->uv_stride = img->stride[VPX_PLANE_U];

-  yv12->border  = (img->stride[VPX_PLANE_Y] - img->d_w) / 2;

-  yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 ||

-                   img->fmt == VPX_IMG_FMT_VPXYV12);

-  return res;

-}

-static vpx_codec_err_t vp9_set_reference(vpx_codec_alg_priv_t *ctx,

-                                         int ctr_id,

-                                         va_list args) {

-  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);

-  if (data) {

-    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;

-    YV12_BUFFER_CONFIG sd;

-    image2yuvconfig(&frame->img, &sd);

-    return vp9_set_reference_dec(ctx->pbi, frame->frame_type, &sd);

-  } else

-    return VPX_CODEC_INVALID_PARAM;

-}

-static vpx_codec_err_t vp9_get_reference(vpx_codec_alg_priv_t *ctx,

-                                         int ctr_id,

-                                         va_list args) {

-  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);

-  if (data) {

-    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;

-    YV12_BUFFER_CONFIG sd;

-    image2yuvconfig(&frame->img, &sd);

-    return vp9_get_reference_dec(ctx->pbi, frame->frame_type, &sd);

-  } else

-    return VPX_CODEC_INVALID_PARAM;

-}

-static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,

-                                        int ctr_id,

-                                        va_list args) {

-#if CONFIG_POSTPROC

-  vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);

-  if (data) {

-    ctx->postproc_cfg_set = 1;

-    ctx->postproc_cfg = *((vp8_postproc_cfg_t *)data);

-    return VPX_CODEC_OK;

-  } else

-    return VPX_CODEC_INVALID_PARAM;

-#else

-  return VPX_CODEC_INCAPABLE;

-#endif

-}

-static vpx_codec_err_t vp8_set_dbg_options(vpx_codec_alg_priv_t *ctx,

-                                           int ctrl_id,

-                                           va_list args) {

-#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC

-  int data = va_arg(args, int);

-#define MAP(id, var) case id: var = data; break;

-  switch (ctrl_id) {

-      MAP(VP8_SET_DBG_COLOR_REF_FRAME,   ctx->dbg_color_ref_frame_flag);

-      MAP(VP8_SET_DBG_COLOR_MB_MODES,    ctx->dbg_color_mb_modes_flag);

-      MAP(VP8_SET_DBG_COLOR_B_MODES,     ctx->dbg_color_b_modes_flag);

-      MAP(VP8_SET_DBG_DISPLAY_MV,        ctx->dbg_display_mv_flag);

-  }

-  return VPX_CODEC_OK;

-#else

-  return VPX_CODEC_INCAPABLE;

-#endif

-}

-static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,

-                                                int ctrl_id,

-                                                va_list args) {

-  int *update_info = va_arg(args, int *);

-  VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;

-  if (update_info) {

-    *update_info = pbi->common.refresh_alt_ref_frame * (int) VP8_ALTR_FRAME

-                   + pbi->common.refresh_golden_frame * (int) VP8_GOLD_FRAME

-                   + pbi->common.refresh_last_frame * (int) VP8_LAST_FRAME;

-    return VPX_CODEC_OK;

-  } else

-    return VPX_CODEC_INVALID_PARAM;

-}

-static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,

-                                               int ctrl_id,

-                                               va_list args) {

-  int *corrupted = va_arg(args, int *);

-  if (corrupted) {

-    VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;

-    *corrupted = pbi->common.frame_to_show->corrupted;

-    return VPX_CODEC_OK;

-  } else

-    return VPX_CODEC_INVALID_PARAM;

-}

-static vpx_codec_ctrl_fn_map_t ctf_maps[] = {

-  {VP8_SET_REFERENCE,             vp9_set_reference},

-  {VP8_COPY_REFERENCE,            vp9_get_reference},

-  {VP8_SET_POSTPROC,              vp8_set_postproc},

-  {VP8_SET_DBG_COLOR_REF_FRAME,   vp8_set_dbg_options},

-  {VP8_SET_DBG_COLOR_MB_MODES,    vp8_set_dbg_options},

-  {VP8_SET_DBG_COLOR_B_MODES,     vp8_set_dbg_options},

-  {VP8_SET_DBG_DISPLAY_MV,        vp8_set_dbg_options},

-  {VP8D_GET_LAST_REF_UPDATES,     vp8_get_last_ref_updates},

-  {VP8D_GET_FRAME_CORRUPTED,      vp8_get_frame_corrupted},

-  { -1, NULL},

-};

-#ifndef VERSION_STRING

-#define VERSION_STRING

-#endif

-CODEC_INTERFACE(vpx_codec_vp8_dx) = {

-  "WebM Project VP8 Decoder" VERSION_STRING,

-  VPX_CODEC_INTERNAL_ABI_VERSION,

-  VPX_CODEC_CAP_DECODER | VP8_CAP_POSTPROC |

-  VPX_CODEC_CAP_INPUT_PARTITION,

-  /* vpx_codec_caps_t          caps; */

-  vp8_init,         /* vpx_codec_init_fn_t       init; */

-  vp8_destroy,      /* vpx_codec_destroy_fn_t    destroy; */

-  ctf_maps,         /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */

-  vp8_xma_get_mmap, /* vpx_codec_get_mmap_fn_t   get_mmap; */

-  vp8_xma_set_mmap, /* vpx_codec_set_mmap_fn_t   set_mmap; */

-  {

-    vp8_peek_si,      /* vpx_codec_peek_si_fn_t    peek_si; */

-    vp8_get_si,       /* vpx_codec_get_si_fn_t     get_si; */

-    vp8_decode,       /* vpx_codec_decode_fn_t     decode; */

-    vp8_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */

-  },

-  {

-    /* encoder functions */

-    NOT_IMPLEMENTED,

-    NOT_IMPLEMENTED,

-    NOT_IMPLEMENTED,

-    NOT_IMPLEMENTED,

-    NOT_IMPLEMENTED,

-    NOT_IMPLEMENTED

-  }

-};

-/*

- * BEGIN BACKWARDS COMPATIBILITY SHIM.

- */

-vpx_codec_iface_t vpx_codec_vp8_algo = {

-  "WebM Project VP8 Decoder (Deprecated API)" VERSION_STRING,

-  VPX_CODEC_INTERNAL_ABI_VERSION,

-  VPX_CODEC_CAP_DECODER | VP8_CAP_POSTPROC,

-  /* vpx_codec_caps_t          caps; */

-  vp8_init,         /* vpx_codec_init_fn_t       init; */

-  vp8_destroy,      /* vpx_codec_destroy_fn_t    destroy; */

-  ctf_maps,         /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */

-  vp8_xma_get_mmap, /* vpx_codec_get_mmap_fn_t   get_mmap; */

-  vp8_xma_set_mmap, /* vpx_codec_set_mmap_fn_t   set_mmap; */

-  {

-    vp8_peek_si,      /* vpx_codec_peek_si_fn_t    peek_si; */

-    vp8_get_si,       /* vpx_codec_get_si_fn_t     get_si; */

-    vp8_decode,       /* vpx_codec_decode_fn_t     decode; */

-    vp8_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */

-  },

-  {

-    /* encoder functions */

-    NOT_IMPLEMENTED,

-    NOT_IMPLEMENTED,

-    NOT_IMPLEMENTED,

-    NOT_IMPLEMENTED,

-    NOT_IMPLEMENTED,

-    NOT_IMPLEMENTED

-  }

-};

--- a/vp8/vp8cx.mk

+++ /dev/null

@@ -1,120 +1,0 @@

-##

-##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-##

-##  Use of this source code is governed by a BSD-style license

-##  that can be found in the LICENSE file in the root of the source

-##  tree. An additional intellectual property rights grant can be found

-##  in the file PATENTS.  All contributing project authors may

-##  be found in the AUTHORS file in the root of the source tree.

-##

-include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk

-VP8_CX_EXPORTS += exports_enc

-VP8_CX_SRCS-yes += $(VP8_COMMON_SRCS-yes)

-VP8_CX_SRCS-no  += $(VP8_COMMON_SRCS-no)

-VP8_CX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes)

-VP8_CX_SRCS_REMOVE-no  += $(VP8_COMMON_SRCS_REMOVE-no)

-ifeq ($(ARCH_ARM),yes)

-  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx_arm.mk

-endif

-VP8_CX_SRCS-yes += vp8_cx_iface.c

-# encoder

-#INCLUDES += algo/vpx_common/vpx_mem/include

-#INCLUDES += common

-#INCLUDES += common

-#INCLUDES += common

-#INCLUDES += algo/vpx_ref/cpu_id/include

-#INCLUDES += common

-#INCLUDES += encoder

-VP8_CX_SRCS-yes += encoder/asm_enc_offsets.c

-VP8_CX_SRCS-yes += encoder/bitstream.c

-VP8_CX_SRCS-yes += encoder/boolhuff.c

-VP8_CX_SRCS-yes += encoder/dct.c

-VP8_CX_SRCS-yes += encoder/encodeframe.c

-VP8_CX_SRCS-yes += encoder/encodeintra.c

-VP8_CX_SRCS-yes += encoder/encodemb.c

-VP8_CX_SRCS-yes += encoder/encodemv.c

-VP8_CX_SRCS-yes += encoder/firstpass.c

-VP8_CX_SRCS-yes += encoder/generic/csystemdependent.c

-VP8_CX_SRCS-yes += encoder/block.h

-VP8_CX_SRCS-yes += encoder/boolhuff.h

-VP8_CX_SRCS-yes += encoder/bitstream.h

-VP8_CX_SRCS-yes += encoder/encodeintra.h

-VP8_CX_SRCS-yes += encoder/encodemb.h

-VP8_CX_SRCS-yes += encoder/encodemv.h

-VP8_CX_SRCS-yes += encoder/firstpass.h

-VP8_CX_SRCS-yes += encoder/lookahead.c

-VP8_CX_SRCS-yes += encoder/lookahead.h

-VP8_CX_SRCS-yes += encoder/mcomp.h

-VP8_CX_SRCS-yes += encoder/modecosts.h

-VP8_CX_SRCS-yes += encoder/onyx_int.h

-VP8_CX_SRCS-yes += encoder/psnr.h

-VP8_CX_SRCS-yes += encoder/quantize.h

-VP8_CX_SRCS-yes += encoder/ratectrl.h

-VP8_CX_SRCS-yes += encoder/rdopt.h

-VP8_CX_SRCS-yes += encoder/tokenize.h

-VP8_CX_SRCS-yes += encoder/treewriter.h

-VP8_CX_SRCS-yes += encoder/variance.h

-VP8_CX_SRCS-yes += encoder/mcomp.c

-VP8_CX_SRCS-yes += encoder/modecosts.c

-VP8_CX_SRCS-yes += encoder/onyx_if.c

-VP8_CX_SRCS-yes += encoder/picklpf.c

-VP8_CX_SRCS-yes += encoder/psnr.c

-VP8_CX_SRCS-yes += encoder/quantize.c

-VP8_CX_SRCS-yes += encoder/ratectrl.c

-VP8_CX_SRCS-yes += encoder/rdopt.c

-VP8_CX_SRCS-yes += encoder/sad_c.c

-VP8_CX_SRCS-yes += encoder/satd_c.c

-VP8_CX_SRCS-yes += encoder/segmentation.c

-VP8_CX_SRCS-yes += encoder/segmentation.h

-VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/ssim.c

-VP8_CX_SRCS-yes += encoder/tokenize.c

-VP8_CX_SRCS-yes += encoder/treewriter.c

-VP8_CX_SRCS-yes += encoder/variance_c.c

-ifeq ($(CONFIG_POSTPROC),yes)

-VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h

-VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c

-endif

-VP8_CX_SRCS-yes += encoder/temporal_filter.c

-VP8_CX_SRCS-yes += encoder/temporal_filter.h

-VP8_CX_SRCS-yes += encoder/mbgraph.c

-VP8_CX_SRCS-yes += encoder/mbgraph.h

-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/mcomp_x86.h

-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_x86.h

-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/temporal_filter_x86.h

-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/x86_csystemdependent.c

-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_mmx.c

-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_impl_mmx.asm

-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/sad_mmx.asm

-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm

-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm

-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm

-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_sse2.c

-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_impl_sse2.asm

-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm

-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm

-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm

-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm

-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm

-VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm

-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm

-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_ssse3.c

-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_impl_ssse3.asm

-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm

-VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm

-VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm

-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm

-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm

-VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm

-VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))

--- a/vp8/vp8cx_arm.mk

+++ /dev/null

@@ -1,63 +1,0 @@

-##

-##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-##

-##  Use of this source code is governed by a BSD-style license

-##  that can be found in the LICENSE file in the root of the source

-##  tree. An additional intellectual property rights grant can be found

-##  in the file PATENTS.  All contributing project authors may

-##  be found in the AUTHORS file in the root of the source tree.

-##

-#VP8_CX_SRCS list is modified according to different platforms.

-#File list for arm

-# encoder

-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/arm_csystemdependent.c

-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/dct_arm.c

-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/dct_arm.h

-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/encodemb_arm.h

-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/quantize_arm.c

-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/quantize_arm.h

-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/variance_arm.c

-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/variance_arm.h

-#File list for armv5te

-# encoder

-VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c

-VP8_CX_SRCS_REMOVE-$(HAVE_ARMV5TE)  += encoder/boolhuff.c

-VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/boolhuff_armv5te$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_armv5$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_mbrow_armv5$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_partitions_armv5$(ASM)

-#File list for armv6

-# encoder

-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_subtract_armv6$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance8x8_armv6$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/walsh_v6$(ASM)

-#File list for neon

-# encoder

-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/fastquantizeb_neon$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/picklpf_arm.c

-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/sad8_neon$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/sad16_neon$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/shortfdct_neon$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/subtract_neon$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/variance_neon$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_mse16x16_neon$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_memcpy_neon$(ASM)

-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM)

--- a/vp8/vp8dx.mk

+++ /dev/null

@@ -1,71 +1,0 @@

-##

-##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-##

-##  Use of this source code is governed by a BSD-style license

-##  that can be found in the LICENSE file in the root of the source

-##  tree. An additional intellectual property rights grant can be found

-##  in the file PATENTS.  All contributing project authors may

-##  be found in the AUTHORS file in the root of the source tree.

-##

-include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk

-VP8_DX_EXPORTS += exports_dec

-VP8_DX_SRCS-yes += $(VP8_COMMON_SRCS-yes)

-VP8_DX_SRCS-no  += $(VP8_COMMON_SRCS-no)

-VP8_DX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes)

-VP8_DX_SRCS_REMOVE-no  += $(VP8_COMMON_SRCS_REMOVE-no)

-ifeq ($(ARCH_ARM),yes)

-  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8dx_arm.mk

-endif

-VP8_DX_SRCS-yes += vp8_dx_iface.c

-# common

-#define ARM

-#define DISABLE_THREAD

-#INCLUDES += algo/vpx_common/vpx_mem/include

-#INCLUDES += common

-#INCLUDES += common

-#INCLUDES += common

-#INCLUDES += common

-#INCLUDES += decoder

-# decoder

-#define ARM

-#define DISABLE_THREAD

-#INCLUDES += algo/vpx_common/vpx_mem/include

-#INCLUDES += common

-#INCLUDES += common

-#INCLUDES += common

-#INCLUDES += common

-#INCLUDES += decoder

-VP8_DX_SRCS-yes += decoder/asm_dec_offsets.c

-VP8_DX_SRCS-yes += decoder/dboolhuff.c

-VP8_DX_SRCS-yes += decoder/decodemv.c

-VP8_DX_SRCS-yes += decoder/decodframe.c

-VP8_DX_SRCS-yes += decoder/dequantize.c

-VP8_DX_SRCS-yes += decoder/detokenize.c

-VP8_DX_SRCS-yes += decoder/dboolhuff.h

-VP8_DX_SRCS-yes += decoder/decodemv.h

-VP8_DX_SRCS-yes += decoder/dequantize.h

-VP8_DX_SRCS-yes += decoder/detokenize.h

-VP8_DX_SRCS-yes += decoder/onyxd_int.h

-VP8_DX_SRCS-yes += decoder/treereader.h

-VP8_DX_SRCS-yes += decoder/onyxd_if.c

-VP8_DX_SRCS-yes += decoder/idct_blk.c

-VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes))

-VP8_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/x86_dsystemdependent.c

-VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/dequantize_mmx.asm

-VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/idct_blk_mmx.c

-VP8_DX_SRCS-$(HAVE_SSE2) += decoder/x86/idct_blk_sse2.c

--- a/vp8/vp8dx_arm.mk

+++ /dev/null

@@ -1,29 +1,0 @@

-##

-##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-##

-##  Use of this source code is governed by a BSD-style license

-##  that can be found in the LICENSE file in the root of the source

-##  tree. An additional intellectual property rights grant can be found

-##  in the file PATENTS.  All contributing project authors may

-##  be found in the AUTHORS file in the root of the source tree.

-##

-#VP8_DX_SRCS list is modified according to different platforms.

-VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/dequantize_arm.c

-#File list for armv6

-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequant_dc_idct_v6$(ASM)

-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequant_idct_v6$(ASM)

-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequantize_v6$(ASM)

-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/idct_blk_v6.c

-#File list for neon

-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_full_2x_neon$(ASM)

-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_0_2x_neon$(ASM)

-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequant_idct_neon$(ASM)

-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM)

-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM)

-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequantizeb_neon$(ASM)

-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_blk_neon.c

--- /dev/null

+++ b/vp9/common/alloccommon.c

@@ -1,0 +1,220 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "blockd.h"

+#include "vpx_mem/vpx_mem.h"

+#include "onyxc_int.h"

+#include "findnearmv.h"

+#include "entropymode.h"

+#include "entropymv.h"

+#include "systemdependent.h"

+void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base) {

+  int stride = cpi->mode_info_stride;

+  int i;

+  // Clear down top border row

+  vpx_memset(mi_base, 0, sizeof(MODE_INFO) * cpi->mode_info_stride);

+  // Clear left border column

+  for (i = 1; i < cpi->mb_rows + 1; i++) {

+    vpx_memset(&mi_base[i * stride], 0, sizeof(MODE_INFO));

+  }

+}

+void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi) {

+  int i, j;

+  // For each in image mode_info element set the in image flag to 1

+  for (i = 0; i < cpi->mb_rows; i++) {

+    for (j = 0; j < cpi->mb_cols; j++) {

+      mi->mbmi.mb_in_image = 1;

+      mi++;   // Next element in the row

+    }

+    mi++;       // Step over border element at start of next row

+  }

+}

+void vp9_de_alloc_frame_buffers(VP9_COMMON *oci) {

+  int i;

+  for (i = 0; i < NUM_YV12_BUFFERS; i++)

+    vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);

+  vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);

+  vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);

+  vpx_free(oci->above_context);

+  vpx_free(oci->mip);

+  vpx_free(oci->prev_mip);

+  oci->above_context = 0;

+  oci->mip = 0;

+  oci->prev_mip = 0;

+}

+int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {

+  int i;

+  vp9_de_alloc_frame_buffers(oci);

+  /* our internal buffers are always multiples of 16 */

+  if ((width & 0xf) != 0)

+    width += 16 - (width & 0xf);

+  if ((height & 0xf) != 0)

+    height += 16 - (height & 0xf);

+  for (i = 0; i < NUM_YV12_BUFFERS; i++) {

+    oci->fb_idx_ref_cnt[i] = 0;

+    oci->yv12_fb[i].flags = 0;

+    if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, VP8BORDERINPIXELS) < 0) {

+      vp9_de_alloc_frame_buffers(oci);

+      return 1;

+    }

+  }

+  oci->new_fb_idx = 0;

+  oci->lst_fb_idx = 1;

+  oci->gld_fb_idx = 2;

+  oci->alt_fb_idx = 3;

+  oci->fb_idx_ref_cnt[0] = 1;

+  oci->fb_idx_ref_cnt[1] = 1;

+  oci->fb_idx_ref_cnt[2] = 1;

+  oci->fb_idx_ref_cnt[3] = 1;

+  if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame,   width, 16, VP8BORDERINPIXELS) < 0) {

+    vp9_de_alloc_frame_buffers(oci);

+    return 1;

+  }

+  if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0) {

+    vp9_de_alloc_frame_buffers(oci);

+    return 1;

+  }

+  oci->mb_rows = height >> 4;

+  oci->mb_cols = width >> 4;

+  oci->MBs = oci->mb_rows * oci->mb_cols;

+  oci->mode_info_stride = oci->mb_cols + 1;

+  oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));

+  if (!oci->mip) {

+    vp9_de_alloc_frame_buffers(oci);

+    return 1;

+  }

+  oci->mi = oci->mip + oci->mode_info_stride + 1;

+  /* allocate memory for last frame MODE_INFO array */

+  oci->prev_mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));

+  if (!oci->prev_mip) {

+    vp9_de_alloc_frame_buffers(oci);

+    return 1;

+  }

+  oci->prev_mi = oci->prev_mip + oci->mode_info_stride + 1;

+  oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);

+  if (!oci->above_context) {

+    vp9_de_alloc_frame_buffers(oci);

+    return 1;

+  }

+  vp9_update_mode_info_border(oci, oci->mip);

+  vp9_update_mode_info_in_image(oci, oci->mi);

+  return 0;

+}

+void vp9_setup_version(VP9_COMMON *cm) {

+  if (cm->version & 0x4) {

+    if (!CONFIG_EXPERIMENTAL)

+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,

+                         "Bitstream was created by an experimental "

+                         "encoder");

+    cm->experimental = 1;

+  }

+  switch (cm->version & 0x3) {

+    case 0:

+      cm->no_lpf = 0;

+      cm->filter_type = NORMAL_LOOPFILTER;

+      cm->use_bilinear_mc_filter = 0;

+      cm->full_pixel = 0;

+      break;

+    case 1:

+      cm->no_lpf = 0;

+      cm->filter_type = SIMPLE_LOOPFILTER;

+      cm->use_bilinear_mc_filter = 1;

+      cm->full_pixel = 0;

+      break;

+    case 2:

+    case 3:

+      cm->no_lpf = 1;

+      cm->filter_type = NORMAL_LOOPFILTER;

+      cm->use_bilinear_mc_filter = 1;

+      cm->full_pixel = 0;

+      break;

+      // Full pel only code deprecated in experimental code base

+      // case 3:

+      //    cm->no_lpf = 1;

+      //    cm->filter_type = SIMPLE_LOOPFILTER;

+      //    cm->use_bilinear_mc_filter = 1;

+      //    cm->full_pixel = 1;

+      //    break;

+  }

+}

+void vp9_create_common(VP9_COMMON *oci) {

+  vp9_machine_specific_config(oci);

+  vp9_init_mbmode_probs(oci);

+  vp9_default_bmode_probs(oci->fc.bmode_prob);

+  oci->txfm_mode = ONLY_4X4;

+  oci->mb_no_coeff_skip = 1;

+  oci->comp_pred_mode = HYBRID_PREDICTION;

+  oci->no_lpf = 0;

+  oci->filter_type = NORMAL_LOOPFILTER;

+  oci->use_bilinear_mc_filter = 0;

+  oci->full_pixel = 0;

+  oci->clr_type = REG_YUV;

+  oci->clamp_type = RECON_CLAMP_REQUIRED;

+  /* Initialise reference frame sign bias structure to defaults */

+  vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));

+  /* Default disable buffer to buffer copying */

+  oci->copy_buffer_to_gf = 0;

+  oci->copy_buffer_to_arf = 0;

+  oci->kf_ymode_probs_update = 0;

+}

+void vp9_remove_common(VP9_COMMON *oci) {

+  vp9_de_alloc_frame_buffers(oci);

+}

+void vp9_initialize_common() {

+  vp9_coef_tree_initialize();

+  vp9_entropy_mode_init();

+  vp9_entropy_mv_init();

+}

--- /dev/null

+++ b/vp9/common/alloccommon.h

@@ -1,0 +1,26 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_ALLOCCOMMON_H

+#define __INC_ALLOCCOMMON_H

+#include "onyxc_int.h"

+void vp9_create_common(VP9_COMMON *oci);

+void vp9_remove_common(VP9_COMMON *oci);

+void vp9_de_alloc_frame_buffers(VP9_COMMON *oci);

+int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height);

+void vp9_setup_version(VP9_COMMON *oci);

+void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base);

+void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi);

+#endif

--- /dev/null

+++ b/vp9/common/arm/arm_systemdependent.c

@@ -1,0 +1,92 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vpx_ports/arm.h"

+#include "vp9/common/pragmas.h"

+#include "vp9/common/subpixel.h"

+#include "vp9/common/loopfilter.h"

+#include "vp9/common/recon.h"

+#include "vp9/common/idct.h"

+#include "vp9/common/onyxc_int.h"

+void vp9_arch_arm_common_init(VP9_COMMON *ctx) {

+#if CONFIG_RUNTIME_CPU_DETECT

+  VP9_COMMON_RTCD *rtcd = &ctx->rtcd;

+  int flags = arm_cpu_caps();

+  rtcd->flags = flags;

+  /* Override default functions with fastest ones for this CPU. */

+#if HAVE_ARMV5TE

+  if (flags & HAS_EDSP) {

+  }

+#endif

+// The commented functions need to be re-written for vpx.

+#if HAVE_ARMV6

+  if (flags & HAS_MEDIA) {

+    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_armv6;

+    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_armv6;

+    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_armv6;

+    rtcd->subpix.sixtap4x4     = vp9_sixtap_predict_armv6;

+    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_armv6;

+    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_armv6;

+    rtcd->subpix.bilinear8x4   = vp9_bilinear_predict8x4_armv6;

+    rtcd->subpix.bilinear4x4   = vp9_bilinear_predict4x4_armv6;

+    // rtcd->idct.idct1        = vp9_short_idct4x4llm_1_v6;

+    // rtcd->idct.idct16       = vp9_short_idct4x4llm_v6_dual;

+    // rtcd->idct.iwalsh1      = vp9_short_inv_walsh4x4_1_v6;

+    // rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_v6;

+    rtcd->recon.copy16x16   = vp9_copy_mem16x16_v6;

+    rtcd->recon.copy8x8     = vp9_copy_mem8x8_v6;

+    rtcd->recon.copy8x4     = vp9_copy_mem8x4_v6;

+    rtcd->recon.recon       = vp9_recon_b_armv6;

+    rtcd->recon.recon2      = vp9_recon2b_armv6;

+    rtcd->recon.recon4      = vp9_recon4b_armv6;

+  }

+#endif

+#if HAVE_ARMV7

+  if (flags & HAS_NEON) {

+    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_neon;

+    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_neon;

+    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_neon;

+    rtcd->subpix.sixtap4x4     = vp9_sixtap_predict_neon;

+    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_neon;

+    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_neon;

+    rtcd->subpix.bilinear8x4   = vp9_bilinear_predict8x4_neon;

+    rtcd->subpix.bilinear4x4   = vp9_bilinear_predict4x4_neon;

+    // rtcd->idct.idct1        = vp9_short_idct4x4llm_1_neon;

+    // rtcd->idct.idct16       = vp9_short_idct4x4llm_neon;

+    // rtcd->idct.iwalsh1      = vp9_short_inv_walsh4x4_1_neon;

+    // rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_neon;

+    rtcd->recon.copy16x16   = vp9_copy_mem16x16_neon;

+    rtcd->recon.copy8x8     = vp9_copy_mem8x8_neon;

+    rtcd->recon.copy8x4     = vp9_copy_mem8x4_neon;

+    rtcd->recon.recon       = vp9_recon_b_neon;

+    rtcd->recon.recon2      = vp9_recon2b_neon;

+    rtcd->recon.recon4      = vp9_recon4b_neon;

+    rtcd->recon.recon_mb    = vp9_recon_mb_neon;

+    rtcd->recon.build_intra_predictors_mby =

+      vp9_build_intra_predictors_mby_neon;

+    rtcd->recon.build_intra_predictors_mby_s =

+      vp9_build_intra_predictors_mby_s_neon;

+  }

+#endif

+#endif

+}

--- /dev/null

+++ b/vp9/common/arm/armv6/bilinearfilter_v6.asm

@@ -1,0 +1,237 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_filter_block2d_bil_first_pass_armv6|

+    EXPORT  |vp9_filter_block2d_bil_second_pass_armv6|

+    AREA    |.text|, CODE, READONLY  ; name this block of code

+;-------------------------------------

+; r0    unsigned char  *src_ptr,

+; r1    unsigned short *dst_ptr,

+; r2    unsigned int    src_pitch,

+; r3    unsigned int    height,

+; stack unsigned int    width,

+; stack const short    *vp9_filter

+;-------------------------------------

+; The output is transposed stroed in output array to make it easy for second pass filtering.

+|vp9_filter_block2d_bil_first_pass_armv6| PROC

+    stmdb   sp!, {r4 - r11, lr}

+    ldr     r11, [sp, #40]                  ; vp9_filter address

+    ldr     r4, [sp, #36]                   ; width

+    mov     r12, r3                         ; outer-loop counter

+    add     r7, r2, r4                      ; preload next row

+    pld     [r0, r7]

+    sub     r2, r2, r4                      ; src increment for height loop

+    ldr     r5, [r11]                       ; load up filter coefficients

+    mov     r3, r3, lsl #1                  ; height*2

+    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)

+    mov     r11, r1                         ; save dst_ptr for each row

+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter

+    beq     bil_null_1st_filter

+|bil_height_loop_1st_v6|

+    ldrb    r6, [r0]                        ; load source data

+    ldrb    r7, [r0, #1]

+    ldrb    r8, [r0, #2]

+    mov     lr, r4, lsr #2                  ; 4-in-parellel loop counter

+|bil_width_loop_1st_v6|

+    ldrb    r9, [r0, #3]

+    ldrb    r10, [r0, #4]

+    pkhbt   r6, r6, r7, lsl #16             ; src[1] | src[0]

+    pkhbt   r7, r7, r8, lsl #16             ; src[2] | src[1]

+    smuad   r6, r6, r5                      ; apply the filter

+    pkhbt   r8, r8, r9, lsl #16             ; src[3] | src[2]

+    smuad   r7, r7, r5

+    pkhbt   r9, r9, r10, lsl #16            ; src[4] | src[3]

+    smuad   r8, r8, r5

+    smuad   r9, r9, r5

+    add     r0, r0, #4

+    subs    lr, lr, #1

+    add     r6, r6, #0x40                   ; round_shift_and_clamp

+    add     r7, r7, #0x40

+    usat    r6, #16, r6, asr #7

+    usat    r7, #16, r7, asr #7

+    strh    r6, [r1], r3                    ; result is transposed and stored

+    add     r8, r8, #0x40                   ; round_shift_and_clamp

+    strh    r7, [r1], r3

+    add     r9, r9, #0x40

+    usat    r8, #16, r8, asr #7

+    usat    r9, #16, r9, asr #7

+    strh    r8, [r1], r3                    ; result is transposed and stored

+    ldrneb  r6, [r0]                        ; load source data

+    strh    r9, [r1], r3

+    ldrneb  r7, [r0, #1]

+    ldrneb  r8, [r0, #2]

+    bne     bil_width_loop_1st_v6

+    add     r0, r0, r2                      ; move to next input row

+    subs    r12, r12, #1

+    add     r9, r2, r4, lsl #1              ; adding back block width

+    pld     [r0, r9]                        ; preload next row

+    add     r11, r11, #2                    ; move over to next column

+    mov     r1, r11

+    bne     bil_height_loop_1st_v6

+    ldmia   sp!, {r4 - r11, pc}

+|bil_null_1st_filter|

+|bil_height_loop_null_1st|

+    mov     lr, r4, lsr #2                  ; loop counter

+|bil_width_loop_null_1st|

+    ldrb    r6, [r0]                        ; load data

+    ldrb    r7, [r0, #1]

+    ldrb    r8, [r0, #2]

+    ldrb    r9, [r0, #3]

+    strh    r6, [r1], r3                    ; store it to immediate buffer

+    add     r0, r0, #4

+    strh    r7, [r1], r3

+    subs    lr, lr, #1

+    strh    r8, [r1], r3

+    strh    r9, [r1], r3

+    bne     bil_width_loop_null_1st

+    subs    r12, r12, #1

+    add     r0, r0, r2                      ; move to next input line

+    add     r11, r11, #2                    ; move over to next column

+    mov     r1, r11

+    bne     bil_height_loop_null_1st

+    ldmia   sp!, {r4 - r11, pc}

+    ENDP  ; |vp9_filter_block2d_bil_first_pass_armv6|

+;---------------------------------

+; r0    unsigned short *src_ptr,

+; r1    unsigned char  *dst_ptr,

+; r2    int             dst_pitch,

+; r3    unsigned int    height,

+; stack unsigned int    width,

+; stack const short    *vp9_filter

+;---------------------------------

+|vp9_filter_block2d_bil_second_pass_armv6| PROC

+    stmdb   sp!, {r4 - r11, lr}

+    ldr     r11, [sp, #40]                  ; vp9_filter address

+    ldr     r4, [sp, #36]                   ; width

+    ldr     r5, [r11]                       ; load up filter coefficients

+    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix

+    mov     r11, r1

+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter

+    beq     bil_null_2nd_filter

+|bil_height_loop_2nd|

+    ldr     r6, [r0]                        ; load the data

+    ldr     r8, [r0, #4]

+    ldrh    r10, [r0, #8]

+    mov     lr, r3, lsr #2                  ; loop counter

+|bil_width_loop_2nd|

+    pkhtb   r7, r6, r8                      ; src[1] | src[2]

+    pkhtb   r9, r8, r10                     ; src[3] | src[4]

+    smuad   r6, r6, r5                      ; apply filter

+    smuad   r8, r8, r5                      ; apply filter

+    subs    lr, lr, #1

+    smuadx  r7, r7, r5                      ; apply filter

+    smuadx  r9, r9, r5                      ; apply filter

+    add     r0, r0, #8

+    add     r6, r6, #0x40                   ; round_shift_and_clamp

+    add     r7, r7, #0x40

+    usat    r6, #8, r6, asr #7

+    usat    r7, #8, r7, asr #7

+    strb    r6, [r1], r2                    ; the result is transposed back and stored

+    add     r8, r8, #0x40                   ; round_shift_and_clamp

+    strb    r7, [r1], r2

+    add     r9, r9, #0x40

+    usat    r8, #8, r8, asr #7

+    usat    r9, #8, r9, asr #7

+    strb    r8, [r1], r2                    ; the result is transposed back and stored

+    ldrne   r6, [r0]                        ; load data

+    strb    r9, [r1], r2

+    ldrne   r8, [r0, #4]

+    ldrneh  r10, [r0, #8]

+    bne     bil_width_loop_2nd

+    subs    r12, r12, #1

+    add     r0, r0, #4                      ; update src for next row

+    add     r11, r11, #1

+    mov     r1, r11

+    bne     bil_height_loop_2nd

+    ldmia   sp!, {r4 - r11, pc}

+|bil_null_2nd_filter|

+|bil_height_loop_null_2nd|

+    mov     lr, r3, lsr #2

+|bil_width_loop_null_2nd|

+    ldr     r6, [r0], #4                    ; load data

+    subs    lr, lr, #1

+    ldr     r8, [r0], #4

+    strb    r6, [r1], r2                    ; store data

+    mov     r7, r6, lsr #16

+    strb    r7, [r1], r2

+    mov     r9, r8, lsr #16

+    strb    r8, [r1], r2

+    strb    r9, [r1], r2

+    bne     bil_width_loop_null_2nd

+    subs    r12, r12, #1

+    add     r0, r0, #4

+    add     r11, r11, #1

+    mov     r1, r11

+    bne     bil_height_loop_null_2nd

+    ldmia   sp!, {r4 - r11, pc}

+    ENDP  ; |vp9_filter_block2d_second_pass_armv6|

+    END

--- /dev/null

+++ b/vp9/common/arm/armv6/copymem16x16_v6.asm

@@ -1,0 +1,186 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_copy_mem16x16_v6|

+    ; ARM

+    ; REQUIRE8

+    ; PRESERVE8

+    AREA    Block, CODE, READONLY ; name this block of code

+;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

+|vp9_copy_mem16x16_v6| PROC

+    stmdb       sp!, {r4 - r7}

+    ;push   {r4-r7}

+    ;preload

+    pld     [r0, #31]                ; preload for next 16x16 block

+    ands    r4, r0, #15

+    beq     copy_mem16x16_fast

+    ands    r4, r0, #7

+    beq     copy_mem16x16_8

+    ands    r4, r0, #3

+    beq     copy_mem16x16_4

+    ;copy one byte each time

+    ldrb    r4, [r0]

+    ldrb    r5, [r0, #1]

+    ldrb    r6, [r0, #2]

+    ldrb    r7, [r0, #3]

+    mov     r12, #16

+copy_mem16x16_1_loop

+    strb    r4, [r2]

+    strb    r5, [r2, #1]

+    strb    r6, [r2, #2]

+    strb    r7, [r2, #3]

+    ldrb    r4, [r0, #4]

+    ldrb    r5, [r0, #5]

+    ldrb    r6, [r0, #6]

+    ldrb    r7, [r0, #7]

+    subs    r12, r12, #1

+    strb    r4, [r2, #4]

+    strb    r5, [r2, #5]

+    strb    r6, [r2, #6]

+    strb    r7, [r2, #7]

+    ldrb    r4, [r0, #8]

+    ldrb    r5, [r0, #9]

+    ldrb    r6, [r0, #10]

+    ldrb    r7, [r0, #11]

+    strb    r4, [r2, #8]

+    strb    r5, [r2, #9]

+    strb    r6, [r2, #10]

+    strb    r7, [r2, #11]

+    ldrb    r4, [r0, #12]

+    ldrb    r5, [r0, #13]

+    ldrb    r6, [r0, #14]

+    ldrb    r7, [r0, #15]

+    add     r0, r0, r1

+    strb    r4, [r2, #12]

+    strb    r5, [r2, #13]

+    strb    r6, [r2, #14]

+    strb    r7, [r2, #15]

+    add     r2, r2, r3

+    ldrneb  r4, [r0]

+    ldrneb  r5, [r0, #1]

+    ldrneb  r6, [r0, #2]

+    ldrneb  r7, [r0, #3]

+    pld     [r0, #31]               ; preload for next 16x16 block

+    bne     copy_mem16x16_1_loop

+    ldmia       sp!, {r4 - r7}

+    ;pop        {r4-r7}

+    mov     pc, lr

+;copy 4 bytes each time

+copy_mem16x16_4

+    ldr     r4, [r0]

+    ldr     r5, [r0, #4]

+    ldr     r6, [r0, #8]

+    ldr     r7, [r0, #12]

+    mov     r12, #16

+copy_mem16x16_4_loop

+    subs    r12, r12, #1

+    add     r0, r0, r1

+    str     r4, [r2]

+    str     r5, [r2, #4]

+    str     r6, [r2, #8]

+    str     r7, [r2, #12]

+    add     r2, r2, r3

+    ldrne   r4, [r0]

+    ldrne   r5, [r0, #4]

+    ldrne   r6, [r0, #8]

+    ldrne   r7, [r0, #12]

+    pld     [r0, #31]               ; preload for next 16x16 block

+    bne     copy_mem16x16_4_loop

+    ldmia       sp!, {r4 - r7}

+    ;pop        {r4-r7}

+    mov     pc, lr

+;copy 8 bytes each time

+copy_mem16x16_8

+    sub     r1, r1, #16

+    sub     r3, r3, #16

+    mov     r12, #16

+copy_mem16x16_8_loop

+    ldmia   r0!, {r4-r5}

+    ;ldm        r0, {r4-r5}

+    ldmia   r0!, {r6-r7}

+    add     r0, r0, r1

+    stmia   r2!, {r4-r5}

+    subs    r12, r12, #1

+    ;stm        r2, {r4-r5}

+    stmia   r2!, {r6-r7}

+    add     r2, r2, r3

+    pld     [r0, #31]               ; preload for next 16x16 block

+    bne     copy_mem16x16_8_loop

+    ldmia       sp!, {r4 - r7}

+    ;pop        {r4-r7}

+    mov     pc, lr

+;copy 16 bytes each time

+copy_mem16x16_fast

+    ;sub        r1, r1, #16

+    ;sub        r3, r3, #16

+    mov     r12, #16

+copy_mem16x16_fast_loop

+    ldmia   r0, {r4-r7}

+    ;ldm        r0, {r4-r7}

+    add     r0, r0, r1

+    subs    r12, r12, #1

+    stmia   r2, {r4-r7}

+    ;stm        r2, {r4-r7}

+    add     r2, r2, r3

+    pld     [r0, #31]               ; preload for next 16x16 block

+    bne     copy_mem16x16_fast_loop

+    ldmia       sp!, {r4 - r7}

+    ;pop        {r4-r7}

+    mov     pc, lr

+    ENDP  ; |vp9_copy_mem16x16_v6|

+    END

--- /dev/null

+++ b/vp9/common/arm/armv6/copymem8x4_v6.asm

@@ -1,0 +1,128 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_copy_mem8x4_v6|

+    ; ARM

+    ; REQUIRE8

+    ; PRESERVE8

+    AREA    Block, CODE, READONLY ; name this block of code

+;void vp9_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

+|vp9_copy_mem8x4_v6| PROC

+    ;push   {r4-r5}

+    stmdb  sp!, {r4-r5}

+    ;preload

+    pld     [r0]

+    pld     [r0, r1]

+    pld     [r0, r1, lsl #1]

+    ands    r4, r0, #7

+    beq     copy_mem8x4_fast

+    ands    r4, r0, #3

+    beq     copy_mem8x4_4

+    ;copy 1 byte each time

+    ldrb    r4, [r0]

+    ldrb    r5, [r0, #1]

+    mov     r12, #4

+copy_mem8x4_1_loop

+    strb    r4, [r2]

+    strb    r5, [r2, #1]

+    ldrb    r4, [r0, #2]

+    ldrb    r5, [r0, #3]

+    subs    r12, r12, #1

+    strb    r4, [r2, #2]

+    strb    r5, [r2, #3]

+    ldrb    r4, [r0, #4]

+    ldrb    r5, [r0, #5]

+    strb    r4, [r2, #4]

+    strb    r5, [r2, #5]

+    ldrb    r4, [r0, #6]

+    ldrb    r5, [r0, #7]

+    add     r0, r0, r1

+    strb    r4, [r2, #6]

+    strb    r5, [r2, #7]

+    add     r2, r2, r3

+    ldrneb  r4, [r0]

+    ldrneb  r5, [r0, #1]

+    bne     copy_mem8x4_1_loop

+    ldmia       sp!, {r4 - r5}

+    ;pop        {r4-r5}

+    mov     pc, lr

+;copy 4 bytes each time

+copy_mem8x4_4

+    ldr     r4, [r0]

+    ldr     r5, [r0, #4]

+    mov     r12, #4

+copy_mem8x4_4_loop

+    subs    r12, r12, #1

+    add     r0, r0, r1

+    str     r4, [r2]

+    str     r5, [r2, #4]

+    add     r2, r2, r3

+    ldrne   r4, [r0]

+    ldrne   r5, [r0, #4]

+    bne     copy_mem8x4_4_loop

+    ldmia  sp!, {r4-r5}

+    ;pop        {r4-r5}

+    mov     pc, lr

+;copy 8 bytes each time

+copy_mem8x4_fast

+    ;sub        r1, r1, #8

+    ;sub        r3, r3, #8

+    mov     r12, #4

+copy_mem8x4_fast_loop

+    ldmia   r0, {r4-r5}

+    ;ldm        r0, {r4-r5}

+    add     r0, r0, r1

+    subs    r12, r12, #1

+    stmia   r2, {r4-r5}

+    ;stm        r2, {r4-r5}

+    add     r2, r2, r3

+    bne     copy_mem8x4_fast_loop

+    ldmia  sp!, {r4-r5}

+    ;pop        {r4-r5}

+    mov     pc, lr

+    ENDP  ; |vp9_copy_mem8x4_v6|

+    END

--- /dev/null

+++ b/vp9/common/arm/armv6/copymem8x8_v6.asm

@@ -1,0 +1,128 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_copy_mem8x8_v6|

+    ; ARM

+    ; REQUIRE8

+    ; PRESERVE8

+    AREA    Block, CODE, READONLY ; name this block of code

+;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

+|vp9_copy_mem8x8_v6| PROC

+    ;push   {r4-r5}

+    stmdb  sp!, {r4-r5}

+    ;preload

+    pld     [r0]

+    pld     [r0, r1]

+    pld     [r0, r1, lsl #1]

+    ands    r4, r0, #7

+    beq     copy_mem8x8_fast

+    ands    r4, r0, #3

+    beq     copy_mem8x8_4

+    ;copy 1 byte each time

+    ldrb    r4, [r0]

+    ldrb    r5, [r0, #1]

+    mov     r12, #8

+copy_mem8x8_1_loop

+    strb    r4, [r2]

+    strb    r5, [r2, #1]

+    ldrb    r4, [r0, #2]

+    ldrb    r5, [r0, #3]

+    subs    r12, r12, #1

+    strb    r4, [r2, #2]

+    strb    r5, [r2, #3]

+    ldrb    r4, [r0, #4]

+    ldrb    r5, [r0, #5]

+    strb    r4, [r2, #4]

+    strb    r5, [r2, #5]

+    ldrb    r4, [r0, #6]

+    ldrb    r5, [r0, #7]

+    add     r0, r0, r1

+    strb    r4, [r2, #6]

+    strb    r5, [r2, #7]

+    add     r2, r2, r3

+    ldrneb  r4, [r0]

+    ldrneb  r5, [r0, #1]

+    bne     copy_mem8x8_1_loop

+    ldmia       sp!, {r4 - r5}

+    ;pop        {r4-r5}

+    mov     pc, lr

+;copy 4 bytes each time

+copy_mem8x8_4

+    ldr     r4, [r0]

+    ldr     r5, [r0, #4]

+    mov     r12, #8

+copy_mem8x8_4_loop

+    subs    r12, r12, #1

+    add     r0, r0, r1

+    str     r4, [r2]

+    str     r5, [r2, #4]

+    add     r2, r2, r3

+    ldrne   r4, [r0]

+    ldrne   r5, [r0, #4]

+    bne     copy_mem8x8_4_loop

+    ldmia       sp!, {r4 - r5}

+    ;pop        {r4-r5}

+    mov     pc, lr

+;copy 8 bytes each time

+copy_mem8x8_fast

+    ;sub        r1, r1, #8

+    ;sub        r3, r3, #8

+    mov     r12, #8

+copy_mem8x8_fast_loop

+    ldmia   r0, {r4-r5}

+    ;ldm        r0, {r4-r5}

+    add     r0, r0, r1

+    subs    r12, r12, #1

+    stmia   r2, {r4-r5}

+    ;stm        r2, {r4-r5}

+    add     r2, r2, r3

+    bne     copy_mem8x8_fast_loop

+    ldmia  sp!, {r4-r5}

+    ;pop        {r4-r5}

+    mov     pc, lr

+    ENDP  ; |vp9_copy_mem8x8_v6|

+    END

--- /dev/null

+++ b/vp9/common/arm/armv6/dc_only_idct_add_v6.asm

@@ -1,0 +1,67 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license and patent

+;  grant that can be found in the LICENSE file in the root of the source

+;  tree. All contributing project authors may be found in the AUTHORS

+;  file in the root of the source tree.

+;

+    EXPORT  |vp8_dc_only_idct_add_v6|

+    AREA    |.text|, CODE, READONLY

+;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr,

+;                             unsigned char *dst_ptr, int pitch, int stride)

+; r0  input_dc

+; r1  pred_ptr

+; r2  dest_ptr

+; r3  pitch

+; sp  stride

+|vp8_dc_only_idct_add_v6| PROC

+    stmdb       sp!, {r4 - r7, lr}

+    add         r0, r0, #4                ; input_dc += 4

+    ldr         r12, c0x0000FFFF

+    ldr         r4, [r1], r3

+    ldr         r6, [r1], r3

+    and         r0, r12, r0, asr #3       ; input_dc >> 3 + mask

+    ldr         lr, [sp, #20]

+    orr         r0, r0, r0, lsl #16       ; a1 | a1

+    uxtab16     r5, r0, r4                ; a1+2 | a1+0

+    uxtab16     r4, r0, r4, ror #8        ; a1+3 | a1+1

+    uxtab16     r7, r0, r6

+    uxtab16     r6, r0, r6, ror #8

+    usat16      r5, #8, r5

+    usat16      r4, #8, r4

+    usat16      r7, #8, r7

+    usat16      r6, #8, r6

+    orr         r5, r5, r4, lsl #8

+    orr         r7, r7, r6, lsl #8

+    ldr         r4, [r1], r3

+    ldr         r6, [r1]

+    str         r5, [r2], lr

+    str         r7, [r2], lr

+    uxtab16     r5, r0, r4

+    uxtab16     r4, r0, r4, ror #8

+    uxtab16     r7, r0, r6

+    uxtab16     r6, r0, r6, ror #8

+    usat16      r5, #8, r5

+    usat16      r4, #8, r4

+    usat16      r7, #8, r7

+    usat16      r6, #8, r6

+    orr         r5, r5, r4, lsl #8

+    orr         r7, r7, r6, lsl #8

+    str         r5, [r2], lr

+    str         r7, [r2]

+    ldmia       sp!, {r4 - r7, pc}

+    ENDP  ; |vp8_dc_only_idct_add_v6|

+; Constant Pool

+c0x0000FFFF DCD 0x0000FFFF

+    END

--- /dev/null

+++ b/vp9/common/arm/armv6/filter_v6.asm

@@ -1,0 +1,624 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_filter_block2d_first_pass_armv6|

+    EXPORT  |vp9_filter_block2d_first_pass_16x16_armv6|

+    EXPORT  |vp9_filter_block2d_first_pass_8x8_armv6|

+    EXPORT  |vp9_filter_block2d_second_pass_armv6|

+    EXPORT  |vp9_filter4_block2d_second_pass_armv6|

+    EXPORT  |vp9_filter_block2d_first_pass_only_armv6|

+    EXPORT  |vp9_filter_block2d_second_pass_only_armv6|

+    AREA    |.text|, CODE, READONLY  ; name this block of code

+;-------------------------------------

+; r0    unsigned char *src_ptr

+; r1    short         *output_ptr

+; r2    unsigned int src_pixels_per_line

+; r3    unsigned int output_width

+; stack unsigned int output_height

+; stack const short *vp9_filter

+;-------------------------------------

+; vp9_filter the input and put in the output array.  Apply the 6 tap FIR filter with

+; the output being a 2 byte value and the intput being a 1 byte value.

+|vp9_filter_block2d_first_pass_armv6| PROC

+    stmdb   sp!, {r4 - r11, lr}

+    ldr     r11, [sp, #40]                  ; vp9_filter address

+    ldr     r7, [sp, #36]                   ; output height

+    sub     r2, r2, r3                      ; inside loop increments input array,

+                                            ; so the height loop only needs to add

+                                            ; r2 - width to the input pointer

+    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts

+    add     r12, r3, #16                    ; square off the output

+    sub     sp, sp, #4

+    ldr     r4, [r11]                       ; load up packed filter coefficients

+    ldr     r5, [r11, #4]

+    ldr     r6, [r11, #8]

+    str     r1, [sp]                        ; push destination to stack

+    mov     r7, r7, lsl #16                 ; height is top part of counter

+; six tap filter

+|height_loop_1st_6|

+    ldrb    r8, [r0, #-2]                   ; load source data

+    ldrb    r9, [r0, #-1]

+    ldrb    r10, [r0], #2

+    orr     r7, r7, r3, lsr #2              ; construct loop counter

+|width_loop_1st_6|

+    ldrb    r11, [r0, #-1]

+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8

+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9

+    ldrb    r9, [r0]

+    smuad   lr, lr, r4                      ; apply the filter

+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10

+    smuad   r8, r8, r4

+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11

+    smlad   lr, r10, r5, lr

+    ldrb    r10, [r0, #1]

+    smlad   r8, r11, r5, r8

+    ldrb    r11, [r0, #2]

+    sub     r7, r7, #1

+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9

+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10

+    smlad   lr, r9, r6, lr

+    smlad   r11, r10, r6, r8

+    ands    r10, r7, #0xff                  ; test loop counter

+    add     lr, lr, #0x40                   ; round_shift_and_clamp

+    ldrneb  r8, [r0, #-2]                   ; load data for next loop

+    usat    lr, #8, lr, asr #7

+    add     r11, r11, #0x40

+    ldrneb  r9, [r0, #-1]

+    usat    r11, #8, r11, asr #7

+    strh    lr, [r1], r12                   ; result is transposed and stored, which

+                                            ; will make second pass filtering easier.

+    ldrneb  r10, [r0], #2

+    strh    r11, [r1], r12

+    bne     width_loop_1st_6

+    ldr     r1, [sp]                        ; load and update dst address

+    subs    r7, r7, #0x10000

+    add     r0, r0, r2                      ; move to next input line

+    add     r1, r1, #2                      ; move over to next column

+    str     r1, [sp]

+    bne     height_loop_1st_6

+    add     sp, sp, #4

+    ldmia   sp!, {r4 - r11, pc}

+    ENDP

+; --------------------------

+; 16x16 version

+; -----------------------------

+|vp9_filter_block2d_first_pass_16x16_armv6| PROC

+    stmdb   sp!, {r4 - r11, lr}

+    ldr     r11, [sp, #40]                  ; vp9_filter address

+    ldr     r7, [sp, #36]                   ; output height

+    add     r4, r2, #18                     ; preload next low

+    pld     [r0, r4]

+    sub     r2, r2, r3                      ; inside loop increments input array,

+                                            ; so the height loop only needs to add

+                                            ; r2 - width to the input pointer

+    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts

+    add     r12, r3, #16                    ; square off the output

+    sub     sp, sp, #4

+    ldr     r4, [r11]                       ; load up packed filter coefficients

+    ldr     r5, [r11, #4]

+    ldr     r6, [r11, #8]

+    str     r1, [sp]                        ; push destination to stack

+    mov     r7, r7, lsl #16                 ; height is top part of counter

+; six tap filter

+|height_loop_1st_16_6|

+    ldrb    r8, [r0, #-2]                   ; load source data

+    ldrb    r9, [r0, #-1]

+    ldrb    r10, [r0], #2

+    orr     r7, r7, r3, lsr #2              ; construct loop counter

+|width_loop_1st_16_6|

+    ldrb    r11, [r0, #-1]

+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8

+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9

+    ldrb    r9, [r0]

+    smuad   lr, lr, r4                      ; apply the filter

+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10

+    smuad   r8, r8, r4

+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11

+    smlad   lr, r10, r5, lr

+    ldrb    r10, [r0, #1]

+    smlad   r8, r11, r5, r8

+    ldrb    r11, [r0, #2]

+    sub     r7, r7, #1

+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9

+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10

+    smlad   lr, r9, r6, lr

+    smlad   r11, r10, r6, r8

+    ands    r10, r7, #0xff                  ; test loop counter

+    add     lr, lr, #0x40                   ; round_shift_and_clamp

+    ldrneb  r8, [r0, #-2]                   ; load data for next loop

+    usat    lr, #8, lr, asr #7

+    add     r11, r11, #0x40

+    ldrneb  r9, [r0, #-1]

+    usat    r11, #8, r11, asr #7

+    strh    lr, [r1], r12                   ; result is transposed and stored, which

+                                            ; will make second pass filtering easier.

+    ldrneb  r10, [r0], #2

+    strh    r11, [r1], r12

+    bne     width_loop_1st_16_6

+    ldr     r1, [sp]                        ; load and update dst address

+    subs    r7, r7, #0x10000

+    add     r0, r0, r2                      ; move to next input line

+    add     r11, r2, #34                    ; adding back block width(=16)

+    pld     [r0, r11]                       ; preload next low

+    add     r1, r1, #2                      ; move over to next column

+    str     r1, [sp]

+    bne     height_loop_1st_16_6

+    add     sp, sp, #4

+    ldmia   sp!, {r4 - r11, pc}

+    ENDP

+; --------------------------

+; 8x8 version

+; -----------------------------

+|vp9_filter_block2d_first_pass_8x8_armv6| PROC

+    stmdb   sp!, {r4 - r11, lr}

+    ldr     r11, [sp, #40]                  ; vp9_filter address

+    ldr     r7, [sp, #36]                   ; output height

+    add     r4, r2, #10                     ; preload next low

+    pld     [r0, r4]

+    sub     r2, r2, r3                      ; inside loop increments input array,

+                                            ; so the height loop only needs to add

+                                            ; r2 - width to the input pointer

+    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts

+    add     r12, r3, #16                    ; square off the output

+    sub     sp, sp, #4

+    ldr     r4, [r11]                       ; load up packed filter coefficients

+    ldr     r5, [r11, #4]

+    ldr     r6, [r11, #8]

+    str     r1, [sp]                        ; push destination to stack

+    mov     r7, r7, lsl #16                 ; height is top part of counter

+; six tap filter

+|height_loop_1st_8_6|

+    ldrb    r8, [r0, #-2]                   ; load source data

+    ldrb    r9, [r0, #-1]

+    ldrb    r10, [r0], #2

+    orr     r7, r7, r3, lsr #2              ; construct loop counter

+|width_loop_1st_8_6|

+    ldrb    r11, [r0, #-1]

+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8

+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9

+    ldrb    r9, [r0]

+    smuad   lr, lr, r4                      ; apply the filter

+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10

+    smuad   r8, r8, r4

+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11

+    smlad   lr, r10, r5, lr

+    ldrb    r10, [r0, #1]

+    smlad   r8, r11, r5, r8

+    ldrb    r11, [r0, #2]

+    sub     r7, r7, #1

+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9

+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10

+    smlad   lr, r9, r6, lr

+    smlad   r11, r10, r6, r8

+    ands    r10, r7, #0xff                  ; test loop counter

+    add     lr, lr, #0x40                   ; round_shift_and_clamp

+    ldrneb  r8, [r0, #-2]                   ; load data for next loop

+    usat    lr, #8, lr, asr #7

+    add     r11, r11, #0x40

+    ldrneb  r9, [r0, #-1]

+    usat    r11, #8, r11, asr #7

+    strh    lr, [r1], r12                   ; result is transposed and stored, which

+                                            ; will make second pass filtering easier.

+    ldrneb  r10, [r0], #2

+    strh    r11, [r1], r12

+    bne     width_loop_1st_8_6

+    ldr     r1, [sp]                        ; load and update dst address

+    subs    r7, r7, #0x10000

+    add     r0, r0, r2                      ; move to next input line

+    add     r11, r2, #18                    ; adding back block width(=8)

+    pld     [r0, r11]                       ; preload next low

+    add     r1, r1, #2                      ; move over to next column

+    str     r1, [sp]

+    bne     height_loop_1st_8_6

+    add     sp, sp, #4

+    ldmia   sp!, {r4 - r11, pc}

+    ENDP

+;---------------------------------

+; r0    short         *src_ptr,

+; r1    unsigned char *output_ptr,

+; r2    unsigned int output_pitch,

+; r3    unsigned int cnt,

+; stack const short *vp9_filter

+;---------------------------------

+|vp9_filter_block2d_second_pass_armv6| PROC

+    stmdb   sp!, {r4 - r11, lr}

+    ldr     r11, [sp, #36]                  ; vp9_filter address

+    sub     sp, sp, #4

+    mov     r7, r3, lsl #16                 ; height is top part of counter

+    str     r1, [sp]                        ; push destination to stack

+    ldr     r4, [r11]                       ; load up packed filter coefficients

+    ldr     r5, [r11, #4]

+    ldr     r6, [r11, #8]

+    pkhbt   r12, r5, r4                     ; pack the filter differently

+    pkhbt   r11, r6, r5

+    sub     r0, r0, #4                      ; offset input buffer

+|height_loop_2nd|

+    ldr     r8, [r0]                        ; load the data

+    ldr     r9, [r0, #4]

+    orr     r7, r7, r3, lsr #1              ; loop counter

+|width_loop_2nd|

+    smuad   lr, r4, r8                      ; apply filter

+    sub     r7, r7, #1

+    smulbt  r8, r4, r8

+    ldr     r10, [r0, #8]

+    smlad   lr, r5, r9, lr

+    smladx  r8, r12, r9, r8

+    ldrh    r9, [r0, #12]

+    smlad   lr, r6, r10, lr

+    smladx  r8, r11, r10, r8

+    add     r0, r0, #4

+    smlatb  r10, r6, r9, r8

+    add     lr, lr, #0x40                   ; round_shift_and_clamp

+    ands    r8, r7, #0xff

+    usat    lr, #8, lr, asr #7

+    add     r10, r10, #0x40

+    strb    lr, [r1], r2                    ; the result is transposed back and stored

+    usat    r10, #8, r10, asr #7

+    ldrne   r8, [r0]                        ; load data for next loop

+    ldrne   r9, [r0, #4]

+    strb    r10, [r1], r2

+    bne     width_loop_2nd

+    ldr     r1, [sp]                        ; update dst for next loop

+    subs    r7, r7, #0x10000

+    add     r0, r0, #16                     ; updata src for next loop

+    add     r1, r1, #1

+    str     r1, [sp]

+    bne     height_loop_2nd

+    add     sp, sp, #4

+    ldmia   sp!, {r4 - r11, pc}

+    ENDP

+;---------------------------------

+; r0    short         *src_ptr,

+; r1    unsigned char *output_ptr,

+; r2    unsigned int output_pitch,

+; r3    unsigned int cnt,

+; stack const short *vp9_filter

+;---------------------------------

+|vp9_filter4_block2d_second_pass_armv6| PROC

+    stmdb   sp!, {r4 - r11, lr}

+    ldr     r11, [sp, #36]                  ; vp9_filter address

+    mov     r7, r3, lsl #16                 ; height is top part of counter

+    ldr     r4, [r11]                       ; load up packed filter coefficients

+    add     lr, r1, r3                      ; save final destination pointer

+    ldr     r5, [r11, #4]

+    ldr     r6, [r11, #8]

+    pkhbt   r12, r5, r4                     ; pack the filter differently

+    pkhbt   r11, r6, r5

+    mov     r4, #0x40                       ; rounding factor (for smlad{x})

+|height_loop_2nd_4|

+    ldrd    r8, [r0, #-4]                   ; load the data

+    orr     r7, r7, r3, lsr #1              ; loop counter

+|width_loop_2nd_4|

+    ldr     r10, [r0, #4]!

+    smladx  r6, r9, r12, r4                 ; apply filter

+    pkhbt   r8, r9, r8

+    smlad   r5, r8, r12, r4

+    pkhbt   r8, r10, r9

+    smladx  r6, r10, r11, r6

+    sub     r7, r7, #1

+    smlad   r5, r8, r11, r5

+    mov     r8, r9                          ; shift the data for the next loop

+    mov     r9, r10

+    usat    r6, #8, r6, asr #7              ; shift and clamp

+    usat    r5, #8, r5, asr #7

+    strb    r5, [r1], r2                    ; the result is transposed back and stored

+    tst     r7, #0xff

+    strb    r6, [r1], r2

+    bne     width_loop_2nd_4

+    subs    r7, r7, #0x10000

+    add     r0, r0, #16                     ; update src for next loop

+    sub     r1, lr, r7, lsr #16             ; update dst for next loop

+    bne     height_loop_2nd_4

+    ldmia   sp!, {r4 - r11, pc}

+    ENDP

+;------------------------------------

+; r0    unsigned char *src_ptr

+; r1    unsigned char *output_ptr,

+; r2    unsigned int src_pixels_per_line

+; r3    unsigned int cnt,

+; stack unsigned int output_pitch,

+; stack const short *vp9_filter

+;------------------------------------

+|vp9_filter_block2d_first_pass_only_armv6| PROC

+    stmdb   sp!, {r4 - r11, lr}

+    add     r7, r2, r3                      ; preload next low

+    add     r7, r7, #2

+    pld     [r0, r7]

+    ldr     r4, [sp, #36]                   ; output pitch

+    ldr     r11, [sp, #40]                  ; HFilter address

+    sub     sp, sp, #8

+    mov     r7, r3

+    sub     r2, r2, r3                      ; inside loop increments input array,

+                                            ; so the height loop only needs to add

+                                            ; r2 - width to the input pointer

+    sub     r4, r4, r3

+    str     r4, [sp]                        ; save modified output pitch

+    str     r2, [sp, #4]

+    mov     r2, #0x40

+    ldr     r4, [r11]                       ; load up packed filter coefficients

+    ldr     r5, [r11, #4]

+    ldr     r6, [r11, #8]

+; six tap filter

+|height_loop_1st_only_6|

+    ldrb    r8, [r0, #-2]                   ; load data

+    ldrb    r9, [r0, #-1]

+    ldrb    r10, [r0], #2

+    mov     r12, r3, lsr #1                 ; loop counter

+|width_loop_1st_only_6|

+    ldrb    r11, [r0, #-1]

+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8

+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9

+    ldrb    r9, [r0]

+;;  smuad   lr, lr, r4

+    smlad   lr, lr, r4, r2

+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10

+;;  smuad   r8, r8, r4

+    smlad   r8, r8, r4, r2

+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11

+    smlad   lr, r10, r5, lr

+    ldrb    r10, [r0, #1]

+    smlad   r8, r11, r5, r8

+    ldrb    r11, [r0, #2]

+    subs    r12, r12, #1

+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9

+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10

+    smlad   lr, r9, r6, lr

+    smlad   r10, r10, r6, r8

+;;  add     lr, lr, #0x40                   ; round_shift_and_clamp

+    ldrneb  r8, [r0, #-2]                   ; load data for next loop

+    usat    lr, #8, lr, asr #7

+;;  add     r10, r10, #0x40

+    strb    lr, [r1], #1                    ; store the result

+    usat    r10, #8, r10, asr #7

+    ldrneb  r9, [r0, #-1]

+    strb    r10, [r1], #1

+    ldrneb  r10, [r0], #2

+    bne     width_loop_1st_only_6

+    ldr     lr, [sp]                        ; load back output pitch

+    ldr     r12, [sp, #4]                   ; load back output pitch

+    subs    r7, r7, #1

+    add     r0, r0, r12                     ; updata src for next loop

+    add     r11, r12, r3                    ; preload next low

+    add     r11, r11, #2

+    pld     [r0, r11]

+    add     r1, r1, lr                      ; update dst for next loop

+    bne     height_loop_1st_only_6

+    add     sp, sp, #8

+    ldmia   sp!, {r4 - r11, pc}

+    ENDP  ; |vp9_filter_block2d_first_pass_only_armv6|

+;------------------------------------

+; r0    unsigned char *src_ptr,

+; r1    unsigned char *output_ptr,

+; r2    unsigned int src_pixels_per_line

+; r3    unsigned int cnt,

+; stack unsigned int output_pitch,

+; stack const short *vp9_filter

+;------------------------------------

+|vp9_filter_block2d_second_pass_only_armv6| PROC

+    stmdb   sp!, {r4 - r11, lr}

+    ldr     r11, [sp, #40]                  ; VFilter address

+    ldr     r12, [sp, #36]                  ; output pitch

+    mov     r7, r3, lsl #16                 ; height is top part of counter

+    sub     r0, r0, r2, lsl #1              ; need 6 elements for filtering, 2 before, 3 after

+    sub     sp, sp, #8

+    ldr     r4, [r11]                       ; load up packed filter coefficients

+    ldr     r5, [r11, #4]

+    ldr     r6, [r11, #8]

+    str     r0, [sp]                        ; save r0 to stack

+    str     r1, [sp, #4]                    ; save dst to stack

+; six tap filter

+|width_loop_2nd_only_6|

+    ldrb    r8, [r0], r2                    ; load data

+    orr     r7, r7, r3                      ; loop counter

+    ldrb    r9, [r0], r2

+    ldrb    r10, [r0], r2

+|height_loop_2nd_only_6|

+    ; filter first column in this inner loop, than, move to next colum.

+    ldrb    r11, [r0], r2

+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8

+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9

+    ldrb    r9, [r0], r2

+    smuad   lr, lr, r4

+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10

+    smuad   r8, r8, r4

+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11

+    smlad   lr, r10, r5, lr

+    ldrb    r10, [r0], r2

+    smlad   r8, r11, r5, r8

+    ldrb    r11, [r0]

+    sub     r7, r7, #2

+    sub     r0, r0, r2, lsl #2

+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9

+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10

+    smlad   lr, r9, r6, lr

+    smlad   r10, r10, r6, r8

+    ands    r9, r7, #0xff

+    add     lr, lr, #0x40                   ; round_shift_and_clamp

+    ldrneb  r8, [r0], r2                    ; load data for next loop

+    usat    lr, #8, lr, asr #7

+    add     r10, r10, #0x40

+    strb    lr, [r1], r12                   ; store the result for the column

+    usat    r10, #8, r10, asr #7

+    ldrneb  r9, [r0], r2

+    strb    r10, [r1], r12

+    ldrneb  r10, [r0], r2

+    bne     height_loop_2nd_only_6

+    ldr     r0, [sp]

+    ldr     r1, [sp, #4]

+    subs    r7, r7, #0x10000

+    add     r0, r0, #1                      ; move to filter next column

+    str     r0, [sp]

+    add     r1, r1, #1

+    str     r1, [sp, #4]

+    bne     width_loop_2nd_only_6

+    add     sp, sp, #8

+    ldmia   sp!, {r4 - r11, pc}

+    ENDP  ; |vp9_filter_block2d_second_pass_only_armv6|

+    END

--- /dev/null

+++ b/vp9/common/arm/armv6/idct_v6.asm

@@ -1,0 +1,345 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+;                   r0  r1  r2  r3  r4  r5  r6  r7  r8  r9  r10 r11 r12     r14

+    EXPORT  |vp8_short_idct4x4llm_1_v6|

+    EXPORT  |vp8_short_idct4x4llm_v6|

+    EXPORT  |vp8_short_idct4x4llm_v6_scott|

+    EXPORT  |vp8_short_idct4x4llm_v6_dual|

+    AREA    |.text|, CODE, READONLY

+;********************************************************************************

+;*  void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch)

+;*      r0  INT16 * input

+;*      r1  INT16 * output

+;*      r2  INT32 pitch

+;*  bench:  3/5

+;********************************************************************************

+|vp8_short_idct4x4llm_1_v6| PROC         ;   cycles  in  out pit

+            ;

+    ldrsh   r0, [r0]    ; load input[0] 1, r0 un 2

+    add r0, r0, #4  ;   1   +4

+    stmdb   sp!, {r4, r5, lr}   ; make room for wide writes 1                   backup

+    mov r0, r0, asr #3  ; (input[0] + 4) >> 3   1, r0 req`d ^1  >> 3

+    pkhbt   r4, r0, r0, lsl #16 ; pack r0 into r4   1, r0 req`d ^1                  pack

+    mov r5, r4  ; expand                        expand

+    strd    r4, [r1], r2    ; *output = r0, post inc    1

+    strd    r4, [r1], r2    ;   1

+    strd    r4, [r1], r2    ;   1

+    strd    r4, [r1]    ;   1

+            ;

+    ldmia   sp!, {r4, r5, pc}   ; replace vars, return                      restore

+    ENDP        ; |vp8_short_idct4x4llm_1_v6|

+;********************************************************************************

+;********************************************************************************

+;********************************************************************************

+;********************************************************************************

+;*  void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch)

+;*      r0  INT16 * input

+;*      r1  INT16 * output

+;*      r2  INT32 pitch

+;*  bench:

+;********************************************************************************

+|vp8_short_idct4x4llm_v6| PROC           ;   cycles  in  out pit

+            ;

+    stmdb   sp!, {r4-r11, lr}   ; backup registers  1                   backup

+            ;

+    mov r4, #0x00004E00 ;   1                   cst

+    orr r4, r4, #0x0000007B ; cospi8sqrt2minus1

+    mov r5, #0x00008A00 ;   1                       cst

+    orr r5, r5, #0x0000008C ; sinpi8sqrt2

+            ;

+    mov r6, #4  ; i=4   1                           i

+loop1           ;

+    ldrsh   r12, [r0, #8]   ; input[4]  1, r12 unavail 2                                                    [4]

+    ldrsh   r3, [r0, #24]   ; input[12] 1, r3 unavail 2             [12]

+    ldrsh   r8, [r0, #16]   ; input[8]  1, r8 unavail 2                                 [8]

+    ldrsh   r7, [r0], #0x2  ; input[0]  1, r7 unavail 2 ++                          [0]

+    smulwb  r10, r5, r12    ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1                                          t1

+    smulwb  r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16  1, r11 un 2, r3/r4 ^1                                               t2

+    add r9, r7, r8  ; a1 = [0] + [8]    1                                       a1

+    sub r7, r7, r8  ; b1 = [0] - [8]    1                               b1

+    add r11, r3, r11    ; temp2 1

+    rsb r11, r11, r10   ; c1 = temp1 - temp2    1                                               c1

+    smulwb  r3, r5, r3  ; ([12] * sinpi8sqrt2) >> 16    1, r3 un 2, r3/r5 ^ 1               t2

+    smulwb  r10, r4, r12    ; ([4] * cospi8sqrt2minus1) >> 16   1, r10 un 2, r12/r4 ^1                                          t1

+    add r8, r7, r11 ; b1 + c1   1                                   b+c

+    strh    r8, [r1, r2]    ; out[pitch] = b1+c1    1

+    sub r7, r7, r11 ; b1 - c1   1                               b-c

+    add r10, r12, r10   ; temp1 1

+    add r3, r10, r3 ; d1 = temp1 + temp2    1               d1

+    add r10, r9, r3 ; a1 + d1   1                                           a+d

+    sub r3, r9, r3  ; a1 - d1   1               a-d

+    add r8, r2, r2  ; pitch * 2 1                                   p*2

+    strh    r7, [r1, r8]    ; out[pitch*2] = b1-c1  1

+    add r7, r2, r2, lsl #1  ; pitch * 3 1                               p*3

+    strh    r3, [r1, r7]    ; out[pitch*3] = a1-d1  1

+    subs    r6, r6, #1  ; i--   1                           --

+    strh    r10, [r1], #0x2 ; out[0] = a1+d1    1       ++

+    bne loop1   ; if i>0, continue

+            ;

+    sub r1, r1, #8  ; set up out for next loop  1       -4

+            ; for this iteration, input=prev output

+    mov r6, #4  ; i=4   1                           i

+;   b   returnfull

+loop2           ;

+    ldrsh   r11, [r1, #2]   ; input[1]  1, r11 un 2                                             [1]

+    ldrsh   r8, [r1, #6]    ; input[3]  1, r8 un 2                                  [3]

+    ldrsh   r3, [r1, #4]    ; input[2]  1, r3 un 2              [2]

+    ldrsh   r0, [r1]    ; input[0]  1, r0 un 2  [0]

+    smulwb  r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1                                       t1

+    smulwb  r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16   1, r10 un 2, r4/r8 ^1                                           t2

+    add r7, r0, r3  ; a1 = [0] + [2]    1                               a1

+    sub r0, r0, r3  ; b1 = [0] - [2]    1   b1

+    add r10, r8, r10    ; temp2 1

+    rsb r9, r10, r9 ; c1 = temp1 - temp2    1                                       c1

+    smulwb  r8, r5, r8  ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1                                    t2

+    smulwb  r10, r4, r11    ; ([1] * cospi8sqrt2minus1) >> 16   1, r10 un 2, r4/r11 ^1                                          t1

+    add r3, r0, r9  ; b1+c1 1               b+c

+    add r3, r3, #4  ; b1+c1+4   1               +4

+    add r10, r11, r10   ; temp1 1

+    mov r3, r3, asr #3  ; b1+c1+4 >> 3  1, r3 ^1                >>3

+    strh    r3, [r1, #2]    ; out[1] = b1+c1    1

+    add r10, r10, r8    ; d1 = temp1 + temp2    1                                           d1

+    add r3, r7, r10 ; a1+d1 1               a+d

+    add r3, r3, #4  ; a1+d1+4   1               +4

+    sub r7, r7, r10 ; a1-d1 1                               a-d

+    add r7, r7, #4  ; a1-d1+4   1                               +4

+    mov r3, r3, asr #3  ; a1+d1+4 >> 3  1, r3 ^1                >>3

+    mov r7, r7, asr #3  ; a1-d1+4 >> 3  1, r7 ^1                                >>3

+    strh    r7, [r1, #6]    ; out[3] = a1-d1    1

+    sub r0, r0, r9  ; b1-c1 1   b-c

+    add r0, r0, #4  ; b1-c1+4   1   +4

+    subs    r6, r6, #1  ; i--   1                           --

+    mov r0, r0, asr #3  ; b1-c1+4 >> 3  1, r0 ^1    >>3

+    strh    r0, [r1, #4]    ; out[2] = b1-c1    1

+    strh    r3, [r1], r2    ; out[0] = a1+d1    1

+;   add r1, r1, r2  ; out += pitch  1       ++

+    bne loop2   ; if i>0, continue

+returnfull          ;

+    ldmia   sp!, {r4 - r11, pc} ; replace vars, return                      restore

+    ENDP

+;********************************************************************************

+;********************************************************************************

+;********************************************************************************

+;********************************************************************************

+;*  void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch)

+;*      r0  INT16 * input

+;*      r1  INT16 * output

+;*      r2  INT32 pitch

+;*  bench:

+;********************************************************************************

+|vp8_short_idct4x4llm_v6_scott| PROC         ;   cycles  in  out pit

+;   mov r0, #0  ;

+;   ldr r0, [r0]    ;

+    stmdb   sp!, {r4 - r11, lr} ; backup registers  1                   backup

+            ;

+    mov r3, #0x00004E00 ;                   cos

+    orr r3, r3, #0x0000007B ; cospi8sqrt2minus1

+    mov r4, #0x00008A00 ;                       sin

+    orr r4, r4, #0x0000008C ; sinpi8sqrt2

+            ;

+    mov r5, #0x2    ; i                         i

+            ;

+short_idct4x4llm_v6_scott_loop1          ;

+    ldr r10, [r0, #(4*2)]   ; i5 | i4                                               5,4

+    ldr r11, [r0, #(12*2)]  ; i13 | i12                                                 13,12

+            ;

+    smulwb  r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16)                             lt1

+    smulwb  r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16)                                  lt2

+            ;

+    smulwb  r12, r3, r10    ; ((ip[4] * cospi8sqrt2misu1) >> 16)                                                        l2t2

+    smulwb  r14, r4, r11    ; ((ip[12] * sinpi8sqrt2) >> 16)                                                                l2t1

+            ;

+    add r6, r6, r7  ; partial c1                                lt1-lt2

+    add r12, r12, r14   ; partial d1                                                        l2t2+l2t1

+            ;

+    smulwt  r14, r4, r10    ; ((ip[5] * sinpi8sqrt2) >> 16)                                                             ht1

+    smulwt  r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16)                                  ht2

+            ;

+    smulwt  r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16)                                       h2t1

+    smulwt  r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16)                                            h2t2

+            ;

+    add r7, r14, r7 ; partial c1_2                                  ht1+ht2

+    sub r8, r8, r9  ; partial d1_2                                      h2t1-h2t2

+            ;

+    pkhbt   r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1                               pack

+    pkhbt   r12, r12, r8, lsl #16   ; partial d1_2 | partial d1_1                                                       pack

+            ;

+    usub16  r6, r6, r10 ; c1_2 | c1_1                               c

+    uadd16  r12, r12, r11   ; d1_2 | d1_1                                                       d

+            ;

+    ldr r10, [r0, #0]   ; i1 | i0                                               1,0

+    ldr r11, [r0, #(8*2)]   ; i9 | i10                                                  9,10

+            ;

+;;;;;;  add r0, r0, #0x4    ;       +4

+;;;;;;  add r1, r1, #0x4    ;           +4

+            ;

+    uadd16  r8, r10, r11    ; i1 + i9 | i0 + i8 aka a1                                      a

+    usub16  r9, r10, r11    ; i1 - i9 | i0 - i8 aka b1                                          b

+            ;

+    uadd16  r7, r8, r12 ; a1 + d1 pair                                  a+d

+    usub16  r14, r8, r12    ; a1 - d1 pair                                                              a-d

+            ;

+    str r7, [r1]    ; op[0] = a1 + d1

+    str r14, [r1, r2]   ; op[pitch*3] = a1 - d1

+            ;

+    add r0, r0, #0x4    ; op[pitch] = b1 + c1       ++

+    add r1, r1, #0x4    ; op[pitch*2] = b1 - c1         ++

+            ;

+    subs    r5, r5, #0x1    ;                           --

+    bne short_idct4x4llm_v6_scott_loop1  ;

+            ;

+    sub r1, r1, #16 ; reset output ptr

+    mov r5, #0x4    ;

+    mov r0, r1  ; input = output

+            ;

+short_idct4x4llm_v6_scott_loop2          ;

+            ;

+    subs    r5, r5, #0x1    ;

+    bne short_idct4x4llm_v6_scott_loop2  ;

+            ;

+    ldmia   sp!, {r4 - r11, pc} ;

+    ENDP        ;

+            ;

+;********************************************************************************

+;********************************************************************************

+;********************************************************************************

+;********************************************************************************

+;*  void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch)

+;*      r0  INT16 * input

+;*      r1  INT16 * output

+;*      r2  INT32 pitch

+;*  bench:

+;********************************************************************************

+|vp8_short_idct4x4llm_v6_dual| PROC          ;   cycles  in  out pit

+            ;

+    stmdb   sp!, {r4-r11, lr}   ; backup registers  1                   backup

+    mov r3, #0x00004E00 ;                   cos

+    orr r3, r3, #0x0000007B ; cospi8sqrt2minus1

+    mov r4, #0x00008A00 ;                       sin

+    orr r4, r4, #0x0000008C ; sinpi8sqrt2

+    mov r5, #0x2    ; i=2                           i

+loop1_dual

+    ldr r6, [r0, #(4*2)]    ; i5 | i4                               5|4

+    ldr r12, [r0, #(12*2)]  ; i13 | i12                                                     13|12

+    ldr r14, [r0, #(8*2)]   ; i9 | i8                                                               9|8

+    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c

+    smulwb  r7, r3, r6  ; (ip[4] * cospi8sqrt2minus1) >> 16                                 4c

+    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s

+    smulwb  r8, r4, r6  ; (ip[4] * sinpi8sqrt2) >> 16                                       4s

+    pkhbt   r7, r7, r9, lsl #16 ; 5c | 4c

+    smulwt  r11, r3, r12    ; (ip[13] * cospi8sqrt2minus1) >> 16                                                    13c

+    pkhbt   r8, r8, r10, lsl #16    ; 5s | 4s

+    uadd16  r6, r6, r7  ; 5c+5 | 4c+4

+    smulwt  r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16                                  13s

+    smulwb  r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16                                            12c

+    smulwb  r10, r4, r12    ; (ip[12] * sinpi8sqrt2) >> 16                                              12s

+    subs    r5, r5, #0x1    ; i--                           --

+    pkhbt   r9, r9, r11, lsl #16    ; 13c | 12c

+    ldr r11, [r0], #0x4 ; i1 | i0       ++                                          1|0

+    pkhbt   r10, r10, r7, lsl #16   ; 13s | 12s

+    uadd16  r7, r12, r9 ; 13c+13 | 12c+12

+    usub16  r7, r8, r7  ; c                                 c

+    uadd16  r6, r6, r10 ; d                             d

+    uadd16  r10, r11, r14   ; a                                             a

+    usub16  r8, r11, r14    ; b                                     b

+    uadd16  r9, r10, r6 ; a+d                                           a+d

+    usub16  r10, r10, r6    ; a-d                                               a-d

+    uadd16  r6, r8, r7  ; b+c                               b+c

+    usub16  r7, r8, r7  ; b-c                                   b-c

+    str r6, [r1, r2]    ; o5 | o4

+    add r6, r2, r2  ; pitch * 2                             p2

+    str r7, [r1, r6]    ; o9 | o8

+    add r6,  r6, r2 ; pitch * 3                             p3

+    str r10, [r1, r6]   ; o13 | o12

+    str r9, [r1], #0x4  ; o1 | o0           ++

+    bne loop1_dual  ;

+    mov r5, #0x2    ; i=2                           i

+    sub r0, r1, #8  ; reset input/output        i/o

+loop2_dual

+    ldr r6, [r0, r2]    ; i5 | i4                               5|4

+    ldr r1, [r0]    ; i1 | i0           1|0

+    ldr r12, [r0, #0x4] ; i3 | i2                                                       3|2

+    add r14, r2, #0x4   ; pitch + 2                                                             p+2

+    ldr r14, [r0, r14]  ; i7 | i6                                                               7|6

+    smulwt  r9, r3, r6  ; (ip[5] * cospi8sqrt2minus1) >> 16                                         5c

+    smulwt  r7, r3, r1  ; (ip[1] * cospi8sqrt2minus1) >> 16                                 1c

+    smulwt  r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16                                               5s

+    smulwt  r8, r4, r1  ; (ip[1] * sinpi8sqrt2) >> 16                                       1s

+    pkhbt   r11, r6, r1, lsl #16    ; i0 | i4                                                   0|4

+    pkhbt   r7, r9, r7, lsl #16 ; 1c | 5c

+    pkhbt   r8, r10, r8, lsl #16    ; 1s | 5s = temp1 �                                     tc1

+    pkhtb   r1, r1, r6, asr #16 ; i1 | i5           1|5

+    uadd16  r1, r7, r1  ; 1c+1 | 5c+5 = temp2 (d)           td2

+    pkhbt   r9, r14, r12, lsl #16   ; i2 | i6                                           2|6

+    uadd16  r10, r11, r9    ; a                                             a

+    usub16  r9, r11, r9 ; b                                         b

+    pkhtb   r6, r12, r14, asr #16   ; i3 | i7                               3|7

+    subs    r5, r5, #0x1    ; i--                           --

+    smulwt  r7, r3, r6  ; (ip[3] * cospi8sqrt2minus1) >> 16                                 3c

+    smulwt  r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16                                                   3s

+    smulwb  r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16                                                     7c

+    smulwb  r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16                                                               7s

+    pkhbt   r7, r12, r7, lsl #16    ; 3c | 7c

+    pkhbt   r11, r14, r11, lsl #16  ; 3s | 7s = temp1 (d)                                                   td1

+    uadd16  r6, r7, r6  ; 3c+3 | 7c+7 = temp2  (c)                              tc2

+    usub16  r12, r8, r6 ; c (o1 | o5)                                                       c

+    uadd16  r6, r11, r1 ; d (o3 | o7)                               d

+    uadd16  r7, r10, r6 ; a+d                                   a+d

+    mov r8, #0x4    ; set up 4's                                        4

+    orr r8, r8, #0x40000    ;                                       4|4

+    usub16  r6, r10, r6 ; a-d                               a-d

+    uadd16  r6, r6, r8  ; a-d+4                             3|7

+    uadd16  r7, r7, r8  ; a+d+4                                 0|4

+    uadd16  r10, r9, r12    ; b+c                                               b+c

+    usub16  r1, r9, r12 ; b-c           b-c

+    uadd16  r10, r10, r8    ; b+c+4                                             1|5

+    uadd16  r1, r1, r8  ; b-c+4         2|6

+    mov r8, r10, asr #19    ; o1 >> 3

+    strh    r8, [r0, #2]    ; o1

+    mov r8, r1, asr #19 ; o2 >> 3

+    strh    r8, [r0, #4]    ; o2

+    mov r8, r6, asr #19 ; o3 >> 3

+    strh    r8, [r0, #6]    ; o3

+    mov r8, r7, asr #19 ; o0 >> 3

+    strh    r8, [r0], r2    ; o0        +p

+    sxth    r10, r10    ;

+    mov r8, r10, asr #3 ; o5 >> 3

+    strh    r8, [r0, #2]    ; o5

+    sxth    r1, r1  ;

+    mov r8, r1, asr #3  ; o6 >> 3

+    strh    r8, [r0, #4]    ; o6

+    sxth    r6, r6  ;

+    mov r8, r6, asr #3  ; o7 >> 3

+    strh    r8, [r0, #6]    ; o7

+    sxth    r7, r7  ;

+    mov r8, r7, asr #3  ; o4 >> 3

+    strh    r8, [r0], r2    ; o4        +p

+;;;;;   subs    r5, r5, #0x1    ; i--                           --

+    bne loop2_dual  ;

+            ;

+    ldmia   sp!, {r4 - r11, pc} ; replace vars, return                      restore

+    ENDP

+    END

--- /dev/null

+++ b/vp9/common/arm/armv6/iwalsh_v6.asm

@@ -1,0 +1,152 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT |vp8_short_inv_walsh4x4_v6|

+    EXPORT |vp8_short_inv_walsh4x4_1_v6|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA    |.text|, CODE, READONLY  ; name this block of code

+;short vp8_short_inv_walsh4x4_v6(short *input, short *output)

+|vp8_short_inv_walsh4x4_v6| PROC

+    stmdb       sp!, {r4 - r11, lr}

+    ldr         r2, [r0], #4         ; [1  |  0]

+    ldr         r3, [r0], #4         ; [3  |  2]

+    ldr         r4, [r0], #4         ; [5  |  4]

+    ldr         r5, [r0], #4         ; [7  |  6]

+    ldr         r6, [r0], #4         ; [9  |  8]

+    ldr         r7, [r0], #4         ; [11 | 10]

+    ldr         r8, [r0], #4         ; [13 | 12]

+    ldr         r9, [r0]             ; [15 | 14]

+    qadd16      r10, r2, r8          ; a1 [1+13  |  0+12]

+    qadd16      r11, r4, r6          ; b1 [5+9   |  4+8]

+    qsub16      r12, r4, r6          ; c1 [5-9   |  4-8]

+    qsub16      lr, r2, r8           ; d1 [1-13  |  0-12]

+    qadd16      r2, r10, r11         ; a1 + b1 [1  |  0]

+    qadd16      r4, r12, lr          ; c1 + d1 [5  |  4]

+    qsub16      r6, r10, r11         ; a1 - b1 [9  |  8]

+    qsub16      r8, lr, r12          ; d1 - c1 [13 | 12]

+    qadd16      r10, r3, r9          ; a1 [3+15  |  2+14]

+    qadd16      r11, r5, r7          ; b1 [7+11  |  6+10]

+    qsub16      r12, r5, r7          ; c1 [7-11  |  6-10]

+    qsub16      lr, r3, r9           ; d1 [3-15  |  2-14]

+    qadd16      r3, r10, r11         ; a1 + b1 [3  |  2]

+    qadd16      r5, r12, lr          ; c1 + d1 [7  |  6]

+    qsub16      r7, r10, r11         ; a1 - b1 [11 | 10]

+    qsub16      r9, lr, r12          ; d1 - c1 [15 | 14]

+    ; first transform complete

+    qsubaddx    r10, r2, r3          ; [c1|a1] [1-2   |   0+3]

+    qaddsubx    r11, r2, r3          ; [b1|d1] [1+2   |   0-3]

+    qsubaddx    r12, r4, r5          ; [c1|a1] [5-6   |   4+7]

+    qaddsubx    lr, r4, r5           ; [b1|d1] [5+6   |   4-7]

+    qaddsubx    r2, r10, r11         ; [b2|c2] [c1+d1 | a1-b1]

+    qaddsubx    r3, r11, r10         ; [a2|d2] [b1+a1 | d1-c1]

+    ldr         r10, c0x00030003

+    qaddsubx    r4, r12, lr          ; [b2|c2] [c1+d1 | a1-b1]

+    qaddsubx    r5, lr, r12          ; [a2|d2] [b1+a1 | d1-c1]

+    qadd16      r2, r2, r10          ; [b2+3|c2+3]

+    qadd16      r3, r3, r10          ; [a2+3|d2+3]

+    qadd16      r4, r4, r10          ; [b2+3|c2+3]

+    qadd16      r5, r5, r10          ; [a2+3|d2+3]

+    asr         r12, r2, #3          ; [1  |  x]

+    pkhtb       r12, r12, r3, asr #19; [1  |  0]

+    lsl         lr, r3, #16          ; [~3 |  x]

+    lsl         r2, r2, #16          ; [~2 |  x]

+    asr         lr, lr, #3           ; [3  |  x]

+    pkhtb       lr, lr, r2, asr #19  ; [3  |  2]

+    asr         r2, r4, #3           ; [5  |  x]

+    pkhtb       r2, r2, r5, asr #19  ; [5  |  4]

+    lsl         r3, r5, #16          ; [~7 |  x]

+    lsl         r4, r4, #16          ; [~6 |  x]

+    asr         r3, r3, #3           ; [7  |  x]

+    pkhtb       r3, r3, r4, asr #19  ; [7  |  6]

+    str         r12, [r1], #4

+    str         lr, [r1], #4

+    str         r2, [r1], #4

+    str         r3, [r1], #4

+    qsubaddx    r2, r6, r7           ; [c1|a1] [9-10  |  8+11]

+    qaddsubx    r3, r6, r7           ; [b1|d1] [9+10  |  8-11]

+    qsubaddx    r4, r8, r9           ; [c1|a1] [13-14 | 12+15]

+    qaddsubx    r5, r8, r9           ; [b1|d1] [13+14 | 12-15]

+    qaddsubx    r6, r2, r3           ; [b2|c2] [c1+d1 | a1-b1]

+    qaddsubx    r7, r3, r2           ; [a2|d2] [b1+a1 | d1-c1]

+    qaddsubx    r8, r4, r5           ; [b2|c2] [c1+d1 | a1-b1]

+    qaddsubx    r9, r5, r4           ; [a2|d2] [b1+a1 | d1-c1]

+    qadd16      r6, r6, r10          ; [b2+3|c2+3]

+    qadd16      r7, r7, r10          ; [a2+3|d2+3]

+    qadd16      r8, r8, r10          ; [b2+3|c2+3]

+    qadd16      r9, r9, r10          ; [a2+3|d2+3]

+    asr         r2, r6, #3           ; [9  |  x]

+    pkhtb       r2, r2, r7, asr #19  ; [9  |  8]

+    lsl         r3, r7, #16          ; [~11|  x]

+    lsl         r4, r6, #16          ; [~10|  x]

+    asr         r3, r3, #3           ; [11 |  x]

+    pkhtb       r3, r3, r4, asr #19  ; [11 | 10]

+    asr         r4, r8, #3           ; [13 |  x]

+    pkhtb       r4, r4, r9, asr #19  ; [13 | 12]

+    lsl         r5, r9, #16          ; [~15|  x]

+    lsl         r6, r8, #16          ; [~14|  x]

+    asr         r5, r5, #3           ; [15 |  x]

+    pkhtb       r5, r5, r6, asr #19  ; [15 | 14]

+    str         r2, [r1], #4

+    str         r3, [r1], #4

+    str         r4, [r1], #4

+    str         r5, [r1]

+    ldmia       sp!, {r4 - r11, pc}

+    ENDP        ; |vp8_short_inv_walsh4x4_v6|

+;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)

+|vp8_short_inv_walsh4x4_1_v6| PROC

+    ldrsh       r2, [r0]             ; [0]

+    add         r2, r2, #3           ; [0] + 3

+    asr         r2, r2, #3           ; a1 ([0]+3) >> 3

+    lsl         r2, r2, #16          ; [a1 |  x]

+    orr         r2, r2, r2, lsr #16  ; [a1 | a1]

+    str         r2, [r1], #4

+    str         r2, [r1], #4

+    str         r2, [r1], #4

+    str         r2, [r1], #4

+    str         r2, [r1], #4

+    str         r2, [r1], #4

+    str         r2, [r1], #4

+    str         r2, [r1]

+    bx          lr

+    ENDP        ; |vp8_short_inv_walsh4x4_1_v6|

+; Constant Pool

+c0x00030003 DCD 0x00030003

+    END

--- /dev/null

+++ b/vp9/common/arm/armv6/loopfilter_v6.asm

@@ -1,0 +1,1282 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT |vp9_loop_filter_horizontal_edge_armv6|

+    EXPORT |vp9_mbloop_filter_horizontal_edge_armv6|

+    EXPORT |vp9_loop_filter_vertical_edge_armv6|

+    EXPORT |vp9_mbloop_filter_vertical_edge_armv6|

+    AREA    |.text|, CODE, READONLY  ; name this block of code

+    MACRO

+    TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3

+    ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3

+    ; a0: 03 02 01 00

+    ; a1: 13 12 11 10

+    ; a2: 23 22 21 20

+    ; a3: 33 32 31 30

+    ;     b3 b2 b1 b0

+    uxtb16      $b1, $a1                    ; xx 12 xx 10

+    uxtb16      $b0, $a0                    ; xx 02 xx 00

+    uxtb16      $b3, $a3                    ; xx 32 xx 30

+    uxtb16      $b2, $a2                    ; xx 22 xx 20

+    orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00

+    orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20

+    uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11

+    uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31

+    uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01

+    uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21

+    orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01

+    orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21

+    pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1

+    pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3

+    pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0

+    pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2

+    MEND

+src         RN  r0

+pstep       RN  r1

+count       RN  r5

+;r0     unsigned char *src_ptr,

+;r1     int src_pixel_step,

+;r2     const char *blimit,

+;r3     const char *limit,

+;stack  const char *thresh,

+;stack  int  count

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

+|vp9_loop_filter_horizontal_edge_armv6| PROC

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

+    stmdb       sp!, {r4 - r11, lr}

+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines

+    ldr         count, [sp, #40]            ; count for 8-in-parallel

+    ldr         r6, [sp, #36]               ; load thresh address

+    sub         sp, sp, #16                 ; create temp buffer

+    ldr         r9, [src], pstep            ; p3

+    ldrb        r4, [r2]                    ; blimit

+    ldr         r10, [src], pstep           ; p2

+    ldrb        r2, [r3]                    ; limit

+    ldr         r11, [src], pstep           ; p1

+    orr         r4, r4, r4, lsl #8

+    ldrb        r3, [r6]                    ; thresh

+    orr         r2, r2, r2, lsl #8

+    mov         count, count, lsl #1        ; 4-in-parallel

+    orr         r4, r4, r4, lsl #16

+    orr         r3, r3, r3, lsl #8

+    orr         r2, r2, r2, lsl #16

+    orr         r3, r3, r3, lsl #16

+|Hnext8|

+    ; vp9_filter_mask() function

+    ; calculate breakout conditions

+    ldr         r12, [src], pstep           ; p0

+    uqsub8      r6, r9, r10                 ; p3 - p2

+    uqsub8      r7, r10, r9                 ; p2 - p3

+    uqsub8      r8, r10, r11                ; p2 - p1

+    uqsub8      r10, r11, r10               ; p1 - p2

+    orr         r6, r6, r7                  ; abs (p3-p2)

+    orr         r8, r8, r10                 ; abs (p2-p1)

+    uqsub8      lr, r6, r2                  ; compare to limit. lr: vp9_filter_mask

+    uqsub8      r8, r8, r2                  ; compare to limit

+    uqsub8      r6, r11, r12                ; p1 - p0

+    orr         lr, lr, r8

+    uqsub8      r7, r12, r11                ; p0 - p1

+    ldr         r9, [src], pstep            ; q0

+    ldr         r10, [src], pstep           ; q1

+    orr         r6, r6, r7                  ; abs (p1-p0)

+    uqsub8      r7, r6, r2                  ; compare to limit

+    uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later

+    orr         lr, lr, r7

+    uqsub8      r6, r11, r10                ; p1 - q1

+    uqsub8      r7, r10, r11                ; q1 - p1

+    uqsub8      r11, r12, r9                ; p0 - q0

+    uqsub8      r12, r9, r12                ; q0 - p0

+    orr         r6, r6, r7                  ; abs (p1-q1)

+    ldr         r7, c0x7F7F7F7F

+    orr         r12, r11, r12               ; abs (p0-q0)

+    ldr         r11, [src], pstep           ; q2

+    uqadd8      r12, r12, r12               ; abs (p0-q0) * 2

+    and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2

+    uqsub8      r7, r9, r10                 ; q0 - q1

+    uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2

+    uqsub8      r6, r10, r9                 ; q1 - q0

+    uqsub8      r12, r12, r4                ; compare to flimit

+    uqsub8      r9, r11, r10                ; q2 - q1

+    orr         lr, lr, r12

+    ldr         r12, [src], pstep           ; q3

+    uqsub8      r10, r10, r11               ; q1 - q2

+    orr         r6, r7, r6                  ; abs (q1-q0)

+    orr         r10, r9, r10                ; abs (q2-q1)

+    uqsub8      r7, r6, r2                  ; compare to limit

+    uqsub8      r10, r10, r2                ; compare to limit

+    uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later

+    orr         lr, lr, r7

+    orr         lr, lr, r10

+    uqsub8      r10, r12, r11               ; q3 - q2

+    uqsub8      r9, r11, r12                ; q2 - q3

+    mvn         r11, #0                     ; r11 == -1

+    orr         r10, r10, r9                ; abs (q3-q2)

+    uqsub8      r10, r10, r2                ; compare to limit

+    mov         r12, #0

+    orr         lr, lr, r10

+    sub         src, src, pstep, lsl #2

+    usub8       lr, r12, lr                 ; use usub8 instead of ssub8

+    sel         lr, r11, r12                ; filter mask: lr

+    cmp         lr, #0

+    beq         hskip_filter                 ; skip filtering

+    sub         src, src, pstep, lsl #1     ; move src pointer down by 6 lines

+    ;vp8_hevmask() function

+    ;calculate high edge variance

+    orr         r10, r6, r8                 ; calculate vp8_hevmask

+    ldr         r7, [src], pstep            ; p1

+    usub8       r10, r12, r10               ; use usub8 instead of ssub8

+    sel         r6, r12, r11                ; obtain vp8_hevmask: r6

+    ;vp9_filter() function

+    ldr         r8, [src], pstep            ; p0

+    ldr         r12, c0x80808080

+    ldr         r9, [src], pstep            ; q0

+    ldr         r10, [src], pstep           ; q1

+    eor         r7, r7, r12                 ; p1 offset to convert to a signed value

+    eor         r8, r8, r12                 ; p0 offset to convert to a signed value

+    eor         r9, r9, r12                 ; q0 offset to convert to a signed value

+    eor         r10, r10, r12               ; q1 offset to convert to a signed value

+    str         r9, [sp]                    ; store qs0 temporarily

+    str         r8, [sp, #4]                ; store ps0 temporarily

+    str         r10, [sp, #8]               ; store qs1 temporarily

+    str         r7, [sp, #12]               ; store ps1 temporarily

+    qsub8       r7, r7, r10                 ; vp9_signed_char_clamp(ps1-qs1)

+    qsub8       r8, r9, r8                  ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))

+    and         r7, r7, r6                  ; vp9_filter (r7) &= hev

+    qadd8       r7, r7, r8

+    ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8

+    qadd8       r7, r7, r8

+    ldr         r10, c0x04040404

+    qadd8       r7, r7, r8

+    and         r7, r7, lr                  ; vp9_filter &= mask;

+    ;modify code for vp8 -- Filter1 = vp9_filter (r7)

+    qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp9_signed_char_clamp(vp9_filter+3)

+    qadd8       r7 , r7 , r10               ; vp9_filter = vp9_signed_char_clamp(vp9_filter+4)

+    mov         r9, #0

+    shadd8      r8 , r8 , r9                ; Filter2 >>= 3

+    shadd8      r7 , r7 , r9                ; vp9_filter >>= 3

+    shadd8      r8 , r8 , r9

+    shadd8      r7 , r7 , r9

+    shadd8      lr , r8 , r9                ; lr: Filter2

+    shadd8      r7 , r7 , r9                ; r7: filter

+    ;usub8      lr, r8, r10                 ; s = (s==4)*-1

+    ;sel        lr, r11, r9

+    ;usub8      r8, r10, r8

+    ;sel        r8, r11, r9

+    ;and        r8, r8, lr                  ; -1 for each element that equals 4

+    ;calculate output

+    ;qadd8      lr, r8, r7                  ; u = vp9_signed_char_clamp(s + vp9_filter)

+    ldr         r8, [sp]                    ; load qs0

+    ldr         r9, [sp, #4]                ; load ps0

+    ldr         r10, c0x01010101

+    qsub8       r8 ,r8, r7                  ; u = vp9_signed_char_clamp(qs0 - vp9_filter)

+    qadd8       r9, r9, lr                  ; u = vp9_signed_char_clamp(ps0 + Filter2)

+    ;end of modification for vp8

+    mov         lr, #0

+    sadd8       r7, r7 , r10                ; vp9_filter += 1

+    shadd8      r7, r7, lr                  ; vp9_filter >>= 1

+    ldr         r11, [sp, #12]              ; load ps1

+    ldr         r10, [sp, #8]               ; load qs1

+    bic         r7, r7, r6                  ; vp9_filter &= ~hev

+    sub         src, src, pstep, lsl #2

+    qadd8       r11, r11, r7                ; u = vp9_signed_char_clamp(ps1 + vp9_filter)

+    qsub8       r10, r10,r7                 ; u = vp9_signed_char_clamp(qs1 - vp9_filter)

+    eor         r11, r11, r12               ; *op1 = u^0x80

+    str         r11, [src], pstep           ; store op1

+    eor         r9, r9, r12                 ; *op0 = u^0x80

+    str         r9, [src], pstep            ; store op0 result

+    eor         r8, r8, r12                 ; *oq0 = u^0x80

+    str         r8, [src], pstep            ; store oq0 result

+    eor         r10, r10, r12               ; *oq1 = u^0x80

+    str         r10, [src], pstep           ; store oq1

+    sub         src, src, pstep, lsl #1

+|hskip_filter|

+    add         src, src, #4

+    sub         src, src, pstep, lsl #2

+    subs        count, count, #1

+    ldrne       r9, [src], pstep            ; p3

+    ldrne       r10, [src], pstep           ; p2

+    ldrne       r11, [src], pstep           ; p1

+    bne         Hnext8

+    add         sp, sp, #16

+    ldmia       sp!, {r4 - r11, pc}

+    ENDP        ; |vp9_loop_filter_horizontal_edge_armv6|

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

+|vp8_mbloop_filter_horizontal_edge_armv6| PROC

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

+    stmdb       sp!, {r4 - r11, lr}

+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines

+    ldr         count, [sp, #40]            ; count for 8-in-parallel

+    ldr         r6, [sp, #36]               ; load thresh address

+    sub         sp, sp, #16                 ; create temp buffer

+    ldr         r9, [src], pstep            ; p3

+    ldrb        r4, [r2]                    ; blimit

+    ldr         r10, [src], pstep           ; p2

+    ldrb        r2, [r3]                    ; limit

+    ldr         r11, [src], pstep           ; p1

+    orr         r4, r4, r4, lsl #8

+    ldrb        r3, [r6]                    ; thresh

+    orr         r2, r2, r2, lsl #8

+    mov         count, count, lsl #1        ; 4-in-parallel

+    orr         r4, r4, r4, lsl #16

+    orr         r3, r3, r3, lsl #8

+    orr         r2, r2, r2, lsl #16

+    orr         r3, r3, r3, lsl #16

+|MBHnext8|

+    ; vp9_filter_mask() function

+    ; calculate breakout conditions

+    ldr         r12, [src], pstep           ; p0

+    uqsub8      r6, r9, r10                 ; p3 - p2

+    uqsub8      r7, r10, r9                 ; p2 - p3

+    uqsub8      r8, r10, r11                ; p2 - p1

+    uqsub8      r10, r11, r10               ; p1 - p2

+    orr         r6, r6, r7                  ; abs (p3-p2)

+    orr         r8, r8, r10                 ; abs (p2-p1)

+    uqsub8      lr, r6, r2                  ; compare to limit. lr: vp9_filter_mask

+    uqsub8      r8, r8, r2                  ; compare to limit

+    uqsub8      r6, r11, r12                ; p1 - p0

+    orr         lr, lr, r8

+    uqsub8      r7, r12, r11                ; p0 - p1

+    ldr         r9, [src], pstep            ; q0

+    ldr         r10, [src], pstep           ; q1

+    orr         r6, r6, r7                  ; abs (p1-p0)

+    uqsub8      r7, r6, r2                  ; compare to limit

+    uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later

+    orr         lr, lr, r7

+    uqsub8      r6, r11, r10                ; p1 - q1

+    uqsub8      r7, r10, r11                ; q1 - p1

+    uqsub8      r11, r12, r9                ; p0 - q0

+    uqsub8      r12, r9, r12                ; q0 - p0

+    orr         r6, r6, r7                  ; abs (p1-q1)

+    ldr         r7, c0x7F7F7F7F

+    orr         r12, r11, r12               ; abs (p0-q0)

+    ldr         r11, [src], pstep           ; q2

+    uqadd8      r12, r12, r12               ; abs (p0-q0) * 2

+    and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2

+    uqsub8      r7, r9, r10                 ; q0 - q1

+    uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2

+    uqsub8      r6, r10, r9                 ; q1 - q0

+    uqsub8      r12, r12, r4                ; compare to flimit

+    uqsub8      r9, r11, r10                ; q2 - q1

+    orr         lr, lr, r12

+    ldr         r12, [src], pstep           ; q3

+    uqsub8      r10, r10, r11               ; q1 - q2

+    orr         r6, r7, r6                  ; abs (q1-q0)

+    orr         r10, r9, r10                ; abs (q2-q1)

+    uqsub8      r7, r6, r2                  ; compare to limit

+    uqsub8      r10, r10, r2                ; compare to limit

+    uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later

+    orr         lr, lr, r7

+    orr         lr, lr, r10

+    uqsub8      r10, r12, r11               ; q3 - q2

+    uqsub8      r9, r11, r12                ; q2 - q3

+    mvn         r11, #0                     ; r11 == -1

+    orr         r10, r10, r9                ; abs (q3-q2)

+    uqsub8      r10, r10, r2                ; compare to limit

+    mov         r12, #0

+    orr         lr, lr, r10

+    usub8       lr, r12, lr                 ; use usub8 instead of ssub8

+    sel         lr, r11, r12                ; filter mask: lr

+    cmp         lr, #0

+    beq         mbhskip_filter               ; skip filtering

+    ;vp8_hevmask() function

+    ;calculate high edge variance

+    sub         src, src, pstep, lsl #2     ; move src pointer down by 6 lines

+    sub         src, src, pstep, lsl #1

+    orr         r10, r6, r8

+    ldr         r7, [src], pstep            ; p1

+    usub8       r10, r12, r10

+    sel         r6, r12, r11                ; hev mask: r6

+    ;vp8_mbfilter() function

+    ;p2, q2 are only needed at the end. Don't need to load them in now.

+    ldr         r8, [src], pstep            ; p0

+    ldr         r12, c0x80808080

+    ldr         r9, [src], pstep            ; q0

+    ldr         r10, [src]                  ; q1

+    eor         r7, r7, r12                 ; ps1

+    eor         r8, r8, r12                 ; ps0

+    eor         r9, r9, r12                 ; qs0

+    eor         r10, r10, r12               ; qs1

+    qsub8       r12, r9, r8                 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))

+    str         r7, [sp, #12]               ; store ps1 temporarily

+    qsub8       r7, r7, r10                 ; vp9_signed_char_clamp(ps1-qs1)

+    str         r10, [sp, #8]               ; store qs1 temporarily

+    qadd8       r7, r7, r12

+    str         r9, [sp]                    ; store qs0 temporarily

+    qadd8       r7, r7, r12

+    str         r8, [sp, #4]                ; store ps0 temporarily

+    qadd8       r7, r7, r12                 ; vp9_filter: r7

+    ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8

+    ldr         r9, c0x04040404

+    and         r7, r7, lr                  ; vp9_filter &= mask (lr is free)

+    mov         r12, r7                     ; Filter2: r12

+    and         r12, r12, r6                ; Filter2 &= hev

+    ;modify code for vp8

+    ;save bottom 3 bits so that we round one side +4 and the other +3

+    qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp9_signed_char_clamp(Filter2+4)

+    qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp9_signed_char_clamp(Filter2+3)

+    mov         r10, #0

+    shadd8      r8 , r8 , r10               ; Filter1 >>= 3

+    shadd8      r12 , r12 , r10             ; Filter2 >>= 3

+    shadd8      r8 , r8 , r10

+    shadd8      r12 , r12 , r10

+    shadd8      r8 , r8 , r10               ; r8: Filter1

+    shadd8      r12 , r12 , r10             ; r12: Filter2

+    ldr         r9, [sp]                    ; load qs0

+    ldr         r11, [sp, #4]               ; load ps0

+    qsub8       r9 , r9, r8                 ; qs0 = vp9_signed_char_clamp(qs0 - Filter1)

+    qadd8       r11, r11, r12               ; ps0 = vp9_signed_char_clamp(ps0 + Filter2)

+    ;save bottom 3 bits so that we round one side +4 and the other +3

+    ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)

+    ;qadd8      r12 , r12 , r9              ; Filter2 = vp9_signed_char_clamp(Filter2+4)

+    ;mov            r10, #0

+    ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3

+    ;usub8      lr, r8, r9                  ; s = (s==4)*-1

+    ;sel            lr, r11, r10

+    ;shadd8     r12 , r12 , r10

+    ;usub8      r8, r9, r8

+    ;sel            r8, r11, r10

+    ;ldr            r9, [sp]                    ; load qs0

+    ;ldr            r11, [sp, #4]               ; load ps0

+    ;shadd8     r12 , r12 , r10

+    ;and            r8, r8, lr                  ; -1 for each element that equals 4

+    ;qadd8      r10, r8, r12                ; u = vp9_signed_char_clamp(s + Filter2)

+    ;qsub8      r9 , r9, r12                ; qs0 = vp9_signed_char_clamp(qs0 - Filter2)

+    ;qadd8      r11, r11, r10               ; ps0 = vp9_signed_char_clamp(ps0 + u)

+    ;end of modification for vp8

+    bic         r12, r7, r6                 ; vp9_filter &= ~hev    ( r6 is free)

+    ;mov        r12, r7

+    ;roughly 3/7th difference across boundary

+    mov         lr, #0x1b                   ; 27

+    mov         r7, #0x3f                   ; 63

+    sxtb16      r6, r12

+    sxtb16      r10, r12, ror #8

+    smlabb      r8, r6, lr, r7

+    smlatb      r6, r6, lr, r7

+    smlabb      r7, r10, lr, r7

+    smultb      r10, r10, lr

+    ssat        r8, #8, r8, asr #7

+    ssat        r6, #8, r6, asr #7

+    add         r10, r10, #63

+    ssat        r7, #8, r7, asr #7

+    ssat        r10, #8, r10, asr #7

+    ldr         lr, c0x80808080

+    pkhbt       r6, r8, r6, lsl #16

+    pkhbt       r10, r7, r10, lsl #16

+    uxtb16      r6, r6

+    uxtb16      r10, r10

+    sub         src, src, pstep

+    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 27)>>7)

+    qsub8       r8, r9, r10                 ; s = vp9_signed_char_clamp(qs0 - u)

+    qadd8       r10, r11, r10               ; s = vp9_signed_char_clamp(ps0 + u)

+    eor         r8, r8, lr                  ; *oq0 = s^0x80

+    str         r8, [src]                   ; store *oq0

+    sub         src, src, pstep

+    eor         r10, r10, lr                ; *op0 = s^0x80

+    str         r10, [src]                  ; store *op0

+    ;roughly 2/7th difference across boundary

+    mov         lr, #0x12                   ; 18

+    mov         r7, #0x3f                   ; 63

+    sxtb16      r6, r12

+    sxtb16      r10, r12, ror #8

+    smlabb      r8, r6, lr, r7

+    smlatb      r6, r6, lr, r7

+    smlabb      r9, r10, lr, r7

+    smlatb      r10, r10, lr, r7

+    ssat        r8, #8, r8, asr #7

+    ssat        r6, #8, r6, asr #7

+    ssat        r9, #8, r9, asr #7

+    ssat        r10, #8, r10, asr #7

+    ldr         lr, c0x80808080

+    pkhbt       r6, r8, r6, lsl #16

+    pkhbt       r10, r9, r10, lsl #16

+    ldr         r9, [sp, #8]                ; load qs1

+    ldr         r11, [sp, #12]              ; load ps1

+    uxtb16      r6, r6

+    uxtb16      r10, r10

+    sub         src, src, pstep

+    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 18)>>7)

+    qadd8       r11, r11, r10               ; s = vp9_signed_char_clamp(ps1 + u)

+    qsub8       r8, r9, r10                 ; s = vp9_signed_char_clamp(qs1 - u)

+    eor         r11, r11, lr                ; *op1 = s^0x80

+    str         r11, [src], pstep           ; store *op1

+    eor         r8, r8, lr                  ; *oq1 = s^0x80

+    add         src, src, pstep, lsl #1

+    mov         r7, #0x3f                   ; 63

+    str         r8, [src], pstep            ; store *oq1

+    ;roughly 1/7th difference across boundary

+    mov         lr, #0x9                    ; 9

+    ldr         r9, [src]                   ; load q2

+    sxtb16      r6, r12

+    sxtb16      r10, r12, ror #8

+    smlabb      r8, r6, lr, r7

+    smlatb      r6, r6, lr, r7

+    smlabb      r12, r10, lr, r7

+    smlatb      r10, r10, lr, r7

+    ssat        r8, #8, r8, asr #7

+    ssat        r6, #8, r6, asr #7

+    ssat        r12, #8, r12, asr #7

+    ssat        r10, #8, r10, asr #7

+    sub         src, src, pstep, lsl #2

+    pkhbt       r6, r8, r6, lsl #16

+    pkhbt       r10, r12, r10, lsl #16

+    sub         src, src, pstep

+    ldr         lr, c0x80808080

+    ldr         r11, [src]                  ; load p2

+    uxtb16      r6, r6

+    uxtb16      r10, r10

+    eor         r9, r9, lr

+    eor         r11, r11, lr

+    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 9)>>7)

+    qadd8       r8, r11, r10                ; s = vp9_signed_char_clamp(ps2 + u)

+    qsub8       r10, r9, r10                ; s = vp9_signed_char_clamp(qs2 - u)

+    eor         r8, r8, lr                  ; *op2 = s^0x80

+    str         r8, [src], pstep, lsl #2    ; store *op2

+    add         src, src, pstep

+    eor         r10, r10, lr                ; *oq2 = s^0x80

+    str         r10, [src], pstep, lsl #1   ; store *oq2

+|mbhskip_filter|

+    add         src, src, #4

+    sub         src, src, pstep, lsl #3

+    subs        count, count, #1

+    ldrne       r9, [src], pstep            ; p3

+    ldrne       r10, [src], pstep           ; p2

+    ldrne       r11, [src], pstep           ; p1

+    bne         MBHnext8

+    add         sp, sp, #16

+    ldmia       sp!, {r4 - r11, pc}

+    ENDP        ; |vp8_mbloop_filter_horizontal_edge_armv6|

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

+|vp9_loop_filter_vertical_edge_armv6| PROC

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

+    stmdb       sp!, {r4 - r11, lr}

+    sub         src, src, #4                ; move src pointer down by 4

+    ldr         count, [sp, #40]            ; count for 8-in-parallel

+    ldr         r12, [sp, #36]              ; load thresh address

+    sub         sp, sp, #16                 ; create temp buffer

+    ldr         r6, [src], pstep            ; load source data

+    ldrb        r4, [r2]                    ; blimit

+    ldr         r7, [src], pstep

+    ldrb        r2, [r3]                    ; limit

+    ldr         r8, [src], pstep

+    orr         r4, r4, r4, lsl #8

+    ldrb        r3, [r12]                   ; thresh

+    orr         r2, r2, r2, lsl #8

+    ldr         lr, [src], pstep

+    mov         count, count, lsl #1        ; 4-in-parallel

+    orr         r4, r4, r4, lsl #16

+    orr         r3, r3, r3, lsl #8

+    orr         r2, r2, r2, lsl #16

+    orr         r3, r3, r3, lsl #16

+|Vnext8|

+    ; vp9_filter_mask() function

+    ; calculate breakout conditions

+    ; transpose the source data for 4-in-parallel operation

+    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12

+    uqsub8      r7, r9, r10                 ; p3 - p2

+    uqsub8      r8, r10, r9                 ; p2 - p3

+    uqsub8      r9, r10, r11                ; p2 - p1

+    uqsub8      r10, r11, r10               ; p1 - p2

+    orr         r7, r7, r8                  ; abs (p3-p2)

+    orr         r10, r9, r10                ; abs (p2-p1)

+    uqsub8      lr, r7, r2                  ; compare to limit. lr: vp9_filter_mask

+    uqsub8      r10, r10, r2                ; compare to limit

+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines

+    orr         lr, lr, r10

+    uqsub8      r6, r11, r12                ; p1 - p0

+    uqsub8      r7, r12, r11                ; p0 - p1

+    add         src, src, #4                ; move src pointer up by 4

+    orr         r6, r6, r7                  ; abs (p1-p0)

+    str         r11, [sp, #12]              ; save p1

+    uqsub8      r10, r6, r2                 ; compare to limit

+    uqsub8      r11, r6, r3                 ; compare to thresh

+    orr         lr, lr, r10

+    ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now

+    ; transpose the source data for 4-in-parallel operation

+    ldr         r6, [src], pstep            ; load source data

+    str         r11, [sp]                   ; push r11 to stack

+    ldr         r7, [src], pstep

+    str         r12, [sp, #4]               ; save current reg before load q0 - q3 data

+    ldr         r8, [src], pstep

+    str         lr, [sp, #8]

+    ldr         lr, [src], pstep

+    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12

+    ldr         lr, [sp, #8]                ; load back (f)limit accumulator

+    uqsub8      r6, r12, r11                ; q3 - q2

+    uqsub8      r7, r11, r12                ; q2 - q3

+    uqsub8      r12, r11, r10               ; q2 - q1

+    uqsub8      r11, r10, r11               ; q1 - q2

+    orr         r6, r6, r7                  ; abs (q3-q2)

+    orr         r7, r12, r11                ; abs (q2-q1)

+    uqsub8      r6, r6, r2                  ; compare to limit

+    uqsub8      r7, r7, r2                  ; compare to limit

+    ldr         r11, [sp, #4]               ; load back p0

+    ldr         r12, [sp, #12]              ; load back p1

+    orr         lr, lr, r6

+    orr         lr, lr, r7

+    uqsub8      r6, r11, r9                 ; p0 - q0

+    uqsub8      r7, r9, r11                 ; q0 - p0

+    uqsub8      r8, r12, r10                ; p1 - q1

+    uqsub8      r11, r10, r12               ; q1 - p1

+    orr         r6, r6, r7                  ; abs (p0-q0)

+    ldr         r7, c0x7F7F7F7F

+    orr         r8, r8, r11                 ; abs (p1-q1)

+    uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2

+    and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2

+    uqsub8      r11, r10, r9                ; q1 - q0

+    uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2

+    uqsub8      r12, r9, r10                ; q0 - q1

+    uqsub8      r6, r6, r4                  ; compare to flimit

+    orr         r9, r11, r12                ; abs (q1-q0)

+    uqsub8      r8, r9, r2                  ; compare to limit

+    uqsub8      r10, r9, r3                 ; compare to thresh

+    orr         lr, lr, r6

+    orr         lr, lr, r8

+    mvn         r11, #0                     ; r11 == -1

+    mov         r12, #0

+    usub8       lr, r12, lr

+    ldr         r9, [sp]                    ; load the compared result

+    sel         lr, r11, r12                ; filter mask: lr

+    cmp         lr, #0

+    beq         vskip_filter                 ; skip filtering

+    ;vp8_hevmask() function

+    ;calculate high edge variance

+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines

+    orr         r9, r9, r10

+    ldrh        r7, [src, #-2]

+    ldrh        r8, [src], pstep

+    usub8       r9, r12, r9

+    sel         r6, r12, r11                ; hev mask: r6

+    ;vp9_filter() function

+    ; load soure data to r6, r11, r12, lr

+    ldrh        r9, [src, #-2]

+    ldrh        r10, [src], pstep

+    pkhbt       r12, r7, r8, lsl #16

+    ldrh        r7, [src, #-2]

+    ldrh        r8, [src], pstep

+    pkhbt       r11, r9, r10, lsl #16

+    ldrh        r9, [src, #-2]

+    ldrh        r10, [src], pstep

+    ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first

+    str         r6, [sp]

+    str         lr, [sp, #4]

+    pkhbt       r6, r7, r8, lsl #16

+    pkhbt       lr, r9, r10, lsl #16

+    ;transpose r12, r11, r6, lr to r7, r8, r9, r10

+    TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10

+    ;load back hev_mask r6 and filter_mask lr

+    ldr         r12, c0x80808080

+    ldr         r6, [sp]

+    ldr         lr, [sp, #4]

+    eor         r7, r7, r12                 ; p1 offset to convert to a signed value

+    eor         r8, r8, r12                 ; p0 offset to convert to a signed value

+    eor         r9, r9, r12                 ; q0 offset to convert to a signed value

+    eor         r10, r10, r12               ; q1 offset to convert to a signed value

+    str         r9, [sp]                    ; store qs0 temporarily

+    str         r8, [sp, #4]                ; store ps0 temporarily

+    str         r10, [sp, #8]               ; store qs1 temporarily

+    str         r7, [sp, #12]               ; store ps1 temporarily

+    qsub8       r7, r7, r10                 ; vp9_signed_char_clamp(ps1-qs1)

+    qsub8       r8, r9, r8                  ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))

+    and         r7, r7, r6                  ;  vp9_filter (r7) &= hev (r7 : filter)

+    qadd8       r7, r7, r8

+    ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8

+    qadd8       r7, r7, r8

+    ldr         r10, c0x04040404

+    qadd8       r7, r7, r8

+    ;mvn         r11, #0                     ; r11 == -1

+    and         r7, r7, lr                  ; vp9_filter &= mask

+    ;modify code for vp8 -- Filter1 = vp9_filter (r7)

+    qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp9_signed_char_clamp(vp9_filter+3)

+    qadd8       r7 , r7 , r10               ; vp9_filter = vp9_signed_char_clamp(vp9_filter+4)

+    mov         r9, #0

+    shadd8      r8 , r8 , r9                ; Filter2 >>= 3

+    shadd8      r7 , r7 , r9                ; vp9_filter >>= 3

+    shadd8      r8 , r8 , r9

+    shadd8      r7 , r7 , r9

+    shadd8      lr , r8 , r9                ; lr: filter2

+    shadd8      r7 , r7 , r9                ; r7: filter

+    ;usub8      lr, r8, r10                 ; s = (s==4)*-1

+    ;sel            lr, r11, r9

+    ;usub8      r8, r10, r8

+    ;sel            r8, r11, r9

+    ;and            r8, r8, lr                  ; -1 for each element that equals 4 -- r8: s

+    ;calculate output

+    ;qadd8      lr, r8, r7                  ; u = vp9_signed_char_clamp(s + vp9_filter)

+    ldr         r8, [sp]                    ; load qs0

+    ldr         r9, [sp, #4]                ; load ps0

+    ldr         r10, c0x01010101

+    qsub8       r8, r8, r7                  ; u = vp9_signed_char_clamp(qs0 - vp9_filter)

+    qadd8       r9, r9, lr                  ; u = vp9_signed_char_clamp(ps0 + Filter2)

+    ;end of modification for vp8

+    eor         r8, r8, r12

+    eor         r9, r9, r12

+    mov         lr, #0

+    sadd8       r7, r7, r10

+    shadd8      r7, r7, lr

+    ldr         r10, [sp, #8]               ; load qs1

+    ldr         r11, [sp, #12]              ; load ps1

+    bic         r7, r7, r6                  ; r7: vp9_filter

+    qsub8       r10 , r10, r7               ; u = vp9_signed_char_clamp(qs1 - vp9_filter)

+    qadd8       r11, r11, r7                ; u = vp9_signed_char_clamp(ps1 + vp9_filter)

+    eor         r10, r10, r12

+    eor         r11, r11, r12

+    sub         src, src, pstep, lsl #2

+    ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1

+    ;output is b0, b1, b2, b3

+    ;b0: 03 02 01 00

+    ;b1: 13 12 11 10

+    ;b2: 23 22 21 20

+    ;b3: 33 32 31 30

+    ;    p1 p0 q0 q1

+    ;   (a3 a2 a1 a0)

+    TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr

+    strh        r6, [src, #-2]              ; store the result

+    mov         r6, r6, lsr #16

+    strh        r6, [src], pstep

+    strh        r7, [src, #-2]

+    mov         r7, r7, lsr #16

+    strh        r7, [src], pstep

+    strh        r12, [src, #-2]

+    mov         r12, r12, lsr #16

+    strh        r12, [src], pstep

+    strh        lr, [src, #-2]

+    mov         lr, lr, lsr #16

+    strh        lr, [src], pstep

+|vskip_filter|

+    sub         src, src, #4

+    subs        count, count, #1

+    ldrne       r6, [src], pstep            ; load source data

+    ldrne       r7, [src], pstep

+    ldrne       r8, [src], pstep

+    ldrne       lr, [src], pstep

+    bne         Vnext8

+    add         sp, sp, #16

+    ldmia       sp!, {r4 - r11, pc}

+    ENDP        ; |vp9_loop_filter_vertical_edge_armv6|

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

+|vp8_mbloop_filter_vertical_edge_armv6| PROC

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

+    stmdb       sp!, {r4 - r11, lr}

+    sub         src, src, #4                ; move src pointer down by 4

+    ldr         count, [sp, #40]            ; count for 8-in-parallel

+    ldr         r12, [sp, #36]              ; load thresh address

+    pld         [src, #23]                  ; preload for next block

+    sub         sp, sp, #16                 ; create temp buffer

+    ldr         r6, [src], pstep            ; load source data

+    ldrb        r4, [r2]                    ; blimit

+    pld         [src, #23]

+    ldr         r7, [src], pstep

+    ldrb        r2, [r3]                    ; limit

+    pld         [src, #23]

+    ldr         r8, [src], pstep

+    orr         r4, r4, r4, lsl #8

+    ldrb        r3, [r12]                   ; thresh

+    orr         r2, r2, r2, lsl #8

+    pld         [src, #23]

+    ldr         lr, [src], pstep

+    mov         count, count, lsl #1        ; 4-in-parallel

+    orr         r4, r4, r4, lsl #16

+    orr         r3, r3, r3, lsl #8

+    orr         r2, r2, r2, lsl #16

+    orr         r3, r3, r3, lsl #16

+|MBVnext8|

+    ; vp9_filter_mask() function

+    ; calculate breakout conditions

+    ; transpose the source data for 4-in-parallel operation

+    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12

+    uqsub8      r7, r9, r10                 ; p3 - p2

+    uqsub8      r8, r10, r9                 ; p2 - p3

+    uqsub8      r9, r10, r11                ; p2 - p1

+    uqsub8      r10, r11, r10               ; p1 - p2

+    orr         r7, r7, r8                  ; abs (p3-p2)

+    orr         r10, r9, r10                ; abs (p2-p1)

+    uqsub8      lr, r7, r2                  ; compare to limit. lr: vp9_filter_mask

+    uqsub8      r10, r10, r2                ; compare to limit

+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines

+    orr         lr, lr, r10

+    uqsub8      r6, r11, r12                ; p1 - p0

+    uqsub8      r7, r12, r11                ; p0 - p1

+    add         src, src, #4                ; move src pointer up by 4

+    orr         r6, r6, r7                  ; abs (p1-p0)

+    str         r11, [sp, #12]              ; save p1

+    uqsub8      r10, r6, r2                 ; compare to limit

+    uqsub8      r11, r6, r3                 ; compare to thresh

+    orr         lr, lr, r10

+    ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now

+    ; transpose the source data for 4-in-parallel operation

+    ldr         r6, [src], pstep            ; load source data

+    str         r11, [sp]                   ; push r11 to stack

+    ldr         r7, [src], pstep

+    str         r12, [sp, #4]               ; save current reg before load q0 - q3 data

+    ldr         r8, [src], pstep

+    str         lr, [sp, #8]

+    ldr         lr, [src], pstep

+    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12

+    ldr         lr, [sp, #8]                ; load back (f)limit accumulator

+    uqsub8      r6, r12, r11                ; q3 - q2

+    uqsub8      r7, r11, r12                ; q2 - q3

+    uqsub8      r12, r11, r10               ; q2 - q1

+    uqsub8      r11, r10, r11               ; q1 - q2

+    orr         r6, r6, r7                  ; abs (q3-q2)

+    orr         r7, r12, r11                ; abs (q2-q1)

+    uqsub8      r6, r6, r2                  ; compare to limit

+    uqsub8      r7, r7, r2                  ; compare to limit

+    ldr         r11, [sp, #4]               ; load back p0

+    ldr         r12, [sp, #12]              ; load back p1

+    orr         lr, lr, r6

+    orr         lr, lr, r7

+    uqsub8      r6, r11, r9                 ; p0 - q0

+    uqsub8      r7, r9, r11                 ; q0 - p0

+    uqsub8      r8, r12, r10                ; p1 - q1

+    uqsub8      r11, r10, r12               ; q1 - p1

+    orr         r6, r6, r7                  ; abs (p0-q0)

+    ldr         r7, c0x7F7F7F7F

+    orr         r8, r8, r11                 ; abs (p1-q1)

+    uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2

+    and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2

+    uqsub8      r11, r10, r9                ; q1 - q0

+    uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2

+    uqsub8      r12, r9, r10                ; q0 - q1

+    uqsub8      r6, r6, r4                  ; compare to flimit

+    orr         r9, r11, r12                ; abs (q1-q0)

+    uqsub8      r8, r9, r2                  ; compare to limit

+    uqsub8      r10, r9, r3                 ; compare to thresh

+    orr         lr, lr, r6

+    orr         lr, lr, r8

+    mvn         r11, #0                     ; r11 == -1

+    mov         r12, #0

+    usub8       lr, r12, lr

+    ldr         r9, [sp]                    ; load the compared result

+    sel         lr, r11, r12                ; filter mask: lr

+    cmp         lr, #0

+    beq         mbvskip_filter               ; skip filtering

+    ;vp8_hevmask() function

+    ;calculate high edge variance

+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines

+    orr         r9, r9, r10

+    ldrh        r7, [src, #-2]

+    ldrh        r8, [src], pstep

+    usub8       r9, r12, r9

+    sel         r6, r12, r11                ; hev mask: r6

+    ; vp8_mbfilter() function

+    ; p2, q2 are only needed at the end. Don't need to load them in now.

+    ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first

+    ; load soure data to r6, r11, r12, lr

+    ldrh        r9, [src, #-2]

+    ldrh        r10, [src], pstep

+    pkhbt       r12, r7, r8, lsl #16

+    ldrh        r7, [src, #-2]

+    ldrh        r8, [src], pstep

+    pkhbt       r11, r9, r10, lsl #16

+    ldrh        r9, [src, #-2]

+    ldrh        r10, [src], pstep

+    str         r6, [sp]                    ; save r6

+    str         lr, [sp, #4]                ; save lr

+    pkhbt       r6, r7, r8, lsl #16

+    pkhbt       lr, r9, r10, lsl #16

+    ;transpose r12, r11, r6, lr to p1, p0, q0, q1

+    TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10

+    ;load back hev_mask r6 and filter_mask lr

+    ldr         r12, c0x80808080

+    ldr         r6, [sp]

+    ldr         lr, [sp, #4]

+    eor         r7, r7, r12                 ; ps1

+    eor         r8, r8, r12                 ; ps0

+    eor         r9, r9, r12                 ; qs0

+    eor         r10, r10, r12               ; qs1

+    qsub8       r12, r9, r8                 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))

+    str         r7, [sp, #12]               ; store ps1 temporarily

+    qsub8       r7, r7, r10                 ; vp9_signed_char_clamp(ps1-qs1)

+    str         r10, [sp, #8]               ; store qs1 temporarily

+    qadd8       r7, r7, r12

+    str         r9, [sp]                    ; store qs0 temporarily

+    qadd8       r7, r7, r12

+    str         r8, [sp, #4]                ; store ps0 temporarily

+    qadd8       r7, r7, r12                 ; vp9_filter: r7

+    ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8

+    ldr         r9, c0x04040404

+    ;mvn         r11, #0                     ; r11 == -1

+    and         r7, r7, lr                  ; vp9_filter &= mask (lr is free)

+    mov         r12, r7                     ; Filter2: r12

+    and         r12, r12, r6                ; Filter2 &= hev

+    ;modify code for vp8

+    ;save bottom 3 bits so that we round one side +4 and the other +3

+    qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp9_signed_char_clamp(Filter2+4)

+    qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp9_signed_char_clamp(Filter2+3)

+    mov         r10, #0

+    shadd8      r8 , r8 , r10               ; Filter1 >>= 3

+    shadd8      r12 , r12 , r10             ; Filter2 >>= 3

+    shadd8      r8 , r8 , r10

+    shadd8      r12 , r12 , r10

+    shadd8      r8 , r8 , r10               ; r8: Filter1

+    shadd8      r12 , r12 , r10             ; r12: Filter2

+    ldr         r9, [sp]                    ; load qs0

+    ldr         r11, [sp, #4]               ; load ps0

+    qsub8       r9 , r9, r8                 ; qs0 = vp9_signed_char_clamp(qs0 - Filter1)

+    qadd8       r11, r11, r12               ; ps0 = vp9_signed_char_clamp(ps0 + Filter2)

+    ;save bottom 3 bits so that we round one side +4 and the other +3

+    ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)

+    ;qadd8      r12 , r12 , r9              ; Filter2 = vp9_signed_char_clamp(Filter2+4)

+    ;mov            r10, #0

+    ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3

+    ;usub8      lr, r8, r9                  ; s = (s==4)*-1

+    ;sel            lr, r11, r10

+    ;shadd8     r12 , r12 , r10

+    ;usub8      r8, r9, r8

+    ;sel            r8, r11, r10

+    ;ldr            r9, [sp]                    ; load qs0

+    ;ldr            r11, [sp, #4]               ; load ps0

+    ;shadd8     r12 , r12 , r10

+    ;and            r8, r8, lr                  ; -1 for each element that equals 4

+    ;qadd8      r10, r8, r12                ; u = vp9_signed_char_clamp(s + Filter2)

+    ;qsub8      r9 , r9, r12                ; qs0 = vp9_signed_char_clamp(qs0 - Filter2)

+    ;qadd8      r11, r11, r10               ; ps0 = vp9_signed_char_clamp(ps0 + u)

+    ;end of modification for vp8

+    bic         r12, r7, r6                 ;vp9_filter &= ~hev    ( r6 is free)

+    ;mov            r12, r7

+    ;roughly 3/7th difference across boundary

+    mov         lr, #0x1b                   ; 27

+    mov         r7, #0x3f                   ; 63

+    sxtb16      r6, r12

+    sxtb16      r10, r12, ror #8

+    smlabb      r8, r6, lr, r7

+    smlatb      r6, r6, lr, r7

+    smlabb      r7, r10, lr, r7

+    smultb      r10, r10, lr

+    ssat        r8, #8, r8, asr #7

+    ssat        r6, #8, r6, asr #7

+    add         r10, r10, #63

+    ssat        r7, #8, r7, asr #7

+    ssat        r10, #8, r10, asr #7

+    ldr         lr, c0x80808080

+    pkhbt       r6, r8, r6, lsl #16

+    pkhbt       r10, r7, r10, lsl #16

+    uxtb16      r6, r6

+    uxtb16      r10, r10

+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines

+    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 27)>>7)

+    qsub8       r8, r9, r10                 ; s = vp9_signed_char_clamp(qs0 - u)

+    qadd8       r10, r11, r10               ; s = vp9_signed_char_clamp(ps0 + u)

+    eor         r8, r8, lr                  ; *oq0 = s^0x80

+    eor         r10, r10, lr                ; *op0 = s^0x80

+    strb        r10, [src, #-1]             ; store op0 result

+    strb        r8, [src], pstep            ; store oq0 result

+    mov         r10, r10, lsr #8

+    mov         r8, r8, lsr #8

+    strb        r10, [src, #-1]

+    strb        r8, [src], pstep

+    mov         r10, r10, lsr #8

+    mov         r8, r8, lsr #8

+    strb        r10, [src, #-1]

+    strb        r8, [src], pstep

+    mov         r10, r10, lsr #8

+    mov         r8, r8, lsr #8

+    strb        r10, [src, #-1]

+    strb        r8, [src], pstep

+    ;roughly 2/7th difference across boundary

+    mov         lr, #0x12                   ; 18

+    mov         r7, #0x3f                   ; 63

+    sxtb16      r6, r12

+    sxtb16      r10, r12, ror #8

+    smlabb      r8, r6, lr, r7

+    smlatb      r6, r6, lr, r7

+    smlabb      r9, r10, lr, r7

+    smlatb      r10, r10, lr, r7

+    ssat        r8, #8, r8, asr #7

+    ssat        r6, #8, r6, asr #7

+    ssat        r9, #8, r9, asr #7

+    ssat        r10, #8, r10, asr #7

+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines

+    pkhbt       r6, r8, r6, lsl #16

+    pkhbt       r10, r9, r10, lsl #16

+    ldr         r9, [sp, #8]                ; load qs1

+    ldr         r11, [sp, #12]              ; load ps1

+    ldr         lr, c0x80808080

+    uxtb16      r6, r6

+    uxtb16      r10, r10

+    add         src, src, #2

+    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 18)>>7)

+    qsub8       r8, r9, r10                 ; s = vp9_signed_char_clamp(qs1 - u)

+    qadd8       r10, r11, r10               ; s = vp9_signed_char_clamp(ps1 + u)

+    eor         r8, r8, lr                  ; *oq1 = s^0x80

+    eor         r10, r10, lr                ; *op1 = s^0x80

+    ldrb        r11, [src, #-5]             ; load p2 for 1/7th difference across boundary

+    strb        r10, [src, #-4]             ; store op1

+    strb        r8, [src, #-1]              ; store oq1

+    ldrb        r9, [src], pstep            ; load q2 for 1/7th difference across boundary

+    mov         r10, r10, lsr #8

+    mov         r8, r8, lsr #8

+    ldrb        r6, [src, #-5]

+    strb        r10, [src, #-4]

+    strb        r8, [src, #-1]

+    ldrb        r7, [src], pstep

+    mov         r10, r10, lsr #8

+    mov         r8, r8, lsr #8

+    orr         r11, r11, r6, lsl #8

+    orr         r9, r9, r7, lsl #8

+    ldrb        r6, [src, #-5]

+    strb        r10, [src, #-4]

+    strb        r8, [src, #-1]

+    ldrb        r7, [src], pstep

+    mov         r10, r10, lsr #8

+    mov         r8, r8, lsr #8

+    orr         r11, r11, r6, lsl #16

+    orr         r9, r9, r7, lsl #16

+    ldrb        r6, [src, #-5]

+    strb        r10, [src, #-4]

+    strb        r8, [src, #-1]

+    ldrb        r7, [src], pstep

+    orr         r11, r11, r6, lsl #24

+    orr         r9, r9, r7, lsl #24

+    ;roughly 1/7th difference across boundary

+    eor         r9, r9, lr

+    eor         r11, r11, lr

+    mov         lr, #0x9                    ; 9

+    mov         r7, #0x3f                   ; 63

+    sxtb16      r6, r12

+    sxtb16      r10, r12, ror #8

+    smlabb      r8, r6, lr, r7

+    smlatb      r6, r6, lr, r7

+    smlabb      r12, r10, lr, r7

+    smlatb      r10, r10, lr, r7

+    ssat        r8, #8, r8, asr #7

+    ssat        r6, #8, r6, asr #7

+    ssat        r12, #8, r12, asr #7

+    ssat        r10, #8, r10, asr #7

+    sub         src, src, pstep, lsl #2

+    pkhbt       r6, r8, r6, lsl #16

+    pkhbt       r10, r12, r10, lsl #16

+    uxtb16      r6, r6

+    uxtb16      r10, r10

+    ldr         lr, c0x80808080

+    orr         r10, r6, r10, lsl #8        ; u = vp9_signed_char_clamp((63 + Filter2 * 9)>>7)

+    qadd8       r8, r11, r10                ; s = vp9_signed_char_clamp(ps2 + u)

+    qsub8       r10, r9, r10                ; s = vp9_signed_char_clamp(qs2 - u)

+    eor         r8, r8, lr                  ; *op2 = s^0x80

+    eor         r10, r10, lr                ; *oq2 = s^0x80

+    strb        r8, [src, #-5]              ; store *op2

+    strb        r10, [src], pstep           ; store *oq2

+    mov         r8, r8, lsr #8

+    mov         r10, r10, lsr #8

+    strb        r8, [src, #-5]

+    strb        r10, [src], pstep

+    mov         r8, r8, lsr #8

+    mov         r10, r10, lsr #8

+    strb        r8, [src, #-5]

+    strb        r10, [src], pstep

+    mov         r8, r8, lsr #8

+    mov         r10, r10, lsr #8

+    strb        r8, [src, #-5]

+    strb        r10, [src], pstep

+    ;adjust src pointer for next loop

+    sub         src, src, #2

+|mbvskip_filter|

+    sub         src, src, #4

+    subs        count, count, #1

+    pld         [src, #23]                  ; preload for next block

+    ldrne       r6, [src], pstep            ; load source data

+    pld         [src, #23]

+    ldrne       r7, [src], pstep

+    pld         [src, #23]

+    ldrne       r8, [src], pstep

+    pld         [src, #23]

+    ldrne       lr, [src], pstep

+    bne         MBVnext8

+    add         sp, sp, #16

+    ldmia       sp!, {r4 - r11, pc}

+    ENDP        ; |vp8_mbloop_filter_vertical_edge_armv6|

+; Constant Pool

+c0x80808080 DCD     0x80808080

+c0x03030303 DCD     0x03030303

+c0x04040404 DCD     0x04040404

+c0x01010101 DCD     0x01010101

+c0x7F7F7F7F DCD     0x7F7F7F7F

+    END

--- /dev/null

+++ b/vp9/common/arm/armv6/recon_v6.asm

@@ -1,0 +1,281 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_recon_b_armv6|

+    EXPORT  |vp8_recon2b_armv6|

+    EXPORT  |vp8_recon4b_armv6|

+    AREA    |.text|, CODE, READONLY  ; name this block of code

+prd     RN  r0

+dif     RN  r1

+dst     RN  r2

+stride      RN  r3

+;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride)

+; R0 char* pred_ptr

+; R1 short * dif_ptr

+; R2 char * dst_ptr

+; R3 int stride

+; Description:

+; Loop through the block adding the Pred and Diff together.  Clamp and then

+; store back into the Dst.

+; Restrictions :

+; all buffers are expected to be 4 byte aligned coming in and

+; going out.

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

+;

+;

+;

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

+|vp8_recon_b_armv6| PROC

+    stmdb   sp!, {r4 - r9, lr}

+    ;0, 1, 2, 3

+    ldr     r4, [prd], #16          ; 3 | 2 | 1 | 0

+    ldr     r6, [dif, #0]           ;     1 |     0

+    ldr     r7, [dif, #4]           ;     3 |     2

+    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0

+    pkhtb   r9, r7, r6, asr #16     ;     3 |     1

+    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0

+    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1

+    usat16  r8, #8, r8

+    usat16  r9, #8, r9

+    add     dif, dif, #32

+    orr     r8, r8, r9, lsl #8

+    str     r8, [dst], stride

+    ;0, 1, 2, 3

+    ldr     r4, [prd], #16          ; 3 | 2 | 1 | 0

+;;  ldr     r6, [dif, #8]           ;     1 |     0

+;;  ldr     r7, [dif, #12]          ;     3 |     2

+    ldr     r6, [dif, #0]           ;     1 |     0

+    ldr     r7, [dif, #4]           ;     3 |     2

+    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0

+    pkhtb   r9, r7, r6, asr #16     ;     3 |     1

+    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0

+    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1

+    usat16  r8, #8, r8

+    usat16  r9, #8, r9

+    add     dif, dif, #32

+    orr     r8, r8, r9, lsl #8

+    str     r8, [dst], stride

+    ;0, 1, 2, 3

+    ldr     r4, [prd], #16          ; 3 | 2 | 1 | 0

+;;  ldr     r6, [dif, #16]          ;     1 |     0

+;;  ldr     r7, [dif, #20]          ;     3 |     2

+    ldr     r6, [dif, #0]           ;     1 |     0

+    ldr     r7, [dif, #4]           ;     3 |     2

+    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0

+    pkhtb   r9, r7, r6, asr #16     ;     3 |     1

+    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0

+    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1

+    usat16  r8, #8, r8

+    usat16  r9, #8, r9

+    add     dif, dif, #32

+    orr     r8, r8, r9, lsl #8

+    str     r8, [dst], stride

+    ;0, 1, 2, 3

+    ldr     r4, [prd], #16          ; 3 | 2 | 1 | 0

+;;  ldr     r6, [dif, #24]          ;     1 |     0

+;;  ldr     r7, [dif, #28]          ;     3 |     2

+    ldr     r6, [dif, #0]           ;     1 |     0

+    ldr     r7, [dif, #4]           ;     3 |     2

+    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0

+    pkhtb   r9, r7, r6, asr #16     ;     3 |     1

+    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0

+    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1

+    usat16  r8, #8, r8

+    usat16  r9, #8, r9

+    orr     r8, r8, r9, lsl #8

+    str     r8, [dst], stride

+    ldmia   sp!, {r4 - r9, pc}

+    ENDP    ; |recon_b|

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

+;

+;

+;

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

+; R0 char  *pred_ptr

+; R1 short *dif_ptr

+; R2 char  *dst_ptr

+; R3 int stride

+|vp8_recon4b_armv6| PROC

+    stmdb   sp!, {r4 - r9, lr}

+    mov     lr, #4

+recon4b_loop

+    ;0, 1, 2, 3

+    ldr     r4, [prd], #4           ; 3 | 2 | 1 | 0

+    ldr     r6, [dif, #0]           ;     1 |     0

+    ldr     r7, [dif, #4]           ;     3 |     2

+    pkhbt   r8, r6, r7, lsl #16     ;     2 |     0

+    pkhtb   r9, r7, r6, asr #16     ;     3 |     1

+    uxtab16 r8, r8, r4              ;     2 |     0  +  3 | 2 | 2 | 0

+    uxtab16 r9, r9, r4, ror #8      ;     3 |     1  +  0 | 3 | 2 | 1

+    usat16  r8, #8, r8

+    usat16  r9, #8, r9

+    orr     r8, r8, r9, lsl #8

+    str     r8, [dst]

+    ;4, 5, 6, 7

+    ldr     r4, [prd], #4

+;;  ldr     r6, [dif, #32]

+;;  ldr     r7, [dif, #36]

+    ldr     r6, [dif, #8]

+    ldr     r7, [dif, #12]

+    pkhbt   r8, r6, r7, lsl #16

+    pkhtb   r9, r7, r6, asr #16

+    uxtab16 r8, r8, r4

+    uxtab16 r9, r9, r4, ror #8

+    usat16  r8, #8, r8

+    usat16  r9, #8, r9

+    orr     r8, r8, r9, lsl #8

+    str     r8, [dst, #4]

+    ;8, 9, 10, 11

+    ldr     r4, [prd], #4

+;;  ldr     r6, [dif, #64]

+;;  ldr     r7, [dif, #68]

+    ldr     r6, [dif, #16]

+    ldr     r7, [dif, #20]

+    pkhbt   r8, r6, r7, lsl #16

+    pkhtb   r9, r7, r6, asr #16

+    uxtab16 r8, r8, r4

+    uxtab16 r9, r9, r4, ror #8

+    usat16  r8, #8, r8

+    usat16  r9, #8, r9

+    orr     r8, r8, r9, lsl #8

+    str     r8, [dst, #8]

+    ;12, 13, 14, 15

+    ldr     r4, [prd], #4

+;;  ldr     r6, [dif, #96]

+;;  ldr     r7, [dif, #100]

+    ldr     r6, [dif, #24]

+    ldr     r7, [dif, #28]

+    pkhbt   r8, r6, r7, lsl #16

+    pkhtb   r9, r7, r6, asr #16

+    uxtab16 r8, r8, r4

+    uxtab16 r9, r9, r4, ror #8

+    usat16  r8, #8, r8

+    usat16  r9, #8, r9

+    orr     r8, r8, r9, lsl #8

+    str     r8, [dst, #12]

+    add     dst, dst, stride

+;;  add     dif, dif, #8

+    add     dif, dif, #32

+    subs    lr, lr, #1

+    bne     recon4b_loop

+    ldmia   sp!, {r4 - r9, pc}

+    ENDP    ; |Recon4B|

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

+;

+;

+;

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

+; R0 char  *pred_ptr

+; R1 short *dif_ptr

+; R2 char  *dst_ptr

+; R3 int stride

+|vp8_recon2b_armv6| PROC

+    stmdb   sp!, {r4 - r9, lr}

+    mov     lr, #4

+recon2b_loop

+    ;0, 1, 2, 3

+    ldr     r4, [prd], #4

+    ldr     r6, [dif, #0]

+    ldr     r7, [dif, #4]

+    pkhbt   r8, r6, r7, lsl #16

+    pkhtb   r9, r7, r6, asr #16

+    uxtab16 r8, r8, r4

+    uxtab16 r9, r9, r4, ror #8

+    usat16  r8, #8, r8

+    usat16  r9, #8, r9

+    orr     r8, r8, r9, lsl #8

+    str     r8, [dst]

+    ;4, 5, 6, 7

+    ldr     r4, [prd], #4

+;;  ldr     r6, [dif, #32]

+;;  ldr     r7, [dif, #36]

+    ldr     r6, [dif, #8]

+    ldr     r7, [dif, #12]

+    pkhbt   r8, r6, r7, lsl #16

+    pkhtb   r9, r7, r6, asr #16

+    uxtab16 r8, r8, r4

+    uxtab16 r9, r9, r4, ror #8

+    usat16  r8, #8, r8

+    usat16  r9, #8, r9

+    orr     r8, r8, r9, lsl #8

+    str     r8, [dst, #4]

+    add     dst, dst, stride

+;;  add     dif, dif, #8

+    add     dif, dif, #16

+    subs    lr, lr, #1

+    bne     recon2b_loop

+    ldmia   sp!, {r4 - r9, pc}

+    ENDP    ; |Recon2B|

+    END

--- /dev/null

+++ b/vp9/common/arm/armv6/simpleloopfilter_v6.asm

@@ -1,0 +1,286 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT |vp9_loop_filter_simple_horizontal_edge_armv6|

+    EXPORT |vp9_loop_filter_simple_vertical_edge_armv6|

+    AREA    |.text|, CODE, READONLY  ; name this block of code

+    MACRO

+    TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3

+    ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3

+    ; a0: 03 02 01 00

+    ; a1: 13 12 11 10

+    ; a2: 23 22 21 20

+    ; a3: 33 32 31 30

+    ;     b3 b2 b1 b0

+    uxtb16      $b1, $a1                    ; xx 12 xx 10

+    uxtb16      $b0, $a0                    ; xx 02 xx 00

+    uxtb16      $b3, $a3                    ; xx 32 xx 30

+    uxtb16      $b2, $a2                    ; xx 22 xx 20

+    orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00

+    orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20

+    uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11

+    uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31

+    uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01

+    uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21

+    orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01

+    orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21

+    pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1

+    pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3

+    pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0

+    pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2

+    MEND

+src         RN  r0

+pstep       RN  r1

+;r0     unsigned char *src_ptr,

+;r1     int src_pixel_step,

+;r2     const char *blimit

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

+|vp9_loop_filter_simple_horizontal_edge_armv6| PROC

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

+    stmdb       sp!, {r4 - r11, lr}

+    ldrb        r12, [r2]                   ; blimit

+    ldr         r3, [src, -pstep, lsl #1]   ; p1

+    ldr         r4, [src, -pstep]           ; p0

+    ldr         r5, [src]                   ; q0

+    ldr         r6, [src, pstep]            ; q1

+    orr         r12, r12, r12, lsl #8       ; blimit

+    ldr         r2, c0x80808080

+    orr         r12, r12, r12, lsl #16      ; blimit

+    mov         r9, #4                      ; double the count. we're doing 4 at a time

+    mov         lr, #0                      ; need 0 in a couple places

+|simple_hnext8|

+    ; vp8_simple_filter_mask()

+    uqsub8      r7, r3, r6                  ; p1 - q1

+    uqsub8      r8, r6, r3                  ; q1 - p1

+    uqsub8      r10, r4, r5                 ; p0 - q0

+    uqsub8      r11, r5, r4                 ; q0 - p0

+    orr         r8, r8, r7                  ; abs(p1 - q1)

+    orr         r10, r10, r11               ; abs(p0 - q0)

+    uqadd8      r10, r10, r10               ; abs(p0 - q0) * 2

+    uhadd8      r8, r8, lr                  ; abs(p1 - q2) >> 1

+    uqadd8      r10, r10, r8                ; abs(p0 - q0)*2 + abs(p1 - q1)/2

+    mvn         r8, #0

+    usub8       r10, r12, r10               ; compare to flimit. usub8 sets GE flags

+    sel         r10, r8, lr                 ; filter mask: F or 0

+    cmp         r10, #0

+    beq         simple_hskip_filter         ; skip filtering if all masks are 0x00

+    ;vp8_simple_filter()

+    eor         r3, r3, r2                  ; p1 offset to convert to a signed value

+    eor         r6, r6, r2                  ; q1 offset to convert to a signed value

+    eor         r4, r4, r2                  ; p0 offset to convert to a signed value

+    eor         r5, r5, r2                  ; q0 offset to convert to a signed value

+    qsub8       r3, r3, r6                  ; vp9_filter = p1 - q1

+    qsub8       r6, r5, r4                  ; q0 - p0

+    qadd8       r3, r3, r6                  ; += q0 - p0

+    ldr         r7, c0x04040404

+    qadd8       r3, r3, r6                  ; += q0 - p0

+    ldr         r8, c0x03030303

+    qadd8       r3, r3, r6                  ; vp9_filter = p1-q1 + 3*(q0-p0))

+    ;STALL

+    and         r3, r3, r10                 ; vp9_filter &= mask

+    qadd8       r7 , r3 , r7                ; Filter1 = vp9_filter + 4

+    qadd8       r8 , r3 , r8                ; Filter2 = vp9_filter + 3

+    shadd8      r7 , r7 , lr

+    shadd8      r8 , r8 , lr

+    shadd8      r7 , r7 , lr

+    shadd8      r8 , r8 , lr

+    shadd8      r7 , r7 , lr                ; Filter1 >>= 3

+    shadd8      r8 , r8 , lr                ; Filter2 >>= 3

+    qsub8       r5 ,r5, r7                  ; u = q0 - Filter1

+    qadd8       r4, r4, r8                  ; u = p0 + Filter2

+    eor         r5, r5, r2                  ; *oq0 = u^0x80

+    str         r5, [src]                   ; store oq0 result

+    eor         r4, r4, r2                  ; *op0 = u^0x80

+    str         r4, [src, -pstep]           ; store op0 result

+|simple_hskip_filter|

+    subs        r9, r9, #1

+    addne       src, src, #4                ; next row

+    ldrne       r3, [src, -pstep, lsl #1]   ; p1

+    ldrne       r4, [src, -pstep]           ; p0

+    ldrne       r5, [src]                   ; q0

+    ldrne       r6, [src, pstep]            ; q1

+    bne         simple_hnext8

+    ldmia       sp!, {r4 - r11, pc}

+    ENDP        ; |vp9_loop_filter_simple_horizontal_edge_armv6|

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

+|vp9_loop_filter_simple_vertical_edge_armv6| PROC

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

+    stmdb       sp!, {r4 - r11, lr}

+    ldrb        r12, [r2]                   ; r12: blimit

+    ldr         r2, c0x80808080

+    orr         r12, r12, r12, lsl #8

+    ; load soure data to r7, r8, r9, r10

+    ldrh        r3, [src, #-2]

+    pld         [src, #23]                  ; preload for next block

+    ldrh        r4, [src], pstep

+    orr         r12, r12, r12, lsl #16

+    ldrh        r5, [src, #-2]

+    pld         [src, #23]

+    ldrh        r6, [src], pstep

+    pkhbt       r7, r3, r4, lsl #16

+    ldrh        r3, [src, #-2]

+    pld         [src, #23]

+    ldrh        r4, [src], pstep

+    pkhbt       r8, r5, r6, lsl #16

+    ldrh        r5, [src, #-2]

+    pld         [src, #23]

+    ldrh        r6, [src], pstep

+    mov         r11, #4                     ; double the count. we're doing 4 at a time

+|simple_vnext8|

+    ; vp8_simple_filter_mask() function

+    pkhbt       r9, r3, r4, lsl #16

+    pkhbt       r10, r5, r6, lsl #16

+    ;transpose r7, r8, r9, r10 to r3, r4, r5, r6

+    TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6

+    uqsub8      r7, r3, r6                  ; p1 - q1

+    uqsub8      r8, r6, r3                  ; q1 - p1

+    uqsub8      r9, r4, r5                  ; p0 - q0

+    uqsub8      r10, r5, r4                 ; q0 - p0

+    orr         r7, r7, r8                  ; abs(p1 - q1)

+    orr         r9, r9, r10                 ; abs(p0 - q0)

+    mov         r8, #0

+    uqadd8      r9, r9, r9                  ; abs(p0 - q0) * 2

+    uhadd8      r7, r7, r8                  ; abs(p1 - q1) / 2

+    uqadd8      r7, r7, r9                  ; abs(p0 - q0)*2 + abs(p1 - q1)/2

+    mvn         r10, #0                     ; r10 == -1

+    usub8       r7, r12, r7                 ; compare to flimit

+    sel         lr, r10, r8                 ; filter mask

+    cmp         lr, #0

+    beq         simple_vskip_filter         ; skip filtering

+    ;vp8_simple_filter() function

+    eor         r3, r3, r2                  ; p1 offset to convert to a signed value

+    eor         r6, r6, r2                  ; q1 offset to convert to a signed value

+    eor         r4, r4, r2                  ; p0 offset to convert to a signed value

+    eor         r5, r5, r2                  ; q0 offset to convert to a signed value

+    qsub8       r3, r3, r6                  ; vp9_filter = p1 - q1

+    qsub8       r6, r5, r4                  ; q0 - p0

+    qadd8       r3, r3, r6                  ; vp9_filter += q0 - p0

+    ldr         r9, c0x03030303             ; r9 = 3

+    qadd8       r3, r3, r6                  ; vp9_filter += q0 - p0

+    ldr         r7, c0x04040404

+    qadd8       r3, r3, r6                  ; vp9_filter = p1-q1 + 3*(q0-p0))

+    ;STALL

+    and         r3, r3, lr                  ; vp9_filter &= mask

+    qadd8       r9 , r3 , r9                ; Filter2 = vp9_filter + 3

+    qadd8       r3 , r3 , r7                ; Filter1 = vp9_filter + 4

+    shadd8      r9 , r9 , r8

+    shadd8      r3 , r3 , r8

+    shadd8      r9 , r9 , r8

+    shadd8      r3 , r3 , r8

+    shadd8      r9 , r9 , r8                ; Filter2 >>= 3

+    shadd8      r3 , r3 , r8                ; Filter1 >>= 3

+    ;calculate output

+    sub         src, src, pstep, lsl #2

+    qadd8       r4, r4, r9                  ; u = p0 + Filter2

+    qsub8       r5, r5, r3                  ; u = q0 - Filter1

+    eor         r4, r4, r2                  ; *op0 = u^0x80

+    eor         r5, r5, r2                  ; *oq0 = u^0x80

+    strb        r4, [src, #-1]              ; store the result

+    mov         r4, r4, lsr #8

+    strb        r5, [src], pstep

+    mov         r5, r5, lsr #8

+    strb        r4, [src, #-1]

+    mov         r4, r4, lsr #8

+    strb        r5, [src], pstep

+    mov         r5, r5, lsr #8

+    strb        r4, [src, #-1]

+    mov         r4, r4, lsr #8

+    strb        r5, [src], pstep

+    mov         r5, r5, lsr #8

+    strb        r4, [src, #-1]

+    strb        r5, [src], pstep

+|simple_vskip_filter|

+    subs        r11, r11, #1

+    ; load soure data to r7, r8, r9, r10

+    ldrneh      r3, [src, #-2]

+    pld         [src, #23]                  ; preload for next block

+    ldrneh      r4, [src], pstep

+    ldrneh      r5, [src, #-2]

+    pld         [src, #23]

+    ldrneh      r6, [src], pstep

+    pkhbt       r7, r3, r4, lsl #16

+    ldrneh      r3, [src, #-2]

+    pld         [src, #23]

+    ldrneh      r4, [src], pstep

+    pkhbt       r8, r5, r6, lsl #16

+    ldrneh      r5, [src, #-2]

+    pld         [src, #23]

+    ldrneh      r6, [src], pstep

+    bne         simple_vnext8

+    ldmia       sp!, {r4 - r11, pc}

+    ENDP        ; |vp9_loop_filter_simple_vertical_edge_armv6|

+; Constant Pool

+c0x80808080 DCD     0x80808080

+c0x03030303 DCD     0x03030303

+c0x04040404 DCD     0x04040404

+    END

--- /dev/null

+++ b/vp9/common/arm/armv6/sixtappredict8x4_v6.asm

@@ -1,0 +1,273 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_sixtap_predict8x4_armv6|

+    AREA    |.text|, CODE, READONLY  ; name this block of code

+;-------------------------------------

+; r0    unsigned char *src_ptr,

+; r1    int  src_pixels_per_line,

+; r2    int  xoffset,

+; r3    int  yoffset,

+; stack unsigned char *dst_ptr,

+; stack int  dst_pitch

+;-------------------------------------

+;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.

+;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,

+;and the result is stored in transpose.

+|vp8_sixtap_predict8x4_armv6| PROC

+    stmdb       sp!, {r4 - r11, lr}

+    str         r3, [sp, #-184]!            ;reserve space on stack for temporary storage, store yoffset

+    cmp         r2, #0                      ;skip first_pass filter if xoffset=0

+    add         lr, sp, #4                  ;point to temporary buffer

+    beq         skip_firstpass_filter

+;first-pass filter

+    adr         r12, filter8_coeff

+    sub         r0, r0, r1, lsl #1

+    add         r3, r1, #10                 ; preload next low

+    pld         [r0, r3]

+    add         r2, r12, r2, lsl #4         ;calculate filter location

+    add         r0, r0, #3                  ;adjust src only for loading convinience

+    ldr         r3, [r2]                    ; load up packed filter coefficients

+    ldr         r4, [r2, #4]

+    ldr         r5, [r2, #8]

+    mov         r2, #0x90000                ; height=9 is top part of counter

+    sub         r1, r1, #8

+|first_pass_hloop_v6|

+    ldrb        r6, [r0, #-5]               ; load source data

+    ldrb        r7, [r0, #-4]

+    ldrb        r8, [r0, #-3]

+    ldrb        r9, [r0, #-2]

+    ldrb        r10, [r0, #-1]

+    orr         r2, r2, #0x4                ; construct loop counter. width=8=4x2

+    pkhbt       r6, r6, r7, lsl #16         ; r7 | r6

+    pkhbt       r7, r7, r8, lsl #16         ; r8 | r7

+    pkhbt       r8, r8, r9, lsl #16         ; r9 | r8

+    pkhbt       r9, r9, r10, lsl #16        ; r10 | r9

+|first_pass_wloop_v6|

+    smuad       r11, r6, r3                 ; vp9_filter[0], vp9_filter[1]

+    smuad       r12, r7, r3

+    ldrb        r6, [r0], #1

+    smlad       r11, r8, r4, r11            ; vp9_filter[2], vp9_filter[3]

+    ldrb        r7, [r0], #1

+    smlad       r12, r9, r4, r12

+    pkhbt       r10, r10, r6, lsl #16       ; r10 | r9

+    pkhbt       r6, r6, r7, lsl #16         ; r11 | r10

+    smlad       r11, r10, r5, r11           ; vp9_filter[4], vp9_filter[5]

+    smlad       r12, r6, r5, r12

+    sub         r2, r2, #1

+    add         r11, r11, #0x40             ; round_shift_and_clamp

+    tst         r2, #0xff                   ; test loop counter

+    usat        r11, #8, r11, asr #7

+    add         r12, r12, #0x40

+    strh        r11, [lr], #20              ; result is transposed and stored, which

+    usat        r12, #8, r12, asr #7

+    strh        r12, [lr], #20

+    movne       r11, r6

+    movne       r12, r7

+    movne       r6, r8

+    movne       r7, r9

+    movne       r8, r10

+    movne       r9, r11

+    movne       r10, r12

+    bne         first_pass_wloop_v6

+    ;;add       r9, ppl, #30                ; attempt to load 2 adjacent cache lines

+    ;;IF ARCHITECTURE=6

+    ;pld        [src, ppl]

+    ;;pld       [src, r9]

+    ;;ENDIF

+    subs        r2, r2, #0x10000

+    sub         lr, lr, #158

+    add         r0, r0, r1                  ; move to next input line

+    add         r11, r1, #18                ; preload next low. adding back block width(=8), which is subtracted earlier

+    pld         [r0, r11]

+    bne         first_pass_hloop_v6

+;second pass filter

+secondpass_filter

+    ldr         r3, [sp], #4                ; load back yoffset

+    ldr         r0, [sp, #216]              ; load dst address from stack 180+36

+    ldr         r1, [sp, #220]              ; load dst stride from stack 180+40

+    cmp         r3, #0

+    beq         skip_secondpass_filter

+    adr         r12, filter8_coeff

+    add         lr, r12, r3, lsl #4         ;calculate filter location

+    mov         r2, #0x00080000

+    ldr         r3, [lr]                    ; load up packed filter coefficients

+    ldr         r4, [lr, #4]

+    ldr         r5, [lr, #8]

+    pkhbt       r12, r4, r3                 ; pack the filter differently

+    pkhbt       r11, r5, r4

+second_pass_hloop_v6

+    ldr         r6, [sp]                    ; load the data

+    ldr         r7, [sp, #4]

+    orr         r2, r2, #2                  ; loop counter

+second_pass_wloop_v6

+    smuad       lr, r3, r6                  ; apply filter

+    smulbt      r10, r3, r6

+    ldr         r8, [sp, #8]

+    smlad       lr, r4, r7, lr

+    smladx      r10, r12, r7, r10

+    ldrh        r9, [sp, #12]

+    smlad       lr, r5, r8, lr

+    smladx      r10, r11, r8, r10

+    add         sp, sp, #4

+    smlatb      r10, r5, r9, r10

+    sub         r2, r2, #1

+    add         lr, lr, #0x40               ; round_shift_and_clamp

+    tst         r2, #0xff

+    usat        lr, #8, lr, asr #7

+    add         r10, r10, #0x40

+    strb        lr, [r0], r1                ; the result is transposed back and stored

+    usat        r10, #8, r10, asr #7

+    strb        r10, [r0],r1

+    movne       r6, r7

+    movne       r7, r8

+    bne         second_pass_wloop_v6

+    subs        r2, r2, #0x10000

+    add         sp, sp, #12                 ; updata src for next loop (20-8)

+    sub         r0, r0, r1, lsl #2

+    add         r0, r0, #1

+    bne         second_pass_hloop_v6

+    add         sp, sp, #20

+    ldmia       sp!, {r4 - r11, pc}

+;--------------------

+skip_firstpass_filter

+    sub         r0, r0, r1, lsl #1

+    sub         r1, r1, #8

+    mov         r2, #9

+skip_firstpass_hloop

+    ldrb        r4, [r0], #1                ; load data

+    subs        r2, r2, #1

+    ldrb        r5, [r0], #1

+    strh        r4, [lr], #20               ; store it to immediate buffer

+    ldrb        r6, [r0], #1                ; load data

+    strh        r5, [lr], #20

+    ldrb        r7, [r0], #1

+    strh        r6, [lr], #20

+    ldrb        r8, [r0], #1

+    strh        r7, [lr], #20

+    ldrb        r9, [r0], #1

+    strh        r8, [lr], #20

+    ldrb        r10, [r0], #1

+    strh        r9, [lr], #20

+    ldrb        r11, [r0], #1

+    strh        r10, [lr], #20

+    add         r0, r0, r1                  ; move to next input line

+    strh        r11, [lr], #20

+    sub         lr, lr, #158                ; move over to next column

+    bne         skip_firstpass_hloop

+    b           secondpass_filter

+;--------------------

+skip_secondpass_filter

+    mov         r2, #8

+    add         sp, sp, #4                  ;start from src[0] instead of src[-2]

+skip_secondpass_hloop

+    ldr         r6, [sp], #4

+    subs        r2, r2, #1

+    ldr         r8, [sp], #4

+    mov         r7, r6, lsr #16             ; unpack

+    strb        r6, [r0], r1

+    mov         r9, r8, lsr #16

+    strb        r7, [r0], r1

+    add         sp, sp, #12                 ; 20-8

+    strb        r8, [r0], r1

+    strb        r9, [r0], r1

+    sub         r0, r0, r1, lsl #2

+    add         r0, r0, #1

+    bne         skip_secondpass_hloop

+    add         sp, sp, #16                 ; 180 - (160 +4)

+    ldmia       sp!, {r4 - r11, pc}

+    ENDP

+;-----------------

+;One word each is reserved. Label filter_coeff can be used to access the data.

+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...

+filter8_coeff

+    DCD     0x00000000,     0x00000080,     0x00000000,     0x00000000

+    DCD     0xfffa0000,     0x000c007b,     0x0000ffff,     0x00000000

+    DCD     0xfff50002,     0x0024006c,     0x0001fff8,     0x00000000

+    DCD     0xfff70000,     0x0032005d,     0x0000fffa,     0x00000000

+    DCD     0xfff00003,     0x004d004d,     0x0003fff0,     0x00000000

+    DCD     0xfffa0000,     0x005d0032,     0x0000fff7,     0x00000000

+    DCD     0xfff80001,     0x006c0024,     0x0002fff5,     0x00000000

+    DCD     0xffff0000,     0x007b000c,     0x0000fffa,     0x00000000

+    ;DCD        0,  0,  128,    0,   0,  0

+    ;DCD        0, -6,  123,   12,  -1,  0

+    ;DCD        2, -11, 108,   36,  -8,  1

+    ;DCD        0, -9,   93,   50,  -6,  0

+    ;DCD        3, -16,  77,   77, -16,  3

+    ;DCD        0, -6,   50,   93,  -9,  0

+    ;DCD        1, -8,   36,  108, -11,  2

+    ;DCD        0, -1,   12,  123,  -6,  0

+    END

--- /dev/null

+++ b/vp9/common/arm/bilinearfilter_arm.c

@@ -1,0 +1,108 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <math.h>

+#include "vp9/common/filter.h"

+#include "vp9/common/subpixel.h"

+#include "bilinearfilter_arm.h"

+void vp9_filter_block2d_bil_armv6

+(

+  unsigned char *src_ptr,

+  unsigned char *dst_ptr,

+  unsigned int   src_pitch,

+  unsigned int   dst_pitch,

+  const short   *HFilter,

+  const short   *VFilter,

+  int            Width,

+  int            Height

+) {

+  unsigned short FData[36 * 16]; /* Temp data buffer used in filtering */

+  /* First filter 1-D horizontally... */

+  vp9_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);

+  /* then 1-D vertically... */

+  vp9_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter);

+}

+void vp9_bilinear_predict4x4_armv6

+(

+  unsigned char  *src_ptr,

+  int   src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  unsigned char *dst_ptr,

+  int dst_pitch

+) {

+  const short  *HFilter;

+  const short  *VFilter;

+  HFilter = vp8_bilinear_filters[xoffset];

+  VFilter = vp8_bilinear_filters[yoffset];

+  vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);

+}

+void vp9_bilinear_predict8x8_armv6

+(

+  unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  unsigned char *dst_ptr,

+  int  dst_pitch

+) {

+  const short  *HFilter;

+  const short  *VFilter;

+  HFilter = vp8_bilinear_filters[xoffset];

+  VFilter = vp8_bilinear_filters[yoffset];

+  vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);

+}

+void vp9_bilinear_predict8x4_armv6

+(

+  unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  unsigned char *dst_ptr,

+  int  dst_pitch

+) {

+  const short  *HFilter;

+  const short  *VFilter;

+  HFilter = vp8_bilinear_filters[xoffset];

+  VFilter = vp8_bilinear_filters[yoffset];

+  vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);

+}

+void vp9_bilinear_predict16x16_armv6

+(

+  unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  unsigned char *dst_ptr,

+  int  dst_pitch

+) {

+  const short  *HFilter;

+  const short  *VFilter;

+  HFilter = vp8_bilinear_filters[xoffset];

+  VFilter = vp8_bilinear_filters[yoffset];

+  vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);

+}

--- /dev/null

+++ b/vp9/common/arm/bilinearfilter_arm.h

@@ -1,0 +1,35 @@

+/*

+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef BILINEARFILTER_ARM_H

+#define BILINEARFILTER_ARM_H

+extern void vp9_filter_block2d_bil_first_pass_armv6

+(

+  const unsigned char  *src_ptr,

+  unsigned short       *dst_ptr,

+  unsigned int          src_pitch,

+  unsigned int          height,

+  unsigned int          width,

+  const short          *vp9_filter

+);

+extern void vp9_filter_block2d_bil_second_pass_armv6

+(

+  const unsigned short *src_ptr,

+  unsigned char        *dst_ptr,

+  int                   dst_pitch,

+  unsigned int          height,

+  unsigned int          width,

+  const short         *vp9_filter

+);

+#endif /* BILINEARFILTER_ARM_H */

--- /dev/null

+++ b/vp9/common/arm/filter_arm.c

@@ -1,0 +1,198 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include <math.h>

+#include "vp9/common/filter.h"

+#include "vp9/common/subpixel.h"

+#include "vpx_ports/mem.h"

+extern void vp9_filter_block2d_first_pass_armv6

+(

+  unsigned char *src_ptr,

+  short         *output_ptr,

+  unsigned int src_pixels_per_line,

+  unsigned int output_width,

+  unsigned int output_height,

+  const short *vp9_filter

+);

+// 8x8

+extern void vp9_filter_block2d_first_pass_8x8_armv6

+(

+  unsigned char *src_ptr,

+  short         *output_ptr,

+  unsigned int src_pixels_per_line,

+  unsigned int output_width,

+  unsigned int output_height,

+  const short *vp9_filter

+);

+// 16x16

+extern void vp9_filter_block2d_first_pass_16x16_armv6

+(

+  unsigned char *src_ptr,

+  short         *output_ptr,

+  unsigned int src_pixels_per_line,

+  unsigned int output_width,

+  unsigned int output_height,

+  const short *vp9_filter

+);

+extern void vp9_filter_block2d_second_pass_armv6

+(

+  short         *src_ptr,

+  unsigned char *output_ptr,

+  unsigned int output_pitch,

+  unsigned int cnt,

+  const short *vp9_filter

+);

+extern void vp9_filter4_block2d_second_pass_armv6

+(

+  short         *src_ptr,

+  unsigned char *output_ptr,

+  unsigned int output_pitch,

+  unsigned int cnt,

+  const short *vp9_filter

+);

+extern void vp9_filter_block2d_first_pass_only_armv6

+(

+  unsigned char *src_ptr,

+  unsigned char *output_ptr,

+  unsigned int src_pixels_per_line,

+  unsigned int cnt,

+  unsigned int output_pitch,

+  const short *vp9_filter

+);

+extern void vp9_filter_block2d_second_pass_only_armv6

+(

+  unsigned char *src_ptr,

+  unsigned char *output_ptr,

+  unsigned int src_pixels_per_line,

+  unsigned int cnt,

+  unsigned int output_pitch,

+  const short *vp9_filter

+);

+#if HAVE_ARMV6

+void vp9_sixtap_predict_armv6

+(

+  unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  unsigned char *dst_ptr,

+  int  dst_pitch

+) {

+  const short  *HFilter;

+  const short  *VFilter;

+  DECLARE_ALIGNED_ARRAY(4, short, FData, 12 * 4); /* Temp data buffer used in filtering */

+  HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */

+  VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

+  /* Vfilter is null. First pass only */

+  if (xoffset && !yoffset) {

+    /*vp9_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter );

+    vp9_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );*/

+    vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter);

+  }

+  /* Hfilter is null. Second pass only */

+  else if (!xoffset && yoffset) {

+    vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter);

+  } else {

+    /* Vfilter is a 4 tap filter */

+    if (yoffset & 0x1) {

+      vp9_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter);

+      vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);

+    }

+    /* Vfilter is 6 tap filter */

+    else {

+      vp9_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter);

+      vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);

+    }

+  }

+}

+void vp9_sixtap_predict8x8_armv6

+(

+  unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  unsigned char *dst_ptr,

+  int  dst_pitch

+) {

+  const short  *HFilter;

+  const short  *VFilter;

+  DECLARE_ALIGNED_ARRAY(4, short, FData, 16 * 8); /* Temp data buffer used in filtering */

+  HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */

+  VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

+  if (xoffset && !yoffset) {

+    vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter);

+  }

+  /* Hfilter is null. Second pass only */

+  else if (!xoffset && yoffset) {

+    vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter);

+  } else {

+    if (yoffset & 0x1) {

+      vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);

+      vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);

+    } else {

+      vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);

+      vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);

+    }

+  }

+}

+void vp9_sixtap_predict16x16_armv6

+(

+  unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  unsigned char *dst_ptr,

+  int  dst_pitch

+) {

+  const short  *HFilter;

+  const short  *VFilter;

+  DECLARE_ALIGNED_ARRAY(4, short, FData, 24 * 16);  /* Temp data buffer used in filtering */

+  HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */

+  VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

+  if (xoffset && !yoffset) {

+    vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter);

+  }

+  /* Hfilter is null. Second pass only */

+  else if (!xoffset && yoffset) {

+    vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter);

+  } else {

+    if (yoffset & 0x1) {

+      vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);

+      vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);

+    } else {

+      vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);

+      vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);

+    }

+  }

+}

+#endif

--- /dev/null

+++ b/vp9/common/arm/idct_arm.h

@@ -1,0 +1,65 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef IDCT_ARM_H

+#define IDCT_ARM_H

+#if HAVE_ARMV6

+extern prototype_idct(vp9_short_idct4x4llm_1_v6);

+extern prototype_idct(vp9_short_idct4x4llm_v6_dual);

+extern prototype_idct_scalar_add(vp9_dc_only_idct_add_v6);

+extern prototype_second_order(vp9_short_inv_walsh4x4_1_v6);

+extern prototype_second_order(vp9_short_inv_walsh4x4_v6);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp9_idct_idct1

+#define vp9_idct_idct1 vp9_short_idct4x4llm_1_v6

+#undef  vp9_idct_idct16

+#define vp9_idct_idct16 vp9_short_idct4x4llm_v6_dual

+#undef  vp9_idct_idct1_scalar_add

+#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_v6

+#undef  vp8_idct_iwalsh1

+#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_v6

+#undef  vp8_idct_iwalsh16

+#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_v6

+#endif

+#endif

+#if HAVE_ARMV7

+extern prototype_idct(vp9_short_idct4x4llm_1_neon);

+extern prototype_idct(vp9_short_idct4x4llm_neon);

+extern prototype_idct_scalar_add(vp9_dc_only_idct_add_neon);

+extern prototype_second_order(vp9_short_inv_walsh4x4_1_neon);

+extern prototype_second_order(vp9_short_inv_walsh4x4_neon);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp9_idct_idct1

+#define vp9_idct_idct1 vp9_short_idct4x4llm_1_neon

+#undef  vp9_idct_idct16

+#define vp9_idct_idct16 vp9_short_idct4x4llm_neon

+#undef  vp9_idct_idct1_scalar_add

+#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_neon

+#undef  vp8_idct_iwalsh1

+#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_neon

+#undef  vp8_idct_iwalsh16

+#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_neon

+#endif

+#endif

+#endif

--- /dev/null

+++ b/vp9/common/arm/loopfilter_arm.c

@@ -1,0 +1,166 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_config.h"

+#include "vp9/common/loopfilter.h"

+#include "vp9/common/onyxc_int.h"

+#if HAVE_ARMV6

+extern prototype_loopfilter(vp9_loop_filter_horizontal_edge_armv6);

+extern prototype_loopfilter(vp9_loop_filter_vertical_edge_armv6);

+extern prototype_loopfilter(vp9_mbloop_filter_horizontal_edge_armv6);

+extern prototype_loopfilter(vp9_mbloop_filter_vertical_edge_armv6);

+#endif

+#if HAVE_ARMV7

+typedef void loopfilter_y_neon(unsigned char *src, int pitch,

+                               unsigned char blimit, unsigned char limit, unsigned char thresh);

+typedef void loopfilter_uv_neon(unsigned char *u, int pitch,

+                                unsigned char blimit, unsigned char limit, unsigned char thresh,

+                                unsigned char *v);

+extern loopfilter_y_neon vp9_loop_filter_horizontal_edge_y_neon;

+extern loopfilter_y_neon vp9_loop_filter_vertical_edge_y_neon;

+extern loopfilter_y_neon vp9_mbloop_filter_horizontal_edge_y_neon;

+extern loopfilter_y_neon vp9_mbloop_filter_vertical_edge_y_neon;

+extern loopfilter_uv_neon vp9_loop_filter_horizontal_edge_uv_neon;

+extern loopfilter_uv_neon vp9_loop_filter_vertical_edge_uv_neon;

+extern loopfilter_uv_neon vp9_mbloop_filter_horizontal_edge_uv_neon;

+extern loopfilter_uv_neon vp9_mbloop_filter_vertical_edge_uv_neon;

+#endif

+#if HAVE_ARMV6

+/*ARMV6 loopfilter functions*/

+/* Horizontal MB filtering */

+void vp9_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

+                               int y_stride, int uv_stride, loop_filter_info *lfi) {

+  vp9_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);

+  if (u_ptr)

+    vp9_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

+  if (v_ptr)

+    vp9_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

+}

+/* Vertical MB Filtering */

+void vp9_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

+                               int y_stride, int uv_stride, loop_filter_info *lfi) {

+  vp9_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);

+  if (u_ptr)

+    vp9_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

+  if (v_ptr)

+    vp9_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

+}

+/* Horizontal B Filtering */

+void vp9_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

+                              int y_stride, int uv_stride, loop_filter_info *lfi) {

+  vp9_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

+  vp9_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

+  vp9_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

+  if (u_ptr)

+    vp9_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);

+  if (v_ptr)

+    vp9_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);

+}

+void vp9_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,

+                               const unsigned char *blimit) {

+  vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);

+  vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);

+  vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);

+}

+/* Vertical B Filtering */

+void vp9_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

+                              int y_stride, int uv_stride, loop_filter_info *lfi) {

+  vp9_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

+  vp9_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

+  vp9_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

+  if (u_ptr)

+    vp9_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);

+  if (v_ptr)

+    vp9_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);

+}

+void vp9_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,

+                               const unsigned char *blimit) {

+  vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);

+  vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);

+  vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);

+}

+#endif

+#if HAVE_ARMV7

+/* NEON loopfilter functions */

+/* Horizontal MB filtering */

+void vp9_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

+                              int y_stride, int uv_stride, loop_filter_info *lfi) {

+  unsigned char mblim = *lfi->mblim;

+  unsigned char lim = *lfi->lim;

+  unsigned char hev_thr = *lfi->hev_thr;

+  vp9_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);

+  if (u_ptr)

+    vp9_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);

+}

+/* Vertical MB Filtering */

+void vp9_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

+                              int y_stride, int uv_stride, loop_filter_info *lfi) {

+  unsigned char mblim = *lfi->mblim;

+  unsigned char lim = *lfi->lim;

+  unsigned char hev_thr = *lfi->hev_thr;

+  vp9_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);

+  if (u_ptr)

+    vp9_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);

+}

+/* Horizontal B Filtering */

+void vp9_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

+                             int y_stride, int uv_stride, loop_filter_info *lfi) {

+  unsigned char blim = *lfi->blim;

+  unsigned char lim = *lfi->lim;

+  unsigned char hev_thr = *lfi->hev_thr;

+  vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);

+  vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);

+  vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);

+  if (u_ptr)

+    vp9_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);

+}

+/* Vertical B Filtering */

+void vp9_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

+                             int y_stride, int uv_stride, loop_filter_info *lfi) {

+  unsigned char blim = *lfi->blim;

+  unsigned char lim = *lfi->lim;

+  unsigned char hev_thr = *lfi->hev_thr;

+  vp9_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);

+  vp9_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);

+  vp9_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);

+  if (u_ptr)

+    vp9_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);

+}

+#endif

--- /dev/null

+++ b/vp9/common/arm/loopfilter_arm.h

@@ -1,0 +1,41 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef LOOPFILTER_ARM_H

+#define LOOPFILTER_ARM_H

+#include "vpx_config.h"

+#if HAVE_ARMV6

+extern prototype_loopfilter_block(vp9_loop_filter_mbv_armv6);

+extern prototype_loopfilter_block(vp9_loop_filter_bv_armv6);

+extern prototype_loopfilter_block(vp9_loop_filter_mbh_armv6);

+extern prototype_loopfilter_block(vp9_loop_filter_bh_armv6);

+extern prototype_simple_loopfilter(vp9_loop_filter_bvs_armv6);

+extern prototype_simple_loopfilter(vp9_loop_filter_bhs_armv6);

+extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_armv6);

+extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_armv6);

+#endif /* HAVE_ARMV6 */

+#if HAVE_ARMV7

+extern prototype_loopfilter_block(vp9_loop_filter_mbv_neon);

+extern prototype_loopfilter_block(vp9_loop_filter_bv_neon);

+extern prototype_loopfilter_block(vp9_loop_filter_mbh_neon);

+extern prototype_loopfilter_block(vp9_loop_filter_bh_neon);

+extern prototype_simple_loopfilter(vp9_loop_filter_mbvs_neon);

+extern prototype_simple_loopfilter(vp9_loop_filter_bvs_neon);

+extern prototype_simple_loopfilter(vp9_loop_filter_mbhs_neon);

+extern prototype_simple_loopfilter(vp9_loop_filter_bhs_neon);

+#endif /* HAVE_ARMV7 */

+#endif /* LOOPFILTER_ARM_H */

--- /dev/null

+++ b/vp9/common/arm/neon/bilinearpredict16x16_neon.asm

@@ -1,0 +1,357 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_bilinear_predict16x16_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char  *src_ptr,

+; r1    int  src_pixels_per_line,

+; r2    int  xoffset,

+; r3    int  yoffset,

+; r4    unsigned char *dst_ptr,

+; stack(r5) int  dst_pitch

+|vp8_bilinear_predict16x16_neon| PROC

+    push            {r4-r5, lr}

+    adr             r12, bifilter16_coeff

+    ldr             r4, [sp, #12]           ;load parameters from stack

+    ldr             r5, [sp, #16]           ;load parameters from stack

+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

+    beq             secondpass_bfilter16x16_only

+    add             r2, r12, r2, lsl #3     ;calculate filter location

+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

+    vld1.s32        {d31}, [r2]             ;load first_pass filter

+    beq             firstpass_bfilter16x16_only

+    sub             sp, sp, #272            ;reserve space on stack for temporary storage

+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data

+    mov             lr, sp

+    vld1.u8         {d5, d6, d7}, [r0], r1

+    mov             r2, #3                  ;loop counter

+    vld1.u8         {d8, d9, d10}, [r0], r1

+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)

+    vld1.u8         {d11, d12, d13}, [r0], r1

+    vdup.8          d1, d31[4]

+;First Pass: output_height lines x output_width columns (17x16)

+filt_blk2d_fp16x16_loop_neon

+    pld             [r0]

+    pld             [r0, r1]

+    pld             [r0, r1, lsl #1]

+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * vp9_filter[0])

+    vmull.u8        q8, d3, d0

+    vmull.u8        q9, d5, d0

+    vmull.u8        q10, d6, d0

+    vmull.u8        q11, d8, d0

+    vmull.u8        q12, d9, d0

+    vmull.u8        q13, d11, d0

+    vmull.u8        q14, d12, d0

+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]

+    vext.8          d5, d5, d6, #1

+    vext.8          d8, d8, d9, #1

+    vext.8          d11, d11, d12, #1

+    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * vp9_filter[1])

+    vmlal.u8        q9, d5, d1

+    vmlal.u8        q11, d8, d1

+    vmlal.u8        q13, d11, d1

+    vext.8          d3, d3, d4, #1

+    vext.8          d6, d6, d7, #1

+    vext.8          d9, d9, d10, #1

+    vext.8          d12, d12, d13, #1

+    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * vp9_filter[1])

+    vmlal.u8        q10, d6, d1

+    vmlal.u8        q12, d9, d1

+    vmlal.u8        q14, d12, d1

+    subs            r2, r2, #1

+    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8

+    vqrshrn.u16    d15, q8, #7

+    vqrshrn.u16    d16, q9, #7

+    vqrshrn.u16    d17, q10, #7

+    vqrshrn.u16    d18, q11, #7

+    vqrshrn.u16    d19, q12, #7

+    vqrshrn.u16    d20, q13, #7

+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data

+    vqrshrn.u16    d21, q14, #7

+    vld1.u8         {d5, d6, d7}, [r0], r1

+    vst1.u8         {d14, d15, d16, d17}, [lr]!     ;store result

+    vld1.u8         {d8, d9, d10}, [r0], r1

+    vst1.u8         {d18, d19, d20, d21}, [lr]!

+    vld1.u8         {d11, d12, d13}, [r0], r1

+    bne             filt_blk2d_fp16x16_loop_neon

+;First-pass filtering for rest 5 lines

+    vld1.u8         {d14, d15, d16}, [r0], r1

+    vmull.u8        q9, d2, d0              ;(src_ptr[0] * vp9_filter[0])

+    vmull.u8        q10, d3, d0

+    vmull.u8        q11, d5, d0

+    vmull.u8        q12, d6, d0

+    vmull.u8        q13, d8, d0

+    vmull.u8        q14, d9, d0

+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]

+    vext.8          d5, d5, d6, #1

+    vext.8          d8, d8, d9, #1

+    vmlal.u8        q9, d2, d1              ;(src_ptr[0] * vp9_filter[1])

+    vmlal.u8        q11, d5, d1

+    vmlal.u8        q13, d8, d1

+    vext.8          d3, d3, d4, #1

+    vext.8          d6, d6, d7, #1

+    vext.8          d9, d9, d10, #1

+    vmlal.u8        q10, d3, d1             ;(src_ptr[0] * vp9_filter[1])

+    vmlal.u8        q12, d6, d1

+    vmlal.u8        q14, d9, d1

+    vmull.u8        q1, d11, d0

+    vmull.u8        q2, d12, d0

+    vmull.u8        q3, d14, d0

+    vmull.u8        q4, d15, d0

+    vext.8          d11, d11, d12, #1       ;construct src_ptr[1]

+    vext.8          d14, d14, d15, #1

+    vmlal.u8        q1, d11, d1             ;(src_ptr[0] * vp9_filter[1])

+    vmlal.u8        q3, d14, d1

+    vext.8          d12, d12, d13, #1

+    vext.8          d15, d15, d16, #1

+    vmlal.u8        q2, d12, d1             ;(src_ptr[0] * vp9_filter[1])

+    vmlal.u8        q4, d15, d1

+    vqrshrn.u16    d10, q9, #7              ;shift/round/saturate to u8

+    vqrshrn.u16    d11, q10, #7

+    vqrshrn.u16    d12, q11, #7

+    vqrshrn.u16    d13, q12, #7

+    vqrshrn.u16    d14, q13, #7

+    vqrshrn.u16    d15, q14, #7

+    vqrshrn.u16    d16, q1, #7

+    vqrshrn.u16    d17, q2, #7

+    vqrshrn.u16    d18, q3, #7

+    vqrshrn.u16    d19, q4, #7

+    vst1.u8         {d10, d11, d12, d13}, [lr]!         ;store result

+    vst1.u8         {d14, d15, d16, d17}, [lr]!

+    vst1.u8         {d18, d19}, [lr]!

+;Second pass: 16x16

+;secondpass_filter

+    add             r3, r12, r3, lsl #3

+    sub             lr, lr, #272

+    vld1.u32        {d31}, [r3]             ;load second_pass filter

+    vld1.u8         {d22, d23}, [lr]!       ;load src data

+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)

+    vdup.8          d1, d31[4]

+    mov             r12, #4                 ;loop counter

+filt_blk2d_sp16x16_loop_neon

+    vld1.u8         {d24, d25}, [lr]!

+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp9_filter[0])

+    vld1.u8         {d26, d27}, [lr]!

+    vmull.u8        q2, d23, d0

+    vld1.u8         {d28, d29}, [lr]!

+    vmull.u8        q3, d24, d0

+    vld1.u8         {d30, d31}, [lr]!

+    vmull.u8        q4, d25, d0

+    vmull.u8        q5, d26, d0

+    vmull.u8        q6, d27, d0

+    vmull.u8        q7, d28, d0

+    vmull.u8        q8, d29, d0

+    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * vp9_filter[1])

+    vmlal.u8        q2, d25, d1

+    vmlal.u8        q3, d26, d1

+    vmlal.u8        q4, d27, d1

+    vmlal.u8        q5, d28, d1

+    vmlal.u8        q6, d29, d1

+    vmlal.u8        q7, d30, d1

+    vmlal.u8        q8, d31, d1

+    subs            r12, r12, #1

+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8

+    vqrshrn.u16    d3, q2, #7

+    vqrshrn.u16    d4, q3, #7

+    vqrshrn.u16    d5, q4, #7

+    vqrshrn.u16    d6, q5, #7

+    vqrshrn.u16    d7, q6, #7

+    vqrshrn.u16    d8, q7, #7

+    vqrshrn.u16    d9, q8, #7

+    vst1.u8         {d2, d3}, [r4], r5      ;store result

+    vst1.u8         {d4, d5}, [r4], r5

+    vst1.u8         {d6, d7}, [r4], r5

+    vmov            q11, q15

+    vst1.u8         {d8, d9}, [r4], r5

+    bne             filt_blk2d_sp16x16_loop_neon

+    add             sp, sp, #272

+    pop             {r4-r5,pc}

+;--------------------

+firstpass_bfilter16x16_only

+    mov             r2, #4                      ;loop counter

+    vdup.8          d0, d31[0]                  ;first_pass filter (d0 d1)

+    vdup.8          d1, d31[4]

+;First Pass: output_height lines x output_width columns (16x16)

+filt_blk2d_fpo16x16_loop_neon

+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data

+    vld1.u8         {d5, d6, d7}, [r0], r1

+    vld1.u8         {d8, d9, d10}, [r0], r1

+    vld1.u8         {d11, d12, d13}, [r0], r1

+    pld             [r0]

+    pld             [r0, r1]

+    pld             [r0, r1, lsl #1]

+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * vp9_filter[0])

+    vmull.u8        q8, d3, d0

+    vmull.u8        q9, d5, d0

+    vmull.u8        q10, d6, d0

+    vmull.u8        q11, d8, d0

+    vmull.u8        q12, d9, d0

+    vmull.u8        q13, d11, d0

+    vmull.u8        q14, d12, d0

+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]

+    vext.8          d5, d5, d6, #1

+    vext.8          d8, d8, d9, #1

+    vext.8          d11, d11, d12, #1

+    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * vp9_filter[1])

+    vmlal.u8        q9, d5, d1

+    vmlal.u8        q11, d8, d1

+    vmlal.u8        q13, d11, d1

+    vext.8          d3, d3, d4, #1

+    vext.8          d6, d6, d7, #1

+    vext.8          d9, d9, d10, #1

+    vext.8          d12, d12, d13, #1

+    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * vp9_filter[1])

+    vmlal.u8        q10, d6, d1

+    vmlal.u8        q12, d9, d1

+    vmlal.u8        q14, d12, d1

+    subs            r2, r2, #1

+    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8

+    vqrshrn.u16    d15, q8, #7

+    vqrshrn.u16    d16, q9, #7

+    vqrshrn.u16    d17, q10, #7

+    vqrshrn.u16    d18, q11, #7

+    vqrshrn.u16    d19, q12, #7

+    vqrshrn.u16    d20, q13, #7

+    vst1.u8         {d14, d15}, [r4], r5        ;store result

+    vqrshrn.u16    d21, q14, #7

+    vst1.u8         {d16, d17}, [r4], r5

+    vst1.u8         {d18, d19}, [r4], r5

+    vst1.u8         {d20, d21}, [r4], r5

+    bne             filt_blk2d_fpo16x16_loop_neon

+    pop             {r4-r5,pc}

+;---------------------

+secondpass_bfilter16x16_only

+;Second pass: 16x16

+;secondpass_filter

+    add             r3, r12, r3, lsl #3

+    mov             r12, #4                     ;loop counter

+    vld1.u32        {d31}, [r3]                 ;load second_pass filter

+    vld1.u8         {d22, d23}, [r0], r1        ;load src data

+    vdup.8          d0, d31[0]                  ;second_pass filter parameters (d0 d1)

+    vdup.8          d1, d31[4]

+filt_blk2d_spo16x16_loop_neon

+    vld1.u8         {d24, d25}, [r0], r1

+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp9_filter[0])

+    vld1.u8         {d26, d27}, [r0], r1

+    vmull.u8        q2, d23, d0

+    vld1.u8         {d28, d29}, [r0], r1

+    vmull.u8        q3, d24, d0

+    vld1.u8         {d30, d31}, [r0], r1

+    vmull.u8        q4, d25, d0

+    vmull.u8        q5, d26, d0

+    vmull.u8        q6, d27, d0

+    vmull.u8        q7, d28, d0

+    vmull.u8        q8, d29, d0

+    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * vp9_filter[1])

+    vmlal.u8        q2, d25, d1

+    vmlal.u8        q3, d26, d1

+    vmlal.u8        q4, d27, d1

+    vmlal.u8        q5, d28, d1

+    vmlal.u8        q6, d29, d1

+    vmlal.u8        q7, d30, d1

+    vmlal.u8        q8, d31, d1

+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8

+    vqrshrn.u16    d3, q2, #7

+    vqrshrn.u16    d4, q3, #7

+    vqrshrn.u16    d5, q4, #7

+    vqrshrn.u16    d6, q5, #7

+    vqrshrn.u16    d7, q6, #7

+    vqrshrn.u16    d8, q7, #7

+    vqrshrn.u16    d9, q8, #7

+    vst1.u8         {d2, d3}, [r4], r5      ;store result

+    subs            r12, r12, #1

+    vst1.u8         {d4, d5}, [r4], r5

+    vmov            q11, q15

+    vst1.u8         {d6, d7}, [r4], r5

+    vst1.u8         {d8, d9}, [r4], r5

+    bne             filt_blk2d_spo16x16_loop_neon

+    pop             {r4-r5,pc}

+    ENDP

+;-----------------

+bifilter16_coeff

+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/bilinearpredict4x4_neon.asm

@@ -1,0 +1,130 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_bilinear_predict4x4_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char  *src_ptr,

+; r1    int  src_pixels_per_line,

+; r2    int  xoffset,

+; r3    int  yoffset,

+; r4    unsigned char *dst_ptr,

+; stack(lr) int  dst_pitch

+|vp8_bilinear_predict4x4_neon| PROC

+    push            {r4, lr}

+    adr             r12, bifilter4_coeff

+    ldr             r4, [sp, #8]            ;load parameters from stack

+    ldr             lr, [sp, #12]           ;load parameters from stack

+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

+    beq             skip_firstpass_filter

+;First pass: output_height lines x output_width columns (5x4)

+    vld1.u8         {d2}, [r0], r1          ;load src data

+    add             r2, r12, r2, lsl #3     ;calculate Hfilter location (2coeffsx4bytes=8bytes)

+    vld1.u8         {d3}, [r0], r1

+    vld1.u32        {d31}, [r2]             ;first_pass filter

+    vld1.u8         {d4}, [r0], r1

+    vdup.8          d0, d31[0]              ;first_pass filter (d0-d1)

+    vld1.u8         {d5}, [r0], r1

+    vdup.8          d1, d31[4]

+    vld1.u8         {d6}, [r0], r1

+    vshr.u64        q4, q1, #8              ;construct src_ptr[1]

+    vshr.u64        q5, q2, #8

+    vshr.u64        d12, d6, #8

+    vzip.32         d2, d3                  ;put 2-line data in 1 register (src_ptr[0])

+    vzip.32         d4, d5

+    vzip.32         d8, d9                  ;put 2-line data in 1 register (src_ptr[1])

+    vzip.32         d10, d11

+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * vp9_filter[0])

+    vmull.u8        q8, d4, d0

+    vmull.u8        q9, d6, d0

+    vmlal.u8        q7, d8, d1              ;(src_ptr[1] * vp9_filter[1])

+    vmlal.u8        q8, d10, d1

+    vmlal.u8        q9, d12, d1

+    vqrshrn.u16    d28, q7, #7              ;shift/round/saturate to u8

+    vqrshrn.u16    d29, q8, #7

+    vqrshrn.u16    d30, q9, #7

+;Second pass: 4x4

+secondpass_filter

+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

+    beq             skip_secondpass_filter

+    add             r3, r12, r3, lsl #3 ;calculate Vfilter location

+    vld1.u32        {d31}, [r3]         ;load second_pass filter

+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0-d5)

+    vdup.8          d1, d31[4]

+    vmull.u8        q1, d28, d0

+    vmull.u8        q2, d29, d0

+    vext.8          d26, d28, d29, #4       ;construct src_ptr[pixel_step]

+    vext.8          d27, d29, d30, #4

+    vmlal.u8        q1, d26, d1

+    vmlal.u8        q2, d27, d1

+    add             r0, r4, lr

+    add             r1, r0, lr

+    add             r2, r1, lr

+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8

+    vqrshrn.u16    d3, q2, #7

+    vst1.32         {d2[0]}, [r4]           ;store result

+    vst1.32         {d2[1]}, [r0]

+    vst1.32         {d3[0]}, [r1]

+    vst1.32         {d3[1]}, [r2]

+    pop             {r4, pc}

+;--------------------

+skip_firstpass_filter

+    vld1.32         {d28[0]}, [r0], r1      ;load src data

+    vld1.32         {d28[1]}, [r0], r1

+    vld1.32         {d29[0]}, [r0], r1

+    vld1.32         {d29[1]}, [r0], r1

+    vld1.32         {d30[0]}, [r0], r1

+    b               secondpass_filter

+;---------------------

+skip_secondpass_filter

+    vst1.32         {d28[0]}, [r4], lr      ;store result

+    vst1.32         {d28[1]}, [r4], lr

+    vst1.32         {d29[0]}, [r4], lr

+    vst1.32         {d29[1]}, [r4], lr

+    pop             {r4, pc}

+    ENDP

+;-----------------

+bifilter4_coeff

+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/bilinearpredict8x4_neon.asm

@@ -1,0 +1,135 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_bilinear_predict8x4_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char  *src_ptr,

+; r1    int  src_pixels_per_line,

+; r2    int  xoffset,

+; r3    int  yoffset,

+; r4    unsigned char *dst_ptr,

+; stack(lr) int  dst_pitch

+|vp8_bilinear_predict8x4_neon| PROC

+    push            {r4, lr}

+    adr             r12, bifilter8x4_coeff

+    ldr             r4, [sp, #8]            ;load parameters from stack

+    ldr             lr, [sp, #12]           ;load parameters from stack

+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

+    beq             skip_firstpass_filter

+;First pass: output_height lines x output_width columns (5x8)

+    add             r2, r12, r2, lsl #3     ;calculate filter location

+    vld1.u8         {q1}, [r0], r1          ;load src data

+    vld1.u32        {d31}, [r2]             ;load first_pass filter

+    vld1.u8         {q2}, [r0], r1

+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)

+    vld1.u8         {q3}, [r0], r1

+    vdup.8          d1, d31[4]

+    vld1.u8         {q4}, [r0], r1

+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp9_filter[0])

+    vld1.u8         {q5}, [r0], r1

+    vmull.u8        q7, d4, d0

+    vmull.u8        q8, d6, d0

+    vmull.u8        q9, d8, d0

+    vmull.u8        q10, d10, d0

+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]

+    vext.8          d5, d4, d5, #1

+    vext.8          d7, d6, d7, #1

+    vext.8          d9, d8, d9, #1

+    vext.8          d11, d10, d11, #1

+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp9_filter[1])

+    vmlal.u8        q7, d5, d1

+    vmlal.u8        q8, d7, d1

+    vmlal.u8        q9, d9, d1

+    vmlal.u8        q10, d11, d1

+    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8

+    vqrshrn.u16    d23, q7, #7

+    vqrshrn.u16    d24, q8, #7

+    vqrshrn.u16    d25, q9, #7

+    vqrshrn.u16    d26, q10, #7

+;Second pass: 4x8

+secondpass_filter

+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

+    beq             skip_secondpass_filter

+    add             r3, r12, r3, lsl #3

+    add             r0, r4, lr

+    vld1.u32        {d31}, [r3]             ;load second_pass filter

+    add             r1, r0, lr

+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)

+    vdup.8          d1, d31[4]

+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp9_filter[0])

+    vmull.u8        q2, d23, d0

+    vmull.u8        q3, d24, d0

+    vmull.u8        q4, d25, d0

+    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * vp9_filter[1])

+    vmlal.u8        q2, d24, d1

+    vmlal.u8        q3, d25, d1

+    vmlal.u8        q4, d26, d1

+    add             r2, r1, lr

+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8

+    vqrshrn.u16    d3, q2, #7

+    vqrshrn.u16    d4, q3, #7

+    vqrshrn.u16    d5, q4, #7

+    vst1.u8         {d2}, [r4]              ;store result

+    vst1.u8         {d3}, [r0]

+    vst1.u8         {d4}, [r1]

+    vst1.u8         {d5}, [r2]

+    pop             {r4, pc}

+;--------------------

+skip_firstpass_filter

+    vld1.u8         {d22}, [r0], r1         ;load src data

+    vld1.u8         {d23}, [r0], r1

+    vld1.u8         {d24}, [r0], r1

+    vld1.u8         {d25}, [r0], r1

+    vld1.u8         {d26}, [r0], r1

+    b               secondpass_filter

+;---------------------

+skip_secondpass_filter

+    vst1.u8         {d22}, [r4], lr         ;store result

+    vst1.u8         {d23}, [r4], lr

+    vst1.u8         {d24}, [r4], lr

+    vst1.u8         {d25}, [r4], lr

+    pop             {r4, pc}

+    ENDP

+;-----------------

+bifilter8x4_coeff

+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/bilinearpredict8x8_neon.asm

@@ -1,0 +1,183 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_bilinear_predict8x8_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char  *src_ptr,

+; r1    int  src_pixels_per_line,

+; r2    int  xoffset,

+; r3    int  yoffset,

+; r4    unsigned char *dst_ptr,

+; stack(lr) int  dst_pitch

+|vp8_bilinear_predict8x8_neon| PROC

+    push            {r4, lr}

+    adr             r12, bifilter8_coeff

+    ldr             r4, [sp, #8]            ;load parameters from stack

+    ldr             lr, [sp, #12]           ;load parameters from stack

+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

+    beq             skip_firstpass_filter

+;First pass: output_height lines x output_width columns (9x8)

+    add             r2, r12, r2, lsl #3     ;calculate filter location

+    vld1.u8         {q1}, [r0], r1          ;load src data

+    vld1.u32        {d31}, [r2]             ;load first_pass filter

+    vld1.u8         {q2}, [r0], r1

+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)

+    vld1.u8         {q3}, [r0], r1

+    vdup.8          d1, d31[4]

+    vld1.u8         {q4}, [r0], r1

+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp9_filter[0])

+    vmull.u8        q7, d4, d0

+    vmull.u8        q8, d6, d0

+    vmull.u8        q9, d8, d0

+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]

+    vext.8          d5, d4, d5, #1

+    vext.8          d7, d6, d7, #1

+    vext.8          d9, d8, d9, #1

+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp9_filter[1])

+    vmlal.u8        q7, d5, d1

+    vmlal.u8        q8, d7, d1

+    vmlal.u8        q9, d9, d1

+    vld1.u8         {q1}, [r0], r1          ;load src data

+    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8

+    vld1.u8         {q2}, [r0], r1

+    vqrshrn.u16    d23, q7, #7

+    vld1.u8         {q3}, [r0], r1

+    vqrshrn.u16    d24, q8, #7

+    vld1.u8         {q4}, [r0], r1

+    vqrshrn.u16    d25, q9, #7

+    ;first_pass filtering on the rest 5-line data

+    vld1.u8         {q5}, [r0], r1

+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp9_filter[0])

+    vmull.u8        q7, d4, d0

+    vmull.u8        q8, d6, d0

+    vmull.u8        q9, d8, d0

+    vmull.u8        q10, d10, d0

+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]

+    vext.8          d5, d4, d5, #1

+    vext.8          d7, d6, d7, #1

+    vext.8          d9, d8, d9, #1

+    vext.8          d11, d10, d11, #1

+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp9_filter[1])

+    vmlal.u8        q7, d5, d1

+    vmlal.u8        q8, d7, d1

+    vmlal.u8        q9, d9, d1

+    vmlal.u8        q10, d11, d1

+    vqrshrn.u16    d26, q6, #7              ;shift/round/saturate to u8

+    vqrshrn.u16    d27, q7, #7

+    vqrshrn.u16    d28, q8, #7

+    vqrshrn.u16    d29, q9, #7

+    vqrshrn.u16    d30, q10, #7

+;Second pass: 8x8

+secondpass_filter

+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

+    beq             skip_secondpass_filter

+    add             r3, r12, r3, lsl #3

+    add             r0, r4, lr

+    vld1.u32        {d31}, [r3]             ;load second_pass filter

+    add             r1, r0, lr

+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)

+    vdup.8          d1, d31[4]

+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp9_filter[0])

+    vmull.u8        q2, d23, d0

+    vmull.u8        q3, d24, d0

+    vmull.u8        q4, d25, d0

+    vmull.u8        q5, d26, d0

+    vmull.u8        q6, d27, d0

+    vmull.u8        q7, d28, d0

+    vmull.u8        q8, d29, d0

+    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * vp9_filter[1])

+    vmlal.u8        q2, d24, d1

+    vmlal.u8        q3, d25, d1

+    vmlal.u8        q4, d26, d1

+    vmlal.u8        q5, d27, d1

+    vmlal.u8        q6, d28, d1

+    vmlal.u8        q7, d29, d1

+    vmlal.u8        q8, d30, d1

+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8

+    vqrshrn.u16    d3, q2, #7

+    vqrshrn.u16    d4, q3, #7

+    vqrshrn.u16    d5, q4, #7

+    vqrshrn.u16    d6, q5, #7

+    vqrshrn.u16    d7, q6, #7

+    vqrshrn.u16    d8, q7, #7

+    vqrshrn.u16    d9, q8, #7

+    vst1.u8         {d2}, [r4]              ;store result

+    vst1.u8         {d3}, [r0]

+    vst1.u8         {d4}, [r1], lr

+    vst1.u8         {d5}, [r1], lr

+    vst1.u8         {d6}, [r1], lr

+    vst1.u8         {d7}, [r1], lr

+    vst1.u8         {d8}, [r1], lr

+    vst1.u8         {d9}, [r1], lr

+    pop             {r4, pc}

+;--------------------

+skip_firstpass_filter

+    vld1.u8         {d22}, [r0], r1         ;load src data

+    vld1.u8         {d23}, [r0], r1

+    vld1.u8         {d24}, [r0], r1

+    vld1.u8         {d25}, [r0], r1

+    vld1.u8         {d26}, [r0], r1

+    vld1.u8         {d27}, [r0], r1

+    vld1.u8         {d28}, [r0], r1

+    vld1.u8         {d29}, [r0], r1

+    vld1.u8         {d30}, [r0], r1

+    b               secondpass_filter

+;---------------------

+skip_secondpass_filter

+    vst1.u8         {d22}, [r4], lr         ;store result

+    vst1.u8         {d23}, [r4], lr

+    vst1.u8         {d24}, [r4], lr

+    vst1.u8         {d25}, [r4], lr

+    vst1.u8         {d26}, [r4], lr

+    vst1.u8         {d27}, [r4], lr

+    vst1.u8         {d28}, [r4], lr

+    vst1.u8         {d29}, [r4], lr

+    pop             {r4, pc}

+    ENDP

+;-----------------

+bifilter8_coeff

+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/buildintrapredictorsmby_neon.asm

@@ -1,0 +1,584 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_build_intra_predictors_mby_neon_func|

+    EXPORT  |vp8_build_intra_predictors_mby_s_neon_func|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char *y_buffer

+; r1    unsigned char *ypred_ptr

+; r2    int y_stride

+; r3    int mode

+; stack int Up

+; stack int Left

+|vp8_build_intra_predictors_mby_neon_func| PROC

+    push            {r4-r8, lr}

+    cmp             r3, #0

+    beq             case_dc_pred

+    cmp             r3, #1

+    beq             case_v_pred

+    cmp             r3, #2

+    beq             case_h_pred

+    cmp             r3, #3

+    beq             case_tm_pred

+case_dc_pred

+    ldr             r4, [sp, #24]       ; Up

+    ldr             r5, [sp, #28]       ; Left

+    ; Default the DC average to 128

+    mov             r12, #128

+    vdup.u8         q0, r12

+    ; Zero out running sum

+    mov             r12, #0

+    ; compute shift and jump

+    adds            r7, r4, r5

+    beq             skip_dc_pred_up_left

+    ; Load above row, if it exists

+    cmp             r4, #0

+    beq             skip_dc_pred_up

+    sub             r6, r0, r2

+    vld1.8          {q1}, [r6]

+    vpaddl.u8       q2, q1

+    vpaddl.u16      q3, q2

+    vpaddl.u32      q4, q3

+    vmov.32         r4, d8[0]

+    vmov.32         r6, d9[0]

+    add             r12, r4, r6

+    ; Move back to interger registers

+skip_dc_pred_up

+    cmp             r5, #0

+    beq             skip_dc_pred_left

+    sub             r0, r0, #1

+    ; Load left row, if it exists

+    ldrb            r3, [r0], r2

+    ldrb            r4, [r0], r2

+    ldrb            r5, [r0], r2

+    ldrb            r6, [r0], r2

+    add             r12, r12, r3

+    add             r12, r12, r4

+    add             r12, r12, r5

+    add             r12, r12, r6

+    ldrb            r3, [r0], r2

+    ldrb            r4, [r0], r2

+    ldrb            r5, [r0], r2

+    ldrb            r6, [r0], r2

+    add             r12, r12, r3

+    add             r12, r12, r4

+    add             r12, r12, r5

+    add             r12, r12, r6

+    ldrb            r3, [r0], r2

+    ldrb            r4, [r0], r2

+    ldrb            r5, [r0], r2

+    ldrb            r6, [r0], r2

+    add             r12, r12, r3

+    add             r12, r12, r4

+    add             r12, r12, r5

+    add             r12, r12, r6

+    ldrb            r3, [r0], r2

+    ldrb            r4, [r0], r2

+    ldrb            r5, [r0], r2

+    ldrb            r6, [r0]

+    add             r12, r12, r3

+    add             r12, r12, r4

+    add             r12, r12, r5

+    add             r12, r12, r6

+skip_dc_pred_left

+    add             r7, r7, #3          ; Shift

+    sub             r4, r7, #1

+    mov             r5, #1

+    add             r12, r12, r5, lsl r4

+    mov             r5, r12, lsr r7     ; expected_dc

+    vdup.u8         q0, r5

+skip_dc_pred_up_left

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    pop             {r4-r8,pc}

+case_v_pred

+    ; Copy down above row

+    sub             r6, r0, r2

+    vld1.8          {q0}, [r6]

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q0}, [r1]!

+    pop             {r4-r8,pc}

+case_h_pred

+    ; Load 4x yleft_col

+    sub             r0, r0, #1

+    ldrb            r3, [r0], r2

+    ldrb            r4, [r0], r2

+    ldrb            r5, [r0], r2

+    ldrb            r6, [r0], r2

+    vdup.u8         q0, r3

+    vdup.u8         q1, r4

+    vdup.u8         q2, r5

+    vdup.u8         q3, r6

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q1}, [r1]!

+    vst1.u8         {q2}, [r1]!

+    vst1.u8         {q3}, [r1]!

+    ldrb            r3, [r0], r2

+    ldrb            r4, [r0], r2

+    ldrb            r5, [r0], r2

+    ldrb            r6, [r0], r2

+    vdup.u8         q0, r3

+    vdup.u8         q1, r4

+    vdup.u8         q2, r5

+    vdup.u8         q3, r6

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q1}, [r1]!

+    vst1.u8         {q2}, [r1]!

+    vst1.u8         {q3}, [r1]!

+    ldrb            r3, [r0], r2

+    ldrb            r4, [r0], r2

+    ldrb            r5, [r0], r2

+    ldrb            r6, [r0], r2

+    vdup.u8         q0, r3

+    vdup.u8         q1, r4

+    vdup.u8         q2, r5

+    vdup.u8         q3, r6

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q1}, [r1]!

+    vst1.u8         {q2}, [r1]!

+    vst1.u8         {q3}, [r1]!

+    ldrb            r3, [r0], r2

+    ldrb            r4, [r0], r2

+    ldrb            r5, [r0], r2

+    ldrb            r6, [r0], r2

+    vdup.u8         q0, r3

+    vdup.u8         q1, r4

+    vdup.u8         q2, r5

+    vdup.u8         q3, r6

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q1}, [r1]!

+    vst1.u8         {q2}, [r1]!

+    vst1.u8         {q3}, [r1]!

+    pop             {r4-r8,pc}

+case_tm_pred

+    ; Load yabove_row

+    sub             r3, r0, r2

+    vld1.8          {q8}, [r3]

+    ; Load ytop_left

+    sub             r3, r3, #1

+    ldrb            r7, [r3]

+    vdup.u16        q7, r7

+    ; Compute yabove_row - ytop_left

+    mov             r3, #1

+    vdup.u8         q0, r3

+    vmull.u8        q4, d16, d0

+    vmull.u8        q5, d17, d0

+    vsub.s16        q4, q4, q7

+    vsub.s16        q5, q5, q7

+    ; Load 4x yleft_col

+    sub             r0, r0, #1

+    mov             r12, #4

+case_tm_pred_loop

+    ldrb            r3, [r0], r2

+    ldrb            r4, [r0], r2

+    ldrb            r5, [r0], r2

+    ldrb            r6, [r0], r2

+    vdup.u16        q0, r3

+    vdup.u16        q1, r4

+    vdup.u16        q2, r5

+    vdup.u16        q3, r6

+    vqadd.s16       q8, q0, q4

+    vqadd.s16       q9, q0, q5

+    vqadd.s16       q10, q1, q4

+    vqadd.s16       q11, q1, q5

+    vqadd.s16       q12, q2, q4

+    vqadd.s16       q13, q2, q5

+    vqadd.s16       q14, q3, q4

+    vqadd.s16       q15, q3, q5

+    vqshrun.s16     d0, q8, #0

+    vqshrun.s16     d1, q9, #0

+    vqshrun.s16     d2, q10, #0

+    vqshrun.s16     d3, q11, #0

+    vqshrun.s16     d4, q12, #0

+    vqshrun.s16     d5, q13, #0

+    vqshrun.s16     d6, q14, #0

+    vqshrun.s16     d7, q15, #0

+    vst1.u8         {q0}, [r1]!

+    vst1.u8         {q1}, [r1]!

+    vst1.u8         {q2}, [r1]!

+    vst1.u8         {q3}, [r1]!

+    subs            r12, r12, #1

+    bne             case_tm_pred_loop

+    pop             {r4-r8,pc}

+    ENDP

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

+; r0    unsigned char *y_buffer

+; r1    unsigned char *ypred_ptr

+; r2    int y_stride

+; r3    int mode

+; stack int Up

+; stack int Left

+|vp8_build_intra_predictors_mby_s_neon_func| PROC

+    push            {r4-r8, lr}

+    mov             r1, r0      ;   unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;

+    cmp             r3, #0

+    beq             case_dc_pred_s

+    cmp             r3, #1

+    beq             case_v_pred_s

+    cmp             r3, #2

+    beq             case_h_pred_s

+    cmp             r3, #3

+    beq             case_tm_pred_s

+case_dc_pred_s

+    ldr             r4, [sp, #24]       ; Up

+    ldr             r5, [sp, #28]       ; Left

+    ; Default the DC average to 128

+    mov             r12, #128

+    vdup.u8         q0, r12

+    ; Zero out running sum

+    mov             r12, #0

+    ; compute shift and jump

+    adds            r7, r4, r5

+    beq             skip_dc_pred_up_left_s

+    ; Load above row, if it exists

+    cmp             r4, #0

+    beq             skip_dc_pred_up_s

+    sub             r6, r0, r2

+    vld1.8          {q1}, [r6]

+    vpaddl.u8       q2, q1

+    vpaddl.u16      q3, q2

+    vpaddl.u32      q4, q3

+    vmov.32         r4, d8[0]

+    vmov.32         r6, d9[0]

+    add             r12, r4, r6

+    ; Move back to interger registers

+skip_dc_pred_up_s

+    cmp             r5, #0

+    beq             skip_dc_pred_left_s

+    sub             r0, r0, #1

+    ; Load left row, if it exists

+    ldrb            r3, [r0], r2

+    ldrb            r4, [r0], r2

+    ldrb            r5, [r0], r2

+    ldrb            r6, [r0], r2

+    add             r12, r12, r3

+    add             r12, r12, r4

+    add             r12, r12, r5

+    add             r12, r12, r6

+    ldrb            r3, [r0], r2

+    ldrb            r4, [r0], r2

+    ldrb            r5, [r0], r2

+    ldrb            r6, [r0], r2

+    add             r12, r12, r3

+    add             r12, r12, r4

+    add             r12, r12, r5

+    add             r12, r12, r6

+    ldrb            r3, [r0], r2

+    ldrb            r4, [r0], r2

+    ldrb            r5, [r0], r2

+    ldrb            r6, [r0], r2

+    add             r12, r12, r3

+    add             r12, r12, r4

+    add             r12, r12, r5

+    add             r12, r12, r6

+    ldrb            r3, [r0], r2

+    ldrb            r4, [r0], r2

+    ldrb            r5, [r0], r2

+    ldrb            r6, [r0]

+    add             r12, r12, r3

+    add             r12, r12, r4

+    add             r12, r12, r5

+    add             r12, r12, r6

+skip_dc_pred_left_s

+    add             r7, r7, #3          ; Shift

+    sub             r4, r7, #1

+    mov             r5, #1

+    add             r12, r12, r5, lsl r4

+    mov             r5, r12, lsr r7     ; expected_dc

+    vdup.u8         q0, r5

+skip_dc_pred_up_left_s

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    pop             {r4-r8,pc}

+case_v_pred_s

+    ; Copy down above row

+    sub             r6, r0, r2

+    vld1.8          {q0}, [r6]

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q0}, [r1], r2

+    pop             {r4-r8,pc}

+case_h_pred_s

+    ; Load 4x yleft_col

+    sub             r0, r0, #1

+    ldrb            r3, [r0], r2

+    ldrb            r4, [r0], r2

+    ldrb            r5, [r0], r2

+    ldrb            r6, [r0], r2

+    vdup.u8         q0, r3

+    vdup.u8         q1, r4

+    vdup.u8         q2, r5

+    vdup.u8         q3, r6

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q1}, [r1], r2

+    vst1.u8         {q2}, [r1], r2

+    vst1.u8         {q3}, [r1], r2

+    ldrb            r3, [r0], r2

+    ldrb            r4, [r0], r2

+    ldrb            r5, [r0], r2

+    ldrb            r6, [r0], r2

+    vdup.u8         q0, r3

+    vdup.u8         q1, r4

+    vdup.u8         q2, r5

+    vdup.u8         q3, r6

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q1}, [r1], r2

+    vst1.u8         {q2}, [r1], r2

+    vst1.u8         {q3}, [r1], r2

+    ldrb            r3, [r0], r2

+    ldrb            r4, [r0], r2

+    ldrb            r5, [r0], r2

+    ldrb            r6, [r0], r2

+    vdup.u8         q0, r3

+    vdup.u8         q1, r4

+    vdup.u8         q2, r5

+    vdup.u8         q3, r6

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q1}, [r1], r2

+    vst1.u8         {q2}, [r1], r2

+    vst1.u8         {q3}, [r1], r2

+    ldrb            r3, [r0], r2

+    ldrb            r4, [r0], r2

+    ldrb            r5, [r0], r2

+    ldrb            r6, [r0], r2

+    vdup.u8         q0, r3

+    vdup.u8         q1, r4

+    vdup.u8         q2, r5

+    vdup.u8         q3, r6

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q1}, [r1], r2

+    vst1.u8         {q2}, [r1], r2

+    vst1.u8         {q3}, [r1], r2

+    pop             {r4-r8,pc}

+case_tm_pred_s

+    ; Load yabove_row

+    sub             r3, r0, r2

+    vld1.8          {q8}, [r3]

+    ; Load ytop_left

+    sub             r3, r3, #1

+    ldrb            r7, [r3]

+    vdup.u16        q7, r7

+    ; Compute yabove_row - ytop_left

+    mov             r3, #1

+    vdup.u8         q0, r3

+    vmull.u8        q4, d16, d0

+    vmull.u8        q5, d17, d0

+    vsub.s16        q4, q4, q7

+    vsub.s16        q5, q5, q7

+    ; Load 4x yleft_col

+    sub             r0, r0, #1

+    mov             r12, #4

+case_tm_pred_loop_s

+    ldrb            r3, [r0], r2

+    ldrb            r4, [r0], r2

+    ldrb            r5, [r0], r2

+    ldrb            r6, [r0], r2

+    vdup.u16        q0, r3

+    vdup.u16        q1, r4

+    vdup.u16        q2, r5

+    vdup.u16        q3, r6

+    vqadd.s16       q8, q0, q4

+    vqadd.s16       q9, q0, q5

+    vqadd.s16       q10, q1, q4

+    vqadd.s16       q11, q1, q5

+    vqadd.s16       q12, q2, q4

+    vqadd.s16       q13, q2, q5

+    vqadd.s16       q14, q3, q4

+    vqadd.s16       q15, q3, q5

+    vqshrun.s16     d0, q8, #0

+    vqshrun.s16     d1, q9, #0

+    vqshrun.s16     d2, q10, #0

+    vqshrun.s16     d3, q11, #0

+    vqshrun.s16     d4, q12, #0

+    vqshrun.s16     d5, q13, #0

+    vqshrun.s16     d6, q14, #0

+    vqshrun.s16     d7, q15, #0

+    vst1.u8         {q0}, [r1], r2

+    vst1.u8         {q1}, [r1], r2

+    vst1.u8         {q2}, [r1], r2

+    vst1.u8         {q3}, [r1], r2

+    subs            r12, r12, #1

+    bne             case_tm_pred_loop_s

+    pop             {r4-r8,pc}

+    ENDP

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/copymem16x16_neon.asm

@@ -1,0 +1,59 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_copy_mem16x16_neon|

+    ; ARM

+    ; REQUIRE8

+    ; PRESERVE8

+    AREA    Block, CODE, READONLY ; name this block of code

+;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

+|vp9_copy_mem16x16_neon| PROC

+    vld1.u8     {q0}, [r0], r1

+    vld1.u8     {q1}, [r0], r1

+    vld1.u8     {q2}, [r0], r1

+    vst1.u8     {q0}, [r2], r3

+    vld1.u8     {q3}, [r0], r1

+    vst1.u8     {q1}, [r2], r3

+    vld1.u8     {q4}, [r0], r1

+    vst1.u8     {q2}, [r2], r3

+    vld1.u8     {q5}, [r0], r1

+    vst1.u8     {q3}, [r2], r3

+    vld1.u8     {q6}, [r0], r1

+    vst1.u8     {q4}, [r2], r3

+    vld1.u8     {q7}, [r0], r1

+    vst1.u8     {q5}, [r2], r3

+    vld1.u8     {q8}, [r0], r1

+    vst1.u8     {q6}, [r2], r3

+    vld1.u8     {q9}, [r0], r1

+    vst1.u8     {q7}, [r2], r3

+    vld1.u8     {q10}, [r0], r1

+    vst1.u8     {q8}, [r2], r3

+    vld1.u8     {q11}, [r0], r1

+    vst1.u8     {q9}, [r2], r3

+    vld1.u8     {q12}, [r0], r1

+    vst1.u8     {q10}, [r2], r3

+    vld1.u8     {q13}, [r0], r1

+    vst1.u8     {q11}, [r2], r3

+    vld1.u8     {q14}, [r0], r1

+    vst1.u8     {q12}, [r2], r3

+    vld1.u8     {q15}, [r0], r1

+    vst1.u8     {q13}, [r2], r3

+    vst1.u8     {q14}, [r2], r3

+    vst1.u8     {q15}, [r2], r3

+    mov     pc, lr

+    ENDP  ; |vp9_copy_mem16x16_neon|

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/copymem8x4_neon.asm

@@ -1,0 +1,34 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_copy_mem8x4_neon|

+    ; ARM

+    ; REQUIRE8

+    ; PRESERVE8

+    AREA    Block, CODE, READONLY ; name this block of code

+;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

+|vp9_copy_mem8x4_neon| PROC

+    vld1.u8     {d0}, [r0], r1

+    vld1.u8     {d1}, [r0], r1

+    vst1.u8     {d0}, [r2], r3

+    vld1.u8     {d2}, [r0], r1

+    vst1.u8     {d1}, [r2], r3

+    vld1.u8     {d3}, [r0], r1

+    vst1.u8     {d2}, [r2], r3

+    vst1.u8     {d3}, [r2], r3

+    mov     pc, lr

+    ENDP  ; |vp9_copy_mem8x4_neon|

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/copymem8x8_neon.asm

@@ -1,0 +1,43 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_copy_mem8x8_neon|

+    ; ARM

+    ; REQUIRE8

+    ; PRESERVE8

+    AREA    Block, CODE, READONLY ; name this block of code

+;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)

+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

+|vp9_copy_mem8x8_neon| PROC

+    vld1.u8     {d0}, [r0], r1

+    vld1.u8     {d1}, [r0], r1

+    vst1.u8     {d0}, [r2], r3

+    vld1.u8     {d2}, [r0], r1

+    vst1.u8     {d1}, [r2], r3

+    vld1.u8     {d3}, [r0], r1

+    vst1.u8     {d2}, [r2], r3

+    vld1.u8     {d4}, [r0], r1

+    vst1.u8     {d3}, [r2], r3

+    vld1.u8     {d5}, [r0], r1

+    vst1.u8     {d4}, [r2], r3

+    vld1.u8     {d6}, [r0], r1

+    vst1.u8     {d5}, [r2], r3

+    vld1.u8     {d7}, [r0], r1

+    vst1.u8     {d6}, [r2], r3

+    vst1.u8     {d7}, [r2], r3

+    mov     pc, lr

+    ENDP  ; |vp9_copy_mem8x8_neon|

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/dc_only_idct_add_neon.asm

@@ -1,0 +1,49 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license and patent

+;  grant that can be found in the LICENSE file in the root of the source

+;  tree. All contributing project authors may be found in the AUTHORS

+;  file in the root of the source tree.

+;

+    EXPORT  |vp8_dc_only_idct_add_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr,

+;                               unsigned char *dst_ptr, int pitch, int stride)

+; r0  input_dc

+; r1  pred_ptr

+; r2  dst_ptr

+; r3  pitch

+; sp  stride

+|vp8_dc_only_idct_add_neon| PROC

+    add             r0, r0, #4

+    asr             r0, r0, #3

+    ldr             r12, [sp]

+    vdup.16         q0, r0

+    vld1.32         {d2[0]}, [r1], r3

+    vld1.32         {d2[1]}, [r1], r3

+    vld1.32         {d4[0]}, [r1], r3

+    vld1.32         {d4[1]}, [r1]

+    vaddw.u8        q1, q0, d2

+    vaddw.u8        q2, q0, d4

+    vqmovun.s16     d2, q1

+    vqmovun.s16     d4, q2

+    vst1.32         {d2[0]}, [r2], r12

+    vst1.32         {d2[1]}, [r2], r12

+    vst1.32         {d4[0]}, [r2], r12

+    vst1.32         {d4[1]}, [r2]

+    bx             lr

+    ENDP

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/iwalsh_neon.asm

@@ -1,0 +1,80 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_short_inv_walsh4x4_neon|

+    EXPORT  |vp8_short_inv_walsh4x4_1_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA    |.text|, CODE, READONLY  ; name this block of code

+;short vp8_short_inv_walsh4x4_neon(short *input, short *output)

+|vp8_short_inv_walsh4x4_neon| PROC

+    ; read in all four lines of values: d0->d3

+    vld1.i16 {q0-q1}, [r0@128]

+    ; first for loop

+    vadd.s16 d4, d0, d3 ;a = [0] + [12]

+    vadd.s16 d6, d1, d2 ;b = [4] + [8]

+    vsub.s16 d5, d0, d3 ;d = [0] - [12]

+    vsub.s16 d7, d1, d2 ;c = [4] - [8]

+    vadd.s16 q0, q2, q3 ; a+b d+c

+    vsub.s16 q1, q2, q3 ; a-b d-c

+    vtrn.32 d0, d2 ;d0:  0  1  8  9

+                   ;d2:  2  3 10 11

+    vtrn.32 d1, d3 ;d1:  4  5 12 13

+                   ;d3:  6  7 14 15

+    vtrn.16 d0, d1 ;d0:  0  4  8 12

+                   ;d1:  1  5  9 13

+    vtrn.16 d2, d3 ;d2:  2  6 10 14

+                   ;d3:  3  7 11 15

+    ; second for loop

+    vadd.s16 d4, d0, d3 ;a = [0] + [3]

+    vadd.s16 d6, d1, d2 ;b = [1] + [2]

+    vsub.s16 d5, d0, d3 ;d = [0] - [3]

+    vsub.s16 d7, d1, d2 ;c = [1] - [2]

+    vmov.i16 q8, #3

+    vadd.s16 q0, q2, q3 ; a+b d+c

+    vsub.s16 q1, q2, q3 ; a-b d-c

+    vadd.i16 q0, q0, q8 ;e/f += 3

+    vadd.i16 q1, q1, q8 ;g/h += 3

+    vshr.s16 q0, q0, #3 ;e/f >> 3

+    vshr.s16 q1, q1, #3 ;g/h >> 3

+    vst4.i16 {d0,d1,d2,d3}, [r1@128]

+    bx lr

+    ENDP    ; |vp8_short_inv_walsh4x4_neon|

+;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)

+|vp8_short_inv_walsh4x4_1_neon| PROC

+    ldrsh r2, [r0]          ; load input[0]

+    add r3, r2, #3          ; add 3

+    add r2, r1, #16         ; base for last 8 output

+    asr r0, r3, #3          ; right shift 3

+    vdup.16 q0, r0          ; load and duplicate

+    vst1.16 {q0}, [r1@128]  ; write back 8

+    vst1.16 {q0}, [r2@128]  ; write back last 8

+    bx lr

+    ENDP    ; |vp8_short_inv_walsh4x4_1_neon|

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/loopfilter_neon.asm

@@ -1,0 +1,397 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_loop_filter_horizontal_edge_y_neon|

+    EXPORT  |vp9_loop_filter_horizontal_edge_uv_neon|

+    EXPORT  |vp9_loop_filter_vertical_edge_y_neon|

+    EXPORT  |vp9_loop_filter_vertical_edge_uv_neon|

+    ARM

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char *src

+; r1    int pitch

+; r2    unsigned char blimit

+; r3    unsigned char limit

+; sp    unsigned char thresh,

+|vp9_loop_filter_horizontal_edge_y_neon| PROC

+    push        {lr}

+    vdup.u8     q0, r2                     ; duplicate blimit

+    vdup.u8     q1, r3                     ; duplicate limit

+    sub         r2, r0, r1, lsl #2         ; move src pointer down by 4 lines

+    ldr         r3, [sp, #4]               ; load thresh

+    add         r12, r2, r1

+    add         r1, r1, r1

+    vdup.u8     q2, r3                     ; duplicate thresh

+    vld1.u8     {q3}, [r2@128], r1              ; p3

+    vld1.u8     {q4}, [r12@128], r1             ; p2

+    vld1.u8     {q5}, [r2@128], r1              ; p1

+    vld1.u8     {q6}, [r12@128], r1             ; p0

+    vld1.u8     {q7}, [r2@128], r1              ; q0

+    vld1.u8     {q8}, [r12@128], r1             ; q1

+    vld1.u8     {q9}, [r2@128]                  ; q2

+    vld1.u8     {q10}, [r12@128]                ; q3

+    sub         r2, r2, r1, lsl #1

+    sub         r12, r12, r1, lsl #1

+    bl          vp9_loop_filter_neon

+    vst1.u8     {q5}, [r2@128], r1              ; store op1

+    vst1.u8     {q6}, [r12@128], r1             ; store op0

+    vst1.u8     {q7}, [r2@128], r1              ; store oq0

+    vst1.u8     {q8}, [r12@128], r1             ; store oq1

+    pop         {pc}

+    ENDP        ; |vp9_loop_filter_horizontal_edge_y_neon|

+; r0    unsigned char *u,

+; r1    int pitch,

+; r2    unsigned char blimit

+; r3    unsigned char limit

+; sp    unsigned char thresh,

+; sp+4  unsigned char *v

+|vp9_loop_filter_horizontal_edge_uv_neon| PROC

+    push        {lr}

+    vdup.u8     q0, r2                      ; duplicate blimit

+    vdup.u8     q1, r3                      ; duplicate limit

+    ldr         r12, [sp, #4]               ; load thresh

+    ldr         r2, [sp, #8]                ; load v ptr

+    vdup.u8     q2, r12                     ; duplicate thresh

+    sub         r3, r0, r1, lsl #2          ; move u pointer down by 4 lines

+    sub         r12, r2, r1, lsl #2         ; move v pointer down by 4 lines

+    vld1.u8     {d6}, [r3@64], r1              ; p3

+    vld1.u8     {d7}, [r12@64], r1             ; p3

+    vld1.u8     {d8}, [r3@64], r1              ; p2

+    vld1.u8     {d9}, [r12@64], r1             ; p2

+    vld1.u8     {d10}, [r3@64], r1             ; p1

+    vld1.u8     {d11}, [r12@64], r1            ; p1

+    vld1.u8     {d12}, [r3@64], r1             ; p0

+    vld1.u8     {d13}, [r12@64], r1            ; p0

+    vld1.u8     {d14}, [r3@64], r1             ; q0

+    vld1.u8     {d15}, [r12@64], r1            ; q0

+    vld1.u8     {d16}, [r3@64], r1             ; q1

+    vld1.u8     {d17}, [r12@64], r1            ; q1

+    vld1.u8     {d18}, [r3@64], r1             ; q2

+    vld1.u8     {d19}, [r12@64], r1            ; q2

+    vld1.u8     {d20}, [r3@64]                 ; q3

+    vld1.u8     {d21}, [r12@64]                ; q3

+    bl          vp9_loop_filter_neon

+    sub         r0, r0, r1, lsl #1

+    sub         r2, r2, r1, lsl #1

+    vst1.u8     {d10}, [r0@64], r1             ; store u op1

+    vst1.u8     {d11}, [r2@64], r1             ; store v op1

+    vst1.u8     {d12}, [r0@64], r1             ; store u op0

+    vst1.u8     {d13}, [r2@64], r1             ; store v op0

+    vst1.u8     {d14}, [r0@64], r1             ; store u oq0

+    vst1.u8     {d15}, [r2@64], r1             ; store v oq0

+    vst1.u8     {d16}, [r0@64]                 ; store u oq1

+    vst1.u8     {d17}, [r2@64]                 ; store v oq1

+    pop         {pc}

+    ENDP        ; |vp9_loop_filter_horizontal_edge_uv_neon|

+; void vp9_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,

+;                                           const signed char *flimit,

+;                                           const signed char *limit,

+;                                           const signed char *thresh,

+;                                           int count)

+; r0    unsigned char *src

+; r1    int pitch

+; r2    unsigned char blimit

+; r3    unsigned char limit

+; sp    unsigned char thresh,

+|vp9_loop_filter_vertical_edge_y_neon| PROC

+    push        {lr}

+    vdup.u8     q0, r2                     ; duplicate blimit

+    vdup.u8     q1, r3                     ; duplicate limit

+    sub         r2, r0, #4                 ; src ptr down by 4 columns

+    add         r1, r1, r1

+    ldr         r3, [sp, #4]               ; load thresh

+    add         r12, r2, r1, asr #1

+    vld1.u8     {d6}, [r2], r1

+    vld1.u8     {d8}, [r12], r1

+    vld1.u8     {d10}, [r2], r1

+    vld1.u8     {d12}, [r12], r1

+    vld1.u8     {d14}, [r2], r1

+    vld1.u8     {d16}, [r12], r1

+    vld1.u8     {d18}, [r2], r1

+    vld1.u8     {d20}, [r12], r1

+    vld1.u8     {d7}, [r2], r1              ; load second 8-line src data

+    vld1.u8     {d9}, [r12], r1

+    vld1.u8     {d11}, [r2], r1

+    vld1.u8     {d13}, [r12], r1

+    vld1.u8     {d15}, [r2], r1

+    vld1.u8     {d17}, [r12], r1

+    vld1.u8     {d19}, [r2]

+    vld1.u8     {d21}, [r12]

+    ;transpose to 8x16 matrix

+    vtrn.32     q3, q7

+    vtrn.32     q4, q8

+    vtrn.32     q5, q9

+    vtrn.32     q6, q10

+    vdup.u8     q2, r3                     ; duplicate thresh

+    vtrn.16     q3, q5

+    vtrn.16     q4, q6

+    vtrn.16     q7, q9

+    vtrn.16     q8, q10

+    vtrn.8      q3, q4

+    vtrn.8      q5, q6

+    vtrn.8      q7, q8

+    vtrn.8      q9, q10

+    bl          vp9_loop_filter_neon

+    vswp        d12, d11

+    vswp        d16, d13

+    sub         r0, r0, #2                 ; dst ptr

+    vswp        d14, d12

+    vswp        d16, d15

+    add         r12, r0, r1, asr #1

+    ;store op1, op0, oq0, oq1

+    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1

+    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r12], r1

+    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1

+    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r12], r1

+    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1

+    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r12], r1

+    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1

+    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r12], r1

+    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r0], r1

+    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r12], r1

+    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r0], r1

+    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r12], r1

+    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r0], r1

+    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r12], r1

+    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r0]

+    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r12]

+    pop         {pc}

+    ENDP        ; |vp9_loop_filter_vertical_edge_y_neon|

+; void vp9_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch

+;                                            const signed char *flimit,

+;                                            const signed char *limit,

+;                                            const signed char *thresh,

+;                                            unsigned char *v)

+; r0    unsigned char *u,

+; r1    int pitch,

+; r2    unsigned char blimit

+; r3    unsigned char limit

+; sp    unsigned char thresh,

+; sp+4  unsigned char *v

+|vp9_loop_filter_vertical_edge_uv_neon| PROC

+    push        {lr}

+    vdup.u8     q0, r2                      ; duplicate blimit

+    sub         r12, r0, #4                 ; move u pointer down by 4 columns

+    ldr         r2, [sp, #8]                ; load v ptr

+    vdup.u8     q1, r3                      ; duplicate limit

+    sub         r3, r2, #4                  ; move v pointer down by 4 columns

+    vld1.u8     {d6}, [r12], r1             ;load u data

+    vld1.u8     {d7}, [r3], r1              ;load v data

+    vld1.u8     {d8}, [r12], r1

+    vld1.u8     {d9}, [r3], r1

+    vld1.u8     {d10}, [r12], r1

+    vld1.u8     {d11}, [r3], r1

+    vld1.u8     {d12}, [r12], r1

+    vld1.u8     {d13}, [r3], r1

+    vld1.u8     {d14}, [r12], r1

+    vld1.u8     {d15}, [r3], r1

+    vld1.u8     {d16}, [r12], r1

+    vld1.u8     {d17}, [r3], r1

+    vld1.u8     {d18}, [r12], r1

+    vld1.u8     {d19}, [r3], r1

+    vld1.u8     {d20}, [r12]

+    vld1.u8     {d21}, [r3]

+    ldr        r12, [sp, #4]               ; load thresh

+    ;transpose to 8x16 matrix

+    vtrn.32     q3, q7

+    vtrn.32     q4, q8

+    vtrn.32     q5, q9

+    vtrn.32     q6, q10

+    vdup.u8     q2, r12                     ; duplicate thresh

+    vtrn.16     q3, q5

+    vtrn.16     q4, q6

+    vtrn.16     q7, q9

+    vtrn.16     q8, q10

+    vtrn.8      q3, q4

+    vtrn.8      q5, q6

+    vtrn.8      q7, q8

+    vtrn.8      q9, q10

+    bl          vp9_loop_filter_neon

+    vswp        d12, d11

+    vswp        d16, d13

+    vswp        d14, d12

+    vswp        d16, d15

+    sub         r0, r0, #2

+    sub         r2, r2, #2

+    ;store op1, op0, oq0, oq1

+    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1

+    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r2], r1

+    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1

+    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r2], r1

+    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1

+    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r2], r1

+    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1

+    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r2], r1

+    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1

+    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r2], r1

+    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1

+    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r2], r1

+    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1

+    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r2], r1

+    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0]

+    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r2]

+    pop         {pc}

+    ENDP        ; |vp9_loop_filter_vertical_edge_uv_neon|

+; void vp9_loop_filter_neon();

+; This is a helper function for the loopfilters. The invidual functions do the

+; necessary load, transpose (if necessary) and store.

+; r0-r3 PRESERVE

+; q0    flimit

+; q1    limit

+; q2    thresh

+; q3    p3

+; q4    p2

+; q5    p1

+; q6    p0

+; q7    q0

+; q8    q1

+; q9    q2

+; q10   q3

+|vp9_loop_filter_neon| PROC

+    ; vp9_filter_mask

+    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)

+    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)

+    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)

+    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)

+    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)

+    vabd.u8     q4, q10, q9                 ; abs(q3 - q2)

+    vmax.u8     q11, q11, q12

+    vmax.u8     q12, q13, q14

+    vmax.u8     q3, q3, q4

+    vmax.u8     q15, q11, q12

+    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)

+    ; vp8_hevmask

+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1

+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1

+    vmax.u8     q15, q15, q3

+    vmov.u8     q10, #0x80                   ; 0x80

+    vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)

+    vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2

+    vcge.u8     q15, q1, q15

+    ; vp9_filter() function

+    ; convert to signed

+    veor        q7, q7, q10                 ; qs0

+    vshr.u8     q2, q2, #1                  ; a = a / 2

+    veor        q6, q6, q10                 ; ps0

+    veor        q5, q5, q10                 ; ps1

+    vqadd.u8    q9, q9, q2                  ; a = b + a

+    veor        q8, q8, q10                 ; qs1

+    vmov.u8     q10, #3                     ; #3

+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)

+    vsubl.s8    q11, d15, d13

+    vcge.u8     q9, q0, q9                  ; (a > flimit * 2 + limit) * -1

+    vmovl.u8    q4, d20

+    vqsub.s8    q1, q5, q8                  ; vp9_filter = clamp(ps1-qs1)

+    vorr        q14, q13, q14               ; vp8_hevmask

+    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)

+    vmul.i16    q11, q11, q4

+    vand        q1, q1, q14                 ; vp9_filter &= hev

+    vand        q15, q15, q9                ; vp9_filter_mask

+    vaddw.s8    q2, q2, d2

+    vaddw.s8    q11, q11, d3

+    vmov.u8     q9, #4                      ; #4

+    ; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0))

+    vqmovn.s16  d2, q2

+    vqmovn.s16  d3, q11

+    vand        q1, q1, q15                 ; vp9_filter &= mask

+    vqadd.s8    q2, q1, q10                 ; Filter2 = clamp(vp9_filter+3)

+    vqadd.s8    q1, q1, q9                  ; Filter1 = clamp(vp9_filter+4)

+    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3

+    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3

+    vqadd.s8    q11, q6, q2                 ; u = clamp(ps0 + Filter2)

+    vqsub.s8    q10, q7, q1                 ; u = clamp(qs0 - Filter1)

+    ; outer tap adjustments: ++vp9_filter >> 1

+    vrshr.s8    q1, q1, #1

+    vbic        q1, q1, q14                 ; vp9_filter &= ~hev

+    vmov.u8     q0, #0x80                   ; 0x80

+    vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + vp9_filter)

+    vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - vp9_filter)

+    veor        q6, q11, q0                 ; *op0 = u^0x80

+    veor        q7, q10, q0                 ; *oq0 = u^0x80

+    veor        q5, q13, q0                 ; *op1 = u^0x80

+    veor        q8, q12, q0                 ; *oq1 = u^0x80

+    bx          lr

+    ENDP        ; |vp9_loop_filter_horizontal_edge_y_neon|

+;-----------------

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm

@@ -1,0 +1,117 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    ;EXPORT  |vp9_loop_filter_simple_horizontal_edge_neon|

+    EXPORT  |vp9_loop_filter_bhs_neon|

+    EXPORT  |vp9_loop_filter_mbhs_neon|

+    ARM

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char *s, PRESERVE

+; r1    int p, PRESERVE

+; q1    limit, PRESERVE

+|vp9_loop_filter_simple_horizontal_edge_neon| PROC

+    sub         r3, r0, r1, lsl #1          ; move src pointer down by 2 lines

+    vld1.u8     {q7}, [r0@128], r1          ; q0

+    vld1.u8     {q5}, [r3@128], r1          ; p0

+    vld1.u8     {q8}, [r0@128]              ; q1

+    vld1.u8     {q6}, [r3@128]              ; p1

+    vabd.u8     q15, q6, q7                 ; abs(p0 - q0)

+    vabd.u8     q14, q5, q8                 ; abs(p1 - q1)

+    vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2

+    vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2

+    vmov.u8     q0, #0x80                   ; 0x80

+    vmov.s16    q13, #3

+    vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2

+    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value

+    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value

+    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value

+    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value

+    vcge.u8     q15, q1, q15                ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1

+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)

+    vsubl.s8    q3, d15, d13

+    vqsub.s8    q4, q5, q8                  ; q4: vp9_filter = vp9_signed_char_clamp(ps1-qs1)

+    vmul.s16    q2, q2, q13                 ;  3 * ( qs0 - ps0)

+    vmul.s16    q3, q3, q13

+    vmov.u8     q10, #0x03                  ; 0x03

+    vmov.u8     q9, #0x04                   ; 0x04

+    vaddw.s8    q2, q2, d8                  ; vp9_filter + 3 * ( qs0 - ps0)

+    vaddw.s8    q3, q3, d9

+    vqmovn.s16  d8, q2                      ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))

+    vqmovn.s16  d9, q3

+    vand        q14, q4, q15                ; vp9_filter &= mask

+    vqadd.s8    q2, q14, q10                ; Filter2 = vp9_signed_char_clamp(vp9_filter+3)

+    vqadd.s8    q3, q14, q9                 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4)

+    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3

+    vshr.s8     q4, q3, #3                  ; Filter1 >>= 3

+    sub         r0, r0, r1

+    ;calculate output

+    vqadd.s8    q11, q6, q2                 ; u = vp9_signed_char_clamp(ps0 + Filter2)

+    vqsub.s8    q10, q7, q4                 ; u = vp9_signed_char_clamp(qs0 - Filter1)

+    veor        q6, q11, q0                 ; *op0 = u^0x80

+    veor        q7, q10, q0                 ; *oq0 = u^0x80

+    vst1.u8     {q6}, [r3@128]              ; store op0

+    vst1.u8     {q7}, [r0@128]              ; store oq0

+    bx          lr

+    ENDP        ; |vp9_loop_filter_simple_horizontal_edge_neon|

+; r0    unsigned char *y

+; r1    int ystride

+; r2    const unsigned char *blimit

+|vp9_loop_filter_bhs_neon| PROC

+    push        {r4, lr}

+    ldrb        r3, [r2]                    ; load blim from mem

+    vdup.s8     q1, r3                      ; duplicate blim

+    add         r0, r0, r1, lsl #2          ; src = y_ptr + 4 * y_stride

+    bl          vp9_loop_filter_simple_horizontal_edge_neon

+    ; vp9_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1

+    add         r0, r0, r1, lsl #2          ; src = y_ptr + 8* y_stride

+    bl          vp9_loop_filter_simple_horizontal_edge_neon

+    add         r0, r0, r1, lsl #2          ; src = y_ptr + 12 * y_stride

+    pop         {r4, lr}

+    b           vp9_loop_filter_simple_horizontal_edge_neon

+    ENDP        ;|vp9_loop_filter_bhs_neon|

+; r0    unsigned char *y

+; r1    int ystride

+; r2    const unsigned char *blimit

+|vp9_loop_filter_mbhs_neon| PROC

+    ldrb        r3, [r2]                   ; load blim from mem

+    vdup.s8     q1, r3                     ; duplicate mblim

+    b           vp9_loop_filter_simple_horizontal_edge_neon

+    ENDP        ;|vp9_loop_filter_bhs_neon|

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/loopfiltersimpleverticaledge_neon.asm

@@ -1,0 +1,154 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    ;EXPORT  |vp9_loop_filter_simple_vertical_edge_neon|

+    EXPORT |vp9_loop_filter_bvs_neon|

+    EXPORT |vp9_loop_filter_mbvs_neon|

+    ARM

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char *s, PRESERVE

+; r1    int p, PRESERVE

+; q1    limit, PRESERVE

+|vp9_loop_filter_simple_vertical_edge_neon| PROC

+    sub         r0, r0, #2                  ; move src pointer down by 2 columns

+    add         r12, r1, r1

+    add         r3, r0, r1

+    vld4.8      {d6[0], d7[0], d8[0], d9[0]}, [r0], r12

+    vld4.8      {d6[1], d7[1], d8[1], d9[1]}, [r3], r12

+    vld4.8      {d6[2], d7[2], d8[2], d9[2]}, [r0], r12

+    vld4.8      {d6[3], d7[3], d8[3], d9[3]}, [r3], r12

+    vld4.8      {d6[4], d7[4], d8[4], d9[4]}, [r0], r12

+    vld4.8      {d6[5], d7[5], d8[5], d9[5]}, [r3], r12

+    vld4.8      {d6[6], d7[6], d8[6], d9[6]}, [r0], r12

+    vld4.8      {d6[7], d7[7], d8[7], d9[7]}, [r3], r12

+    vld4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r12

+    vld4.8      {d10[1], d11[1], d12[1], d13[1]}, [r3], r12

+    vld4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r12

+    vld4.8      {d10[3], d11[3], d12[3], d13[3]}, [r3], r12

+    vld4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r12

+    vld4.8      {d10[5], d11[5], d12[5], d13[5]}, [r3], r12

+    vld4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r12

+    vld4.8      {d10[7], d11[7], d12[7], d13[7]}, [r3]

+    vswp        d7, d10

+    vswp        d12, d9

+    ;vp9_filter_mask() function

+    ;vp8_hevmask() function

+    sub         r0, r0, r1, lsl #4

+    vabd.u8     q15, q5, q4                 ; abs(p0 - q0)

+    vabd.u8     q14, q3, q6                 ; abs(p1 - q1)

+    vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2

+    vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2

+    vmov.u8     q0, #0x80                   ; 0x80

+    vmov.s16    q11, #3

+    vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2

+    veor        q4, q4, q0                  ; qs0: q0 offset to convert to a signed value

+    veor        q5, q5, q0                  ; ps0: p0 offset to convert to a signed value

+    veor        q3, q3, q0                  ; ps1: p1 offset to convert to a signed value

+    veor        q6, q6, q0                  ; qs1: q1 offset to convert to a signed value

+    vcge.u8     q15, q1, q15                ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1

+    vsubl.s8    q2, d8, d10                 ; ( qs0 - ps0)

+    vsubl.s8    q13, d9, d11

+    vqsub.s8    q14, q3, q6                  ; vp9_filter = vp9_signed_char_clamp(ps1-qs1)

+    vmul.s16    q2, q2, q11                 ;  3 * ( qs0 - ps0)

+    vmul.s16    q13, q13, q11

+    vmov.u8     q11, #0x03                  ; 0x03

+    vmov.u8     q12, #0x04                  ; 0x04

+    vaddw.s8    q2, q2, d28                  ; vp9_filter + 3 * ( qs0 - ps0)

+    vaddw.s8    q13, q13, d29

+    vqmovn.s16  d28, q2                      ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))

+    vqmovn.s16  d29, q13

+    add         r0, r0, #1

+    add         r3, r0, r1

+    vand        q14, q14, q15                 ; vp9_filter &= mask

+    vqadd.s8    q2, q14, q11                 ; Filter2 = vp9_signed_char_clamp(vp9_filter+3)

+    vqadd.s8    q3, q14, q12                 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4)

+    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3

+    vshr.s8     q14, q3, #3                  ; Filter1 >>= 3

+    ;calculate output

+    vqadd.s8    q11, q5, q2                 ; u = vp9_signed_char_clamp(ps0 + Filter2)

+    vqsub.s8    q10, q4, q14                 ; u = vp9_signed_char_clamp(qs0 - Filter1)

+    veor        q6, q11, q0                 ; *op0 = u^0x80

+    veor        q7, q10, q0                 ; *oq0 = u^0x80

+    add         r12, r1, r1

+    vswp        d13, d14

+    ;store op1, op0, oq0, oq1

+    vst2.8      {d12[0], d13[0]}, [r0], r12

+    vst2.8      {d12[1], d13[1]}, [r3], r12

+    vst2.8      {d12[2], d13[2]}, [r0], r12

+    vst2.8      {d12[3], d13[3]}, [r3], r12

+    vst2.8      {d12[4], d13[4]}, [r0], r12

+    vst2.8      {d12[5], d13[5]}, [r3], r12

+    vst2.8      {d12[6], d13[6]}, [r0], r12

+    vst2.8      {d12[7], d13[7]}, [r3], r12

+    vst2.8      {d14[0], d15[0]}, [r0], r12

+    vst2.8      {d14[1], d15[1]}, [r3], r12

+    vst2.8      {d14[2], d15[2]}, [r0], r12

+    vst2.8      {d14[3], d15[3]}, [r3], r12

+    vst2.8      {d14[4], d15[4]}, [r0], r12

+    vst2.8      {d14[5], d15[5]}, [r3], r12

+    vst2.8      {d14[6], d15[6]}, [r0], r12

+    vst2.8      {d14[7], d15[7]}, [r3]

+    bx          lr

+    ENDP        ; |vp9_loop_filter_simple_vertical_edge_neon|

+; r0    unsigned char *y

+; r1    int ystride

+; r2    const unsigned char *blimit

+|vp9_loop_filter_bvs_neon| PROC

+    push        {r4, lr}

+    ldrb        r3, [r2]                   ; load blim from mem

+    mov         r4, r0

+    add         r0, r0, #4

+    vdup.s8     q1, r3                     ; duplicate blim

+    bl          vp9_loop_filter_simple_vertical_edge_neon

+    ; vp9_loop_filter_simple_vertical_edge_neon preserves  r1 and q1

+    add         r0, r4, #8

+    bl          vp9_loop_filter_simple_vertical_edge_neon

+    add         r0, r4, #12

+    pop         {r4, lr}

+    b           vp9_loop_filter_simple_vertical_edge_neon

+    ENDP        ;|vp9_loop_filter_bvs_neon|

+; r0    unsigned char *y

+; r1    int ystride

+; r2    const unsigned char *blimit

+|vp9_loop_filter_mbvs_neon| PROC

+    ldrb        r3, [r2]                   ; load mblim from mem

+    vdup.s8     q1, r3                     ; duplicate mblim

+    b           vp9_loop_filter_simple_vertical_edge_neon

+    ENDP        ;|vp9_loop_filter_bvs_neon|

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/mbloopfilter_neon.asm

@@ -1,0 +1,469 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_mbloop_filter_horizontal_edge_y_neon|

+    EXPORT  |vp8_mbloop_filter_horizontal_edge_uv_neon|

+    EXPORT  |vp8_mbloop_filter_vertical_edge_y_neon|

+    EXPORT  |vp8_mbloop_filter_vertical_edge_uv_neon|

+    ARM

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,

+;                                               const unsigned char *blimit,

+;                                               const unsigned char *limit,

+;                                               const unsigned char *thresh)

+; r0    unsigned char *src,

+; r1    int pitch,

+; r2    unsigned char blimit

+; r3    unsigned char limit

+; sp    unsigned char thresh,

+|vp8_mbloop_filter_horizontal_edge_y_neon| PROC

+    push        {lr}

+    add         r1, r1, r1                  ; double stride

+    ldr         r12, [sp, #4]               ; load thresh

+    sub         r0, r0, r1, lsl #1          ; move src pointer down by 4 lines

+    vdup.u8     q2, r12                     ; thresh

+    add         r12, r0, r1,  lsr #1        ; move src pointer up by 1 line

+    vld1.u8     {q3}, [r0@128], r1              ; p3

+    vld1.u8     {q4}, [r12@128], r1             ; p2

+    vld1.u8     {q5}, [r0@128], r1              ; p1

+    vld1.u8     {q6}, [r12@128], r1             ; p0

+    vld1.u8     {q7}, [r0@128], r1              ; q0

+    vld1.u8     {q8}, [r12@128], r1             ; q1

+    vld1.u8     {q9}, [r0@128], r1              ; q2

+    vld1.u8     {q10}, [r12@128], r1            ; q3

+    bl          vp8_mbloop_filter_neon

+    sub         r12, r12, r1, lsl #2

+    add         r0, r12, r1, lsr #1

+    vst1.u8     {q4}, [r12@128],r1         ; store op2

+    vst1.u8     {q5}, [r0@128],r1          ; store op1

+    vst1.u8     {q6}, [r12@128], r1        ; store op0

+    vst1.u8     {q7}, [r0@128],r1          ; store oq0

+    vst1.u8     {q8}, [r12@128]            ; store oq1

+    vst1.u8     {q9}, [r0@128]             ; store oq2

+    pop         {pc}

+    ENDP        ; |vp8_mbloop_filter_horizontal_edge_y_neon|

+; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,

+;                                                const unsigned char *blimit,

+;                                                const unsigned char *limit,

+;                                                const unsigned char *thresh,

+;                                                unsigned char *v)

+; r0    unsigned char *u,

+; r1    int pitch,

+; r2    unsigned char blimit

+; r3    unsigned char limit

+; sp    unsigned char thresh,

+; sp+4  unsigned char *v

+|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC

+    push        {lr}

+    ldr         r12, [sp, #4]                 ; load thresh

+    sub         r0, r0, r1, lsl #2            ; move u pointer down by 4 lines

+    vdup.u8     q2, r12                       ; thresh

+    ldr         r12, [sp, #8]                 ; load v ptr

+    sub         r12, r12, r1, lsl #2          ; move v pointer down by 4 lines

+    vld1.u8     {d6}, [r0@64], r1              ; p3

+    vld1.u8     {d7}, [r12@64], r1              ; p3

+    vld1.u8     {d8}, [r0@64], r1              ; p2

+    vld1.u8     {d9}, [r12@64], r1              ; p2

+    vld1.u8     {d10}, [r0@64], r1             ; p1

+    vld1.u8     {d11}, [r12@64], r1             ; p1

+    vld1.u8     {d12}, [r0@64], r1             ; p0

+    vld1.u8     {d13}, [r12@64], r1             ; p0

+    vld1.u8     {d14}, [r0@64], r1             ; q0

+    vld1.u8     {d15}, [r12@64], r1             ; q0

+    vld1.u8     {d16}, [r0@64], r1             ; q1

+    vld1.u8     {d17}, [r12@64], r1             ; q1

+    vld1.u8     {d18}, [r0@64], r1             ; q2

+    vld1.u8     {d19}, [r12@64], r1             ; q2

+    vld1.u8     {d20}, [r0@64], r1             ; q3

+    vld1.u8     {d21}, [r12@64], r1             ; q3

+    bl          vp8_mbloop_filter_neon

+    sub         r0, r0, r1, lsl #3

+    sub         r12, r12, r1, lsl #3

+    add         r0, r0, r1

+    add         r12, r12, r1

+    vst1.u8     {d8}, [r0@64], r1              ; store u op2

+    vst1.u8     {d9}, [r12@64], r1              ; store v op2

+    vst1.u8     {d10}, [r0@64], r1             ; store u op1

+    vst1.u8     {d11}, [r12@64], r1             ; store v op1

+    vst1.u8     {d12}, [r0@64], r1             ; store u op0

+    vst1.u8     {d13}, [r12@64], r1             ; store v op0

+    vst1.u8     {d14}, [r0@64], r1             ; store u oq0

+    vst1.u8     {d15}, [r12@64], r1             ; store v oq0

+    vst1.u8     {d16}, [r0@64], r1             ; store u oq1

+    vst1.u8     {d17}, [r12@64], r1             ; store v oq1

+    vst1.u8     {d18}, [r0@64], r1             ; store u oq2

+    vst1.u8     {d19}, [r12@64], r1             ; store v oq2

+    pop         {pc}

+    ENDP        ; |vp8_mbloop_filter_horizontal_edge_uv_neon|

+; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,

+;                                             const unsigned char *blimit,

+;                                             const unsigned char *limit,

+;                                             const unsigned char *thresh)

+; r0    unsigned char *src,

+; r1    int pitch,

+; r2    unsigned char blimit

+; r3    unsigned char limit

+; sp    unsigned char thresh,

+|vp8_mbloop_filter_vertical_edge_y_neon| PROC

+    push        {lr}

+    ldr         r12, [sp, #4]               ; load thresh

+    sub         r0, r0, #4                  ; move src pointer down by 4 columns

+    vdup.s8     q2, r12                     ; thresh

+    add         r12, r0, r1, lsl #3         ; move src pointer down by 8 lines

+    vld1.u8     {d6}, [r0], r1              ; load first 8-line src data

+    vld1.u8     {d7}, [r12], r1             ; load second 8-line src data

+    vld1.u8     {d8}, [r0], r1

+    vld1.u8     {d9}, [r12], r1

+    vld1.u8     {d10}, [r0], r1

+    vld1.u8     {d11}, [r12], r1

+    vld1.u8     {d12}, [r0], r1

+    vld1.u8     {d13}, [r12], r1

+    vld1.u8     {d14}, [r0], r1

+    vld1.u8     {d15}, [r12], r1

+    vld1.u8     {d16}, [r0], r1

+    vld1.u8     {d17}, [r12], r1

+    vld1.u8     {d18}, [r0], r1

+    vld1.u8     {d19}, [r12], r1

+    vld1.u8     {d20}, [r0], r1

+    vld1.u8     {d21}, [r12], r1

+    ;transpose to 8x16 matrix

+    vtrn.32     q3, q7

+    vtrn.32     q4, q8

+    vtrn.32     q5, q9

+    vtrn.32     q6, q10

+    vtrn.16     q3, q5

+    vtrn.16     q4, q6

+    vtrn.16     q7, q9

+    vtrn.16     q8, q10

+    vtrn.8      q3, q4

+    vtrn.8      q5, q6

+    vtrn.8      q7, q8

+    vtrn.8      q9, q10

+    sub         r0, r0, r1, lsl #3

+    bl          vp8_mbloop_filter_neon

+    sub         r12, r12, r1, lsl #3

+    ;transpose to 16x8 matrix

+    vtrn.32     q3, q7

+    vtrn.32     q4, q8

+    vtrn.32     q5, q9

+    vtrn.32     q6, q10

+    vtrn.16     q3, q5

+    vtrn.16     q4, q6

+    vtrn.16     q7, q9

+    vtrn.16     q8, q10

+    vtrn.8      q3, q4

+    vtrn.8      q5, q6

+    vtrn.8      q7, q8

+    vtrn.8      q9, q10

+    ;store op2, op1, op0, oq0, oq1, oq2

+    vst1.8      {d6}, [r0], r1

+    vst1.8      {d7}, [r12], r1

+    vst1.8      {d8}, [r0], r1

+    vst1.8      {d9}, [r12], r1

+    vst1.8      {d10}, [r0], r1

+    vst1.8      {d11}, [r12], r1

+    vst1.8      {d12}, [r0], r1

+    vst1.8      {d13}, [r12], r1

+    vst1.8      {d14}, [r0], r1

+    vst1.8      {d15}, [r12], r1

+    vst1.8      {d16}, [r0], r1

+    vst1.8      {d17}, [r12], r1

+    vst1.8      {d18}, [r0], r1

+    vst1.8      {d19}, [r12], r1

+    vst1.8      {d20}, [r0]

+    vst1.8      {d21}, [r12]

+    pop         {pc}

+    ENDP        ; |vp8_mbloop_filter_vertical_edge_y_neon|

+; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,

+;                                              const unsigned char *blimit,

+;                                              const unsigned char *limit,

+;                                              const unsigned char *thresh,

+;                                              unsigned char *v)

+; r0    unsigned char *u,

+; r1    int pitch,

+; r2    const signed char *flimit,

+; r3    const signed char *limit,

+; sp    const signed char *thresh,

+; sp+4  unsigned char *v

+|vp8_mbloop_filter_vertical_edge_uv_neon| PROC

+    push        {lr}

+    ldr         r12, [sp, #4]               ; load thresh

+    sub         r0, r0, #4                  ; move u pointer down by 4 columns

+    vdup.u8     q2, r12                     ; thresh

+    ldr         r12, [sp, #8]               ; load v ptr

+    sub         r12, r12, #4                ; move v pointer down by 4 columns

+    vld1.u8     {d6}, [r0], r1              ;load u data

+    vld1.u8     {d7}, [r12], r1             ;load v data

+    vld1.u8     {d8}, [r0], r1

+    vld1.u8     {d9}, [r12], r1

+    vld1.u8     {d10}, [r0], r1

+    vld1.u8     {d11}, [r12], r1

+    vld1.u8     {d12}, [r0], r1

+    vld1.u8     {d13}, [r12], r1

+    vld1.u8     {d14}, [r0], r1

+    vld1.u8     {d15}, [r12], r1

+    vld1.u8     {d16}, [r0], r1

+    vld1.u8     {d17}, [r12], r1

+    vld1.u8     {d18}, [r0], r1

+    vld1.u8     {d19}, [r12], r1

+    vld1.u8     {d20}, [r0], r1

+    vld1.u8     {d21}, [r12], r1

+    ;transpose to 8x16 matrix

+    vtrn.32     q3, q7

+    vtrn.32     q4, q8

+    vtrn.32     q5, q9

+    vtrn.32     q6, q10

+    vtrn.16     q3, q5

+    vtrn.16     q4, q6

+    vtrn.16     q7, q9

+    vtrn.16     q8, q10

+    vtrn.8      q3, q4

+    vtrn.8      q5, q6

+    vtrn.8      q7, q8

+    vtrn.8      q9, q10

+    sub         r0, r0, r1, lsl #3

+    bl          vp8_mbloop_filter_neon

+    sub         r12, r12, r1, lsl #3

+    ;transpose to 16x8 matrix

+    vtrn.32     q3, q7

+    vtrn.32     q4, q8

+    vtrn.32     q5, q9

+    vtrn.32     q6, q10

+    vtrn.16     q3, q5

+    vtrn.16     q4, q6

+    vtrn.16     q7, q9

+    vtrn.16     q8, q10

+    vtrn.8      q3, q4

+    vtrn.8      q5, q6

+    vtrn.8      q7, q8

+    vtrn.8      q9, q10

+    ;store op2, op1, op0, oq0, oq1, oq2

+    vst1.8      {d6}, [r0], r1

+    vst1.8      {d7}, [r12], r1

+    vst1.8      {d8}, [r0], r1

+    vst1.8      {d9}, [r12], r1

+    vst1.8      {d10}, [r0], r1

+    vst1.8      {d11}, [r12], r1

+    vst1.8      {d12}, [r0], r1

+    vst1.8      {d13}, [r12], r1

+    vst1.8      {d14}, [r0], r1

+    vst1.8      {d15}, [r12], r1

+    vst1.8      {d16}, [r0], r1

+    vst1.8      {d17}, [r12], r1

+    vst1.8      {d18}, [r0], r1

+    vst1.8      {d19}, [r12], r1

+    vst1.8      {d20}, [r0]

+    vst1.8      {d21}, [r12]

+    pop         {pc}

+    ENDP        ; |vp8_mbloop_filter_vertical_edge_uv_neon|

+; void vp8_mbloop_filter_neon()

+; This is a helper function for the macroblock loopfilters. The individual

+; functions do the necessary load, transpose (if necessary), preserve (if

+; necessary) and store.

+; r0,r1 PRESERVE

+; r2    mblimit

+; r3    limit

+; q2    thresh

+; q3    p3 PRESERVE

+; q4    p2

+; q5    p1

+; q6    p0

+; q7    q0

+; q8    q1

+; q9    q2

+; q10   q3 PRESERVE

+|vp8_mbloop_filter_neon| PROC

+    ; vp9_filter_mask

+    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)

+    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)

+    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)

+    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)

+    vabd.u8     q1, q9, q8                  ; abs(q2 - q1)

+    vabd.u8     q0, q10, q9                 ; abs(q3 - q2)

+    vmax.u8     q11, q11, q12

+    vmax.u8     q12, q13, q14

+    vmax.u8     q1, q1, q0

+    vmax.u8     q15, q11, q12

+    vabd.u8     q12, q6, q7                 ; abs(p0 - q0)

+    ; vp8_hevmask

+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh) * -1

+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh) * -1

+    vmax.u8     q15, q15, q1

+    vdup.u8     q1, r3                      ; limit

+    vdup.u8     q2, r2                      ; mblimit

+    vmov.u8     q0, #0x80                   ; 0x80

+    vcge.u8     q15, q1, q15

+    vabd.u8     q1, q5, q8                  ; a = abs(p1 - q1)

+    vqadd.u8    q12, q12, q12               ; b = abs(p0 - q0) * 2

+    vmov.u16    q11, #3                     ; #3

+    ; vp9_filter

+    ; convert to signed

+    veor        q7, q7, q0                  ; qs0

+    vshr.u8     q1, q1, #1                  ; a = a / 2

+    veor        q6, q6, q0                  ; ps0

+    veor        q5, q5, q0                  ; ps1

+    vqadd.u8    q12, q12, q1                ; a = b + a

+    veor        q8, q8, q0                  ; qs1

+    veor        q4, q4, q0                  ; ps2

+    veor        q9, q9, q0                  ; qs2

+    vorr        q14, q13, q14               ; vp8_hevmask

+    vcge.u8     q12, q2, q12                ; (a > flimit * 2 + limit) * -1

+    vsubl.s8    q2, d14, d12                ; qs0 - ps0

+    vsubl.s8    q13, d15, d13

+    vqsub.s8    q1, q5, q8                  ; vp9_filter = clamp(ps1-qs1)

+    vmul.i16    q2, q2, q11                 ; 3 * ( qs0 - ps0)

+    vand        q15, q15, q12               ; vp9_filter_mask

+    vmul.i16    q13, q13, q11

+    vmov.u8     q12, #3                     ; #3

+    vaddw.s8    q2, q2, d2                  ; vp9_filter + 3 * ( qs0 - ps0)

+    vaddw.s8    q13, q13, d3

+    vmov.u8     q11, #4                     ; #4

+    ; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0))

+    vqmovn.s16  d2, q2

+    vqmovn.s16  d3, q13

+    vand        q1, q1, q15                 ; vp9_filter &= mask

+    vmov.u16    q15, #63                    ; #63

+    vand        q13, q1, q14                ; Filter2 &= hev

+    vqadd.s8    q2, q13, q11                ; Filter1 = clamp(Filter2+4)

+    vqadd.s8    q13, q13, q12               ; Filter2 = clamp(Filter2+3)

+    vmov        q0, q15

+    vshr.s8     q2, q2, #3                  ; Filter1 >>= 3

+    vshr.s8     q13, q13, #3                ; Filter2 >>= 3

+    vmov        q11, q15

+    vmov        q12, q15

+    vqsub.s8    q7, q7, q2                  ; qs0 = clamp(qs0 - Filter1)

+    vqadd.s8    q6, q6, q13                 ; ps0 = clamp(ps0 + Filter2)

+    vbic        q1, q1, q14                 ; vp9_filter &= ~hev

+    ; roughly 1/7th difference across boundary

+    ; roughly 2/7th difference across boundary

+    ; roughly 3/7th difference across boundary

+    vmov.u8     d5, #9                      ; #9

+    vmov.u8     d4, #18                     ; #18

+    vmov        q13, q15

+    vmov        q14, q15

+    vmlal.s8    q0, d2, d5                  ; 63 + Filter2 * 9

+    vmlal.s8    q11, d3, d5

+    vmov.u8     d5, #27                     ; #27

+    vmlal.s8    q12, d2, d4                 ; 63 + Filter2 * 18

+    vmlal.s8    q13, d3, d4

+    vmlal.s8    q14, d2, d5                 ; 63 + Filter2 * 27

+    vmlal.s8    q15, d3, d5

+    vqshrn.s16  d0, q0, #7                  ; u = clamp((63 + Filter2 * 9)>>7)

+    vqshrn.s16  d1, q11, #7

+    vqshrn.s16  d24, q12, #7                ; u = clamp((63 + Filter2 * 18)>>7)

+    vqshrn.s16  d25, q13, #7

+    vqshrn.s16  d28, q14, #7                ; u = clamp((63 + Filter2 * 27)>>7)

+    vqshrn.s16  d29, q15, #7

+    vmov.u8     q1, #0x80                   ; 0x80

+    vqsub.s8    q11, q9, q0                 ; s = clamp(qs2 - u)

+    vqadd.s8    q0, q4, q0                  ; s = clamp(ps2 + u)

+    vqsub.s8    q13, q8, q12                ; s = clamp(qs1 - u)

+    vqadd.s8    q12, q5, q12                ; s = clamp(ps1 + u)

+    vqsub.s8    q15, q7, q14                ; s = clamp(qs0 - u)

+    vqadd.s8    q14, q6, q14                ; s = clamp(ps0 + u)

+    veor        q9, q11, q1                 ; *oq2 = s^0x80

+    veor        q4, q0, q1                  ; *op2 = s^0x80

+    veor        q8, q13, q1                 ; *oq1 = s^0x80

+    veor        q5, q12, q1                 ; *op2 = s^0x80

+    veor        q7, q15, q1                 ; *oq0 = s^0x80

+    veor        q6, q14, q1                 ; *op0 = s^0x80

+    bx          lr

+    ENDP        ; |vp8_mbloop_filter_neon|

+;-----------------

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/recon16x16mb_neon.asm

@@ -1,0 +1,131 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_recon16x16mb_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char  *pred_ptr,

+; r1    short *diff_ptr,

+; r2    unsigned char *dst_ptr,

+; r3    int ystride,

+; stack unsigned char *udst_ptr,

+; stack unsigned char *vdst_ptr

+|vp8_recon16x16mb_neon| PROC

+    mov             r12, #4             ;loop counter for Y loop

+recon16x16mb_loop_y

+    vld1.u8         {q12, q13}, [r0]!   ;load data from pred_ptr

+    vld1.16         {q8, q9}, [r1]!     ;load data from diff_ptr

+    vld1.u8         {q14, q15}, [r0]!

+    vld1.16         {q10, q11}, [r1]!

+    vmovl.u8        q0, d24             ;modify Pred data from 8 bits to 16 bits

+    vmovl.u8        q1, d25

+    vmovl.u8        q2, d26

+    vmovl.u8        q3, d27

+    vmovl.u8        q4, d28

+    vmovl.u8        q5, d29

+    vmovl.u8        q6, d30

+    vld1.16         {q12, q13}, [r1]!

+    vmovl.u8        q7, d31

+    vld1.16         {q14, q15}, [r1]!

+    pld             [r0]

+    pld             [r1]

+    pld             [r1, #64]

+    vadd.s16        q0, q0, q8          ;add Diff data and Pred data together

+    vadd.s16        q1, q1, q9

+    vadd.s16        q2, q2, q10

+    vadd.s16        q3, q3, q11

+    vadd.s16        q4, q4, q12

+    vadd.s16        q5, q5, q13

+    vadd.s16        q6, q6, q14

+    vadd.s16        q7, q7, q15

+    vqmovun.s16     d0, q0              ;CLAMP() saturation

+    vqmovun.s16     d1, q1

+    vqmovun.s16     d2, q2

+    vqmovun.s16     d3, q3

+    vqmovun.s16     d4, q4

+    vqmovun.s16     d5, q5

+    vst1.u8         {q0}, [r2], r3      ;store result

+    vqmovun.s16     d6, q6

+    vst1.u8         {q1}, [r2], r3

+    vqmovun.s16     d7, q7

+    vst1.u8         {q2}, [r2], r3

+    subs            r12, r12, #1

+    moveq           r12, #2             ;loop counter for UV loop

+    vst1.u8         {q3}, [r2], r3

+    bne             recon16x16mb_loop_y

+    mov             r3, r3, lsr #1      ;uv_stride = ystride>>1

+    ldr             r2, [sp]            ;load upred_ptr

+recon16x16mb_loop_uv

+    vld1.u8         {q12, q13}, [r0]!   ;load data from pred_ptr

+    vld1.16         {q8, q9}, [r1]!     ;load data from diff_ptr

+    vld1.u8         {q14, q15}, [r0]!

+    vld1.16         {q10, q11}, [r1]!

+    vmovl.u8        q0, d24             ;modify Pred data from 8 bits to 16 bits

+    vmovl.u8        q1, d25

+    vmovl.u8        q2, d26

+    vmovl.u8        q3, d27

+    vmovl.u8        q4, d28

+    vmovl.u8        q5, d29

+    vmovl.u8        q6, d30

+    vld1.16         {q12, q13}, [r1]!

+    vmovl.u8        q7, d31

+    vld1.16         {q14, q15}, [r1]!

+    vadd.s16        q0, q0, q8          ;add Diff data and Pred data together

+    vadd.s16        q1, q1, q9

+    vadd.s16        q2, q2, q10

+    vadd.s16        q3, q3, q11

+    vadd.s16        q4, q4, q12

+    vadd.s16        q5, q5, q13

+    vadd.s16        q6, q6, q14

+    vqmovun.s16     d0, q0              ;CLAMP() saturation

+    vadd.s16        q7, q7, q15

+    vqmovun.s16     d1, q1

+    vqmovun.s16     d2, q2

+    vqmovun.s16     d3, q3

+    vst1.u8         {d0}, [r2], r3      ;store result

+    vqmovun.s16     d4, q4

+    vst1.u8         {d1}, [r2], r3

+    vqmovun.s16     d5, q5

+    vst1.u8         {d2}, [r2], r3

+    vqmovun.s16     d6, q6

+    vst1.u8         {d3}, [r2], r3

+    vqmovun.s16     d7, q7

+    vst1.u8         {d4}, [r2], r3

+    subs            r12, r12, #1

+    vst1.u8         {d5}, [r2], r3

+    vst1.u8         {d6}, [r2], r3

+    vst1.u8         {d7}, [r2], r3

+    ldrne           r2, [sp, #4]        ;load vpred_ptr

+    bne             recon16x16mb_loop_uv

+    bx             lr

+    ENDP

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/recon2b_neon.asm

@@ -1,0 +1,54 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_recon2b_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char  *pred_ptr,

+; r1    short *diff_ptr,

+; r2    unsigned char *dst_ptr,

+; r3    int stride

+|vp8_recon2b_neon| PROC

+    vld1.u8         {q8, q9}, [r0]      ;load data from pred_ptr

+    vld1.16         {q4, q5}, [r1]!     ;load data from diff_ptr

+    vmovl.u8        q0, d16             ;modify Pred data from 8 bits to 16 bits

+    vld1.16         {q6, q7}, [r1]!

+    vmovl.u8        q1, d17

+    vmovl.u8        q2, d18

+    vmovl.u8        q3, d19

+    vadd.s16        q0, q0, q4          ;add Diff data and Pred data together

+    vadd.s16        q1, q1, q5

+    vadd.s16        q2, q2, q6

+    vadd.s16        q3, q3, q7

+    vqmovun.s16     d0, q0              ;CLAMP() saturation

+    vqmovun.s16     d1, q1

+    vqmovun.s16     d2, q2

+    vqmovun.s16     d3, q3

+    add             r0, r2, r3

+    vst1.u8         {d0}, [r2]          ;store result

+    vst1.u8         {d1}, [r0], r3

+    add             r2, r0, r3

+    vst1.u8         {d2}, [r0]

+    vst1.u8         {d3}, [r2], r3

+    bx             lr

+    ENDP

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/recon4b_neon.asm

@@ -1,0 +1,69 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_recon4b_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char  *pred_ptr,

+; r1    short *diff_ptr,

+; r2    unsigned char *dst_ptr,

+; r3    int stride

+|vp8_recon4b_neon| PROC

+    vld1.u8         {q12, q13}, [r0]!   ;load data from pred_ptr

+    vld1.16         {q8, q9}, [r1]!     ;load data from diff_ptr

+    vld1.u8         {q14, q15}, [r0]

+    vld1.16         {q10, q11}, [r1]!

+    vmovl.u8        q0, d24             ;modify Pred data from 8 bits to 16 bits

+    vmovl.u8        q1, d25

+    vmovl.u8        q2, d26

+    vmovl.u8        q3, d27

+    vmovl.u8        q4, d28

+    vmovl.u8        q5, d29

+    vmovl.u8        q6, d30

+    vld1.16         {q12, q13}, [r1]!

+    vmovl.u8        q7, d31

+    vld1.16         {q14, q15}, [r1]

+    vadd.s16        q0, q0, q8          ;add Diff data and Pred data together

+    vadd.s16        q1, q1, q9

+    vadd.s16        q2, q2, q10

+    vadd.s16        q3, q3, q11

+    vadd.s16        q4, q4, q12

+    vadd.s16        q5, q5, q13

+    vadd.s16        q6, q6, q14

+    vadd.s16        q7, q7, q15

+    vqmovun.s16     d0, q0              ;CLAMP() saturation

+    vqmovun.s16     d1, q1

+    vqmovun.s16     d2, q2

+    vqmovun.s16     d3, q3

+    vqmovun.s16     d4, q4

+    vqmovun.s16     d5, q5

+    vqmovun.s16     d6, q6

+    vqmovun.s16     d7, q7

+    add             r0, r2, r3

+    vst1.u8         {q0}, [r2]          ;store result

+    vst1.u8         {q1}, [r0], r3

+    add             r2, r0, r3

+    vst1.u8         {q2}, [r0]

+    vst1.u8         {q3}, [r2], r3

+    bx             lr

+    ENDP

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/recon_neon.c

@@ -1,0 +1,29 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vp9/common/recon.h"

+#include "vp9/common/blockd.h"

+extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr);

+void vp8_recon_mb_neon(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *xd) {

+  unsigned char *pred_ptr = &xd->predictor[0];

+  short *diff_ptr = &xd->diff[0];

+  unsigned char *dst_ptr = xd->dst.y_buffer;

+  unsigned char *udst_ptr = xd->dst.u_buffer;

+  unsigned char *vdst_ptr = xd->dst.v_buffer;

+  int ystride = xd->dst.y_stride;

+  /*int uv_stride = xd->dst.uv_stride;*/

+  vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride,

+                        udst_ptr, vdst_ptr);

+}

--- /dev/null

+++ b/vp9/common/arm/neon/reconb_neon.asm

@@ -1,0 +1,61 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_recon_b_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char  *pred_ptr,

+; r1    short *diff_ptr,

+; r2    unsigned char *dst_ptr,

+; r3    int stride

+|vp8_recon_b_neon| PROC

+    mov             r12, #16

+    vld1.u8         {d28}, [r0], r12    ;load 4 data/line from pred_ptr

+    vld1.16         {q10, q11}, [r1]!   ;load data from diff_ptr

+    vld1.u8         {d29}, [r0], r12

+    vld1.16         {q11, q12}, [r1]!

+    vld1.u8         {d30}, [r0], r12

+    vld1.16         {q12, q13}, [r1]!

+    vld1.u8         {d31}, [r0], r12

+    vld1.16         {q13}, [r1]

+    vmovl.u8        q0, d28             ;modify Pred data from 8 bits to 16 bits

+    vmovl.u8        q1, d29             ;Pred data in d0, d2, d4, d6

+    vmovl.u8        q2, d30

+    vmovl.u8        q3, d31

+    vadd.s16        d0, d0, d20         ;add Diff data and Pred data together

+    vadd.s16        d2, d2, d22

+    vadd.s16        d4, d4, d24

+    vadd.s16        d6, d6, d26

+    vqmovun.s16     d0, q0              ;CLAMP() saturation

+    vqmovun.s16     d1, q1

+    vqmovun.s16     d2, q2

+    vqmovun.s16     d3, q3

+    add             r1, r2, r3

+    vst1.32         {d0[0]}, [r2]       ;store result

+    vst1.32         {d1[0]}, [r1], r3

+    add             r2, r1, r3

+    vst1.32         {d2[0]}, [r1]

+    vst1.32         {d3[0]}, [r2], r3

+    bx             lr

+    ENDP

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/save_neon_reg.asm

@@ -1,0 +1,36 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_push_neon|

+    EXPORT  |vp9_pop_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+|vp9_push_neon| PROC

+    vst1.i64            {d8, d9, d10, d11}, [r0]!

+    vst1.i64            {d12, d13, d14, d15}, [r0]!

+    bx              lr

+    ENDP

+|vp9_pop_neon| PROC

+    vld1.i64            {d8, d9, d10, d11}, [r0]!

+    vld1.i64            {d12, d13, d14, d15}, [r0]!

+    bx              lr

+    ENDP

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/shortidct4x4llm_1_neon.asm

@@ -1,0 +1,67 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_short_idct4x4llm_1_neon|

+    EXPORT  |vp8_dc_only_idct_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

+;void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);

+; r0    short *input;

+; r1    short *output;

+; r2    int pitch;

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

+|vp8_short_idct4x4llm_1_neon| PROC

+    vld1.16         {d0[]}, [r0]            ;load input[0]

+    add             r3, r1, r2

+    add             r12, r3, r2

+    vrshr.s16       d0, d0, #3

+    add             r0, r12, r2

+    vst1.16         {d0}, [r1]

+    vst1.16         {d0}, [r3]

+    vst1.16         {d0}, [r12]

+    vst1.16         {d0}, [r0]

+    bx             lr

+    ENDP

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;

+;void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);

+; r0    short input_dc;

+; r1    short *output;

+; r2    int pitch;

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

+|vp8_dc_only_idct_neon| PROC

+    vdup.16         d0, r0

+    add             r3, r1, r2

+    add             r12, r3, r2

+    vrshr.s16       d0, d0, #3

+    add             r0, r12, r2

+    vst1.16         {d0}, [r1]

+    vst1.16         {d0}, [r3]

+    vst1.16         {d0}, [r12]

+    vst1.16         {d0}, [r0]

+    bx             lr

+    ENDP

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/shortidct4x4llm_neon.asm

@@ -1,0 +1,122 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_short_idct4x4llm_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;*************************************************************

+;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)

+;r0 short * input

+;r1 short * output

+;r2 int pitch

+;*************************************************************

+;static const int cospi8sqrt2minus1=20091;

+;static const int sinpi8sqrt2      =35468;

+;static const int rounding = 0;

+;Optimization note: The resulted data from dequantization are signed 13-bit data that is

+;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since

+;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half

+;result of the multiplication that is needed in IDCT.

+|vp8_short_idct4x4llm_neon| PROC

+    adr             r12, idct_coeff

+    vld1.16         {q1, q2}, [r0]

+    vld1.16         {d0}, [r12]

+    vswp            d3, d4                  ;q2(vp[4] vp[12])

+    vqdmulh.s16     q3, q2, d0[2]

+    vqdmulh.s16     q4, q2, d0[0]

+    vqadd.s16       d12, d2, d3             ;a1

+    vqsub.s16       d13, d2, d3             ;b1

+    vshr.s16        q3, q3, #1

+    vshr.s16        q4, q4, #1

+    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)

+    vqadd.s16       q4, q4, q2

+    ;d6 - c1:temp1

+    ;d7 - d1:temp2

+    ;d8 - d1:temp1

+    ;d9 - c1:temp2

+    vqsub.s16       d10, d6, d9             ;c1

+    vqadd.s16       d11, d7, d8             ;d1

+    vqadd.s16       d2, d12, d11

+    vqadd.s16       d3, d13, d10

+    vqsub.s16       d4, d13, d10

+    vqsub.s16       d5, d12, d11

+    vtrn.32         d2, d4

+    vtrn.32         d3, d5

+    vtrn.16         d2, d3

+    vtrn.16         d4, d5

+    vswp            d3, d4

+    vqdmulh.s16     q3, q2, d0[2]

+    vqdmulh.s16     q4, q2, d0[0]

+    vqadd.s16       d12, d2, d3             ;a1

+    vqsub.s16       d13, d2, d3             ;b1

+    vshr.s16        q3, q3, #1

+    vshr.s16        q4, q4, #1

+    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)

+    vqadd.s16       q4, q4, q2

+    vqsub.s16       d10, d6, d9             ;c1

+    vqadd.s16       d11, d7, d8             ;d1

+    vqadd.s16       d2, d12, d11

+    vqadd.s16       d3, d13, d10

+    vqsub.s16       d4, d13, d10

+    vqsub.s16       d5, d12, d11

+    vrshr.s16       d2, d2, #3

+    vrshr.s16       d3, d3, #3

+    vrshr.s16       d4, d4, #3

+    vrshr.s16       d5, d5, #3

+    add             r3, r1, r2

+    add             r12, r3, r2

+    add             r0, r12, r2

+    vtrn.32         d2, d4

+    vtrn.32         d3, d5

+    vtrn.16         d2, d3

+    vtrn.16         d4, d5

+    vst1.16         {d2}, [r1]

+    vst1.16         {d3}, [r3]

+    vst1.16         {d4}, [r12]

+    vst1.16         {d5}, [r0]

+    bx             lr

+    ENDP

+;-----------------

+idct_coeff

+    DCD     0x4e7b4e7b, 0x8a8c8a8c

+;20091, 20091, 35468, 35468

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/sixtappredict16x16_neon.asm

@@ -1,0 +1,490 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_sixtap_predict16x16_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+filter16_coeff

+    DCD     0,  0,  128,    0,   0,  0,   0,  0

+    DCD     0, -6,  123,   12,  -1,  0,   0,  0

+    DCD     2, -11, 108,   36,  -8,  1,   0,  0

+    DCD     0, -9,   93,   50,  -6,  0,   0,  0

+    DCD     3, -16,  77,   77, -16,  3,   0,  0

+    DCD     0, -6,   50,   93,  -9,  0,   0,  0

+    DCD     1, -8,   36,  108, -11,  2,   0,  0

+    DCD     0, -1,   12,  123,  -6,   0,  0,  0

+; r0    unsigned char  *src_ptr,

+; r1    int  src_pixels_per_line,

+; r2    int  xoffset,

+; r3    int  yoffset,

+; r4    unsigned char *dst_ptr,

+; stack(r5) int  dst_pitch

+;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to

+; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication,

+; the result can be negtive. So, I treat the result as s16. But, since it is also possible

+; that the result can be a large positive number (> 2^15-1), which could be confused as a

+; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,

+; which ensures that the result stays in s16 range. Finally, saturated add the result by

+; applying 3rd filter coeff. Same applys to other filter functions.

+|vp8_sixtap_predict16x16_neon| PROC

+    push            {r4-r5, lr}

+    adr             r12, filter16_coeff

+    ldr             r4, [sp, #12]           ;load parameters from stack

+    ldr             r5, [sp, #16]           ;load parameters from stack

+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

+    beq             secondpass_filter16x16_only

+    add             r2, r12, r2, lsl #5     ;calculate filter location

+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

+    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter

+    beq             firstpass_filter16x16_only

+    sub             sp, sp, #336            ;reserve space on stack for temporary storage

+    mov             lr, sp

+    vabs.s32        q12, q14

+    vabs.s32        q13, q15

+    mov             r2, #7                  ;loop counter

+    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)

+    sub             r0, r0, r1, lsl #1

+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)

+    vdup.8          d1, d24[4]

+    vdup.8          d2, d25[0]

+    vdup.8          d3, d25[4]

+    vdup.8          d4, d26[0]

+    vdup.8          d5, d26[4]

+;First Pass: output_height lines x output_width columns (21x16)

+filt_blk2d_fp16x16_loop_neon

+    vld1.u8         {d6, d7, d8}, [r0], r1      ;load src data

+    vld1.u8         {d9, d10, d11}, [r0], r1

+    vld1.u8         {d12, d13, d14}, [r0], r1

+    pld             [r0]

+    pld             [r0, r1]

+    pld             [r0, r1, lsl #1]

+    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp9_filter[0])

+    vmull.u8        q9, d7, d0

+    vmull.u8        q10, d9, d0

+    vmull.u8        q11, d10, d0

+    vmull.u8        q12, d12, d0

+    vmull.u8        q13, d13, d0

+    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]

+    vext.8          d29, d9, d10, #1

+    vext.8          d30, d12, d13, #1

+    vmlsl.u8        q8, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])

+    vmlsl.u8        q10, d29, d1

+    vmlsl.u8        q12, d30, d1

+    vext.8          d28, d7, d8, #1

+    vext.8          d29, d10, d11, #1

+    vext.8          d30, d13, d14, #1

+    vmlsl.u8        q9, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])

+    vmlsl.u8        q11, d29, d1

+    vmlsl.u8        q13, d30, d1

+    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]

+    vext.8          d29, d9, d10, #4

+    vext.8          d30, d12, d13, #4

+    vmlsl.u8        q8, d28, d4             ;-(src_ptr[2] * vp9_filter[4])

+    vmlsl.u8        q10, d29, d4

+    vmlsl.u8        q12, d30, d4

+    vext.8          d28, d7, d8, #4

+    vext.8          d29, d10, d11, #4

+    vext.8          d30, d13, d14, #4

+    vmlsl.u8        q9, d28, d4             ;-(src_ptr[2] * vp9_filter[4])

+    vmlsl.u8        q11, d29, d4

+    vmlsl.u8        q13, d30, d4

+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]

+    vext.8          d29, d9, d10, #5

+    vext.8          d30, d12, d13, #5

+    vmlal.u8        q8, d28, d5             ;(src_ptr[3] * vp9_filter[5])

+    vmlal.u8        q10, d29, d5

+    vmlal.u8        q12, d30, d5

+    vext.8          d28, d7, d8, #5

+    vext.8          d29, d10, d11, #5

+    vext.8          d30, d13, d14, #5

+    vmlal.u8        q9, d28, d5             ;(src_ptr[3] * vp9_filter[5])

+    vmlal.u8        q11, d29, d5

+    vmlal.u8        q13, d30, d5

+    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]

+    vext.8          d29, d9, d10, #2

+    vext.8          d30, d12, d13, #2

+    vmlal.u8        q8, d28, d2             ;(src_ptr[0] * vp9_filter[2])

+    vmlal.u8        q10, d29, d2

+    vmlal.u8        q12, d30, d2

+    vext.8          d28, d7, d8, #2

+    vext.8          d29, d10, d11, #2

+    vext.8          d30, d13, d14, #2

+    vmlal.u8        q9, d28, d2             ;(src_ptr[0] * vp9_filter[2])

+    vmlal.u8        q11, d29, d2

+    vmlal.u8        q13, d30, d2

+    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]

+    vext.8          d29, d9, d10, #3

+    vext.8          d30, d12, d13, #3

+    vext.8          d15, d7, d8, #3

+    vext.8          d31, d10, d11, #3

+    vext.8          d6, d13, d14, #3

+    vmull.u8        q4, d28, d3             ;(src_ptr[1] * vp9_filter[3])

+    vmull.u8        q5, d29, d3

+    vmull.u8        q6, d30, d3

+    vqadd.s16       q8, q4                  ;sum of all (src_data*filter_parameters)

+    vqadd.s16       q10, q5

+    vqadd.s16       q12, q6

+    vmull.u8        q6, d15, d3             ;(src_ptr[1] * vp9_filter[3])

+    vmull.u8        q7, d31, d3

+    vmull.u8        q3, d6, d3

+    subs            r2, r2, #1

+    vqadd.s16       q9, q6

+    vqadd.s16       q11, q7

+    vqadd.s16       q13, q3

+    vqrshrun.s16    d6, q8, #7              ;shift/round/saturate to u8

+    vqrshrun.s16    d7, q9, #7

+    vqrshrun.s16    d8, q10, #7

+    vqrshrun.s16    d9, q11, #7

+    vqrshrun.s16    d10, q12, #7

+    vqrshrun.s16    d11, q13, #7

+    vst1.u8         {d6, d7, d8}, [lr]!     ;store result

+    vst1.u8         {d9, d10, d11}, [lr]!

+    bne             filt_blk2d_fp16x16_loop_neon

+;Second pass: 16x16

+;secondpass_filter - do first 8-columns and then second 8-columns

+    add             r3, r12, r3, lsl #5

+    sub             lr, lr, #336

+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter

+    mov             r3, #2                  ;loop counter

+    vabs.s32        q7, q5

+    vabs.s32        q8, q6

+    mov             r2, #16

+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)

+    vdup.8          d1, d14[4]

+    vdup.8          d2, d15[0]

+    vdup.8          d3, d15[4]

+    vdup.8          d4, d16[0]

+    vdup.8          d5, d16[4]

+filt_blk2d_sp16x16_outloop_neon

+    vld1.u8         {d18}, [lr], r2         ;load src data

+    vld1.u8         {d19}, [lr], r2

+    vld1.u8         {d20}, [lr], r2

+    vld1.u8         {d21}, [lr], r2

+    mov             r12, #4                 ;loop counter

+    vld1.u8         {d22}, [lr], r2

+secondpass_inner_loop_neon

+    vld1.u8         {d23}, [lr], r2         ;load src data

+    vld1.u8         {d24}, [lr], r2

+    vld1.u8         {d25}, [lr], r2

+    vld1.u8         {d26}, [lr], r2

+    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp9_filter[0])

+    vmull.u8        q4, d19, d0

+    vmull.u8        q5, d20, d0

+    vmull.u8        q6, d21, d0

+    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp9_filter[1])

+    vmlsl.u8        q4, d20, d1

+    vmlsl.u8        q5, d21, d1

+    vmlsl.u8        q6, d22, d1

+    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp9_filter[4])

+    vmlsl.u8        q4, d23, d4

+    vmlsl.u8        q5, d24, d4

+    vmlsl.u8        q6, d25, d4

+    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp9_filter[2])

+    vmlal.u8        q4, d21, d2

+    vmlal.u8        q5, d22, d2

+    vmlal.u8        q6, d23, d2

+    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp9_filter[5])

+    vmlal.u8        q4, d24, d5

+    vmlal.u8        q5, d25, d5

+    vmlal.u8        q6, d26, d5

+    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp9_filter[3])

+    vmull.u8        q8, d22, d3

+    vmull.u8        q9, d23, d3

+    vmull.u8        q10, d24, d3

+    subs            r12, r12, #1

+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)

+    vqadd.s16       q8, q4

+    vqadd.s16       q9, q5

+    vqadd.s16       q10, q6

+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8

+    vqrshrun.s16    d7, q8, #7

+    vqrshrun.s16    d8, q9, #7

+    vqrshrun.s16    d9, q10, #7

+    vst1.u8         {d6}, [r4], r5          ;store result

+    vmov            q9, q11

+    vst1.u8         {d7}, [r4], r5

+    vmov            q10, q12

+    vst1.u8         {d8}, [r4], r5

+    vmov            d22, d26

+    vst1.u8         {d9}, [r4], r5

+    bne             secondpass_inner_loop_neon

+    subs            r3, r3, #1

+    sub             lr, lr, #336

+    add             lr, lr, #8

+    sub             r4, r4, r5, lsl #4

+    add             r4, r4, #8

+    bne filt_blk2d_sp16x16_outloop_neon

+    add             sp, sp, #336

+    pop             {r4-r5,pc}

+;--------------------

+firstpass_filter16x16_only

+    vabs.s32        q12, q14

+    vabs.s32        q13, q15

+    mov             r2, #8                  ;loop counter

+    sub             r0, r0, #2              ;move srcptr back to (column-2)

+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)

+    vdup.8          d1, d24[4]

+    vdup.8          d2, d25[0]

+    vdup.8          d3, d25[4]

+    vdup.8          d4, d26[0]

+    vdup.8          d5, d26[4]

+;First Pass: output_height lines x output_width columns (16x16)

+filt_blk2d_fpo16x16_loop_neon

+    vld1.u8         {d6, d7, d8}, [r0], r1      ;load src data

+    vld1.u8         {d9, d10, d11}, [r0], r1

+    pld             [r0]

+    pld             [r0, r1]

+    vmull.u8        q6, d6, d0              ;(src_ptr[-2] * vp9_filter[0])

+    vmull.u8        q7, d7, d0

+    vmull.u8        q8, d9, d0

+    vmull.u8        q9, d10, d0

+    vext.8          d20, d6, d7, #1         ;construct src_ptr[-1]

+    vext.8          d21, d9, d10, #1

+    vext.8          d22, d7, d8, #1

+    vext.8          d23, d10, d11, #1

+    vext.8          d24, d6, d7, #4         ;construct src_ptr[2]

+    vext.8          d25, d9, d10, #4

+    vext.8          d26, d7, d8, #4

+    vext.8          d27, d10, d11, #4

+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]

+    vext.8          d29, d9, d10, #5

+    vmlsl.u8        q6, d20, d1             ;-(src_ptr[-1] * vp9_filter[1])

+    vmlsl.u8        q8, d21, d1

+    vmlsl.u8        q7, d22, d1             ;-(src_ptr[-1] * vp9_filter[1])

+    vmlsl.u8        q9, d23, d1

+    vmlsl.u8        q6, d24, d4             ;-(src_ptr[2] * vp9_filter[4])

+    vmlsl.u8        q8, d25, d4

+    vmlsl.u8        q7, d26, d4             ;-(src_ptr[2] * vp9_filter[4])

+    vmlsl.u8        q9, d27, d4

+    vmlal.u8        q6, d28, d5             ;(src_ptr[3] * vp9_filter[5])

+    vmlal.u8        q8, d29, d5

+    vext.8          d20, d7, d8, #5

+    vext.8          d21, d10, d11, #5

+    vext.8          d22, d6, d7, #2         ;construct src_ptr[0]

+    vext.8          d23, d9, d10, #2

+    vext.8          d24, d7, d8, #2

+    vext.8          d25, d10, d11, #2

+    vext.8          d26, d6, d7, #3         ;construct src_ptr[1]

+    vext.8          d27, d9, d10, #3

+    vext.8          d28, d7, d8, #3

+    vext.8          d29, d10, d11, #3

+    vmlal.u8        q7, d20, d5             ;(src_ptr[3] * vp9_filter[5])

+    vmlal.u8        q9, d21, d5

+    vmlal.u8        q6, d22, d2             ;(src_ptr[0] * vp9_filter[2])

+    vmlal.u8        q8, d23, d2

+    vmlal.u8        q7, d24, d2             ;(src_ptr[0] * vp9_filter[2])

+    vmlal.u8        q9, d25, d2

+    vmull.u8        q10, d26, d3            ;(src_ptr[1] * vp9_filter[3])

+    vmull.u8        q11, d27, d3

+    vmull.u8        q12, d28, d3            ;(src_ptr[1] * vp9_filter[3])

+    vmull.u8        q15, d29, d3

+    vqadd.s16       q6, q10                 ;sum of all (src_data*filter_parameters)

+    vqadd.s16       q8, q11

+    vqadd.s16       q7, q12

+    vqadd.s16       q9, q15

+    subs            r2, r2, #1

+    vqrshrun.s16    d6, q6, #7              ;shift/round/saturate to u8

+    vqrshrun.s16    d7, q7, #7

+    vqrshrun.s16    d8, q8, #7

+    vqrshrun.s16    d9, q9, #7

+    vst1.u8         {q3}, [r4], r5              ;store result

+    vst1.u8         {q4}, [r4], r5

+    bne             filt_blk2d_fpo16x16_loop_neon

+    pop             {r4-r5,pc}

+;--------------------

+secondpass_filter16x16_only

+;Second pass: 16x16

+    add             r3, r12, r3, lsl #5

+    sub             r0, r0, r1, lsl #1

+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter

+    mov             r3, #2                  ;loop counter

+    vabs.s32        q7, q5

+    vabs.s32        q8, q6

+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)

+    vdup.8          d1, d14[4]

+    vdup.8          d2, d15[0]

+    vdup.8          d3, d15[4]

+    vdup.8          d4, d16[0]

+    vdup.8          d5, d16[4]

+filt_blk2d_spo16x16_outloop_neon

+    vld1.u8         {d18}, [r0], r1         ;load src data

+    vld1.u8         {d19}, [r0], r1

+    vld1.u8         {d20}, [r0], r1

+    vld1.u8         {d21}, [r0], r1

+    mov             r12, #4                 ;loop counter

+    vld1.u8         {d22}, [r0], r1

+secondpass_only_inner_loop_neon

+    vld1.u8         {d23}, [r0], r1         ;load src data

+    vld1.u8         {d24}, [r0], r1

+    vld1.u8         {d25}, [r0], r1

+    vld1.u8         {d26}, [r0], r1

+    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp9_filter[0])

+    vmull.u8        q4, d19, d0

+    vmull.u8        q5, d20, d0

+    vmull.u8        q6, d21, d0

+    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp9_filter[1])

+    vmlsl.u8        q4, d20, d1

+    vmlsl.u8        q5, d21, d1

+    vmlsl.u8        q6, d22, d1

+    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp9_filter[4])

+    vmlsl.u8        q4, d23, d4

+    vmlsl.u8        q5, d24, d4

+    vmlsl.u8        q6, d25, d4

+    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp9_filter[2])

+    vmlal.u8        q4, d21, d2

+    vmlal.u8        q5, d22, d2

+    vmlal.u8        q6, d23, d2

+    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp9_filter[5])

+    vmlal.u8        q4, d24, d5

+    vmlal.u8        q5, d25, d5

+    vmlal.u8        q6, d26, d5

+    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp9_filter[3])

+    vmull.u8        q8, d22, d3

+    vmull.u8        q9, d23, d3

+    vmull.u8        q10, d24, d3

+    subs            r12, r12, #1

+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)

+    vqadd.s16       q8, q4

+    vqadd.s16       q9, q5

+    vqadd.s16       q10, q6

+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8

+    vqrshrun.s16    d7, q8, #7

+    vqrshrun.s16    d8, q9, #7

+    vqrshrun.s16    d9, q10, #7

+    vst1.u8         {d6}, [r4], r5          ;store result

+    vmov            q9, q11

+    vst1.u8         {d7}, [r4], r5

+    vmov            q10, q12

+    vst1.u8         {d8}, [r4], r5

+    vmov            d22, d26

+    vst1.u8         {d9}, [r4], r5

+    bne             secondpass_only_inner_loop_neon

+    subs            r3, r3, #1

+    sub             r0, r0, r1, lsl #4

+    sub             r0, r0, r1, lsl #2

+    sub             r0, r0, r1

+    add             r0, r0, #8

+    sub             r4, r4, r5, lsl #4

+    add             r4, r4, #8

+    bne filt_blk2d_spo16x16_outloop_neon

+    pop             {r4-r5,pc}

+    ENDP

+;-----------------

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/sixtappredict4x4_neon.asm

@@ -1,0 +1,422 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_sixtap_predict_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+filter4_coeff

+    DCD     0,  0,  128,    0,   0,  0,   0,  0

+    DCD     0, -6,  123,   12,  -1,  0,   0,  0

+    DCD     2, -11, 108,   36,  -8,  1,   0,  0

+    DCD     0, -9,   93,   50,  -6,  0,   0,  0

+    DCD     3, -16,  77,   77, -16,  3,   0,  0

+    DCD     0, -6,   50,   93,  -9,  0,   0,  0

+    DCD     1, -8,   36,  108, -11,  2,   0,  0

+    DCD     0, -1,   12,  123,  -6,   0,  0,  0

+; r0    unsigned char  *src_ptr,

+; r1    int  src_pixels_per_line,

+; r2    int  xoffset,

+; r3    int  yoffset,

+; stack(r4) unsigned char *dst_ptr,

+; stack(lr) int  dst_pitch

+|vp8_sixtap_predict_neon| PROC

+    push            {r4, lr}

+    adr             r12, filter4_coeff

+    ldr             r4, [sp, #8]            ;load parameters from stack

+    ldr             lr, [sp, #12]           ;load parameters from stack

+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

+    beq             secondpass_filter4x4_only

+    add             r2, r12, r2, lsl #5     ;calculate filter location

+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

+    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter

+    beq             firstpass_filter4x4_only

+    vabs.s32        q12, q14                ;get abs(filer_parameters)

+    vabs.s32        q13, q15

+    sub             r0, r0, #2              ;go back 2 columns of src data

+    sub             r0, r0, r1, lsl #1      ;go back 2 lines of src data

+;First pass: output_height lines x output_width columns (9x4)

+    vld1.u8         {q3}, [r0], r1          ;load first 4-line src data

+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)

+    vld1.u8         {q4}, [r0], r1

+    vdup.8          d1, d24[4]

+    vld1.u8         {q5}, [r0], r1

+    vdup.8          d2, d25[0]

+    vld1.u8         {q6}, [r0], r1

+    vdup.8          d3, d25[4]

+    vdup.8          d4, d26[0]

+    vdup.8          d5, d26[4]

+    pld             [r0]

+    pld             [r0, r1]

+    pld             [r0, r1, lsl #1]

+    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]

+    vext.8          d19, d8, d9, #5

+    vext.8          d20, d10, d11, #5

+    vext.8          d21, d12, d13, #5

+    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done

+    vswp            d11, d12

+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])

+    vzip.32         d20, d21

+    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp9_filter[5])

+    vmull.u8        q8, d20, d5

+    vmov            q4, q3                  ;keep original src data in q4 q6

+    vmov            q6, q5

+    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together

+    vzip.32         d10, d11

+    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]

+    vshr.u64        q10, q6, #8

+    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp9_filter[0])

+    vmlal.u8        q8, d10, d0

+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])

+    vzip.32         d20, d21

+    vshr.u64        q3, q4, #32             ;construct src_ptr[2]

+    vshr.u64        q5, q6, #32

+    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp9_filter[1])

+    vmlsl.u8        q8, d20, d1

+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])

+    vzip.32         d10, d11

+    vshr.u64        q9, q4, #16             ;construct src_ptr[0]

+    vshr.u64        q10, q6, #16

+    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp9_filter[4])

+    vmlsl.u8        q8, d10, d4

+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])

+    vzip.32         d20, d21

+    vshr.u64        q3, q4, #24             ;construct src_ptr[1]

+    vshr.u64        q5, q6, #24

+    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp9_filter[2])

+    vmlal.u8        q8, d20, d2

+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])

+    vzip.32         d10, d11

+    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp9_filter[3])

+    vmull.u8        q10, d10, d3

+    vld1.u8         {q3}, [r0], r1          ;load rest 5-line src data

+    vld1.u8         {q4}, [r0], r1

+    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)

+    vqadd.s16       q8, q10

+    vld1.u8         {q5}, [r0], r1

+    vld1.u8         {q6}, [r0], r1

+    vqrshrun.s16    d27, q7, #7             ;shift/round/saturate to u8

+    vqrshrun.s16    d28, q8, #7

+    ;First Pass on rest 5-line data

+    vld1.u8         {q11}, [r0], r1

+    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]

+    vext.8          d19, d8, d9, #5

+    vext.8          d20, d10, d11, #5

+    vext.8          d21, d12, d13, #5

+    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done

+    vswp            d11, d12

+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])

+    vzip.32         d20, d21

+    vext.8          d31, d22, d23, #5       ;construct src_ptr[3]

+    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp9_filter[5])

+    vmull.u8        q8, d20, d5

+    vmull.u8        q12, d31, d5            ;(src_ptr[3] * vp9_filter[5])

+    vmov            q4, q3                  ;keep original src data in q4 q6

+    vmov            q6, q5

+    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together

+    vzip.32         d10, d11

+    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]

+    vshr.u64        q10, q6, #8

+    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp9_filter[0])

+    vmlal.u8        q8, d10, d0

+    vmlal.u8        q12, d22, d0            ;(src_ptr[-2] * vp9_filter[0])

+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])

+    vzip.32         d20, d21

+    vshr.u64        q3, q4, #32             ;construct src_ptr[2]

+    vshr.u64        q5, q6, #32

+    vext.8          d31, d22, d23, #1       ;construct src_ptr[-1]

+    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp9_filter[1])

+    vmlsl.u8        q8, d20, d1

+    vmlsl.u8        q12, d31, d1            ;-(src_ptr[-1] * vp9_filter[1])

+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])

+    vzip.32         d10, d11

+    vshr.u64        q9, q4, #16             ;construct src_ptr[0]

+    vshr.u64        q10, q6, #16

+    vext.8          d31, d22, d23, #4       ;construct src_ptr[2]

+    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp9_filter[4])

+    vmlsl.u8        q8, d10, d4

+    vmlsl.u8        q12, d31, d4            ;-(src_ptr[2] * vp9_filter[4])

+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])

+    vzip.32         d20, d21

+    vshr.u64        q3, q4, #24             ;construct src_ptr[1]

+    vshr.u64        q5, q6, #24

+    vext.8          d31, d22, d23, #2       ;construct src_ptr[0]

+    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp9_filter[2])

+    vmlal.u8        q8, d20, d2

+    vmlal.u8        q12, d31, d2            ;(src_ptr[0] * vp9_filter[2])

+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])

+    vzip.32         d10, d11

+    vext.8          d31, d22, d23, #3       ;construct src_ptr[1]

+    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp9_filter[3])

+    vmull.u8        q10, d10, d3

+    vmull.u8        q11, d31, d3            ;(src_ptr[1] * vp9_filter[3])

+    add             r3, r12, r3, lsl #5

+    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)

+    vqadd.s16       q8, q10

+    vqadd.s16       q12, q11

+    vext.8          d23, d27, d28, #4

+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter

+    vqrshrun.s16    d29, q7, #7             ;shift/round/saturate to u8

+    vqrshrun.s16    d30, q8, #7

+    vqrshrun.s16    d31, q12, #7

+;Second pass: 4x4

+    vabs.s32        q7, q5

+    vabs.s32        q8, q6

+    vext.8          d24, d28, d29, #4

+    vext.8          d25, d29, d30, #4

+    vext.8          d26, d30, d31, #4

+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)

+    vdup.8          d1, d14[4]

+    vdup.8          d2, d15[0]

+    vdup.8          d3, d15[4]

+    vdup.8          d4, d16[0]

+    vdup.8          d5, d16[4]

+    vmull.u8        q3, d27, d0             ;(src_ptr[-2] * vp9_filter[0])

+    vmull.u8        q4, d28, d0

+    vmull.u8        q5, d25, d5             ;(src_ptr[3] * vp9_filter[5])

+    vmull.u8        q6, d26, d5

+    vmlsl.u8        q3, d29, d4             ;-(src_ptr[2] * vp9_filter[4])

+    vmlsl.u8        q4, d30, d4

+    vmlsl.u8        q5, d23, d1             ;-(src_ptr[-1] * vp9_filter[1])

+    vmlsl.u8        q6, d24, d1

+    vmlal.u8        q3, d28, d2             ;(src_ptr[0] * vp9_filter[2])

+    vmlal.u8        q4, d29, d2

+    vmlal.u8        q5, d24, d3             ;(src_ptr[1] * vp9_filter[3])

+    vmlal.u8        q6, d25, d3

+    add             r0, r4, lr

+    add             r1, r0, lr

+    add             r2, r1, lr

+    vqadd.s16       q5, q3                  ;sum of all (src_data*filter_parameters)

+    vqadd.s16       q6, q4

+    vqrshrun.s16    d3, q5, #7              ;shift/round/saturate to u8

+    vqrshrun.s16    d4, q6, #7

+    vst1.32         {d3[0]}, [r4]           ;store result

+    vst1.32         {d3[1]}, [r0]

+    vst1.32         {d4[0]}, [r1]

+    vst1.32         {d4[1]}, [r2]

+    pop             {r4, pc}

+;---------------------

+firstpass_filter4x4_only

+    vabs.s32        q12, q14                ;get abs(filer_parameters)

+    vabs.s32        q13, q15

+    sub             r0, r0, #2              ;go back 2 columns of src data

+;First pass: output_height lines x output_width columns (4x4)

+    vld1.u8         {q3}, [r0], r1          ;load first 4-line src data

+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)

+    vld1.u8         {q4}, [r0], r1

+    vdup.8          d1, d24[4]

+    vld1.u8         {q5}, [r0], r1

+    vdup.8          d2, d25[0]

+    vld1.u8         {q6}, [r0], r1

+    vdup.8          d3, d25[4]

+    vdup.8          d4, d26[0]

+    vdup.8          d5, d26[4]

+    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]

+    vext.8          d19, d8, d9, #5

+    vext.8          d20, d10, d11, #5

+    vext.8          d21, d12, d13, #5

+    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done

+    vswp            d11, d12

+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])

+    vzip.32         d20, d21

+    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp9_filter[5])

+    vmull.u8        q8, d20, d5

+    vmov            q4, q3                  ;keep original src data in q4 q6

+    vmov            q6, q5

+    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together

+    vzip.32         d10, d11

+    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]

+    vshr.u64        q10, q6, #8

+    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp9_filter[0])

+    vmlal.u8        q8, d10, d0

+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])

+    vzip.32         d20, d21

+    vshr.u64        q3, q4, #32             ;construct src_ptr[2]

+    vshr.u64        q5, q6, #32

+    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp9_filter[1])

+    vmlsl.u8        q8, d20, d1

+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])

+    vzip.32         d10, d11

+    vshr.u64        q9, q4, #16             ;construct src_ptr[0]

+    vshr.u64        q10, q6, #16

+    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp9_filter[4])

+    vmlsl.u8        q8, d10, d4

+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])

+    vzip.32         d20, d21

+    vshr.u64        q3, q4, #24             ;construct src_ptr[1]

+    vshr.u64        q5, q6, #24

+    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp9_filter[2])

+    vmlal.u8        q8, d20, d2

+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])

+    vzip.32         d10, d11

+    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp9_filter[3])

+    vmull.u8        q10, d10, d3

+    add             r0, r4, lr

+    add             r1, r0, lr

+    add             r2, r1, lr

+    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)

+    vqadd.s16       q8, q10

+    vqrshrun.s16    d27, q7, #7             ;shift/round/saturate to u8

+    vqrshrun.s16    d28, q8, #7

+    vst1.32         {d27[0]}, [r4]          ;store result

+    vst1.32         {d27[1]}, [r0]

+    vst1.32         {d28[0]}, [r1]

+    vst1.32         {d28[1]}, [r2]

+    pop             {r4, pc}

+;---------------------

+secondpass_filter4x4_only

+    sub             r0, r0, r1, lsl #1

+    add             r3, r12, r3, lsl #5

+    vld1.32         {d27[0]}, [r0], r1      ;load src data

+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter

+    vld1.32         {d27[1]}, [r0], r1

+    vabs.s32        q7, q5

+    vld1.32         {d28[0]}, [r0], r1

+    vabs.s32        q8, q6

+    vld1.32         {d28[1]}, [r0], r1

+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)

+    vld1.32         {d29[0]}, [r0], r1

+    vdup.8          d1, d14[4]

+    vld1.32         {d29[1]}, [r0], r1

+    vdup.8          d2, d15[0]

+    vld1.32         {d30[0]}, [r0], r1

+    vdup.8          d3, d15[4]

+    vld1.32         {d30[1]}, [r0], r1

+    vdup.8          d4, d16[0]

+    vld1.32         {d31[0]}, [r0], r1

+    vdup.8          d5, d16[4]

+    vext.8          d23, d27, d28, #4

+    vext.8          d24, d28, d29, #4

+    vext.8          d25, d29, d30, #4

+    vext.8          d26, d30, d31, #4

+    vmull.u8        q3, d27, d0             ;(src_ptr[-2] * vp9_filter[0])

+    vmull.u8        q4, d28, d0

+    vmull.u8        q5, d25, d5             ;(src_ptr[3] * vp9_filter[5])

+    vmull.u8        q6, d26, d5

+    vmlsl.u8        q3, d29, d4             ;-(src_ptr[2] * vp9_filter[4])

+    vmlsl.u8        q4, d30, d4

+    vmlsl.u8        q5, d23, d1             ;-(src_ptr[-1] * vp9_filter[1])

+    vmlsl.u8        q6, d24, d1

+    vmlal.u8        q3, d28, d2             ;(src_ptr[0] * vp9_filter[2])

+    vmlal.u8        q4, d29, d2

+    vmlal.u8        q5, d24, d3             ;(src_ptr[1] * vp9_filter[3])

+    vmlal.u8        q6, d25, d3

+    add             r0, r4, lr

+    add             r1, r0, lr

+    add             r2, r1, lr

+    vqadd.s16       q5, q3                  ;sum of all (src_data*filter_parameters)

+    vqadd.s16       q6, q4

+    vqrshrun.s16    d3, q5, #7              ;shift/round/saturate to u8

+    vqrshrun.s16    d4, q6, #7

+    vst1.32         {d3[0]}, [r4]           ;store result

+    vst1.32         {d3[1]}, [r0]

+    vst1.32         {d4[0]}, [r1]

+    vst1.32         {d4[1]}, [r2]

+    pop             {r4, pc}

+    ENDP

+;-----------------

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/sixtappredict8x4_neon.asm

@@ -1,0 +1,473 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_sixtap_predict8x4_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+filter8_coeff

+    DCD     0,  0,  128,    0,   0,  0,   0,  0

+    DCD     0, -6,  123,   12,  -1,  0,   0,  0

+    DCD     2, -11, 108,   36,  -8,  1,   0,  0

+    DCD     0, -9,   93,   50,  -6,  0,   0,  0

+    DCD     3, -16,  77,   77, -16,  3,   0,  0

+    DCD     0, -6,   50,   93,  -9,  0,   0,  0

+    DCD     1, -8,   36,  108, -11,  2,   0,  0

+    DCD     0, -1,   12,  123,  -6,   0,  0,  0

+; r0    unsigned char  *src_ptr,

+; r1    int  src_pixels_per_line,

+; r2    int  xoffset,

+; r3    int  yoffset,

+; r4    unsigned char *dst_ptr,

+; stack(r5) int  dst_pitch

+|vp8_sixtap_predict8x4_neon| PROC

+    push            {r4-r5, lr}

+    adr             r12, filter8_coeff

+    ldr             r4, [sp, #12]           ;load parameters from stack

+    ldr             r5, [sp, #16]           ;load parameters from stack

+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

+    beq             secondpass_filter8x4_only

+    add             r2, r12, r2, lsl #5     ;calculate filter location

+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

+    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter

+    beq             firstpass_filter8x4_only

+    sub             sp, sp, #32             ;reserve space on stack for temporary storage

+    vabs.s32        q12, q14

+    vabs.s32        q13, q15

+    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)

+    mov             lr, sp

+    sub             r0, r0, r1, lsl #1

+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)

+    vdup.8          d1, d24[4]

+    vdup.8          d2, d25[0]

+;First pass: output_height lines x output_width columns (9x8)

+    vld1.u8         {q3}, [r0], r1          ;load src data

+    vdup.8          d3, d25[4]

+    vld1.u8         {q4}, [r0], r1

+    vdup.8          d4, d26[0]

+    vld1.u8         {q5}, [r0], r1

+    vdup.8          d5, d26[4]

+    vld1.u8         {q6}, [r0], r1

+    pld             [r0]

+    pld             [r0, r1]

+    pld             [r0, r1, lsl #1]

+    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp9_filter[0])

+    vmull.u8        q8, d8, d0

+    vmull.u8        q9, d10, d0

+    vmull.u8        q10, d12, d0

+    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]

+    vext.8          d29, d8, d9, #1

+    vext.8          d30, d10, d11, #1

+    vext.8          d31, d12, d13, #1

+    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])

+    vmlsl.u8        q8, d29, d1

+    vmlsl.u8        q9, d30, d1

+    vmlsl.u8        q10, d31, d1

+    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]

+    vext.8          d29, d8, d9, #4

+    vext.8          d30, d10, d11, #4

+    vext.8          d31, d12, d13, #4

+    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp9_filter[4])

+    vmlsl.u8        q8, d29, d4

+    vmlsl.u8        q9, d30, d4

+    vmlsl.u8        q10, d31, d4

+    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]

+    vext.8          d29, d8, d9, #2

+    vext.8          d30, d10, d11, #2

+    vext.8          d31, d12, d13, #2

+    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp9_filter[2])

+    vmlal.u8        q8, d29, d2

+    vmlal.u8        q9, d30, d2

+    vmlal.u8        q10, d31, d2

+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]

+    vext.8          d29, d8, d9, #5

+    vext.8          d30, d10, d11, #5

+    vext.8          d31, d12, d13, #5

+    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp9_filter[5])

+    vmlal.u8        q8, d29, d5

+    vmlal.u8        q9, d30, d5

+    vmlal.u8        q10, d31, d5

+    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]

+    vext.8          d29, d8, d9, #3

+    vext.8          d30, d10, d11, #3

+    vext.8          d31, d12, d13, #3

+    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp9_filter[3])

+    vmull.u8        q4, d29, d3

+    vmull.u8        q5, d30, d3

+    vmull.u8        q6, d31, d3

+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)

+    vqadd.s16       q8, q4

+    vqadd.s16       q9, q5

+    vqadd.s16       q10, q6

+    vld1.u8         {q3}, [r0], r1          ;load src data

+    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8

+    vqrshrun.s16    d23, q8, #7

+    vqrshrun.s16    d24, q9, #7

+    vqrshrun.s16    d25, q10, #7

+    vld1.u8         {q4}, [r0], r1

+    vst1.u8         {d22}, [lr]!            ;store result

+    vld1.u8         {q5}, [r0], r1

+    vst1.u8         {d23}, [lr]!

+    vld1.u8         {q6}, [r0], r1

+    vst1.u8         {d24}, [lr]!

+    vld1.u8         {q7}, [r0], r1

+    vst1.u8         {d25}, [lr]!

+    ;first_pass filtering on the rest 5-line data

+    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp9_filter[0])

+    vmull.u8        q9, d8, d0

+    vmull.u8        q10, d10, d0

+    vmull.u8        q11, d12, d0

+    vmull.u8        q12, d14, d0

+    vext.8          d27, d6, d7, #1         ;construct src_ptr[-1]

+    vext.8          d28, d8, d9, #1

+    vext.8          d29, d10, d11, #1

+    vext.8          d30, d12, d13, #1

+    vext.8          d31, d14, d15, #1

+    vmlsl.u8        q8, d27, d1             ;-(src_ptr[-1] * vp9_filter[1])

+    vmlsl.u8        q9, d28, d1

+    vmlsl.u8        q10, d29, d1

+    vmlsl.u8        q11, d30, d1

+    vmlsl.u8        q12, d31, d1

+    vext.8          d27, d6, d7, #4         ;construct src_ptr[2]

+    vext.8          d28, d8, d9, #4

+    vext.8          d29, d10, d11, #4

+    vext.8          d30, d12, d13, #4

+    vext.8          d31, d14, d15, #4

+    vmlsl.u8        q8, d27, d4             ;-(src_ptr[2] * vp9_filter[4])

+    vmlsl.u8        q9, d28, d4

+    vmlsl.u8        q10, d29, d4

+    vmlsl.u8        q11, d30, d4

+    vmlsl.u8        q12, d31, d4

+    vext.8          d27, d6, d7, #2         ;construct src_ptr[0]

+    vext.8          d28, d8, d9, #2

+    vext.8          d29, d10, d11, #2

+    vext.8          d30, d12, d13, #2

+    vext.8          d31, d14, d15, #2

+    vmlal.u8        q8, d27, d2             ;(src_ptr[0] * vp9_filter[2])

+    vmlal.u8        q9, d28, d2

+    vmlal.u8        q10, d29, d2

+    vmlal.u8        q11, d30, d2

+    vmlal.u8        q12, d31, d2

+    vext.8          d27, d6, d7, #5         ;construct src_ptr[3]

+    vext.8          d28, d8, d9, #5

+    vext.8          d29, d10, d11, #5

+    vext.8          d30, d12, d13, #5

+    vext.8          d31, d14, d15, #5

+    vmlal.u8        q8, d27, d5             ;(src_ptr[3] * vp9_filter[5])

+    vmlal.u8        q9, d28, d5

+    vmlal.u8        q10, d29, d5

+    vmlal.u8        q11, d30, d5

+    vmlal.u8        q12, d31, d5

+    vext.8          d27, d6, d7, #3         ;construct src_ptr[1]

+    vext.8          d28, d8, d9, #3

+    vext.8          d29, d10, d11, #3

+    vext.8          d30, d12, d13, #3

+    vext.8          d31, d14, d15, #3

+    vmull.u8        q3, d27, d3             ;(src_ptr[1] * vp9_filter[3])

+    vmull.u8        q4, d28, d3

+    vmull.u8        q5, d29, d3

+    vmull.u8        q6, d30, d3

+    vmull.u8        q7, d31, d3

+    vqadd.s16       q8, q3                  ;sum of all (src_data*filter_parameters)

+    vqadd.s16       q9, q4

+    vqadd.s16       q10, q5

+    vqadd.s16       q11, q6

+    vqadd.s16       q12, q7

+    vqrshrun.s16    d26, q8, #7             ;shift/round/saturate to u8

+    vqrshrun.s16    d27, q9, #7

+    vqrshrun.s16    d28, q10, #7

+    vqrshrun.s16    d29, q11, #7                ;load intermediate data from stack

+    vqrshrun.s16    d30, q12, #7

+;Second pass: 8x4

+;secondpass_filter

+    add             r3, r12, r3, lsl #5

+    sub             lr, lr, #32

+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter

+    vld1.u8         {q11}, [lr]!

+    vabs.s32        q7, q5

+    vabs.s32        q8, q6

+    vld1.u8         {q12}, [lr]!

+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)

+    vdup.8          d1, d14[4]

+    vdup.8          d2, d15[0]

+    vdup.8          d3, d15[4]

+    vdup.8          d4, d16[0]

+    vdup.8          d5, d16[4]

+    vmull.u8        q3, d22, d0             ;(src_ptr[-2] * vp9_filter[0])

+    vmull.u8        q4, d23, d0

+    vmull.u8        q5, d24, d0

+    vmull.u8        q6, d25, d0

+    vmlsl.u8        q3, d23, d1             ;-(src_ptr[-1] * vp9_filter[1])

+    vmlsl.u8        q4, d24, d1

+    vmlsl.u8        q5, d25, d1

+    vmlsl.u8        q6, d26, d1

+    vmlsl.u8        q3, d26, d4             ;-(src_ptr[2] * vp9_filter[4])

+    vmlsl.u8        q4, d27, d4

+    vmlsl.u8        q5, d28, d4

+    vmlsl.u8        q6, d29, d4

+    vmlal.u8        q3, d24, d2             ;(src_ptr[0] * vp9_filter[2])

+    vmlal.u8        q4, d25, d2

+    vmlal.u8        q5, d26, d2

+    vmlal.u8        q6, d27, d2

+    vmlal.u8        q3, d27, d5             ;(src_ptr[3] * vp9_filter[5])

+    vmlal.u8        q4, d28, d5

+    vmlal.u8        q5, d29, d5

+    vmlal.u8        q6, d30, d5

+    vmull.u8        q7, d25, d3             ;(src_ptr[1] * vp9_filter[3])

+    vmull.u8        q8, d26, d3

+    vmull.u8        q9, d27, d3

+    vmull.u8        q10, d28, d3

+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)

+    vqadd.s16       q8, q4

+    vqadd.s16       q9, q5

+    vqadd.s16       q10, q6

+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8

+    vqrshrun.s16    d7, q8, #7

+    vqrshrun.s16    d8, q9, #7

+    vqrshrun.s16    d9, q10, #7

+    vst1.u8         {d6}, [r4], r5          ;store result

+    vst1.u8         {d7}, [r4], r5

+    vst1.u8         {d8}, [r4], r5

+    vst1.u8         {d9}, [r4], r5

+    add             sp, sp, #32

+    pop             {r4-r5,pc}

+;--------------------

+firstpass_filter8x4_only

+    vabs.s32        q12, q14

+    vabs.s32        q13, q15

+    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)

+    vld1.u8         {q3}, [r0], r1          ;load src data

+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)

+    vld1.u8         {q4}, [r0], r1

+    vdup.8          d1, d24[4]

+    vld1.u8         {q5}, [r0], r1

+    vdup.8          d2, d25[0]

+    vld1.u8         {q6}, [r0], r1

+    vdup.8          d3, d25[4]

+    vdup.8          d4, d26[0]

+    vdup.8          d5, d26[4]

+;First pass: output_height lines x output_width columns (4x8)

+    pld             [r0]

+    pld             [r0, r1]

+    pld             [r0, r1, lsl #1]

+    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp9_filter[0])

+    vmull.u8        q8, d8, d0

+    vmull.u8        q9, d10, d0

+    vmull.u8        q10, d12, d0

+    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]

+    vext.8          d29, d8, d9, #1

+    vext.8          d30, d10, d11, #1

+    vext.8          d31, d12, d13, #1

+    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])

+    vmlsl.u8        q8, d29, d1

+    vmlsl.u8        q9, d30, d1

+    vmlsl.u8        q10, d31, d1

+    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]

+    vext.8          d29, d8, d9, #4

+    vext.8          d30, d10, d11, #4

+    vext.8          d31, d12, d13, #4

+    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp9_filter[4])

+    vmlsl.u8        q8, d29, d4

+    vmlsl.u8        q9, d30, d4

+    vmlsl.u8        q10, d31, d4

+    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]

+    vext.8          d29, d8, d9, #2

+    vext.8          d30, d10, d11, #2

+    vext.8          d31, d12, d13, #2

+    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp9_filter[2])

+    vmlal.u8        q8, d29, d2

+    vmlal.u8        q9, d30, d2

+    vmlal.u8        q10, d31, d2

+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]

+    vext.8          d29, d8, d9, #5

+    vext.8          d30, d10, d11, #5

+    vext.8          d31, d12, d13, #5

+    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp9_filter[5])

+    vmlal.u8        q8, d29, d5

+    vmlal.u8        q9, d30, d5

+    vmlal.u8        q10, d31, d5

+    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]

+    vext.8          d29, d8, d9, #3

+    vext.8          d30, d10, d11, #3

+    vext.8          d31, d12, d13, #3

+    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp9_filter[3])

+    vmull.u8        q4, d29, d3

+    vmull.u8        q5, d30, d3

+    vmull.u8        q6, d31, d3

+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)

+    vqadd.s16       q8, q4

+    vqadd.s16       q9, q5

+    vqadd.s16       q10, q6

+    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8

+    vqrshrun.s16    d23, q8, #7

+    vqrshrun.s16    d24, q9, #7

+    vqrshrun.s16    d25, q10, #7

+    vst1.u8         {d22}, [r4], r5         ;store result

+    vst1.u8         {d23}, [r4], r5

+    vst1.u8         {d24}, [r4], r5

+    vst1.u8         {d25}, [r4], r5

+    pop             {r4-r5,pc}

+;---------------------

+secondpass_filter8x4_only

+;Second pass: 8x4

+    add             r3, r12, r3, lsl #5

+    sub             r0, r0, r1, lsl #1

+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter

+    vabs.s32        q7, q5

+    vabs.s32        q8, q6

+    vld1.u8         {d22}, [r0], r1

+    vld1.u8         {d23}, [r0], r1

+    vld1.u8         {d24}, [r0], r1

+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)

+    vld1.u8         {d25}, [r0], r1

+    vdup.8          d1, d14[4]

+    vld1.u8         {d26}, [r0], r1

+    vdup.8          d2, d15[0]

+    vld1.u8         {d27}, [r0], r1

+    vdup.8          d3, d15[4]

+    vld1.u8         {d28}, [r0], r1

+    vdup.8          d4, d16[0]

+    vld1.u8         {d29}, [r0], r1

+    vdup.8          d5, d16[4]

+    vld1.u8         {d30}, [r0], r1

+    vmull.u8        q3, d22, d0             ;(src_ptr[-2] * vp9_filter[0])

+    vmull.u8        q4, d23, d0

+    vmull.u8        q5, d24, d0

+    vmull.u8        q6, d25, d0

+    vmlsl.u8        q3, d23, d1             ;-(src_ptr[-1] * vp9_filter[1])

+    vmlsl.u8        q4, d24, d1

+    vmlsl.u8        q5, d25, d1

+    vmlsl.u8        q6, d26, d1

+    vmlsl.u8        q3, d26, d4             ;-(src_ptr[2] * vp9_filter[4])

+    vmlsl.u8        q4, d27, d4

+    vmlsl.u8        q5, d28, d4

+    vmlsl.u8        q6, d29, d4

+    vmlal.u8        q3, d24, d2             ;(src_ptr[0] * vp9_filter[2])

+    vmlal.u8        q4, d25, d2

+    vmlal.u8        q5, d26, d2

+    vmlal.u8        q6, d27, d2

+    vmlal.u8        q3, d27, d5             ;(src_ptr[3] * vp9_filter[5])

+    vmlal.u8        q4, d28, d5

+    vmlal.u8        q5, d29, d5

+    vmlal.u8        q6, d30, d5

+    vmull.u8        q7, d25, d3             ;(src_ptr[1] * vp9_filter[3])

+    vmull.u8        q8, d26, d3

+    vmull.u8        q9, d27, d3

+    vmull.u8        q10, d28, d3

+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)

+    vqadd.s16       q8, q4

+    vqadd.s16       q9, q5

+    vqadd.s16       q10, q6

+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8

+    vqrshrun.s16    d7, q8, #7

+    vqrshrun.s16    d8, q9, #7

+    vqrshrun.s16    d9, q10, #7

+    vst1.u8         {d6}, [r4], r5          ;store result

+    vst1.u8         {d7}, [r4], r5

+    vst1.u8         {d8}, [r4], r5

+    vst1.u8         {d9}, [r4], r5

+    pop             {r4-r5,pc}

+    ENDP

+;-----------------

+    END

--- /dev/null

+++ b/vp9/common/arm/neon/sixtappredict8x8_neon.asm

@@ -1,0 +1,524 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_sixtap_predict8x8_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+filter8_coeff

+    DCD     0,  0,  128,    0,   0,  0,   0,  0

+    DCD     0, -6,  123,   12,  -1,  0,   0,  0

+    DCD     2, -11, 108,   36,  -8,  1,   0,  0

+    DCD     0, -9,   93,   50,  -6,  0,   0,  0

+    DCD     3, -16,  77,   77, -16,  3,   0,  0

+    DCD     0, -6,   50,   93,  -9,  0,   0,  0

+    DCD     1, -8,   36,  108, -11,  2,   0,  0

+    DCD     0, -1,   12,  123,  -6,   0,  0,  0

+; r0    unsigned char  *src_ptr,

+; r1    int  src_pixels_per_line,

+; r2    int  xoffset,

+; r3    int  yoffset,

+; stack(r4) unsigned char *dst_ptr,

+; stack(r5) int  dst_pitch

+|vp8_sixtap_predict8x8_neon| PROC

+    push            {r4-r5, lr}

+    adr             r12, filter8_coeff

+    ldr             r4, [sp, #12]           ;load parameters from stack

+    ldr             r5, [sp, #16]           ;load parameters from stack

+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

+    beq             secondpass_filter8x8_only

+    add             r2, r12, r2, lsl #5     ;calculate filter location

+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

+    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter

+    beq             firstpass_filter8x8_only

+    sub             sp, sp, #64             ;reserve space on stack for temporary storage

+    mov             lr, sp

+    vabs.s32        q12, q14

+    vabs.s32        q13, q15

+    mov             r2, #2                  ;loop counter

+    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)

+    sub             r0, r0, r1, lsl #1

+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)

+    vdup.8          d1, d24[4]

+    vdup.8          d2, d25[0]

+;First pass: output_height lines x output_width columns (13x8)

+    vld1.u8         {q3}, [r0], r1          ;load src data

+    vdup.8          d3, d25[4]

+    vld1.u8         {q4}, [r0], r1

+    vdup.8          d4, d26[0]

+    vld1.u8         {q5}, [r0], r1

+    vdup.8          d5, d26[4]

+    vld1.u8         {q6}, [r0], r1

+filt_blk2d_fp8x8_loop_neon

+    pld             [r0]

+    pld             [r0, r1]

+    pld             [r0, r1, lsl #1]

+    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp9_filter[0])

+    vmull.u8        q8, d8, d0

+    vmull.u8        q9, d10, d0

+    vmull.u8        q10, d12, d0

+    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]

+    vext.8          d29, d8, d9, #1

+    vext.8          d30, d10, d11, #1

+    vext.8          d31, d12, d13, #1

+    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])

+    vmlsl.u8        q8, d29, d1

+    vmlsl.u8        q9, d30, d1

+    vmlsl.u8        q10, d31, d1

+    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]

+    vext.8          d29, d8, d9, #4

+    vext.8          d30, d10, d11, #4

+    vext.8          d31, d12, d13, #4

+    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp9_filter[4])

+    vmlsl.u8        q8, d29, d4

+    vmlsl.u8        q9, d30, d4

+    vmlsl.u8        q10, d31, d4

+    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]

+    vext.8          d29, d8, d9, #2

+    vext.8          d30, d10, d11, #2

+    vext.8          d31, d12, d13, #2

+    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp9_filter[2])

+    vmlal.u8        q8, d29, d2

+    vmlal.u8        q9, d30, d2

+    vmlal.u8        q10, d31, d2

+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]

+    vext.8          d29, d8, d9, #5

+    vext.8          d30, d10, d11, #5

+    vext.8          d31, d12, d13, #5

+    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp9_filter[5])

+    vmlal.u8        q8, d29, d5

+    vmlal.u8        q9, d30, d5

+    vmlal.u8        q10, d31, d5

+    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]

+    vext.8          d29, d8, d9, #3

+    vext.8          d30, d10, d11, #3

+    vext.8          d31, d12, d13, #3

+    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp9_filter[3])

+    vmull.u8        q4, d29, d3

+    vmull.u8        q5, d30, d3

+    vmull.u8        q6, d31, d3

+    subs            r2, r2, #1

+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)

+    vqadd.s16       q8, q4

+    vqadd.s16       q9, q5

+    vqadd.s16       q10, q6

+    vld1.u8         {q3}, [r0], r1          ;load src data

+    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8

+    vqrshrun.s16    d23, q8, #7

+    vqrshrun.s16    d24, q9, #7

+    vqrshrun.s16    d25, q10, #7

+    vst1.u8         {d22}, [lr]!            ;store result

+    vld1.u8         {q4}, [r0], r1

+    vst1.u8         {d23}, [lr]!

+    vld1.u8         {q5}, [r0], r1

+    vst1.u8         {d24}, [lr]!

+    vld1.u8         {q6}, [r0], r1

+    vst1.u8         {d25}, [lr]!

+    bne             filt_blk2d_fp8x8_loop_neon

+    ;first_pass filtering on the rest 5-line data

+    ;vld1.u8            {q3}, [r0], r1          ;load src data

+    ;vld1.u8            {q4}, [r0], r1

+    ;vld1.u8            {q5}, [r0], r1

+    ;vld1.u8            {q6}, [r0], r1

+    vld1.u8         {q7}, [r0], r1

+    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp9_filter[0])

+    vmull.u8        q9, d8, d0

+    vmull.u8        q10, d10, d0

+    vmull.u8        q11, d12, d0

+    vmull.u8        q12, d14, d0

+    vext.8          d27, d6, d7, #1         ;construct src_ptr[-1]

+    vext.8          d28, d8, d9, #1

+    vext.8          d29, d10, d11, #1

+    vext.8          d30, d12, d13, #1

+    vext.8          d31, d14, d15, #1

+    vmlsl.u8        q8, d27, d1             ;-(src_ptr[-1] * vp9_filter[1])

+    vmlsl.u8        q9, d28, d1

+    vmlsl.u8        q10, d29, d1

+    vmlsl.u8        q11, d30, d1

+    vmlsl.u8        q12, d31, d1

+    vext.8          d27, d6, d7, #4         ;construct src_ptr[2]

+    vext.8          d28, d8, d9, #4

+    vext.8          d29, d10, d11, #4

+    vext.8          d30, d12, d13, #4

+    vext.8          d31, d14, d15, #4

+    vmlsl.u8        q8, d27, d4             ;-(src_ptr[2] * vp9_filter[4])

+    vmlsl.u8        q9, d28, d4

+    vmlsl.u8        q10, d29, d4

+    vmlsl.u8        q11, d30, d4

+    vmlsl.u8        q12, d31, d4

+    vext.8          d27, d6, d7, #2         ;construct src_ptr[0]

+    vext.8          d28, d8, d9, #2

+    vext.8          d29, d10, d11, #2

+    vext.8          d30, d12, d13, #2

+    vext.8          d31, d14, d15, #2

+    vmlal.u8        q8, d27, d2             ;(src_ptr[0] * vp9_filter[2])

+    vmlal.u8        q9, d28, d2

+    vmlal.u8        q10, d29, d2

+    vmlal.u8        q11, d30, d2

+    vmlal.u8        q12, d31, d2

+    vext.8          d27, d6, d7, #5         ;construct src_ptr[3]

+    vext.8          d28, d8, d9, #5

+    vext.8          d29, d10, d11, #5

+    vext.8          d30, d12, d13, #5

+    vext.8          d31, d14, d15, #5

+    vmlal.u8        q8, d27, d5             ;(src_ptr[3] * vp9_filter[5])

+    vmlal.u8        q9, d28, d5

+    vmlal.u8        q10, d29, d5

+    vmlal.u8        q11, d30, d5

+    vmlal.u8        q12, d31, d5

+    vext.8          d27, d6, d7, #3         ;construct src_ptr[1]

+    vext.8          d28, d8, d9, #3

+    vext.8          d29, d10, d11, #3

+    vext.8          d30, d12, d13, #3

+    vext.8          d31, d14, d15, #3

+    vmull.u8        q3, d27, d3             ;(src_ptr[1] * vp9_filter[3])

+    vmull.u8        q4, d28, d3

+    vmull.u8        q5, d29, d3

+    vmull.u8        q6, d30, d3

+    vmull.u8        q7, d31, d3

+    vqadd.s16       q8, q3                  ;sum of all (src_data*filter_parameters)

+    vqadd.s16       q9, q4

+    vqadd.s16       q10, q5

+    vqadd.s16       q11, q6

+    vqadd.s16       q12, q7

+    add             r3, r12, r3, lsl #5

+    vqrshrun.s16    d26, q8, #7             ;shift/round/saturate to u8

+    sub             lr, lr, #64

+    vqrshrun.s16    d27, q9, #7

+    vld1.u8         {q9}, [lr]!             ;load intermediate data from stack

+    vqrshrun.s16    d28, q10, #7

+    vld1.u8         {q10}, [lr]!

+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter

+    vqrshrun.s16    d29, q11, #7

+    vld1.u8         {q11}, [lr]!

+    vabs.s32        q7, q5

+    vabs.s32        q8, q6

+    vqrshrun.s16    d30, q12, #7

+    vld1.u8         {q12}, [lr]!

+;Second pass: 8x8

+    mov             r3, #2                  ;loop counter

+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)

+    vdup.8          d1, d14[4]

+    vdup.8          d2, d15[0]

+    vdup.8          d3, d15[4]

+    vdup.8          d4, d16[0]

+    vdup.8          d5, d16[4]

+filt_blk2d_sp8x8_loop_neon

+    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp9_filter[0])

+    vmull.u8        q4, d19, d0

+    vmull.u8        q5, d20, d0

+    vmull.u8        q6, d21, d0

+    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp9_filter[1])

+    vmlsl.u8        q4, d20, d1

+    vmlsl.u8        q5, d21, d1

+    vmlsl.u8        q6, d22, d1

+    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp9_filter[4])

+    vmlsl.u8        q4, d23, d4

+    vmlsl.u8        q5, d24, d4

+    vmlsl.u8        q6, d25, d4

+    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp9_filter[2])

+    vmlal.u8        q4, d21, d2

+    vmlal.u8        q5, d22, d2

+    vmlal.u8        q6, d23, d2

+    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp9_filter[5])

+    vmlal.u8        q4, d24, d5

+    vmlal.u8        q5, d25, d5

+    vmlal.u8        q6, d26, d5

+    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp9_filter[3])

+    vmull.u8        q8, d22, d3

+    vmull.u8        q9, d23, d3

+    vmull.u8        q10, d24, d3

+    subs            r3, r3, #1

+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)

+    vqadd.s16       q8, q4

+    vqadd.s16       q9, q5

+    vqadd.s16       q10, q6

+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8

+    vqrshrun.s16    d7, q8, #7

+    vqrshrun.s16    d8, q9, #7

+    vqrshrun.s16    d9, q10, #7

+    vmov            q9, q11

+    vst1.u8         {d6}, [r4], r5          ;store result

+    vmov            q10, q12

+    vst1.u8         {d7}, [r4], r5

+    vmov            q11, q13

+    vst1.u8         {d8}, [r4], r5

+    vmov            q12, q14

+    vst1.u8         {d9}, [r4], r5

+    vmov            d26, d30

+    bne filt_blk2d_sp8x8_loop_neon

+    add             sp, sp, #64

+    pop             {r4-r5,pc}

+;---------------------

+firstpass_filter8x8_only

+    ;add                r2, r12, r2, lsl #5     ;calculate filter location

+    ;vld1.s32       {q14, q15}, [r2]        ;load first_pass filter

+    vabs.s32        q12, q14

+    vabs.s32        q13, q15

+    mov             r2, #2                  ;loop counter

+    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)

+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)

+    vdup.8          d1, d24[4]

+    vdup.8          d2, d25[0]

+    vdup.8          d3, d25[4]

+    vdup.8          d4, d26[0]

+    vdup.8          d5, d26[4]

+;First pass: output_height lines x output_width columns (8x8)

+filt_blk2d_fpo8x8_loop_neon

+    vld1.u8         {q3}, [r0], r1          ;load src data

+    vld1.u8         {q4}, [r0], r1

+    vld1.u8         {q5}, [r0], r1

+    vld1.u8         {q6}, [r0], r1

+    pld             [r0]

+    pld             [r0, r1]

+    pld             [r0, r1, lsl #1]

+    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp9_filter[0])

+    vmull.u8        q8, d8, d0

+    vmull.u8        q9, d10, d0

+    vmull.u8        q10, d12, d0

+    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]

+    vext.8          d29, d8, d9, #1

+    vext.8          d30, d10, d11, #1

+    vext.8          d31, d12, d13, #1

+    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp9_filter[1])

+    vmlsl.u8        q8, d29, d1

+    vmlsl.u8        q9, d30, d1

+    vmlsl.u8        q10, d31, d1

+    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]

+    vext.8          d29, d8, d9, #4

+    vext.8          d30, d10, d11, #4

+    vext.8          d31, d12, d13, #4

+    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp9_filter[4])

+    vmlsl.u8        q8, d29, d4

+    vmlsl.u8        q9, d30, d4

+    vmlsl.u8        q10, d31, d4

+    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]

+    vext.8          d29, d8, d9, #2

+    vext.8          d30, d10, d11, #2

+    vext.8          d31, d12, d13, #2

+    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp9_filter[2])

+    vmlal.u8        q8, d29, d2

+    vmlal.u8        q9, d30, d2

+    vmlal.u8        q10, d31, d2

+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]

+    vext.8          d29, d8, d9, #5

+    vext.8          d30, d10, d11, #5

+    vext.8          d31, d12, d13, #5

+    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp9_filter[5])

+    vmlal.u8        q8, d29, d5

+    vmlal.u8        q9, d30, d5

+    vmlal.u8        q10, d31, d5

+    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]

+    vext.8          d29, d8, d9, #3

+    vext.8          d30, d10, d11, #3

+    vext.8          d31, d12, d13, #3

+    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp9_filter[3])

+    vmull.u8        q4, d29, d3

+    vmull.u8        q5, d30, d3

+    vmull.u8        q6, d31, d3

+ ;

+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)

+    vqadd.s16       q8, q4

+    vqadd.s16       q9, q5

+    vqadd.s16       q10, q6

+    subs            r2, r2, #1

+    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8

+    vqrshrun.s16    d23, q8, #7

+    vqrshrun.s16    d24, q9, #7

+    vqrshrun.s16    d25, q10, #7

+    vst1.u8         {d22}, [r4], r5         ;store result

+    vst1.u8         {d23}, [r4], r5

+    vst1.u8         {d24}, [r4], r5

+    vst1.u8         {d25}, [r4], r5

+    bne             filt_blk2d_fpo8x8_loop_neon

+    pop             {r4-r5,pc}

+;---------------------

+secondpass_filter8x8_only

+    sub             r0, r0, r1, lsl #1

+    add             r3, r12, r3, lsl #5

+    vld1.u8         {d18}, [r0], r1         ;load src data

+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter

+    vld1.u8         {d19}, [r0], r1

+    vabs.s32        q7, q5

+    vld1.u8         {d20}, [r0], r1

+    vabs.s32        q8, q6

+    vld1.u8         {d21}, [r0], r1

+    mov             r3, #2                  ;loop counter

+    vld1.u8         {d22}, [r0], r1

+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)

+    vld1.u8         {d23}, [r0], r1

+    vdup.8          d1, d14[4]

+    vld1.u8         {d24}, [r0], r1

+    vdup.8          d2, d15[0]

+    vld1.u8         {d25}, [r0], r1

+    vdup.8          d3, d15[4]

+    vld1.u8         {d26}, [r0], r1

+    vdup.8          d4, d16[0]

+    vld1.u8         {d27}, [r0], r1

+    vdup.8          d5, d16[4]

+    vld1.u8         {d28}, [r0], r1

+    vld1.u8         {d29}, [r0], r1

+    vld1.u8         {d30}, [r0], r1

+;Second pass: 8x8

+filt_blk2d_spo8x8_loop_neon

+    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp9_filter[0])

+    vmull.u8        q4, d19, d0

+    vmull.u8        q5, d20, d0

+    vmull.u8        q6, d21, d0

+    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp9_filter[1])

+    vmlsl.u8        q4, d20, d1

+    vmlsl.u8        q5, d21, d1

+    vmlsl.u8        q6, d22, d1

+    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp9_filter[4])

+    vmlsl.u8        q4, d23, d4

+    vmlsl.u8        q5, d24, d4

+    vmlsl.u8        q6, d25, d4

+    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp9_filter[2])

+    vmlal.u8        q4, d21, d2

+    vmlal.u8        q5, d22, d2

+    vmlal.u8        q6, d23, d2

+    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp9_filter[5])

+    vmlal.u8        q4, d24, d5

+    vmlal.u8        q5, d25, d5

+    vmlal.u8        q6, d26, d5

+    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp9_filter[3])

+    vmull.u8        q8, d22, d3

+    vmull.u8        q9, d23, d3

+    vmull.u8        q10, d24, d3

+    subs            r3, r3, #1

+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)

+    vqadd.s16       q8, q4

+    vqadd.s16       q9, q5

+    vqadd.s16       q10, q6

+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8

+    vqrshrun.s16    d7, q8, #7

+    vqrshrun.s16    d8, q9, #7

+    vqrshrun.s16    d9, q10, #7

+    vmov            q9, q11

+    vst1.u8         {d6}, [r4], r5          ;store result

+    vmov            q10, q12

+    vst1.u8         {d7}, [r4], r5

+    vmov            q11, q13

+    vst1.u8         {d8}, [r4], r5

+    vmov            q12, q14

+    vst1.u8         {d9}, [r4], r5

+    vmov            d26, d30

+    bne filt_blk2d_spo8x8_loop_neon

+    pop             {r4-r5,pc}

+    ENDP

+;-----------------

+    END

--- /dev/null

+++ b/vp9/common/arm/recon_arm.h

@@ -1,0 +1,90 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef RECON_ARM_H

+#define RECON_ARM_H

+#if HAVE_ARMV6

+extern prototype_recon_block(vp9_recon_b_armv6);

+extern prototype_recon_block(vp9_recon2b_armv6);

+extern prototype_recon_block(vp9_recon4b_armv6);

+extern prototype_copy_block(vp9_copy_mem8x8_v6);

+extern prototype_copy_block(vp9_copy_mem8x4_v6);

+extern prototype_copy_block(vp9_copy_mem16x16_v6);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp8_recon_recon

+#define vp8_recon_recon vp9_recon_b_armv6

+#undef  vp8_recon_recon2

+#define vp8_recon_recon2 vp9_recon2b_armv6

+#undef  vp8_recon_recon4

+#define vp8_recon_recon4 vp9_recon4b_armv6

+#undef  vp8_recon_copy8x8

+#define vp8_recon_copy8x8 vp9_copy_mem8x8_v6

+#undef  vp8_recon_copy8x4

+#define vp8_recon_copy8x4 vp9_copy_mem8x4_v6

+#undef  vp8_recon_copy16x16

+#define vp8_recon_copy16x16 vp9_copy_mem16x16_v6

+#endif

+#endif

+#if HAVE_ARMV7

+extern prototype_recon_block(vp9_recon_b_neon);

+extern prototype_recon_block(vp9_recon2b_neon);

+extern prototype_recon_block(vp9_recon4b_neon);

+extern prototype_copy_block(vp9_copy_mem8x8_neon);

+extern prototype_copy_block(vp9_copy_mem8x4_neon);

+extern prototype_copy_block(vp9_copy_mem16x16_neon);

+extern prototype_recon_macroblock(vp9_recon_mb_neon);

+extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_neon);

+extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_s_neon);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp8_recon_recon

+#define vp8_recon_recon vp9_recon_b_neon

+#undef  vp8_recon_recon2

+#define vp8_recon_recon2 vp9_recon2b_neon

+#undef  vp8_recon_recon4

+#define vp8_recon_recon4 vp9_recon4b_neon

+#undef  vp8_recon_copy8x8

+#define vp8_recon_copy8x8 vp9_copy_mem8x8_neon

+#undef  vp8_recon_copy8x4

+#define vp8_recon_copy8x4 vp9_copy_mem8x4_neon

+#undef  vp8_recon_copy16x16

+#define vp8_recon_copy16x16 vp9_copy_mem16x16_neon

+#undef  vp8_recon_recon_mb

+#define vp8_recon_recon_mb vp9_recon_mb_neon

+#undef  vp9_recon_build_intra_predictors_mby

+#define vp9_recon_build_intra_predictors_mby vp9_build_intra_predictors_mby_neon

+#undef  vp9_recon_build_intra_predictors_mby_s

+#define vp9_recon_build_intra_predictors_mby_s vp9_build_intra_predictors_mby_s_neon

+#endif

+#endif

+#endif

--- /dev/null

+++ b/vp9/common/arm/reconintra_arm.c

@@ -1,0 +1,62 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vp9/common/blockd.h"

+#include "vp9/common/reconintra.h"

+#include "vpx_mem/vpx_mem.h"

+#include "vp9/common/recon.h"

+#if HAVE_ARMV7

+extern void vp9_build_intra_predictors_mby_neon_func(

+  unsigned char *y_buffer,

+  unsigned char *ypred_ptr,

+  int y_stride,

+  int mode,

+  int Up,

+  int Left);

+void vp9_build_intra_predictors_mby_neon(MACROBLOCKD *xd) {

+  unsigned char *y_buffer = xd->dst.y_buffer;

+  unsigned char *ypred_ptr = xd->predictor;

+  int y_stride = xd->dst.y_stride;

+  int mode = xd->mode_info_context->mbmi.mode;

+  int Up = xd->up_available;

+  int Left = xd->left_available;

+  vp9_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr,

+                                           y_stride, mode, Up, Left);

+}

+#endif

+#if HAVE_ARMV7

+extern void vp9_build_intra_predictors_mby_s_neon_func(

+  unsigned char *y_buffer,

+  unsigned char *ypred_ptr,

+  int y_stride,

+  int mode,

+  int Up,

+  int Left);

+void vp9_build_intra_predictors_mby_s_neon(MACROBLOCKD *xd) {

+  unsigned char *y_buffer = xd->dst.y_buffer;

+  unsigned char *ypred_ptr = xd->predictor;

+  int y_stride = xd->dst.y_stride;

+  int mode = xd->mode_info_context->mbmi.mode;

+  int Up = xd->up_available;

+  int Left = xd->left_available;

+  vp9_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr,

+                                             y_stride, mode, Up, Left);

+}

+#endif

--- /dev/null

+++ b/vp9/common/arm/subpixel_arm.h

@@ -1,0 +1,89 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef SUBPIXEL_ARM_H

+#define SUBPIXEL_ARM_H

+#if HAVE_ARMV6

+extern prototype_subpixel_predict(vp9_sixtap_predict16x16_armv6);

+extern prototype_subpixel_predict(vp9_sixtap_predict8x8_armv6);

+extern prototype_subpixel_predict(vp9_sixtap_predict8x4_armv6);

+extern prototype_subpixel_predict(vp9_sixtap_predict_armv6);

+extern prototype_subpixel_predict(vp9_bilinear_predict16x16_armv6);

+extern prototype_subpixel_predict(vp9_bilinear_predict8x8_armv6);

+extern prototype_subpixel_predict(vp9_bilinear_predict8x4_armv6);

+extern prototype_subpixel_predict(vp9_bilinear_predict4x4_armv6);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp9_subpix_sixtap16x16

+#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_armv6

+#undef  vp9_subpix_sixtap8x8

+#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_armv6

+#undef  vp9_subpix_sixtap8x4

+#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_armv6

+#undef  vp9_subpix_sixtap4x4

+#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_armv6

+#undef  vp9_subpix_bilinear16x16

+#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_armv6

+#undef  vp9_subpix_bilinear8x8

+#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_armv6

+#undef  vp9_subpix_bilinear8x4

+#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_armv6

+#undef  vp9_subpix_bilinear4x4

+#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_armv6

+#endif

+#endif

+#if HAVE_ARMV7

+extern prototype_subpixel_predict(vp9_sixtap_predict16x16_neon);

+extern prototype_subpixel_predict(vp9_sixtap_predict8x8_neon);

+extern prototype_subpixel_predict(vp9_sixtap_predict8x4_neon);

+extern prototype_subpixel_predict(vp9_sixtap_predict_neon);

+extern prototype_subpixel_predict(vp9_bilinear_predict16x16_neon);

+extern prototype_subpixel_predict(vp9_bilinear_predict8x8_neon);

+extern prototype_subpixel_predict(vp9_bilinear_predict8x4_neon);

+extern prototype_subpixel_predict(vp9_bilinear_predict4x4_neon);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp9_subpix_sixtap16x16

+#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_neon

+#undef  vp9_subpix_sixtap8x8

+#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_neon

+#undef  vp9_subpix_sixtap8x4

+#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_neon

+#undef  vp9_subpix_sixtap4x4

+#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_neon

+#undef  vp9_subpix_bilinear16x16

+#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_neon

+#undef  vp9_subpix_bilinear8x8

+#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_neon

+#undef  vp9_subpix_bilinear8x4

+#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_neon

+#undef  vp9_subpix_bilinear4x4

+#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_neon

+#endif

+#endif

+#endif

--- /dev/null

+++ b/vp9/common/asm_com_offsets.c

@@ -1,0 +1,40 @@

+/*

+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_config.h"

+#include "vpx/vpx_codec.h"

+#include "vpx_ports/asm_offsets.h"

+#include "vpx_scale/yv12config.h"

+BEGIN

+/* vpx_scale */

+DEFINE(yv12_buffer_config_y_width,              offsetof(YV12_BUFFER_CONFIG, y_width));

+DEFINE(yv12_buffer_config_y_height,             offsetof(YV12_BUFFER_CONFIG, y_height));

+DEFINE(yv12_buffer_config_y_stride,             offsetof(YV12_BUFFER_CONFIG, y_stride));

+DEFINE(yv12_buffer_config_uv_width,             offsetof(YV12_BUFFER_CONFIG, uv_width));

+DEFINE(yv12_buffer_config_uv_height,            offsetof(YV12_BUFFER_CONFIG, uv_height));

+DEFINE(yv12_buffer_config_uv_stride,            offsetof(YV12_BUFFER_CONFIG, uv_stride));

+DEFINE(yv12_buffer_config_y_buffer,             offsetof(YV12_BUFFER_CONFIG, y_buffer));

+DEFINE(yv12_buffer_config_u_buffer,             offsetof(YV12_BUFFER_CONFIG, u_buffer));

+DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));

+DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));

+DEFINE(VP8BORDERINPIXELS_VAL,                   VP8BORDERINPIXELS);

+END

+/* add asserts for any offset that is not supported by assembly code */

+/* add asserts for any size that is not supported by assembly code */

+#if HAVE_ARMV7

+/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */

+ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32)

+#endif

--- /dev/null

+++ b/vp9/common/blockd.c

@@ -1,0 +1,29 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "blockd.h"

+#include "vpx_mem/vpx_mem.h"

+const unsigned char vp9_block2left[25] = {

+  0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8

+};

+const unsigned char vp9_block2above[25] = {

+  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8

+};

+const unsigned char vp9_block2left_8x8[25] = {

+  0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8

+};

+const unsigned char vp9_block2above_8x8[25] = {

+  0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8

+};

--- /dev/null

+++ b/vp9/common/blockd.h

@@ -1,0 +1,518 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_BLOCKD_H

+#define __INC_BLOCKD_H

+void vpx_log(const char *format, ...);

+#include "vpx_ports/config.h"

+#include "vpx_scale/yv12config.h"

+#include "mv.h"

+#include "treecoder.h"

+#include "subpixel.h"

+#include "vpx_ports/mem.h"

+#include "common.h"

+#define TRUE    1

+#define FALSE   0

+// #define MODE_STATS

+/*#define DCPRED 1*/

+#define DCPREDSIMTHRESH 0

+#define DCPREDCNTTHRESH 3

+#define MB_FEATURE_TREE_PROBS   3

+#define PREDICTION_PROBS 3

+#define MBSKIP_CONTEXTS 3

+#define MAX_MB_SEGMENTS         4

+#define MAX_REF_LF_DELTAS       4

+#define MAX_MODE_LF_DELTAS      4

+/* Segment Feature Masks */

+#define SEGMENT_DELTADATA   0

+#define SEGMENT_ABSDATA     1

+#if CONFIG_NEWBESTREFMV || CONFIG_NEW_MVREF

+#define MAX_MV_REFS 19

+#endif

+typedef struct {

+  int r, c;

+} POS;

+typedef enum PlaneType {

+  PLANE_TYPE_Y_NO_DC = 0,

+  PLANE_TYPE_Y2,

+  PLANE_TYPE_UV,

+  PLANE_TYPE_Y_WITH_DC,

+} PLANE_TYPE;

+typedef char ENTROPY_CONTEXT;

+typedef struct {

+  ENTROPY_CONTEXT y1[4];

+  ENTROPY_CONTEXT u[2];

+  ENTROPY_CONTEXT v[2];

+  ENTROPY_CONTEXT y2;

+} ENTROPY_CONTEXT_PLANES;

+extern const unsigned char vp9_block2left[25];

+extern const unsigned char vp9_block2above[25];

+extern const unsigned char vp9_block2left_8x8[25];

+extern const unsigned char vp9_block2above_8x8[25];

+#define VP9_COMBINEENTROPYCONTEXTS( Dest, A, B) \

+  Dest = ((A)!=0) + ((B)!=0);

+typedef enum {

+  KEY_FRAME = 0,

+  INTER_FRAME = 1

+} FRAME_TYPE;

+typedef enum

+{

+  SIXTAP   = 0,

+  BILINEAR = 1,

+  EIGHTTAP = 2,

+  EIGHTTAP_SHARP = 3,

+  SWITCHABLE  /* should be the last one */

+} INTERPOLATIONFILTERTYPE;

+typedef enum

+{

+  DC_PRED,            /* average of above and left pixels */

+  V_PRED,             /* vertical prediction */

+  H_PRED,             /* horizontal prediction */

+  D45_PRED,           /* Directional 45 deg prediction  [anti-clockwise from 0 deg hor] */

+  D135_PRED,          /* Directional 135 deg prediction [anti-clockwise from 0 deg hor] */

+  D117_PRED,          /* Directional 112 deg prediction [anti-clockwise from 0 deg hor] */

+  D153_PRED,          /* Directional 157 deg prediction [anti-clockwise from 0 deg hor] */

+  D27_PRED,           /* Directional 22 deg prediction  [anti-clockwise from 0 deg hor] */

+  D63_PRED,           /* Directional 67 deg prediction  [anti-clockwise from 0 deg hor] */

+  TM_PRED,            /* Truemotion prediction */

+  I8X8_PRED,          /* 8x8 based prediction, each 8x8 has its own prediction mode */

+  B_PRED,             /* block based prediction, each block has its own prediction mode */

+  NEARESTMV,

+  NEARMV,

+  ZEROMV,

+  NEWMV,

+  SPLITMV,

+  MB_MODE_COUNT

+} MB_PREDICTION_MODE;

+// Segment level features.

+typedef enum {

+  SEG_LVL_ALT_Q = 0,               // Use alternate Quantizer ....

+  SEG_LVL_ALT_LF = 1,              // Use alternate loop filter value...

+  SEG_LVL_REF_FRAME = 2,           // Optional Segment reference frame

+  SEG_LVL_MODE = 3,                // Optional Segment mode

+  SEG_LVL_EOB = 4,                 // EOB end stop marker.

+  SEG_LVL_TRANSFORM = 5,           // Block transform size.

+  SEG_LVL_MAX = 6                  // Number of MB level features supported

+} SEG_LVL_FEATURES;

+// Segment level features.

+typedef enum {

+  TX_4X4,                      // 4x4 dct transform

+  TX_8X8,                      // 8x8 dct transform

+  TX_16X16,                    // 16x16 dct transform

+  TX_SIZE_MAX                  // Number of different transforms available

+} TX_SIZE;

+typedef enum {

+  DCT_DCT   = 0,                      // DCT  in both horizontal and vertical

+  ADST_DCT  = 1,                      // ADST in vertical, DCT in horizontal

+  DCT_ADST  = 2,                      // DCT  in vertical, ADST in horizontal

+  ADST_ADST = 3                       // ADST in both directions

+} TX_TYPE;

+#define VP9_YMODES  (B_PRED + 1)

+#define VP9_UV_MODES (TM_PRED + 1)

+#define VP9_I8X8_MODES (TM_PRED + 1)

+#define VP9_I32X32_MODES (TM_PRED + 1)

+#define VP9_MVREFS (1 + SPLITMV - NEARESTMV)

+typedef enum {

+  B_DC_PRED,          /* average of above and left pixels */

+  B_TM_PRED,

+  B_VE_PRED,           /* vertical prediction */

+  B_HE_PRED,           /* horizontal prediction */

+  B_LD_PRED,

+  B_RD_PRED,

+  B_VR_PRED,

+  B_VL_PRED,

+  B_HD_PRED,

+  B_HU_PRED,

+  LEFT4X4,

+  ABOVE4X4,

+  ZERO4X4,

+  NEW4X4,

+  B_MODE_COUNT

+} B_PREDICTION_MODE;

+#define VP9_BINTRAMODES (B_HU_PRED + 1)  /* 10 */

+#define VP9_SUBMVREFS (1 + NEW4X4 - LEFT4X4)

+typedef enum {

+  PARTITIONING_16X8 = 0,

+  PARTITIONING_8X16,

+  PARTITIONING_8X8,

+  PARTITIONING_4X4,

+  NB_PARTITIONINGS,

+} SPLITMV_PARTITIONING_TYPE;

+/* For keyframes, intra block modes are predicted by the (already decoded)

+   modes for the Y blocks to the left and above us; for interframes, there

+   is a single probability table. */

+union b_mode_info {

+  struct {

+    B_PREDICTION_MODE first;

+    TX_TYPE           tx_type;

+#if CONFIG_COMP_INTRA_PRED

+    B_PREDICTION_MODE second;

+#endif

+  } as_mode;

+  struct {

+    int_mv first;

+    int_mv second;

+  } as_mv;

+};

+typedef enum {

+  INTRA_FRAME = 0,

+  LAST_FRAME = 1,

+  GOLDEN_FRAME = 2,

+  ALTREF_FRAME = 3,

+  MAX_REF_FRAMES = 4

+} MV_REFERENCE_FRAME;

+typedef struct {

+  MB_PREDICTION_MODE mode, uv_mode;

+#if CONFIG_COMP_INTRA_PRED

+  MB_PREDICTION_MODE second_mode, second_uv_mode;

+#endif

+  MV_REFERENCE_FRAME ref_frame, second_ref_frame;

+  TX_SIZE txfm_size;

+  int_mv mv[2]; // for each reference frame used

+#if CONFIG_NEWBESTREFMV || CONFIG_NEW_MVREF

+  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REFS];

+#endif

+  SPLITMV_PARTITIONING_TYPE partitioning;

+  unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */

+  unsigned char need_to_clamp_mvs;

+  unsigned char need_to_clamp_secondmv;

+  unsigned char segment_id;                  /* Which set of segmentation parameters should be used for this MB */

+  // Flags used for prediction status of various bistream signals

+  unsigned char seg_id_predicted;

+  unsigned char ref_predicted;

+  // Indicates if the mb is part of the image (1) vs border (0)

+  // This can be useful in determining whether the MB provides

+  // a valid predictor

+  unsigned char mb_in_image;

+#if CONFIG_PRED_FILTER

+  // Flag to turn prediction signal filter on(1)/off(0 ) at the MB level

+  unsigned int pred_filter_enabled;

+#endif

+    INTERPOLATIONFILTERTYPE interp_filter;

+#if CONFIG_SUPERBLOCKS

+  // FIXME need a SB array of 4 MB_MODE_INFOs that

+  // only needs one encoded_as_sb.

+  unsigned char encoded_as_sb;

+#endif

+} MB_MODE_INFO;

+typedef struct {

+  MB_MODE_INFO mbmi;

+  union b_mode_info bmi[16];

+} MODE_INFO;

+typedef struct blockd {

+  short *qcoeff;

+  short *dqcoeff;

+  unsigned char  *predictor;

+  short *diff;

+  short *dequant;

+  /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */

+  unsigned char **base_pre;

+  unsigned char **base_second_pre;

+  int pre;

+  int pre_stride;

+  unsigned char **base_dst;

+  int dst;

+  int dst_stride;

+  int eob;

+  union b_mode_info bmi;

+} BLOCKD;

+typedef struct macroblockd {

+  DECLARE_ALIGNED(16, short, diff[400]);      /* from idct diff */

+  DECLARE_ALIGNED(16, unsigned char,  predictor[384]);

+  DECLARE_ALIGNED(16, short, qcoeff[400]);

+  DECLARE_ALIGNED(16, short, dqcoeff[400]);

+  DECLARE_ALIGNED(16, char,  eobs[25]);

+  /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */

+  BLOCKD block[25];

+  int fullpixel_mask;

+  YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */

+  struct {

+    uint8_t *y_buffer, *u_buffer, *v_buffer;

+  } second_pre;

+  YV12_BUFFER_CONFIG dst;

+  MODE_INFO *prev_mode_info_context;

+  MODE_INFO *mode_info_context;

+  int mode_info_stride;

+  FRAME_TYPE frame_type;

+  int up_available;

+  int left_available;

+  /* Y,U,V,Y2 */

+  ENTROPY_CONTEXT_PLANES *above_context;

+  ENTROPY_CONTEXT_PLANES *left_context;

+  /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */

+  unsigned char segmentation_enabled;

+  /* 0 (do not update) 1 (update) the macroblock segmentation map. */

+  unsigned char update_mb_segmentation_map;

+  /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */

+  unsigned char update_mb_segmentation_data;

+  /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */

+  unsigned char mb_segment_abs_delta;

+  /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */

+  /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */

+  // Probability Tree used to code Segment number

+  vp9_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];

+#if CONFIG_NEW_MVREF

+  vp9_prob mb_mv_ref_id_probs[MAX_REF_FRAMES][3];

+#endif

+  // Segment features

+  signed char segment_feature_data[MAX_MB_SEGMENTS][SEG_LVL_MAX];

+  unsigned int segment_feature_mask[MAX_MB_SEGMENTS];

+  /* mode_based Loop filter adjustment */

+  unsigned char mode_ref_lf_delta_enabled;

+  unsigned char mode_ref_lf_delta_update;

+  /* Delta values have the range +/- MAX_LOOP_FILTER */

+  signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];                /* 0 = Intra, Last, GF, ARF */

+  signed char ref_lf_deltas[MAX_REF_LF_DELTAS];                     /* 0 = Intra, Last, GF, ARF */

+  signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];              /* 0 = BPRED, ZERO_MV, MV, SPLIT */

+  signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];                   /* 0 = BPRED, ZERO_MV, MV, SPLIT */

+  /* Distance of MB away from frame edges */

+  int mb_to_left_edge;

+  int mb_to_right_edge;

+  int mb_to_top_edge;

+  int mb_to_bottom_edge;

+  unsigned int frames_since_golden;

+  unsigned int frames_till_alt_ref_frame;

+  vp9_subpix_fn_t  subpixel_predict;

+  vp9_subpix_fn_t  subpixel_predict8x4;

+  vp9_subpix_fn_t  subpixel_predict8x8;

+  vp9_subpix_fn_t  subpixel_predict16x16;

+  vp9_subpix_fn_t  subpixel_predict_avg;

+  vp9_subpix_fn_t  subpixel_predict_avg8x4;

+  vp9_subpix_fn_t  subpixel_predict_avg8x8;

+  vp9_subpix_fn_t  subpixel_predict_avg16x16;

+  int allow_high_precision_mv;

+  int corrupted;

+#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)

+  /* This is an intermediate buffer currently used in sub-pixel motion search

+   * to keep a copy of the reference area. This buffer can be used for other

+   * purpose.

+   */

+  DECLARE_ALIGNED(32, unsigned char, y_buf[22 * 32]);

+#endif

+#if CONFIG_RUNTIME_CPU_DETECT

+  struct VP9_COMMON_RTCD  *rtcd;

+#endif

+  int mb_index;   // Index of the MB in the SB (0..3)

+  int q_index;

+} MACROBLOCKD;

+#define ACTIVE_HT 110                // quantization stepsize threshold

+#define ACTIVE_HT8 300

+#define ACTIVE_HT16 300

+// convert MB_PREDICTION_MODE to B_PREDICTION_MODE

+static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) {

+  B_PREDICTION_MODE b_mode;

+  switch (mode) {

+    case DC_PRED:

+      b_mode = B_DC_PRED;

+      break;

+    case V_PRED:

+      b_mode = B_VE_PRED;

+      break;

+    case H_PRED:

+      b_mode = B_HE_PRED;

+      break;

+    case TM_PRED:

+      b_mode = B_TM_PRED;

+      break;

+    case D45_PRED:

+      b_mode = B_LD_PRED;

+      break;

+    case D135_PRED:

+      b_mode = B_RD_PRED;

+      break;

+    case D117_PRED:

+      b_mode = B_VR_PRED;

+      break;

+    case D153_PRED:

+      b_mode = B_HD_PRED;

+      break;

+    case D27_PRED:

+      b_mode = B_HU_PRED;

+      break;

+    case D63_PRED:

+      b_mode = B_VL_PRED;

+      break;

+    default :

+      // for debug purpose, to be removed after full testing

+      assert(0);

+      break;

+  }

+  return b_mode;

+}

+// transform mapping

+static TX_TYPE txfm_map(B_PREDICTION_MODE bmode) {

+  // map transform type

+  TX_TYPE tx_type;

+  switch (bmode) {

+    case B_TM_PRED :

+    case B_RD_PRED :

+      tx_type = ADST_ADST;

+      break;

+    case B_VE_PRED :

+    case B_VR_PRED :

+      tx_type = ADST_DCT;

+      break;

+    case B_HE_PRED :

+    case B_HD_PRED :

+    case B_HU_PRED :

+      tx_type = DCT_ADST;

+      break;

+    default :

+      tx_type = DCT_DCT;

+      break;

+  }

+  return tx_type;

+}

+static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) {

+  TX_TYPE tx_type = DCT_DCT;

+  if (xd->mode_info_context->mbmi.mode == B_PRED &&

+      xd->q_index < ACTIVE_HT) {

+    tx_type = txfm_map(b->bmi.as_mode.first);

+  }

+  return tx_type;

+}

+static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, const BLOCKD *b) {

+  TX_TYPE tx_type = DCT_DCT;

+  if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&

+      xd->q_index < ACTIVE_HT8) {

+    tx_type = txfm_map(pred_mode_conv(b->bmi.as_mode.first));

+  }

+  return tx_type;

+}

+static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, const BLOCKD *b) {

+  TX_TYPE tx_type = DCT_DCT;

+  if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&

+      xd->q_index < ACTIVE_HT16) {

+    tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));

+  }

+  return tx_type;

+}

+static TX_TYPE get_tx_type(const MACROBLOCKD *xd, const BLOCKD *b) {

+  TX_TYPE tx_type = DCT_DCT;

+  int ib = (b - xd->block);

+  if (ib >= 16)

+    return tx_type;

+  if (xd->mode_info_context->mbmi.txfm_size == TX_16X16) {

+    tx_type = get_tx_type_16x16(xd, b);

+  }

+  if (xd->mode_info_context->mbmi.txfm_size  == TX_8X8) {

+    ib = (ib & 8) + ((ib & 4) >> 1);

+    tx_type = get_tx_type_8x8(xd, &xd->block[ib]);

+  }

+  if (xd->mode_info_context->mbmi.txfm_size  == TX_4X4) {

+    tx_type = get_tx_type_4x4(xd, b);

+  }

+  return tx_type;

+}

+extern void vp9_build_block_doffsets(MACROBLOCKD *xd);

+extern void vp9_setup_block_dptrs(MACROBLOCKD *xd);

+static void update_blockd_bmi(MACROBLOCKD *xd) {

+  int i;

+  int is_4x4;

+  is_4x4 = (xd->mode_info_context->mbmi.mode == SPLITMV) ||

+           (xd->mode_info_context->mbmi.mode == I8X8_PRED) ||

+           (xd->mode_info_context->mbmi.mode == B_PRED);

+  if (is_4x4) {

+    for (i = 0; i < 16; i++) {

+      xd->block[i].bmi = xd->mode_info_context->bmi[i];

+    }

+  }

+}

+#endif  /* __INC_BLOCKD_H */

--- /dev/null

+++ b/vp9/common/coefupdateprobs.h

@@ -1,0 +1,16 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+/* Update probabilities for the nodes in the token entropy tree.

+   Generated file included by entropy.c */

+#define COEF_UPDATE_PROB 252

+#define COEF_UPDATE_PROB_8X8 252

+#define COEF_UPDATE_PROB_16X16 252

--- /dev/null

+++ b/vp9/common/common.h

@@ -1,0 +1,41 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef common_h

+#define common_h 1

+#include <assert.h>

+#include "vpx_config.h"

+/* Interface header for common constant data structures and lookup tables */

+#include "vpx_mem/vpx_mem.h"

+#include "common_types.h"

+/* Only need this for fixed-size arrays, for structs just assign. */

+#define vp9_copy( Dest, Src) { \

+    assert( sizeof( Dest) == sizeof( Src)); \

+    vpx_memcpy( Dest, Src, sizeof( Src)); \

+  }

+/* Use this for variably-sized arrays. */

+#define vp9_copy_array( Dest, Src, N) { \

+    assert( sizeof( *Dest) == sizeof( *Src)); \

+    vpx_memcpy( Dest, Src, N * sizeof( *Src)); \

+  }

+#define vp9_zero( Dest)  vpx_memset( &Dest, 0, sizeof( Dest));

+#define vp9_zero_array( Dest, N)  vpx_memset( Dest, 0, N * sizeof( *Dest));

+#endif  /* common_h */

--- /dev/null

+++ b/vp9/common/common_types.h

@@ -1,0 +1,18 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_COMMON_TYPES

+#define __INC_COMMON_TYPES

+#define TRUE    1

+#define FALSE   0

+#endif

--- /dev/null

+++ b/vp9/common/context.c

@@ -1,0 +1,397 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "entropy.h"

+/* *** GENERATED FILE: DO NOT EDIT *** */

+#if 0

+int Contexts[vp8_coef_counter_dimen];

+const int default_contexts[vp8_coef_counter_dimen] = {

+  {

+    // Block Type ( 0 )

+    {

+      // Coeff Band ( 0 )

+      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

+      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

+      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

+    },

+    {

+      // Coeff Band ( 1 )

+      {30190, 26544, 225,  24,   4,   0,   0,   0,   0,   0,   0, 4171593},

+      {26846, 25157, 1241, 130,  26,   6,   1,   0,   0,   0,   0, 149987},

+      {10484, 9538, 1006, 160,  36,  18,   0,   0,   0,   0,   0, 15104},

+    },

+    {

+      // Coeff Band ( 2 )

+      {25842, 40456, 1126,  83,  11,   2,   0,   0,   0,   0,   0,   0},

+      {9338, 8010, 512,  73,   7,   3,   2,   0,   0,   0,   0, 43294},

+      {1047, 751, 149,  31,  13,   6,   1,   0,   0,   0,   0, 879},

+    },

+    {

+      // Coeff Band ( 3 )

+      {26136, 9826, 252,  13,   0,   0,   0,   0,   0,   0,   0,   0},

+      {8134, 5574, 191,  14,   2,   0,   0,   0,   0,   0,   0, 35302},

+      { 605, 677, 116,   9,   1,   0,   0,   0,   0,   0,   0, 611},

+    },

+    {

+      // Coeff Band ( 4 )

+      {10263, 15463, 283,  17,   0,   0,   0,   0,   0,   0,   0,   0},

+      {2773, 2191, 128,   9,   2,   2,   0,   0,   0,   0,   0, 10073},

+      { 134, 125,  32,   4,   0,   2,   0,   0,   0,   0,   0,  50},

+    },

+    {

+      // Coeff Band ( 5 )

+      {10483, 2663,  23,   1,   0,   0,   0,   0,   0,   0,   0,   0},

+      {2137, 1251,  27,   1,   1,   0,   0,   0,   0,   0,   0, 14362},

+      { 116, 156,  14,   2,   1,   0,   0,   0,   0,   0,   0, 190},

+    },

+    {

+      // Coeff Band ( 6 )

+      {40977, 27614, 412,  28,   0,   0,   0,   0,   0,   0,   0,   0},

+      {6113, 5213, 261,  22,   3,   0,   0,   0,   0,   0,   0, 26164},

+      { 382, 312,  50,  14,   2,   0,   0,   0,   0,   0,   0, 345},

+    },

+    {

+      // Coeff Band ( 7 )

+      {   0,  26,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

+      {   0,  13,   0,   0,   0,   0,   0,   0,   0,   0,   0, 319},

+      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   8},

+    },

+  },

+  {

+    // Block Type ( 1 )

+    {

+      // Coeff Band ( 0 )

+      {3268, 19382, 1043, 250,  93,  82,  49,  26,  17,   8,  25, 82289},

+      {8758, 32110, 5436, 1832, 827, 668, 420, 153,  24,   0,   3, 52914},

+      {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399,  59,   0,   0, 18620},

+    },

+    {

+      // Coeff Band ( 1 )

+      {12419, 8420, 452,  62,   9,   1,   0,   0,   0,   0,   0,   0},

+      {11715, 8705, 693,  92,  15,   7,   2,   0,   0,   0,   0, 53988},

+      {7603, 8585, 2306, 778, 270, 145,  39,   5,   0,   0,   0, 9136},

+    },

+    {

+      // Coeff Band ( 2 )

+      {15938, 14335, 1207, 184,  55,  13,   4,   1,   0,   0,   0,   0},

+      {7415, 6829, 1138, 244,  71,  26,   7,   0,   0,   0,   0, 9980},

+      {1580, 1824, 655, 241,  89,  46,  10,   2,   0,   0,   0, 429},

+    },

+    {

+      // Coeff Band ( 3 )

+      {19453, 5260, 201,  19,   0,   0,   0,   0,   0,   0,   0,   0},

+      {9173, 3758, 213,  22,   1,   1,   0,   0,   0,   0,   0, 9820},

+      {1689, 1277, 276,  51,  17,   4,   0,   0,   0,   0,   0, 679},

+    },

+    {

+      // Coeff Band ( 4 )

+      {12076, 10667, 620,  85,  19,   9,   5,   0,   0,   0,   0,   0},

+      {4665, 3625, 423,  55,  19,   9,   0,   0,   0,   0,   0, 5127},

+      { 415, 440, 143,  34,  20,   7,   2,   0,   0,   0,   0, 101},

+    },

+    {

+      // Coeff Band ( 5 )

+      {12183, 4846, 115,  11,   1,   0,   0,   0,   0,   0,   0,   0},

+      {4226, 3149, 177,  21,   2,   0,   0,   0,   0,   0,   0, 7157},

+      { 375, 621, 189,  51,  11,   4,   1,   0,   0,   0,   0, 198},

+    },

+    {

+      // Coeff Band ( 6 )

+      {61658, 37743, 1203,  94,  10,   3,   0,   0,   0,   0,   0,   0},

+      {15514, 11563, 903, 111,  14,   5,   0,   0,   0,   0,   0, 25195},

+      { 929, 1077, 291,  78,  14,   7,   1,   0,   0,   0,   0, 507},

+    },

+    {

+      // Coeff Band ( 7 )

+      {   0, 990,  15,   3,   0,   0,   0,   0,   0,   0,   0,   0},

+      {   0, 412,  13,   0,   0,   0,   0,   0,   0,   0,   0, 1641},

+      {   0,  18,   7,   1,   0,   0,   0,   0,   0,   0,   0,  30},

+    },

+  },

+  {

+    // Block Type ( 2 )

+    {

+      // Coeff Band ( 0 )

+      { 953, 24519, 628, 120,  28,  12,   4,   0,   0,   0,   0, 2248798},

+      {1525, 25654, 2647, 617, 239, 143,  42,   5,   0,   0,   0, 66837},

+      {1180, 11011, 3001, 1237, 532, 448, 239,  54,   5,   0,   0, 7122},

+    },

+    {

+      // Coeff Band ( 1 )

+      {1356, 2220,  67,  10,   4,   1,   0,   0,   0,   0,   0,   0},

+      {1450, 2544, 102,  18,   4,   3,   0,   0,   0,   0,   0, 57063},

+      {1182, 2110, 470, 130,  41,  21,   0,   0,   0,   0,   0, 6047},

+    },

+    {

+      // Coeff Band ( 2 )

+      { 370, 3378, 200,  30,   5,   4,   1,   0,   0,   0,   0,   0},

+      { 293, 1006, 131,  29,  11,   0,   0,   0,   0,   0,   0, 5404},

+      { 114, 387,  98,  23,   4,   8,   1,   0,   0,   0,   0, 236},

+    },

+    {

+      // Coeff Band ( 3 )

+      { 579, 194,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0},

+      { 395, 213,   5,   1,   0,   0,   0,   0,   0,   0,   0, 4157},

+      { 119, 122,   4,   0,   0,   0,   0,   0,   0,   0,   0, 300},

+    },

+    {

+      // Coeff Band ( 4 )

+      {  38, 557,  19,   0,   0,   0,   0,   0,   0,   0,   0,   0},

+      {  21, 114,  12,   1,   0,   0,   0,   0,   0,   0,   0, 427},

+      {   0,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   7},

+    },

+    {

+      // Coeff Band ( 5 )

+      {  52,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

+      {  18,   6,   0,   0,   0,   0,   0,   0,   0,   0,   0, 652},

+      {   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  30},

+    },

+    {

+      // Coeff Band ( 6 )

+      { 640, 569,  10,   0,   0,   0,   0,   0,   0,   0,   0,   0},

+      {  25,  77,   2,   0,   0,   0,   0,   0,   0,   0,   0, 517},

+      {   4,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3},

+    },

+    {

+      // Coeff Band ( 7 )

+      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

+      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

+      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

+    },

+  },

+  {

+    // Block Type ( 3 )

+    {

+      // Coeff Band ( 0 )

+      {2506, 20161, 2707, 767, 261, 178, 107,  30,  14,   3,   0, 100694},

+      {8806, 36478, 8817, 3268, 1280, 850, 401, 114,  42,   0,   0, 58572},

+      {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175,  32,   0,   0, 19284},

+    },

+    {

+      // Coeff Band ( 1 )

+      {9738, 11313, 959, 205,  70,  18,  11,   1,   0,   0,   0,   0},

+      {12628, 15085, 1507, 273,  52,  19,   9,   0,   0,   0,   0, 54280},

+      {10701, 15846, 5561, 1926, 813, 570, 249,  36,   0,   0,   0, 6460},

+    },

+    {

+      // Coeff Band ( 2 )

+      {6781, 22539, 2784, 634, 182, 123,  20,   4,   0,   0,   0,   0},

+      {6263, 11544, 2649, 790, 259, 168,  27,   5,   0,   0,   0, 20539},

+      {3109, 4075, 2031, 896, 457, 386, 158,  29,   0,   0,   0, 1138},

+    },

+    {

+      // Coeff Band ( 3 )

+      {11515, 4079, 465,  73,   5,  14,   2,   0,   0,   0,   0,   0},

+      {9361, 5834, 650,  96,  24,   8,   4,   0,   0,   0,   0, 22181},

+      {4343, 3974, 1360, 415, 132,  96,  14,   1,   0,   0,   0, 1267},

+    },

+    {

+      // Coeff Band ( 4 )

+      {4787, 9297, 823, 168,  44,  12,   4,   0,   0,   0,   0,   0},

+      {3619, 4472, 719, 198,  60,  31,   3,   0,   0,   0,   0, 8401},

+      {1157, 1175, 483, 182,  88,  31,   8,   0,   0,   0,   0, 268},

+    },

+    {

+      // Coeff Band ( 5 )

+      {8299, 1226,  32,   5,   1,   0,   0,   0,   0,   0,   0,   0},

+      {3502, 1568,  57,   4,   1,   1,   0,   0,   0,   0,   0, 9811},

+      {1055, 1070, 166,  29,   6,   1,   0,   0,   0,   0,   0, 527},

+    },

+    {

+      // Coeff Band ( 6 )

+      {27414, 27927, 1989, 347,  69,  26,   0,   0,   0,   0,   0,   0},

+      {5876, 10074, 1574, 341,  91,  24,   4,   0,   0,   0,   0, 21954},

+      {1571, 2171, 778, 324, 124,  65,  16,   0,   0,   0,   0, 979},

+    },

+    {

+      // Coeff Band ( 7 )

+      {   0,  29,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},

+      {   0,  23,   0,   0,   0,   0,   0,   0,   0,   0,   0, 459},

+      {   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  13},

+    },

+  },

+};

+// Update probabilities for the nodes in the token entropy tree.

+const vp9_prob tree_update_probs[vp9_coef_tree_dimen] = {

+  {

+    {

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },

+      {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },

+      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+  },

+  {

+    {

+      {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },

+      {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },

+    },

+    {

+      {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+  },

+  {

+    {

+      {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },

+      {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },

+    },

+    {

+      {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+  },

+  {

+    {

+      {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },

+      {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+    {

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },

+    },

+  },

+};

+#endif

--- /dev/null

+++ b/vp9/common/debugmodes.c

@@ -1,0 +1,146 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <stdio.h>

+#include "blockd.h"

+void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols,

+                                        int frame) {

+  int mb_row;

+  int mb_col;

+  int mb_index = 0;

+  FILE *mvs = fopen("mvs.stt", "a");

+  /* print out the macroblock Y modes */

+  mb_index = 0;

+  fprintf(mvs, "Mb Modes for Frame %d\n", frame);

+  for (mb_row = 0; mb_row < rows; mb_row++) {

+    for (mb_col = 0; mb_col < cols; mb_col++) {

+      fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode);

+      mb_index++;

+    }

+    fprintf(mvs, "\n");

+    mb_index++;

+  }

+  fprintf(mvs, "\n");

+  mb_index = 0;

+  fprintf(mvs, "Mb mv ref for Frame %d\n", frame);

+  for (mb_row = 0; mb_row < rows; mb_row++) {

+    for (mb_col = 0; mb_col < cols; mb_col++) {

+      fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame);

+      mb_index++;

+    }

+    fprintf(mvs, "\n");

+    mb_index++;

+  }

+  fprintf(mvs, "\n");

+  /* print out the macroblock UV modes */

+  mb_index = 0;

+  fprintf(mvs, "UV Modes for Frame %d\n", frame);

+  for (mb_row = 0; mb_row < rows; mb_row++) {

+    for (mb_col = 0; mb_col < cols; mb_col++) {

+      fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode);

+      mb_index++;

+    }

+    mb_index++;

+    fprintf(mvs, "\n");

+  }

+  fprintf(mvs, "\n");

+  /* print out the block modes */

+  mb_index = 0;

+  fprintf(mvs, "Mbs for Frame %d\n", frame);

+  {

+    int b_row;

+    for (b_row = 0; b_row < 4 * rows; b_row++) {

+      int b_col;

+      int bindex;

+      for (b_col = 0; b_col < 4 * cols; b_col++) {

+        mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);

+        bindex = (b_row & 3) * 4 + (b_col & 3);

+        if (mi[mb_index].mbmi.mode == B_PRED) {

+          fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.first);

+#if CONFIG_COMP_INTRA_PRED

+          fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.second);

+#endif

+        } else

+          fprintf(mvs, "xx ");

+      }

+      fprintf(mvs, "\n");

+    }

+  }

+  fprintf(mvs, "\n");

+  /* print out the macroblock mvs */

+  mb_index = 0;

+  fprintf(mvs, "MVs for Frame %d\n", frame);

+  for (mb_row = 0; mb_row < rows; mb_row++) {

+    for (mb_col = 0; mb_col < cols; mb_col++) {

+      fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv[0].as_mv.row / 2,

+          mi[mb_index].mbmi.mv[0].as_mv.col / 2);

+      mb_index++;

+    }

+    mb_index++;

+    fprintf(mvs, "\n");

+  }

+  fprintf(mvs, "\n");

+  /* print out the block modes */

+  mb_index = 0;

+  fprintf(mvs, "MVs for Frame %d\n", frame);

+  {

+    int b_row;

+    for (b_row = 0; b_row < 4 * rows; b_row++) {

+      int b_col;

+      int bindex;

+      for (b_col = 0; b_col < 4 * cols; b_col++) {

+        mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);

+        bindex = (b_row & 3) * 4 + (b_col & 3);

+        fprintf(mvs, "%3d:%-3d ",

+                mi[mb_index].bmi[bindex].as_mv.first.as_mv.row,

+                mi[mb_index].bmi[bindex].as_mv.first.as_mv.col);

+      }

+      fprintf(mvs, "\n");

+    }

+  }

+  fprintf(mvs, "\n");

+  fclose(mvs);

+}

--- /dev/null

+++ b/vp9/common/default_coef_probs.h

@@ -1,0 +1,1377 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+*/

+/*Generated file, included by entropy.c*/

+static const vp9_prob default_coef_probs [BLOCK_TYPES]

+                                         [COEF_BANDS]

+                                         [PREV_COEF_CONTEXTS]

+                                         [ENTROPY_NODES] = {

+  {

+    /* Block Type ( 0 ) */

+    {

+      /* Coeff Band ( 0 )*/

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 1 )*/

+      { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },

+      { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },

+      { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 },

+      { 90, 116, 227, 252, 214, 209, 255, 255, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 2 )*/

+      {   1,  98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },

+      { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },

+      {  78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },

+      {  64, 128, 202, 247, 198, 180, 255, 219, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 3 )*/

+      {   1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },

+      { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },

+      {  77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },

+      {  64, 100, 216, 255, 236, 230, 128, 128, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 4 )*/

+      {   1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },

+      { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },

+      {  37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 },

+      {  28, 110, 196, 243, 228, 255, 255, 255, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 5 )*/

+      {   1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },

+      { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },

+      { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 },

+      { 90, 90, 231, 255, 211, 171, 128, 128, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 6 )*/

+      {   1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },

+      { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },

+      {  80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 },

+      {  64, 120, 211, 255, 194, 224, 128, 128, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 7 )*/

+      {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 246,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

+    }

+  },

+  {

+    /* Block Type ( 1 ) */

+    {

+      /* Coeff Band ( 0 )*/

+      { 198,  35, 237, 223, 193, 187, 162, 160, 145, 155,  62 },

+      { 131,  45, 198, 221, 172, 176, 220, 157, 252, 221,   1 },

+      {  68,  47, 146, 208, 149, 167, 221, 162, 255, 223, 128 },

+      {  48,  32, 146, 208, 149, 167, 221, 162, 255, 223, 128 },

+    },

+    {

+      /* Coeff Band ( 1 )*/

+      {   1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },

+      { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },

+      {  81,  99, 181, 242, 176, 190, 249, 202, 255, 255, 128 },

+      {  66,  90, 181, 242, 176, 190, 249, 202, 255, 255, 128 },

+    },

+    {

+      /* Coeff Band ( 2 )*/

+      {   1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },

+      {  99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },

+      {  23,  91, 163, 242, 170, 187, 247, 210, 255, 255, 128 },

+      {  18,  80, 163, 242, 170, 187, 247, 210, 255, 255, 128 },

+    },

+    {

+      /* Coeff Band ( 3 )*/

+      {   1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },

+      { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },

+      {  44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 },

+      {  36, 120, 201, 253, 205, 192, 255, 255, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 4 )*/

+      {   1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },

+      {  94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },

+      {  22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 },

+      {  18, 90, 174, 245, 186, 161, 255, 199, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 5 )*/

+      {   1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },

+      { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },

+      {  35,  77, 181, 251, 193, 211, 255, 205, 128, 128, 128 },

+      {  28,  70, 181, 251, 193, 211, 255, 205, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 6 )*/

+      {   1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },

+      { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },

+      {  45,  99, 188, 251, 195, 217, 255, 224, 128, 128, 128 },

+      {  40,  90, 188, 251, 195, 217, 255, 224, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 7 )*/

+      {   1,   1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },

+      { 203,   1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },

+      { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },

+      { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },

+    }

+  },

+  {

+    /* Block Type ( 2 ) */

+    {

+      /* Coeff Band ( 0 )*/

+      { 253,   9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },

+      { 175,  13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },

+      {  73,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },

+      {  64,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },

+    },

+    {

+      /* Coeff Band ( 1 )*/

+      {   1,  95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },

+      { 239,  90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },

+      { 155,  77, 195, 248, 188, 195, 255, 255, 128, 128, 128 },

+      { 140,  70, 195, 248, 188, 195, 255, 255, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 2 )*/

+      {   1,  24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },

+      { 201,  51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },

+      {  69,  46, 190, 239, 201, 218, 255, 228, 128, 128, 128 },

+      {  60,  40, 190, 239, 201, 218, 255, 228, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 3 )*/

+      {   1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },

+      { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },

+      { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 },

+      { 132, 118, 248, 255, 255, 128, 128, 128, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 4 )*/

+      {   1,  16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },

+      { 190,  36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },

+      { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 5 )*/

+      {   1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 6 )*/

+      {   1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },

+      { 213,  62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },

+      {  55,  93, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+      {  48,  85, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 7 )*/

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

+    }

+  },

+  {

+    /* Block Type ( 3 ) */

+    {

+      /* Coeff Band ( 0 )*/

+      { 202,  24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },

+      { 126,  38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },

+      {  63,  48, 138, 219, 151, 178, 240, 170, 255, 216, 128 },

+      {  54,  40, 138, 219, 151, 178, 240, 170, 255, 216, 128 },

+    },

+    {

+      /* Coeff Band ( 1 )*/

+      {   1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },

+      { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },

+      {  44,  84, 162, 232, 172, 180, 245, 178, 255, 255, 128 },

+      {  32,  70, 162, 232, 172, 180, 245, 178, 255, 255, 128 },

+    },

+    {

+      /* Coeff Band ( 2 )*/

+      {   1,  52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },

+      { 124,  74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },

+      {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },

+      {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },

+    },

+    {

+      /* Coeff Band ( 3 )*/

+      {   1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },

+      { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },

+      {  28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 },

+      {  26, 104, 170, 242, 183, 194, 254, 223, 255, 255, 128 },

+    },

+    {

+      /* Coeff Band ( 4 )*/

+      {   1,  81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },

+      { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },

+      {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },

+      {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 5 )*/

+      {   1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },

+      { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },

+      {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },

+      {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 6 )*/

+      {   1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },

+      { 141,  84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },

+      {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },

+      {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 7 )*/

+      {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 244,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+    }

+  }

+};

+static const vp9_prob default_hybrid_coef_probs [BLOCK_TYPES]

+                                                [COEF_BANDS]

+                                                [PREV_COEF_CONTEXTS]

+                                                [ENTROPY_NODES] = {

+  {

+    /* Block Type ( 0 ) */

+    {

+      /* Coeff Band ( 0 )*/

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 1 )*/

+      { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },

+      { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },

+      { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 },

+      { 90, 116, 227, 252, 214, 209, 255, 255, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 2 )*/

+      {   1,  98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },

+      { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },

+      {  78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },

+      {  64, 128, 202, 247, 198, 180, 255, 219, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 3 )*/

+      {   1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },

+      { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },

+      {  77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },

+      {  64, 100, 216, 255, 236, 230, 128, 128, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 4 )*/

+      {   1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },

+      { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },

+      {  37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 },

+      {  28, 110, 196, 243, 228, 255, 255, 255, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 5 )*/

+      {   1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },

+      { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },

+      { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 },

+      { 90, 90, 231, 255, 211, 171, 128, 128, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 6 )*/

+      {   1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },

+      { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },

+      {  80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 },

+      {  64, 120, 211, 255, 194, 224, 128, 128, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 7 )*/

+      {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 246,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

+    }

+  },

+  {

+    /* Block Type ( 1 ) */

+    {

+      /* Coeff Band ( 0 )*/

+      { 198,  35, 237, 223, 193, 187, 162, 160, 145, 155,  62 },

+      { 131,  45, 198, 221, 172, 176, 220, 157, 252, 221,   1 },

+      {  68,  47, 146, 208, 149, 167, 221, 162, 255, 223, 128 },

+      {  48,  32, 146, 208, 149, 167, 221, 162, 255, 223, 128 },

+    },

+    {

+      /* Coeff Band ( 1 )*/

+      {   1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },

+      { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },

+      {  81,  99, 181, 242, 176, 190, 249, 202, 255, 255, 128 },

+      {  66,  90, 181, 242, 176, 190, 249, 202, 255, 255, 128 },

+    },

+    {

+      /* Coeff Band ( 2 )*/

+      {   1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },

+      {  99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },

+      {  23,  91, 163, 242, 170, 187, 247, 210, 255, 255, 128 },

+      {  18,  80, 163, 242, 170, 187, 247, 210, 255, 255, 128 },

+    },

+    {

+      /* Coeff Band ( 3 )*/

+      {   1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },

+      { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },

+      {  44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 },

+      {  36, 120, 201, 253, 205, 192, 255, 255, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 4 )*/

+      {   1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },

+      {  94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },

+      {  22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 },

+      {  18, 90, 174, 245, 186, 161, 255, 199, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 5 )*/

+      {   1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },

+      { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },

+      {  35,  77, 181, 251, 193, 211, 255, 205, 128, 128, 128 },

+      {  28,  70, 181, 251, 193, 211, 255, 205, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 6 )*/

+      {   1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },

+      { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },

+      {  45,  99, 188, 251, 195, 217, 255, 224, 128, 128, 128 },

+      {  40,  90, 188, 251, 195, 217, 255, 224, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 7 )*/

+      {   1,   1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },

+      { 203,   1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },

+      { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },

+      { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },

+    }

+  },

+  {

+    /* Block Type ( 2 ) */

+    {

+      /* Coeff Band ( 0 )*/

+      { 253,   9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },

+      { 175,  13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },

+      {  73,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },

+      {  64,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },

+    },

+    {

+      /* Coeff Band ( 1 )*/

+      {   1,  95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },

+      { 239,  90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },

+      { 155,  77, 195, 248, 188, 195, 255, 255, 128, 128, 128 },

+      { 140,  70, 195, 248, 188, 195, 255, 255, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 2 )*/

+      {   1,  24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },

+      { 201,  51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },

+      {  69,  46, 190, 239, 201, 218, 255, 228, 128, 128, 128 },

+      {  60,  40, 190, 239, 201, 218, 255, 228, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 3 )*/

+      {   1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },

+      { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },

+      { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 },

+      { 132, 118, 248, 255, 255, 128, 128, 128, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 4 )*/

+      {   1,  16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },

+      { 190,  36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },

+      { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 5 )*/

+      {   1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 6 )*/

+      {   1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },

+      { 213,  62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },

+      {  55,  93, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+      {  48,  85, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 7 )*/

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

+    }

+  },

+  {

+    /* Block Type ( 3 ) */

+    {

+      /* Coeff Band ( 0 )*/

+      { 202,  24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },

+      { 126,  38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },

+      {  63,  48, 138, 219, 151, 178, 240, 170, 255, 216, 128 },

+      {  54,  40, 138, 219, 151, 178, 240, 170, 255, 216, 128 },

+    },

+    {

+      /* Coeff Band ( 1 )*/

+      {   1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },

+      { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },

+      {  44,  84, 162, 232, 172, 180, 245, 178, 255, 255, 128 },

+      {  32,  70, 162, 232, 172, 180, 245, 178, 255, 255, 128 },

+    },

+    {

+      /* Coeff Band ( 2 )*/

+      {   1,  52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },

+      { 124,  74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },

+      {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },

+      {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },

+    },

+    {

+      /* Coeff Band ( 3 )*/

+      {   1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },

+      { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },

+      {  28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 },

+      {  26, 104, 170, 242, 183, 194, 254, 223, 255, 255, 128 },

+    },

+    {

+      /* Coeff Band ( 4 )*/

+      {   1,  81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },

+      { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },

+      {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },

+      {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 5 )*/

+      {   1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },

+      { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },

+      {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },

+      {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 6 )*/

+      {   1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },

+      { 141,  84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },

+      {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },

+      {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },

+    },

+    {

+      /* Coeff Band ( 7 )*/

+      {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 244,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+      { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },

+    }

+  }

+};

+static const vp9_prob

+default_coef_probs_8x8[BLOCK_TYPES_8X8]

+[COEF_BANDS]

+[PREV_COEF_CONTEXTS]

+[ENTROPY_NODES] = {

+  {

+    /* block Type 0 */

+    {

+      /* Coeff Band 0 */

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 1 */

+      { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},

+      { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},

+      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},

+      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 2 */

+      { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},

+      { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},

+      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},

+      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 3 */

+      { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},

+      { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},

+      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},

+      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 4 */

+      { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},

+      { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},

+      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},

+      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 5 */

+      { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},

+      { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},

+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},

+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 6 */

+      { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},

+      { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},

+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},

+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 7 */

+      { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},

+      { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},

+      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},

+      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}

+    }

+  },

+  {

+    /* block Type 1 */

+    {

+      /* Coeff Band 0 */

+      { 134, 152, 233, 224, 234, 52, 255, 166, 128, 128, 128},

+      { 97, 132, 185, 234, 186, 189, 197, 171, 255, 212, 128},

+      { 84, 110, 185, 237, 182, 182, 145, 145, 255, 255, 128}

+    },

+    {

+      /* Coeff Band 1 */

+      { 1, 124, 213, 247, 192, 212, 255, 255, 128, 128, 128},

+      { 88, 111, 178, 254, 189, 211, 255, 255, 128, 128, 128},

+      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128},

+      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128}

+    },

+    {

+      /* Coeff Band 2 */

+      { 1, 102, 225, 255, 210, 240, 128, 128, 128, 128, 128},

+      { 110, 78, 195, 254, 200, 191, 255, 255, 128, 128, 128},

+      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128},

+      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 3 */

+      { 1, 1, 229, 255, 202, 224, 128, 128, 128, 128, 128},

+      { 150, 1, 192, 255, 206, 226, 128, 128, 128, 128, 128},

+      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128},

+      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 4 */

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 5 */

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 6 */

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 7 */

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    }

+  },

+  {

+    /* block Type 2 */

+    {

+      /* Coeff Band 0 */

+      { 11, 181, 226, 199, 183, 255, 255, 255, 128, 128, 128},

+      { 2, 147, 185, 248, 163, 180, 255, 236, 128, 128, 128},

+      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128},

+      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128}

+    },

+    {

+      /* Coeff Band 1 */

+      { 1, 150, 191, 246, 174, 188, 255, 235, 128, 128, 128},

+      { 1, 125, 166, 245, 165, 185, 255, 234, 128, 128, 128},

+      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128},

+      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128}

+    },

+    {

+      /* Coeff Band 2 */

+      { 1, 146, 184, 242, 167, 183, 255, 230, 255, 255, 128},

+      { 1, 119, 160, 239, 156, 178, 255, 231, 255, 255, 128},

+      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128},

+      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128}

+    },

+    {

+      /* Coeff Band 3 */

+      { 1, 150, 188, 244, 169, 183, 255, 233, 255, 255, 128},

+      { 1, 123, 162, 243, 161, 180, 255, 233, 128, 128, 128},

+      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128},

+      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128}

+    },

+    {

+      /* Coeff Band 4 */

+      { 1, 163, 202, 252, 188, 204, 255, 248, 128, 128, 128},

+      { 1, 136, 180, 251, 181, 201, 255, 246, 128, 128, 128},

+      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128},

+      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 5 */

+      { 1, 156, 195, 249, 179, 193, 255, 241, 255, 255, 128},

+      { 1, 128, 169, 248, 171, 192, 255, 242, 255, 255, 128},

+      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128},

+      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128}

+    },

+    {

+      /* Coeff Band 6 */

+      { 1, 36, 71, 251, 192, 201, 255, 243, 255, 255, 128},

+      { 1, 49, 185, 250, 184, 199, 255, 242, 128, 128, 128},

+      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128},

+      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128}

+    },

+    {

+      /* Coeff Band 7 */

+      { 1, 19, 98, 255, 218, 222, 255, 255, 128, 128, 128},

+      { 36, 50, 210, 255, 212, 221, 255, 255, 128, 128, 128},

+      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128},

+      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128}

+    }

+  },

+  { /* block Type 3 */

+    { /* Coeff Band 0 */

+      { 192, 18, 155, 172, 145, 164, 192, 135, 246, 223, 255},

+      { 94, 29, 97, 131, 131, 153, 171, 121, 250, 190, 255},

+      { 25, 29, 63, 128, 119, 147, 168, 124, 251, 183, 255},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    },

+    { /* Coeff Band 1 */

+      { 1, 108, 192, 220, 186, 173, 255, 194, 255, 255, 128},

+      { 123, 104, 188, 221, 165, 171, 247, 180, 255, 255, 128},

+      { 23, 76, 152, 216, 154, 166, 226, 182, 255, 209, 128},

+      { 1, 26, 52, 162, 109, 152, 208, 144, 255, 231, 128}

+    },

+    { /* Coeff Band 2 */

+      { 1, 57, 179, 220, 156, 175, 210, 158, 255, 223, 128},

+      { 48, 57, 134, 212, 151, 170, 219, 185, 255, 248, 128},

+      { 4, 35, 63, 189, 120, 156, 221, 159, 255, 241, 128},

+      { 1, 17, 23, 110, 97, 143, 187, 120, 255, 234, 128}

+    },

+    { /* Coeff Band 3 */

+      { 1, 115, 205, 243, 182, 187, 254, 218, 255, 255, 128},

+      { 80, 101, 186, 241, 183, 186, 249, 182, 255, 255, 128},

+      { 10, 81, 144, 229, 164, 175, 241, 185, 255, 255, 128},

+      { 1, 44, 81, 192, 130, 148, 240, 180, 255, 255, 128}

+    },

+    { /* Coeff Band 4 */

+      { 1, 161, 207, 249, 187, 176, 255, 180, 128, 128, 128},

+      { 79, 148, 196, 240, 186, 182, 253, 171, 255, 255, 128},

+      { 14, 111, 171, 233, 170, 178, 235, 204, 255, 255, 128},

+      { 1, 63, 103, 202, 143, 162, 240, 178, 255, 255, 128}

+    },

+    { /* Coeff Band 5 */

+      { 1, 101, 202, 239, 185, 184, 252, 186, 255, 255, 128},

+      { 43, 67, 166, 237, 178, 190, 246, 194, 255, 255, 128},

+      { 4, 49, 85, 220, 140, 168, 253, 182, 255, 255, 128},

+      { 1, 24, 35, 144, 93, 135, 239, 159, 255, 253, 128}

+    },

+    { /* Coeff Band 6 */

+      { 1, 212, 243, 255, 240, 234, 255, 255, 128, 128, 128},

+      { 98, 168, 234, 255, 229, 234, 255, 255, 128, 128, 128},

+      { 19, 127, 199, 255, 212, 198, 255, 255, 128, 128, 128},

+      { 1, 103, 162, 253, 186, 151, 255, 255, 128, 128, 128}

+    },

+    { /* Coeff Band 7 */

+      { 1, 188, 253, 255, 255, 128, 128, 128, 128, 128, 128},

+      { 191, 68, 242, 255, 255, 128, 128, 128, 128, 128, 128},

+      { 8, 132, 255, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    }

+  }

+};

+static const vp9_prob

+default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]

+                             [COEF_BANDS]

+                             [PREV_COEF_CONTEXTS]

+                             [ENTROPY_NODES] = {

+  {

+    /* block Type 0 */

+    {

+      /* Coeff Band 0 */

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 1 */

+      { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},

+      { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},

+      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},

+      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 2 */

+      { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},

+      { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},

+      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},

+      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 3 */

+      { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},

+      { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},

+      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},

+      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 4 */

+      { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},

+      { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},

+      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},

+      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 5 */

+      { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},

+      { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},

+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},

+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 6 */

+      { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},

+      { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},

+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},

+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 7 */

+      { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},

+      { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},

+      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},

+      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}

+    }

+  },

+  {

+    /* block Type 1 */

+    {

+      /* Coeff Band 0 */

+      { 134, 152, 233, 224, 234, 52, 255, 166, 128, 128, 128},

+      { 97, 132, 185, 234, 186, 189, 197, 171, 255, 212, 128},

+      { 84, 110, 185, 237, 182, 182, 145, 145, 255, 255, 128}

+    },

+    {

+      /* Coeff Band 1 */

+      { 1, 124, 213, 247, 192, 212, 255, 255, 128, 128, 128},

+      { 88, 111, 178, 254, 189, 211, 255, 255, 128, 128, 128},

+      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128},

+      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128}

+    },

+    {

+      /* Coeff Band 2 */

+      { 1, 102, 225, 255, 210, 240, 128, 128, 128, 128, 128},

+      { 110, 78, 195, 254, 200, 191, 255, 255, 128, 128, 128},

+      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128},

+      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 3 */

+      { 1, 1, 229, 255, 202, 224, 128, 128, 128, 128, 128},

+      { 150, 1, 192, 255, 206, 226, 128, 128, 128, 128, 128},

+      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128},

+      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 4 */

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 5 */

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 6 */

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 7 */

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    }

+  },

+  {

+    /* block Type 2 */

+    {

+      /* Coeff Band 0 */

+      { 11, 181, 226, 199, 183, 255, 255, 255, 128, 128, 128},

+      { 2, 147, 185, 248, 163, 180, 255, 236, 128, 128, 128},

+      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128},

+      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128}

+    },

+    {

+      /* Coeff Band 1 */

+      { 1, 150, 191, 246, 174, 188, 255, 235, 128, 128, 128},

+      { 1, 125, 166, 245, 165, 185, 255, 234, 128, 128, 128},

+      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128},

+      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128}

+    },

+    {

+      /* Coeff Band 2 */

+      { 1, 146, 184, 242, 167, 183, 255, 230, 255, 255, 128},

+      { 1, 119, 160, 239, 156, 178, 255, 231, 255, 255, 128},

+      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128},

+      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128}

+    },

+    {

+      /* Coeff Band 3 */

+      { 1, 150, 188, 244, 169, 183, 255, 233, 255, 255, 128},

+      { 1, 123, 162, 243, 161, 180, 255, 233, 128, 128, 128},

+      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128},

+      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128}

+    },

+    {

+      /* Coeff Band 4 */

+      { 1, 163, 202, 252, 188, 204, 255, 248, 128, 128, 128},

+      { 1, 136, 180, 251, 181, 201, 255, 246, 128, 128, 128},

+      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128},

+      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128}

+    },

+    {

+      /* Coeff Band 5 */

+      { 1, 156, 195, 249, 179, 193, 255, 241, 255, 255, 128},

+      { 1, 128, 169, 248, 171, 192, 255, 242, 255, 255, 128},

+      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128},

+      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128}

+    },

+    {

+      /* Coeff Band 6 */

+      { 1, 36, 71, 251, 192, 201, 255, 243, 255, 255, 128},

+      { 1, 49, 185, 250, 184, 199, 255, 242, 128, 128, 128},

+      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128},

+      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128}

+    },

+    {

+      /* Coeff Band 7 */

+      { 1, 19, 98, 255, 218, 222, 255, 255, 128, 128, 128},

+      { 36, 50, 210, 255, 212, 221, 255, 255, 128, 128, 128},

+      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128},

+      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128}

+    }

+  },

+  { /* block Type 3 */

+    { /* Coeff Band 0 */

+      { 192, 18, 155, 172, 145, 164, 192, 135, 246, 223, 255},

+      { 94, 29, 97, 131, 131, 153, 171, 121, 250, 190, 255},

+      { 25, 29, 63, 128, 119, 147, 168, 124, 251, 183, 255},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    },

+    { /* Coeff Band 1 */

+      { 1, 108, 192, 220, 186, 173, 255, 194, 255, 255, 128},

+      { 123, 104, 188, 221, 165, 171, 247, 180, 255, 255, 128},

+      { 23, 76, 152, 216, 154, 166, 226, 182, 255, 209, 128},

+      { 1, 26, 52, 162, 109, 152, 208, 144, 255, 231, 128}

+    },

+    { /* Coeff Band 2 */

+      { 1, 57, 179, 220, 156, 175, 210, 158, 255, 223, 128},

+      { 48, 57, 134, 212, 151, 170, 219, 185, 255, 248, 128},

+      { 4, 35, 63, 189, 120, 156, 221, 159, 255, 241, 128},

+      { 1, 17, 23, 110, 97, 143, 187, 120, 255, 234, 128}

+    },

+    { /* Coeff Band 3 */

+      { 1, 115, 205, 243, 182, 187, 254, 218, 255, 255, 128},

+      { 80, 101, 186, 241, 183, 186, 249, 182, 255, 255, 128},

+      { 10, 81, 144, 229, 164, 175, 241, 185, 255, 255, 128},

+      { 1, 44, 81, 192, 130, 148, 240, 180, 255, 255, 128}

+    },

+    { /* Coeff Band 4 */

+      { 1, 161, 207, 249, 187, 176, 255, 180, 128, 128, 128},

+      { 79, 148, 196, 240, 186, 182, 253, 171, 255, 255, 128},

+      { 14, 111, 171, 233, 170, 178, 235, 204, 255, 255, 128},

+      { 1, 63, 103, 202, 143, 162, 240, 178, 255, 255, 128}

+    },

+    { /* Coeff Band 5 */

+      { 1, 101, 202, 239, 185, 184, 252, 186, 255, 255, 128},

+      { 43, 67, 166, 237, 178, 190, 246, 194, 255, 255, 128},

+      { 4, 49, 85, 220, 140, 168, 253, 182, 255, 255, 128},

+      { 1, 24, 35, 144, 93, 135, 239, 159, 255, 253, 128}

+    },

+    { /* Coeff Band 6 */

+      { 1, 212, 243, 255, 240, 234, 255, 255, 128, 128, 128},

+      { 98, 168, 234, 255, 229, 234, 255, 255, 128, 128, 128},

+      { 19, 127, 199, 255, 212, 198, 255, 255, 128, 128, 128},

+      { 1, 103, 162, 253, 186, 151, 255, 255, 128, 128, 128}

+    },

+    { /* Coeff Band 7 */

+      { 1, 188, 253, 255, 255, 128, 128, 128, 128, 128, 128},

+      { 191, 68, 242, 255, 255, 128, 128, 128, 128, 128, 128},

+      { 8, 132, 255, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    }

+  }

+};

+static const vp9_prob

+  default_coef_probs_16x16[BLOCK_TYPES_16X16]

+                          [COEF_BANDS]

+                          [PREV_COEF_CONTEXTS]

+                          [ENTROPY_NODES] = {

+  { /* block Type 0 */

+    { /* Coeff Band 0 */

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    },

+    { /* Coeff Band 1 */

+      { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},

+      { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},

+      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},

+      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}

+    },

+    { /* Coeff Band 2 */

+      { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},

+      { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},

+      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},

+      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}

+    },

+    { /* Coeff Band 3 */

+      { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},

+      { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},

+      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},

+      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}

+    },

+    { /* Coeff Band 4 */

+      { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},

+      { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},

+      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},

+      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}

+    },

+    { /* Coeff Band 5 */

+      { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},

+      { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},

+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},

+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}

+    },

+    { /* Coeff Band 6 */

+      { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},

+      { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},

+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},

+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}

+    },

+    { /* Coeff Band 7 */

+      { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},

+      { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},

+      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},

+      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}

+    }

+  },

+  { /* block Type 1 */

+      { /* Coeff Band 0 */

+        { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},

+        { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},

+        { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},

+        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+      },

+      { /* Coeff Band 1 */

+        { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},

+        { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},

+        { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},

+        { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}

+      },

+      { /* Coeff Band 2 */

+        { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},

+        { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},

+        { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},

+        { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}

+      },

+      { /* Coeff Band 3 */

+        { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},

+        { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},

+        { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},

+        { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}

+      },

+      { /* Coeff Band 4 */

+        { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},

+        { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},

+        { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},

+        { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}

+      },

+      { /* Coeff Band 5 */

+        { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},

+        { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},

+        { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},

+        { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}

+      },

+      { /* Coeff Band 6 */

+        { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},

+        { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},

+        { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},

+        { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}

+      },

+      { /* Coeff Band 7 */

+        { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},

+        { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},

+        { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},

+        { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}

+      }

+  },

+  { /* block Type 2 */

+      { /* Coeff Band 0 */

+        { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},

+        { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},

+        { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},

+        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+      },

+      { /* Coeff Band 1 */

+        { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},

+        { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},

+        { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},

+        { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}

+      },

+      { /* Coeff Band 2 */

+        { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},

+        { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},

+        { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},

+        { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}

+      },

+      { /* Coeff Band 3 */

+        { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},

+        { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},

+        { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},

+        { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}

+      },

+      { /* Coeff Band 4 */

+        { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},

+        { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},

+        { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},

+        { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}

+      },

+      { /* Coeff Band 5 */

+        { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},

+        { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},

+        { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},

+        { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}

+      },

+      { /* Coeff Band 6 */

+        { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},

+        { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},

+        { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},

+        { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}

+      },

+      { /* Coeff Band 7 */

+        { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},

+        { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},

+        { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},

+        { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}

+      }

+  },

+  { /* block Type 3 */

+    { /* Coeff Band 0 */

+      { 17, 105, 227, 195, 164, 170, 168, 137, 221, 160, 184},

+      { 6, 92, 166, 193, 158, 169, 179, 142, 236, 175, 200},

+      { 2, 68, 118, 193, 147, 168, 187, 149, 241, 178, 247},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    },

+    { /* Coeff Band 1 */

+      { 1, 193, 221, 246, 198, 194, 244, 176, 255, 192, 128},

+      { 112, 160, 209, 244, 196, 194, 243, 175, 255, 209, 128},

+      { 45, 123, 175, 240, 184, 195, 239, 178, 255, 218, 255},

+      { 16, 53, 75, 169, 119, 152, 209, 146, 255, 219, 255}

+    },

+    { /* Coeff Band 2 */

+      { 1, 141, 183, 240, 176, 187, 246, 198, 255, 218, 128},

+      { 36, 97, 150, 231, 161, 180, 243, 191, 255, 217, 255},

+      { 8, 65, 111, 210, 143, 166, 230, 167, 255, 224, 255},

+      { 2, 35, 61, 157, 113, 149, 208, 142, 255, 217, 255}

+    },

+    { /* Coeff Band 3 */

+      { 1, 173, 196, 245, 184, 191, 252, 211, 255, 240, 128},

+      { 35, 119, 175, 242, 177, 187, 252, 209, 255, 235, 128},

+      { 4, 88, 141, 234, 161, 180, 249, 200, 255, 228, 128},

+      { 1, 57, 95, 203, 133, 161, 235, 167, 255, 231, 255}

+    },

+    { /* Coeff Band 4 */

+      { 1, 208, 227, 249, 209, 204, 248, 188, 255, 248, 128},

+      { 28, 162, 211, 247, 203, 200, 252, 188, 255, 232, 128},

+      { 5, 114, 174, 238, 182, 189, 245, 184, 255, 238, 128},

+      { 1, 61, 100, 205, 136, 164, 235, 163, 255, 239, 128}

+    },

+    { /* Coeff Band 5 */

+      { 1, 195, 218, 252, 208, 207, 250, 205, 255, 245, 128},

+      { 22, 141, 196, 249, 198, 201, 250, 202, 255, 244, 128},

+      { 2, 105, 163, 240, 178, 189, 246, 191, 255, 246, 128},

+      { 1, 70, 112, 206, 144, 167, 232, 162, 255, 239, 128}

+    },

+    { /* Coeff Band 6 */

+      { 1, 204, 215, 251, 204, 203, 255, 222, 255, 225, 128},

+      { 15, 140, 194, 249, 194, 199, 254, 221, 255, 253, 128},

+      { 1, 95, 153, 243, 172, 188, 254, 213, 255, 248, 128},

+      { 1, 59, 99, 216, 135, 166, 247, 190, 255, 237, 255}

+    },

+    { /* Coeff Band 7 */

+      { 1, 7, 231, 255, 227, 223, 255, 240, 255, 255, 128},

+      { 15, 157, 217, 255, 218, 219, 255, 239, 255, 255, 128},

+      { 1, 114, 182, 252, 198, 207, 255, 235, 255, 255, 128},

+      { 1, 71, 122, 238, 154, 181, 255, 216, 255, 255, 128}

+    }

+  }

+};

+static const vp9_prob

+  default_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]

+                                 [COEF_BANDS]

+                                 [PREV_COEF_CONTEXTS]

+                                 [ENTROPY_NODES] = {

+  { /* block Type 0 */

+    { /* Coeff Band 0 */

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    },

+    { /* Coeff Band 1 */

+      { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},

+      { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},

+      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},

+      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}

+    },

+    { /* Coeff Band 2 */

+      { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},

+      { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},

+      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},

+      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}

+    },

+    { /* Coeff Band 3 */

+      { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},

+      { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},

+      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},

+      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}

+    },

+    { /* Coeff Band 4 */

+      { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},

+      { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},

+      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},

+      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}

+    },

+    { /* Coeff Band 5 */

+      { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},

+      { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},

+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},

+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}

+    },

+    { /* Coeff Band 6 */

+      { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},

+      { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},

+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},

+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}

+    },

+    { /* Coeff Band 7 */

+      { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},

+      { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},

+      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},

+      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}

+    }

+  },

+  { /* block Type 1 */

+      { /* Coeff Band 0 */

+        { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},

+        { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},

+        { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},

+        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+      },

+      { /* Coeff Band 1 */

+        { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},

+        { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},

+        { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},

+        { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}

+      },

+      { /* Coeff Band 2 */

+        { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},

+        { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},

+        { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},

+        { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}

+      },

+      { /* Coeff Band 3 */

+        { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},

+        { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},

+        { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},

+        { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}

+      },

+      { /* Coeff Band 4 */

+        { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},

+        { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},

+        { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},

+        { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}

+      },

+      { /* Coeff Band 5 */

+        { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},

+        { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},

+        { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},

+        { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}

+      },

+      { /* Coeff Band 6 */

+        { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},

+        { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},

+        { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},

+        { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}

+      },

+      { /* Coeff Band 7 */

+        { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},

+        { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},

+        { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},

+        { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}

+      }

+  },

+  { /* block Type 2 */

+      { /* Coeff Band 0 */

+        { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},

+        { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},

+        { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},

+        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+      },

+      { /* Coeff Band 1 */

+        { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},

+        { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},

+        { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},

+        { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}

+      },

+      { /* Coeff Band 2 */

+        { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},

+        { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},

+        { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},

+        { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}

+      },

+      { /* Coeff Band 3 */

+        { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},

+        { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},

+        { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},

+        { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}

+      },

+      { /* Coeff Band 4 */

+        { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},

+        { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},

+        { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},

+        { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}

+      },

+      { /* Coeff Band 5 */

+        { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},

+        { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},

+        { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},

+        { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}

+      },

+      { /* Coeff Band 6 */

+        { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},

+        { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},

+        { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},

+        { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}

+      },

+      { /* Coeff Band 7 */

+        { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},

+        { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},

+        { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},

+        { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}

+      }

+  },

+  { /* block Type 3 */

+    { /* Coeff Band 0 */

+      { 17, 105, 227, 195, 164, 170, 168, 137, 221, 160, 184},

+      { 6, 92, 166, 193, 158, 169, 179, 142, 236, 175, 200},

+      { 2, 68, 118, 193, 147, 168, 187, 149, 241, 178, 247},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    },

+    { /* Coeff Band 1 */

+      { 1, 193, 221, 246, 198, 194, 244, 176, 255, 192, 128},

+      { 112, 160, 209, 244, 196, 194, 243, 175, 255, 209, 128},

+      { 45, 123, 175, 240, 184, 195, 239, 178, 255, 218, 255},

+      { 16, 53, 75, 169, 119, 152, 209, 146, 255, 219, 255}

+    },

+    { /* Coeff Band 2 */

+      { 1, 141, 183, 240, 176, 187, 246, 198, 255, 218, 128},

+      { 36, 97, 150, 231, 161, 180, 243, 191, 255, 217, 255},

+      { 8, 65, 111, 210, 143, 166, 230, 167, 255, 224, 255},

+      { 2, 35, 61, 157, 113, 149, 208, 142, 255, 217, 255}

+    },

+    { /* Coeff Band 3 */

+      { 1, 173, 196, 245, 184, 191, 252, 211, 255, 240, 128},

+      { 35, 119, 175, 242, 177, 187, 252, 209, 255, 235, 128},

+      { 4, 88, 141, 234, 161, 180, 249, 200, 255, 228, 128},

+      { 1, 57, 95, 203, 133, 161, 235, 167, 255, 231, 255}

+    },

+    { /* Coeff Band 4 */

+      { 1, 208, 227, 249, 209, 204, 248, 188, 255, 248, 128},

+      { 28, 162, 211, 247, 203, 200, 252, 188, 255, 232, 128},

+      { 5, 114, 174, 238, 182, 189, 245, 184, 255, 238, 128},

+      { 1, 61, 100, 205, 136, 164, 235, 163, 255, 239, 128}

+    },

+    { /* Coeff Band 5 */

+      { 1, 195, 218, 252, 208, 207, 250, 205, 255, 245, 128},

+      { 22, 141, 196, 249, 198, 201, 250, 202, 255, 244, 128},

+      { 2, 105, 163, 240, 178, 189, 246, 191, 255, 246, 128},

+      { 1, 70, 112, 206, 144, 167, 232, 162, 255, 239, 128}

+    },

+    { /* Coeff Band 6 */

+      { 1, 204, 215, 251, 204, 203, 255, 222, 255, 225, 128},

+      { 15, 140, 194, 249, 194, 199, 254, 221, 255, 253, 128},

+      { 1, 95, 153, 243, 172, 188, 254, 213, 255, 248, 128},

+      { 1, 59, 99, 216, 135, 166, 247, 190, 255, 237, 255}

+    },

+    { /* Coeff Band 7 */

+      { 1, 7, 231, 255, 227, 223, 255, 240, 255, 255, 128},

+      { 15, 157, 217, 255, 218, 219, 255, 239, 255, 255, 128},

+      { 1, 114, 182, 252, 198, 207, 255, 235, 255, 255, 128},

+      { 1, 71, 122, 238, 154, 181, 255, 216, 255, 255, 128}

+    }

+  }

+};

--- /dev/null

+++ b/vp9/common/entropy.c

@@ -1,0 +1,447 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <stdio.h>

+#include "entropy.h"

+#include "string.h"

+#include "blockd.h"

+#include "onyxc_int.h"

+#include "entropymode.h"

+#include "vpx_mem/vpx_mem.h"

+#define uchar unsigned char     /* typedefs can clash */

+#define uint  unsigned int

+typedef const uchar cuchar;

+typedef const uint cuint;

+typedef vp9_prob Prob;

+#include "coefupdateprobs.h"

+const int vp9_i8x8_block[4] = {0, 2, 8, 10};

+DECLARE_ALIGNED(16, const unsigned char, vp9_norm[256]) = {

+  0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,

+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

+};

+DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]) = {

+  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7

+};

+DECLARE_ALIGNED(16, cuchar, vp9_prev_token_class[MAX_ENTROPY_TOKENS]) = {

+  0, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0

+};

+DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d[16]) = {

+  0,  1,  4,  8,

+  5,  2,  3,  6,

+  9, 12, 13, 10,

+  7, 11, 14, 15,

+};

+DECLARE_ALIGNED(16, const int, vp9_col_scan[16]) = {

+  0, 4,  8, 12,

+  1, 5,  9, 13,

+  2, 6, 10, 14,

+  3, 7, 11, 15

+};

+DECLARE_ALIGNED(16, const int, vp9_row_scan[16]) = {

+  0,   1,  2,  3,

+  4,   5,  6,  7,

+  8,   9, 10, 11,

+  12, 13, 14, 15

+};

+DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]) = { 0, 1, 2, 3, 5, 4, 4, 5,

+                                                           5, 3, 6, 3, 5, 4, 6, 6,

+                                                           6, 5, 5, 6, 6, 6, 6, 6,

+                                                           6, 6, 6, 6, 6, 6, 6, 6,

+                                                           6, 6, 6, 6, 7, 7, 7, 7,

+                                                           7, 7, 7, 7, 7, 7, 7, 7,

+                                                           7, 7, 7, 7, 7, 7, 7, 7,

+                                                           7, 7, 7, 7, 7, 7, 7, 7

+                                                         };

+DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]) = {

+  0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,

+  12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,

+  35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,

+  58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,

+};

+// Table can be optimized.

+DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]) = {

+    0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,

+    6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+};

+DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = {

+      0,   1,  16,  32,  17,   2,   3,  18,  33,  48,  64,  49,  34,  19,   4,   5,

+     20,  35,  50,  65,  80,  96,  81,  66,  51,  36,  21,   6,   7,  22,  37,  52,

+     67,  82,  97, 112, 128, 113,  98,  83,  68,  53,  38,  23,   8,   9,  24,  39,

+     54,  69,  84,  99, 114, 129, 144, 160, 145, 130, 115, 100,  85,  70,  55,  40,

+     25,  10,  11,  26,  41,  56,  71,  86, 101, 116, 131, 146, 161, 176, 192, 177,

+    162, 147, 132, 117, 102,  87,  72,  57,  42,  27,  12,  13,  28,  43,  58,  73,

+     88, 103, 118, 133, 148, 163, 178, 193, 208, 224, 209, 194, 179, 164, 149, 134,

+    119, 104,  89,  74,  59,  44,  29,  14,  15,  30,  45,  60,  75,  90, 105, 120,

+    135, 150, 165, 180, 195, 210, 225, 240, 241, 226, 211, 196, 181, 166, 151, 136,

+    121, 106,  91,  76,  61,  46,  31,  47,  62,  77,  92, 107, 122, 137, 152, 167,

+    182, 197, 212, 227, 242, 243, 228, 213, 198, 183, 168, 153, 138, 123, 108,  93,

+     78,  63,  79,  94, 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230,

+    215, 200, 185, 170, 155, 140, 125, 110,  95, 111, 126, 141, 156, 171, 186, 201,

+    216, 231, 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188,

+    203, 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235,

+    250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, 255,

+};

+/* Array indices are identical to previously-existing CONTEXT_NODE indices */

+const vp9_tree_index vp9_coef_tree[ 22] =     /* corresponding _CONTEXT_NODEs */

+{

+  -DCT_EOB_TOKEN, 2,                             /* 0 = EOB */

+  -ZERO_TOKEN, 4,                               /* 1 = ZERO */

+  -ONE_TOKEN, 6,                               /* 2 = ONE */

+  8, 12,                                      /* 3 = LOW_VAL */

+  -TWO_TOKEN, 10,                            /* 4 = TWO */

+  -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */

+  14, 16,                                    /* 6 = HIGH_LOW */

+  -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */

+  18, 20,                                   /* 8 = CAT_THREEFOUR */

+  -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,  /* 9 = CAT_THREE */

+  -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6   /* 10 = CAT_FIVE */

+};

+struct vp9_token_struct vp9_coef_encodings[MAX_ENTROPY_TOKENS];

+/* Trees for extra bits.  Probabilities are constant and

+   do not depend on previously encoded bits */

+static const Prob Pcat1[] = { 159};

+static const Prob Pcat2[] = { 165, 145};

+static const Prob Pcat3[] = { 173, 148, 140};

+static const Prob Pcat4[] = { 176, 155, 140, 135};

+static const Prob Pcat5[] = { 180, 157, 141, 134, 130};

+static const Prob Pcat6[] =

+{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129};

+static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[26];

+static void init_bit_tree(vp9_tree_index *p, int n) {

+  int i = 0;

+  while (++i < n) {

+    p[0] = p[1] = i << 1;

+    p += 2;

+  }

+  p[0] = p[1] = 0;

+}

+static void init_bit_trees() {

+  init_bit_tree(cat1, 1);

+  init_bit_tree(cat2, 2);

+  init_bit_tree(cat3, 3);

+  init_bit_tree(cat4, 4);

+  init_bit_tree(cat5, 5);

+  init_bit_tree(cat6, 13);

+}

+vp9_extra_bit_struct vp9_extra_bits[12] = {

+  { 0, 0, 0, 0},

+  { 0, 0, 0, 1},

+  { 0, 0, 0, 2},

+  { 0, 0, 0, 3},

+  { 0, 0, 0, 4},

+  { cat1, Pcat1, 1, 5},

+  { cat2, Pcat2, 2, 7},

+  { cat3, Pcat3, 3, 11},

+  { cat4, Pcat4, 4, 19},

+  { cat5, Pcat5, 5, 35},

+  { cat6, Pcat6, 13, 67},

+  { 0, 0, 0, 0}

+};

+#include "default_coef_probs.h"

+void vp9_default_coef_probs(VP9_COMMON *pc) {

+  vpx_memcpy(pc->fc.coef_probs, default_coef_probs,

+             sizeof(pc->fc.coef_probs));

+  vpx_memcpy(pc->fc.hybrid_coef_probs, default_hybrid_coef_probs,

+             sizeof(pc->fc.hybrid_coef_probs));

+  vpx_memcpy(pc->fc.coef_probs_8x8, default_coef_probs_8x8,

+             sizeof(pc->fc.coef_probs_8x8));

+  vpx_memcpy(pc->fc.hybrid_coef_probs_8x8, default_hybrid_coef_probs_8x8,

+             sizeof(pc->fc.hybrid_coef_probs_8x8));

+  vpx_memcpy(pc->fc.coef_probs_16x16, default_coef_probs_16x16,

+             sizeof(pc->fc.coef_probs_16x16));

+  vpx_memcpy(pc->fc.hybrid_coef_probs_16x16,

+             default_hybrid_coef_probs_16x16,

+             sizeof(pc->fc.hybrid_coef_probs_16x16));

+}

+void vp9_coef_tree_initialize() {

+  init_bit_trees();

+  vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);

+}

+// #define COEF_COUNT_TESTING

+#define COEF_COUNT_SAT 24

+#define COEF_MAX_UPDATE_FACTOR 112

+#define COEF_COUNT_SAT_KEY 24

+#define COEF_MAX_UPDATE_FACTOR_KEY 112

+#define COEF_COUNT_SAT_AFTER_KEY 24

+#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128

+void vp9_adapt_coef_probs(VP9_COMMON *cm) {

+  int t, i, j, k, count;

+  unsigned int branch_ct[ENTROPY_NODES][2];

+  vp9_prob coef_probs[ENTROPY_NODES];

+  int update_factor; /* denominator 256 */

+  int factor;

+  int count_sat;

+  // printf("Frame type: %d\n", cm->frame_type);

+  if (cm->frame_type == KEY_FRAME) {

+    update_factor = COEF_MAX_UPDATE_FACTOR_KEY;

+    count_sat = COEF_COUNT_SAT_KEY;

+  } else if (cm->last_frame_type == KEY_FRAME) {

+    update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY;  /* adapt quickly */

+    count_sat = COEF_COUNT_SAT_AFTER_KEY;

+  } else {

+    update_factor = COEF_MAX_UPDATE_FACTOR;

+    count_sat = COEF_COUNT_SAT;

+  }

+#ifdef COEF_COUNT_TESTING

+  {

+    printf("static const unsigned int\ncoef_counts"

+           "[BLOCK_TYPES] [COEF_BANDS]"

+           "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");

+    for (i = 0; i < BLOCK_TYPES; ++i) {

+      printf("  {\n");

+      for (j = 0; j < COEF_BANDS; ++j) {

+        printf("    {\n");

+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+          printf("      {");

+          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

+            printf("%d, ", cm->fc.coef_counts[i][j][k][t]);

+          printf("},\n");

+        }

+        printf("    },\n");

+      }

+      printf("  },\n");

+    }

+    printf("};\n");

+    printf("static const unsigned int\ncoef_counts_8x8"

+           "[BLOCK_TYPES_8X8] [COEF_BANDS]"

+           "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");

+    for (i = 0; i < BLOCK_TYPES_8X8; ++i) {

+      printf("  {\n");

+      for (j = 0; j < COEF_BANDS; ++j) {

+        printf("    {\n");

+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+          printf("      {");

+          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

+            printf("%d, ", cm->fc.coef_counts_8x8[i][j][k][t]);

+          printf("},\n");

+        }

+        printf("    },\n");

+      }

+      printf("  },\n");

+    }

+    printf("};\n");

+    printf("static const unsigned int\nhybrid_coef_counts"

+           "[BLOCK_TYPES] [COEF_BANDS]"

+           "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");

+    for (i = 0; i < BLOCK_TYPES; ++i) {

+      printf("  {\n");

+      for (j = 0; j < COEF_BANDS; ++j) {

+        printf("    {\n");

+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+          printf("      {");

+          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

+            printf("%d, ", cm->fc.hybrid_coef_counts[i][j][k][t]);

+          printf("},\n");

+        }

+        printf("    },\n");

+      }

+      printf("  },\n");

+    }

+    printf("};\n");

+  }

+#endif

+  for (i = 0; i < BLOCK_TYPES; ++i)

+    for (j = 0; j < COEF_BANDS; ++j)

+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

+          continue;

+        vp9_tree_probs_from_distribution(

+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

+          coef_probs, branch_ct, cm->fc.coef_counts [i][j][k],

+          256, 1);

+        for (t = 0; t < ENTROPY_NODES; ++t) {

+          int prob;

+          count = branch_ct[t][0] + branch_ct[t][1];

+          count = count > count_sat ? count_sat : count;

+          factor = (update_factor * count / count_sat);

+          prob = ((int)cm->fc.pre_coef_probs[i][j][k][t] * (256 - factor) +

+                  (int)coef_probs[t] * factor + 128) >> 8;

+          if (prob <= 0) cm->fc.coef_probs[i][j][k][t] = 1;

+          else if (prob > 255) cm->fc.coef_probs[i][j][k][t] = 255;

+          else cm->fc.coef_probs[i][j][k][t] = prob;

+        }

+      }

+  for (i = 0; i < BLOCK_TYPES; ++i)

+    for (j = 0; j < COEF_BANDS; ++j)

+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

+          continue;

+        vp9_tree_probs_from_distribution(

+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

+          coef_probs, branch_ct, cm->fc.hybrid_coef_counts [i][j][k],

+          256, 1);

+        for (t = 0; t < ENTROPY_NODES; ++t) {

+          int prob;

+          count = branch_ct[t][0] + branch_ct[t][1];

+          count = count > count_sat ? count_sat : count;

+          factor = (update_factor * count / count_sat);

+          prob = ((int)cm->fc.pre_hybrid_coef_probs[i][j][k][t] * (256 - factor) +

+                  (int)coef_probs[t] * factor + 128) >> 8;

+          if (prob <= 0) cm->fc.hybrid_coef_probs[i][j][k][t] = 1;

+          else if (prob > 255) cm->fc.hybrid_coef_probs[i][j][k][t] = 255;

+          else cm->fc.hybrid_coef_probs[i][j][k][t] = prob;

+        }

+      }

+  for (i = 0; i < BLOCK_TYPES_8X8; ++i)

+    for (j = 0; j < COEF_BANDS; ++j)

+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

+          continue;

+        vp9_tree_probs_from_distribution(

+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

+          coef_probs, branch_ct, cm->fc.coef_counts_8x8 [i][j][k],

+          256, 1);

+        for (t = 0; t < ENTROPY_NODES; ++t) {

+          int prob;

+          count = branch_ct[t][0] + branch_ct[t][1];

+          count = count > count_sat ? count_sat : count;

+          factor = (update_factor * count / count_sat);

+          prob = ((int)cm->fc.pre_coef_probs_8x8[i][j][k][t] * (256 - factor) +

+                  (int)coef_probs[t] * factor + 128) >> 8;

+          if (prob <= 0) cm->fc.coef_probs_8x8[i][j][k][t] = 1;

+          else if (prob > 255) cm->fc.coef_probs_8x8[i][j][k][t] = 255;

+          else cm->fc.coef_probs_8x8[i][j][k][t] = prob;

+        }

+      }

+  for (i = 0; i < BLOCK_TYPES_8X8; ++i)

+    for (j = 0; j < COEF_BANDS; ++j)

+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

+          continue;

+        vp9_tree_probs_from_distribution(

+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

+          coef_probs, branch_ct, cm->fc.hybrid_coef_counts_8x8 [i][j][k],

+          256, 1);

+        for (t = 0; t < ENTROPY_NODES; ++t) {

+          int prob;

+          count = branch_ct[t][0] + branch_ct[t][1];

+          count = count > count_sat ? count_sat : count;

+          factor = (update_factor * count / count_sat);

+          prob = ((int)cm->fc.pre_hybrid_coef_probs_8x8[i][j][k][t] *

+                  (256 - factor) +

+                  (int)coef_probs[t] * factor + 128) >> 8;

+          if (prob <= 0) cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = 1;

+          else if (prob > 255) cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = 255;

+          else cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = prob;

+        }

+      }

+  for (i = 0; i < BLOCK_TYPES_16X16; ++i)

+    for (j = 0; j < COEF_BANDS; ++j)

+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

+          continue;

+        vp9_tree_probs_from_distribution(

+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

+          coef_probs, branch_ct, cm->fc.coef_counts_16x16[i][j][k], 256, 1);

+        for (t = 0; t < ENTROPY_NODES; ++t) {

+          int prob;

+          count = branch_ct[t][0] + branch_ct[t][1];

+          count = count > count_sat ? count_sat : count;

+          factor = (update_factor * count / count_sat);

+          prob = ((int)cm->fc.pre_coef_probs_16x16[i][j][k][t] *

+                  (256 - factor) +

+                  (int)coef_probs[t] * factor + 128) >> 8;

+          if (prob <= 0) cm->fc.coef_probs_16x16[i][j][k][t] = 1;

+          else if (prob > 255) cm->fc.coef_probs_16x16[i][j][k][t] = 255;

+          else cm->fc.coef_probs_16x16[i][j][k][t] = prob;

+        }

+      }

+  for (i = 0; i < BLOCK_TYPES_16X16; ++i)

+    for (j = 0; j < COEF_BANDS; ++j)

+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

+          continue;

+        vp9_tree_probs_from_distribution(

+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

+          coef_probs, branch_ct, cm->fc.hybrid_coef_counts_16x16[i][j][k], 256, 1);

+        for (t = 0; t < ENTROPY_NODES; ++t) {

+          int prob;

+          count = branch_ct[t][0] + branch_ct[t][1];

+          count = count > count_sat ? count_sat : count;

+          factor = (update_factor * count / count_sat);

+          prob = ((int)cm->fc.pre_hybrid_coef_probs_16x16[i][j][k][t] * (256 - factor) +

+                  (int)coef_probs[t] * factor + 128) >> 8;

+          if (prob <= 0) cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = 1;

+          else if (prob > 255) cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = 255;

+          else cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = prob;

+        }

+      }

+}

--- /dev/null

+++ b/vp9/common/entropy.h

@@ -1,0 +1,112 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_ENTROPY_H

+#define __INC_ENTROPY_H

+#include "treecoder.h"

+#include "blockd.h"

+#include "common.h"

+#include "coefupdateprobs.h"

+extern const int vp9_i8x8_block[4];

+/* Coefficient token alphabet */

+#define ZERO_TOKEN              0       /* 0         Extra Bits 0+0 */

+#define ONE_TOKEN               1       /* 1         Extra Bits 0+1 */

+#define TWO_TOKEN               2       /* 2         Extra Bits 0+1 */

+#define THREE_TOKEN             3       /* 3         Extra Bits 0+1 */

+#define FOUR_TOKEN              4       /* 4         Extra Bits 0+1 */

+#define DCT_VAL_CATEGORY1       5       /* 5-6       Extra Bits 1+1 */

+#define DCT_VAL_CATEGORY2       6       /* 7-10      Extra Bits 2+1 */

+#define DCT_VAL_CATEGORY3       7       /* 11-18     Extra Bits 3+1 */

+#define DCT_VAL_CATEGORY4       8       /* 19-34     Extra Bits 4+1 */

+#define DCT_VAL_CATEGORY5       9       /* 35-66     Extra Bits 5+1 */

+#define DCT_VAL_CATEGORY6       10      /* 67+       Extra Bits 13+1 */

+#define DCT_EOB_TOKEN           11      /* EOB       Extra Bits 0+0 */

+#define MAX_ENTROPY_TOKENS 12

+#define ENTROPY_NODES 11

+#define EOSB_TOKEN              127     /* Not signalled, encoder only */

+extern const vp9_tree_index vp9_coef_tree[];

+extern struct vp9_token_struct vp9_coef_encodings[MAX_ENTROPY_TOKENS];

+typedef struct {

+  vp9_tree_p tree;

+  const vp9_prob *prob;

+  int Len;

+  int base_val;

+} vp9_extra_bit_struct;

+extern vp9_extra_bit_struct vp9_extra_bits[12];    /* indexed by token value */

+#define PROB_UPDATE_BASELINE_COST   7

+#define MAX_PROB                255

+#define DCT_MAX_VALUE           8192

+/* Coefficients are predicted via a 3-dimensional probability table. */

+/* Outside dimension.  0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */

+#define BLOCK_TYPES 4

+#define BLOCK_TYPES_8X8 4

+#define BLOCK_TYPES_16X16 4

+/* Middle dimension is a coarsening of the coefficient's

+   position within the 4x4 DCT. */

+#define COEF_BANDS 8

+extern DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]);

+extern DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]);

+extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]);

+/* Inside dimension is 3-valued measure of nearby complexity, that is,

+   the extent to which nearby coefficients are nonzero.  For the first

+   coefficient (DC, unless block type is 0), we look at the (already encoded)

+   blocks above and to the left of the current block.  The context index is

+   then the number (0,1,or 2) of these blocks having nonzero coefficients.

+   After decoding a coefficient, the measure is roughly the size of the

+   most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1).

+   Note that the intuitive meaning of this measure changes as coefficients

+   are decoded, e.g., prior to the first token, a zero means that my neighbors

+   are empty while, after the first token, because of the use of end-of-block,

+   a zero means we just decoded a zero and hence guarantees that a non-zero

+   coefficient will appear later in this block.  However, this shift

+   in meaning is perfectly OK because our context depends also on the

+   coefficient band (and since zigzag positions 0, 1, and 2 are in

+   distinct bands). */

+/*# define DC_TOKEN_CONTEXTS        3*/ /* 00, 0!0, !0!0 */

+#define PREV_COEF_CONTEXTS       4

+#define SUBEXP_PARAM                4   /* Subexponential code parameter */

+#define MODULUS_PARAM               13  /* Modulus parameter */

+extern DECLARE_ALIGNED(16, const unsigned char, vp9_prev_token_class[MAX_ENTROPY_TOKENS]);

+struct VP9Common;

+void vp9_default_coef_probs(struct VP9Common *);

+extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d[16]);

+extern DECLARE_ALIGNED(16, const int, vp9_col_scan[16]);

+extern DECLARE_ALIGNED(16, const int, vp9_row_scan[16]);

+extern DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]);

+void vp9_coef_tree_initialize(void);

+extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]);

+void vp9_adapt_coef_probs(struct VP9Common *);

+#endif

--- /dev/null

+++ b/vp9/common/entropymode.c

@@ -1,0 +1,614 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "onyxc_int.h"

+#include "modecont.h"

+#include "vpx_mem/vpx_mem.h"

+static const unsigned int kf_y_mode_cts[8][VP9_YMODES] = {

+  /* DC V   H  D45 135 117 153 D27 D63 TM i8x8 BPRED */

+  {12,  6,  5,  5,  5,  5,  5,  5,  5,  2, 22, 200},

+  {25, 13, 13,  7,  7,  7,  7,  7,  7,  6, 27, 160},

+  {31, 17, 18,  8,  8,  8,  8,  8,  8,  9, 26, 139},

+  {40, 22, 23,  8,  8,  8,  8,  8,  8, 12, 27, 116},

+  {53, 26, 28,  8,  8,  8,  8,  8,  8, 13, 26,  94},

+  {68, 33, 35,  8,  8,  8,  8,  8,  8, 17, 20,  68},

+  {78, 38, 38,  8,  8,  8,  8,  8,  8, 19, 16,  52},

+  {89, 42, 42,  8,  8,  8,  8,  8,  8, 21, 12,  34},

+};

+static const unsigned int y_mode_cts  [VP9_YMODES] = {

+  /* DC V   H  D45 135 117 153 D27 D63 TM i8x8 BPRED */

+  98, 19, 15, 14, 14, 14, 14, 12, 12, 13, 16, 70

+};

+static const unsigned int uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {

+  /* DC   V   H  D45 135 117 153 D27 D63 TM */

+  { 200, 15, 15, 10, 10, 10, 10, 10, 10,  6}, /* DC */

+  { 130, 75, 10, 10, 10, 10, 10, 10, 10,  6}, /* V */

+  { 130, 10, 75, 10, 10, 10, 10, 10, 10,  6}, /* H */

+  { 130, 15, 10, 75, 10, 10, 10, 10, 10,  6}, /* D45 */

+  { 150, 15, 10, 10, 75, 10, 10, 10, 10,  6}, /* D135 */

+  { 150, 15, 10, 10, 10, 75, 10, 10, 10,  6}, /* D117 */

+  { 150, 15, 10, 10, 10, 10, 75, 10, 10,  6}, /* D153 */

+  { 150, 15, 10, 10, 10, 10, 10, 75, 10,  6}, /* D27 */

+  { 150, 15, 10, 10, 10, 10, 10, 10, 75,  6}, /* D63 */

+  { 160, 30, 30, 10, 10, 10, 10, 10, 10, 16}, /* TM */

+  { 132, 46, 40, 10, 10, 10, 10, 10, 10, 18}, /* i8x8 - never used */

+  { 150, 35, 41, 10, 10, 10, 10, 10, 10, 10}, /* BPRED */

+};

+static const unsigned int i8x8_mode_cts  [VP9_I8X8_MODES] = {

+  /* DC V   H D45 135 117 153 D27 D63  TM */

+  73, 49, 61, 30, 30, 30, 30, 30, 30, 13

+};

+static const unsigned int kf_uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {

+  // DC   V   H  D45 135 117 153 D27 D63 TM

+  { 160, 24, 24, 20, 20, 20, 20, 20, 20,  8}, /* DC */

+  { 102, 64, 30, 20, 20, 20, 20, 20, 20, 10}, /* V */

+  { 102, 30, 64, 20, 20, 20, 20, 20, 20, 10}, /* H */

+  { 102, 33, 20, 64, 20, 20, 20, 20, 20, 14}, /* D45 */

+  { 102, 33, 20, 20, 64, 20, 20, 20, 20, 14}, /* D135 */

+  { 122, 33, 20, 20, 20, 64, 20, 20, 20, 14}, /* D117 */

+  { 102, 33, 20, 20, 20, 20, 64, 20, 20, 14}, /* D153 */

+  { 102, 33, 20, 20, 20, 20, 20, 64, 20, 14}, /* D27 */

+  { 102, 33, 20, 20, 20, 20, 20, 20, 64, 14}, /* D63 */

+  { 132, 36, 30, 20, 20, 20, 20, 20, 20, 18}, /* TM */

+  { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* i8x8 - never used */

+  { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* BPRED */

+};

+static const unsigned int bmode_cts[VP9_BINTRAMODES] = {

+  /* DC    TM     VE     HE   LD    RD    VR    VL    HD    HU */

+  43891, 17694, 10036, 3920, 3363, 2546, 5119, 3221, 2471, 1723

+};

+typedef enum {

+  SUBMVREF_NORMAL,

+  SUBMVREF_LEFT_ZED,

+  SUBMVREF_ABOVE_ZED,

+  SUBMVREF_LEFT_ABOVE_SAME,

+  SUBMVREF_LEFT_ABOVE_ZED

+} sumvfref_t;

+int vp9_mv_cont(const int_mv *l, const int_mv *a) {

+  int lez = (l->as_int == 0);

+  int aez = (a->as_int == 0);

+  int lea = (l->as_int == a->as_int);

+  if (lea && lez)

+    return SUBMVREF_LEFT_ABOVE_ZED;

+  if (lea)

+    return SUBMVREF_LEFT_ABOVE_SAME;

+  if (aez)

+    return SUBMVREF_ABOVE_ZED;

+  if (lez)

+    return SUBMVREF_LEFT_ZED;

+  return SUBMVREF_NORMAL;

+}

+const vp9_prob vp9_sub_mv_ref_prob [VP9_SUBMVREFS - 1] = { 180, 162, 25};

+const vp9_prob vp9_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP9_SUBMVREFS - 1] = {

+  { 147, 136, 18 },

+  { 106, 145, 1  },

+  { 179, 121, 1  },

+  { 223, 1, 34 },

+  { 208, 1, 1  }

+};

+vp9_mbsplit vp9_mbsplits [VP9_NUMMBSPLITS] = {

+  {

+    0,  0,  0,  0,

+    0,  0,  0,  0,

+    1,  1,  1,  1,

+    1,  1,  1,  1,

+  }, {

+    0,  0,  1,  1,

+    0,  0,  1,  1,

+    0,  0,  1,  1,

+    0,  0,  1,  1,

+  }, {

+    0,  0,  1,  1,

+    0,  0,  1,  1,

+    2,  2,  3,  3,

+    2,  2,  3,  3,

+  }, {

+    0,  1,  2,  3,

+    4,  5,  6,  7,

+    8,  9,  10, 11,

+    12, 13, 14, 15,

+  },

+};

+const int vp9_mbsplit_count [VP9_NUMMBSPLITS] = { 2, 2, 4, 16};

+const vp9_prob vp9_mbsplit_probs [VP9_NUMMBSPLITS - 1] = { 110, 111, 150};

+/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */

+const vp9_tree_index vp9_bmode_tree[VP9_BINTRAMODES * 2 - 2] = /* INTRAMODECONTEXTNODE value */

+{

+  -B_DC_PRED, 2,                             /* 0 = DC_NODE */

+  -B_TM_PRED, 4,                            /* 1 = TM_NODE */

+  -B_VE_PRED, 6,                           /* 2 = VE_NODE */

+  8, 12,                                  /* 3 = COM_NODE */

+  -B_HE_PRED, 10,                        /* 4 = HE_NODE */

+  -B_RD_PRED, -B_VR_PRED,               /* 5 = RD_NODE */

+  -B_LD_PRED, 14,                        /* 6 = LD_NODE */

+  -B_VL_PRED, 16,                      /* 7 = VL_NODE */

+  -B_HD_PRED, -B_HU_PRED             /* 8 = HD_NODE */

+};

+/* Again, these trees use the same probability indices as their

+   explicitly-programmed predecessors. */

+const vp9_tree_index vp9_ymode_tree[VP9_YMODES * 2 - 2] = {

+  2, 14,

+  -DC_PRED, 4,

+  6, 8,

+  -D45_PRED, -D135_PRED,

+  10, 12,

+  -D117_PRED, -D153_PRED,

+  -D27_PRED, -D63_PRED,

+  16, 18,

+  -V_PRED, -H_PRED,

+  -TM_PRED, 20,

+  -B_PRED, -I8X8_PRED

+};

+const vp9_tree_index vp9_kf_ymode_tree[VP9_YMODES * 2 - 2] = {

+  2, 14,

+  -DC_PRED, 4,

+  6, 8,

+  -D45_PRED, -D135_PRED,

+  10, 12,

+  -D117_PRED, -D153_PRED,

+  -D27_PRED, -D63_PRED,

+  16, 18,

+  -V_PRED, -H_PRED,

+  -TM_PRED, 20,

+  -B_PRED, -I8X8_PRED

+};

+const vp9_tree_index vp9_i8x8_mode_tree[VP9_I8X8_MODES * 2 - 2] = {

+  2, 14,

+  -DC_PRED, 4,

+  6, 8,

+  -D45_PRED, -D135_PRED,

+  10, 12,

+  -D117_PRED, -D153_PRED,

+  -D27_PRED, -D63_PRED,

+  -V_PRED, 16,

+  -H_PRED, -TM_PRED

+};

+const vp9_tree_index vp9_uv_mode_tree[VP9_UV_MODES * 2 - 2] = {

+  2, 14,

+  -DC_PRED, 4,

+  6, 8,

+  -D45_PRED, -D135_PRED,

+  10, 12,

+  -D117_PRED, -D153_PRED,

+  -D27_PRED, -D63_PRED,

+  -V_PRED, 16,

+  -H_PRED, -TM_PRED

+};

+const vp9_tree_index vp9_mbsplit_tree[6] = {

+  -PARTITIONING_4X4,   2,

+  -PARTITIONING_8X8,   4,

+  -PARTITIONING_16X8, -PARTITIONING_8X16,

+};

+const vp9_tree_index vp9_mv_ref_tree[8] = {

+  -ZEROMV, 2,

+  -NEARESTMV, 4,

+  -NEARMV, 6,

+  -NEWMV, -SPLITMV

+};

+#if CONFIG_SUPERBLOCKS

+const vp9_tree_index vp9_sb_mv_ref_tree[6] = {

+  -ZEROMV, 2,

+  -NEARESTMV, 4,

+  -NEARMV, -NEWMV

+};

+#endif

+const vp9_tree_index vp9_sub_mv_ref_tree[6] = {

+  -LEFT4X4, 2,

+  -ABOVE4X4, 4,

+  -ZERO4X4, -NEW4X4

+};

+struct vp9_token_struct vp9_bmode_encodings   [VP9_BINTRAMODES];

+struct vp9_token_struct vp9_ymode_encodings   [VP9_YMODES];

+#if CONFIG_SUPERBLOCKS

+struct vp9_token_struct vp9_sb_kf_ymode_encodings [VP9_I32X32_MODES];

+#endif

+struct vp9_token_struct vp9_kf_ymode_encodings [VP9_YMODES];

+struct vp9_token_struct vp9_uv_mode_encodings  [VP9_UV_MODES];

+struct vp9_token_struct vp9_i8x8_mode_encodings  [VP9_I8X8_MODES];

+struct vp9_token_struct vp9_mbsplit_encodings [VP9_NUMMBSPLITS];

+struct vp9_token_struct vp9_mv_ref_encoding_array    [VP9_MVREFS];

+#if CONFIG_SUPERBLOCKS

+struct vp9_token_struct vp9_sb_mv_ref_encoding_array  [VP9_MVREFS];

+#endif

+struct vp9_token_struct vp9_sub_mv_ref_encoding_array [VP9_SUBMVREFS];

+void vp9_init_mbmode_probs(VP9_COMMON *x) {

+  unsigned int bct [VP9_YMODES] [2];      /* num Ymodes > num UV modes */

+  vp9_tree_probs_from_distribution(VP9_YMODES, vp9_ymode_encodings,

+                                   vp9_ymode_tree, x->fc.ymode_prob,

+                                   bct, y_mode_cts, 256, 1);

+  {

+    int i;

+    for (i = 0; i < 8; i++) {

+      vp9_tree_probs_from_distribution(VP9_YMODES, vp9_kf_ymode_encodings,

+                                       vp9_kf_ymode_tree, x->kf_ymode_prob[i],

+                                       bct, kf_y_mode_cts[i], 256, 1);

+#if CONFIG_SUPERBLOCKS

+      vp9_tree_probs_from_distribution(VP9_I32X32_MODES,

+                                       vp9_sb_kf_ymode_encodings,

+                                       vp9_sb_ymode_tree,

+                                       x->sb_kf_ymode_prob[i], bct,

+                                       kf_y_mode_cts[i], 256, 1);

+#endif

+    }

+  }

+  {

+    int i;

+    for (i = 0; i < VP9_YMODES; i++) {

+      vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,

+                                       vp9_uv_mode_tree, x->kf_uv_mode_prob[i],

+                                       bct, kf_uv_mode_cts[i], 256, 1);

+      vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,

+                                       vp9_uv_mode_tree, x->fc.uv_mode_prob[i],

+                                       bct, uv_mode_cts[i], 256, 1);

+    }

+  }

+  vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings,

+                                   vp9_i8x8_mode_tree, x->fc.i8x8_mode_prob,

+                                   bct, i8x8_mode_cts, 256, 1);

+  vpx_memcpy(x->fc.sub_mv_ref_prob, vp9_sub_mv_ref_prob2,

+             sizeof(vp9_sub_mv_ref_prob2));

+  vpx_memcpy(x->fc.mbsplit_prob, vp9_mbsplit_probs, sizeof(vp9_mbsplit_probs));

+  vpx_memcpy(x->fc.switchable_interp_prob, vp9_switchable_interp_prob,

+             sizeof(vp9_switchable_interp_prob));

+}

+static void intra_bmode_probs_from_distribution(

+  vp9_prob p [VP9_BINTRAMODES - 1],

+  unsigned int branch_ct [VP9_BINTRAMODES - 1] [2],

+  const unsigned int events [VP9_BINTRAMODES]) {

+  vp9_tree_probs_from_distribution(VP9_BINTRAMODES, vp9_bmode_encodings,

+                                   vp9_bmode_tree, p, branch_ct,

+                                   events, 256, 1);

+}

+void vp9_default_bmode_probs(vp9_prob p [VP9_BINTRAMODES - 1]) {

+  unsigned int branch_ct [VP9_BINTRAMODES - 1] [2];

+  intra_bmode_probs_from_distribution(p, branch_ct, bmode_cts);

+}

+void vp9_kf_default_bmode_probs(vp9_prob p[VP9_BINTRAMODES][VP9_BINTRAMODES]

+                                          [VP9_BINTRAMODES - 1]) {

+  unsigned int branch_ct[VP9_BINTRAMODES - 1][2];

+  int i, j;

+  for (i = 0; i < VP9_BINTRAMODES; i++) {

+    for (j = 0; j < VP9_BINTRAMODES; j++) {

+      intra_bmode_probs_from_distribution(

+        p[i][j], branch_ct, vp9_kf_default_bmode_counts[i][j]);

+    }

+  }

+}

+#if VP9_SWITCHABLE_FILTERS == 3

+const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {

+  -0, 2,

+  -1, -2

+};

+struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];

+const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {

+  EIGHTTAP, SIXTAP, EIGHTTAP_SHARP};

+const int vp9_switchable_interp_map[SWITCHABLE+1] = {1, -1, 0, 2, -1};

+const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]

+                                          [VP9_SWITCHABLE_FILTERS-1] = {

+  {248, 192}, { 32, 248}, { 32,  32}, {192, 160}

+};

+#elif VP9_SWITCHABLE_FILTERS == 2

+const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {

+  -0, -1,

+};

+struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];

+const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]

+                                          [VP9_SWITCHABLE_FILTERS-1] = {

+  {248},

+  { 64},

+  {192},

+};

+const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {

+  EIGHTTAP, EIGHTTAP_SHARP};

+const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, -1, 0, 1, -1}; //8, 8s

+#endif

+void vp9_entropy_mode_init() {

+  vp9_tokens_from_tree(vp9_bmode_encodings,   vp9_bmode_tree);

+  vp9_tokens_from_tree(vp9_ymode_encodings,   vp9_ymode_tree);

+  vp9_tokens_from_tree(vp9_kf_ymode_encodings, vp9_kf_ymode_tree);

+#if CONFIG_SUPERBLOCKS

+  vp9_tokens_from_tree(vp9_sb_kf_ymode_encodings, vp9_sb_ymode_tree);

+#endif

+  vp9_tokens_from_tree(vp9_uv_mode_encodings,  vp9_uv_mode_tree);

+  vp9_tokens_from_tree(vp9_i8x8_mode_encodings,  vp9_i8x8_mode_tree);

+  vp9_tokens_from_tree(vp9_mbsplit_encodings, vp9_mbsplit_tree);

+  vp9_tokens_from_tree(vp9_switchable_interp_encodings,

+                       vp9_switchable_interp_tree);

+  vp9_tokens_from_tree_offset(vp9_mv_ref_encoding_array,

+                              vp9_mv_ref_tree, NEARESTMV);

+#if CONFIG_SUPERBLOCKS

+  vp9_tokens_from_tree_offset(vp9_sb_mv_ref_encoding_array,

+                              vp9_sb_mv_ref_tree, NEARESTMV);

+#endif

+  vp9_tokens_from_tree_offset(vp9_sub_mv_ref_encoding_array,

+                              vp9_sub_mv_ref_tree, LEFT4X4);

+}

+void vp9_init_mode_contexts(VP9_COMMON *pc) {

+  vpx_memset(pc->fc.mv_ref_ct, 0, sizeof(pc->fc.mv_ref_ct));

+  vpx_memset(pc->fc.mv_ref_ct_a, 0, sizeof(pc->fc.mv_ref_ct_a));

+  vpx_memcpy(pc->fc.mode_context,

+             vp9_default_mode_contexts,

+             sizeof(pc->fc.mode_context));

+  vpx_memcpy(pc->fc.mode_context_a,

+             vp9_default_mode_contexts_a,

+             sizeof(pc->fc.mode_context_a));

+}

+void vp9_accum_mv_refs(VP9_COMMON *pc,

+                       MB_PREDICTION_MODE m,

+                       const int ct[4]) {

+  int (*mv_ref_ct)[4][2];

+  if (pc->refresh_alt_ref_frame)

+    mv_ref_ct = pc->fc.mv_ref_ct_a;

+  else

+    mv_ref_ct = pc->fc.mv_ref_ct;

+  if (m == ZEROMV) {

+    ++mv_ref_ct [ct[0]] [0] [0];

+  } else {

+    ++mv_ref_ct [ct[0]] [0] [1];

+    if (m == NEARESTMV) {

+      ++mv_ref_ct [ct[1]] [1] [0];

+    } else {

+      ++mv_ref_ct [ct[1]] [1] [1];

+      if (m == NEARMV) {

+        ++mv_ref_ct [ct[2]] [2] [0];

+      } else {

+        ++mv_ref_ct [ct[2]] [2] [1];

+        if (m == NEWMV) {

+          ++mv_ref_ct [ct[3]] [3] [0];

+        } else {

+          ++mv_ref_ct [ct[3]] [3] [1];

+        }

+      }

+    }

+  }

+}

+#define MVREF_COUNT_SAT 20

+#define MVREF_MAX_UPDATE_FACTOR 144

+void vp9_update_mode_context(VP9_COMMON *pc) {

+  int i, j;

+  int (*mv_ref_ct)[4][2];

+  int (*mode_context)[4];

+  if (pc->refresh_alt_ref_frame) {

+    mv_ref_ct = pc->fc.mv_ref_ct_a;

+    mode_context = pc->fc.mode_context_a;

+  } else {

+    mv_ref_ct = pc->fc.mv_ref_ct;

+    mode_context = pc->fc.mode_context;

+  }

+  for (j = 0; j < 6; j++) {

+    for (i = 0; i < 4; i++) {

+      int this_prob;

+      int count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];

+      int factor;

+      {

+        this_prob = count > 0 ? 256 * mv_ref_ct[j][i][0] / count : 128;

+        count = count > MVREF_COUNT_SAT ? MVREF_COUNT_SAT : count;

+        factor = (MVREF_MAX_UPDATE_FACTOR * count / MVREF_COUNT_SAT);

+        this_prob = (pc->fc.vp8_mode_contexts[j][i] * (256 - factor) +

+                     this_prob * factor + 128) >> 8;

+        this_prob = this_prob ? (this_prob < 255 ? this_prob : 255) : 1;

+        mode_context[j][i] = this_prob;

+      }

+    }

+  }

+}

+#ifdef MODE_STATS

+#include "vp9/common/modecont.h"

+void print_mode_contexts(VP9_COMMON *pc) {

+  int j, i;

+  printf("\n====================\n");

+  for (j = 0; j < 6; j++) {

+    for (i = 0; i < 4; i++) {

+      printf("%4d ", pc->fc.mode_context[j][i]);

+    }

+    printf("\n");

+  }

+  printf("====================\n");

+  for (j = 0; j < 6; j++) {

+    for (i = 0; i < 4; i++) {

+      printf("%4d ", pc->fc.mode_context_a[j][i]);

+    }

+    printf("\n");

+  }

+}

+#endif

+// #define MODE_COUNT_TESTING

+#define MODE_COUNT_SAT 20

+#define MODE_MAX_UPDATE_FACTOR 144

+void vp9_adapt_mode_probs(VP9_COMMON *cm) {

+  int i, t, count, factor;

+  unsigned int branch_ct[32][2];

+  vp9_prob ymode_probs[VP9_YMODES - 1];

+  vp9_prob uvmode_probs[VP9_UV_MODES - 1];

+  vp9_prob bmode_probs[VP9_BINTRAMODES - 1];

+  vp9_prob i8x8_mode_probs[VP9_I8X8_MODES - 1];

+  vp9_prob sub_mv_ref_probs[VP9_SUBMVREFS - 1];

+  vp9_prob mbsplit_probs[VP9_NUMMBSPLITS - 1];

+#ifdef MODE_COUNT_TESTING

+  printf("static const unsigned int\nymode_counts"

+         "[VP9_YMODES] = {\n");

+  for (t = 0; t < VP9_YMODES; ++t) printf("%d, ", cm->fc.ymode_counts[t]);

+  printf("};\n");

+  printf("static const unsigned int\nuv_mode_counts"

+         "[VP9_YMODES] [VP9_UV_MODES] = {\n");

+  for (i = 0; i < VP9_YMODES; ++i) {

+    printf("  {");

+    for (t = 0; t < VP9_UV_MODES; ++t) printf("%d, ", cm->fc.uv_mode_counts[i][t]);

+    printf("},\n");

+  }

+  printf("};\n");

+  printf("static const unsigned int\nbmode_counts"

+         "[VP9_BINTRAMODES] = {\n");

+  for (t = 0; t < VP9_BINTRAMODES; ++t) printf("%d, ", cm->fc.bmode_counts[t]);

+  printf("};\n");

+  printf("static const unsigned int\ni8x8_mode_counts"

+         "[VP9_I8X8_MODES] = {\n");

+  for (t = 0; t < VP9_I8X8_MODES; ++t) printf("%d, ", cm->fc.i8x8_mode_counts[t]);

+  printf("};\n");

+  printf("static const unsigned int\nsub_mv_ref_counts"

+         "[SUBMVREF_COUNT] [VP9_SUBMVREFS] = {\n");

+  for (i = 0; i < SUBMVREF_COUNT; ++i) {

+    printf("  {");

+    for (t = 0; t < VP9_SUBMVREFS; ++t) printf("%d, ", cm->fc.sub_mv_ref_counts[i][t]);

+    printf("},\n");

+  }

+  printf("};\n");

+  printf("static const unsigned int\nmbsplit_counts"

+         "[VP9_NUMMBSPLITS] = {\n");

+  for (t = 0; t < VP9_NUMMBSPLITS; ++t) printf("%d, ", cm->fc.mbsplit_counts[t]);

+  printf("};\n");

+#endif

+  vp9_tree_probs_from_distribution(

+    VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree,

+    ymode_probs, branch_ct, cm->fc.ymode_counts,

+    256, 1);

+  for (t = 0; t < VP9_YMODES - 1; ++t) {

+    int prob;

+    count = branch_ct[t][0] + branch_ct[t][1];

+    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;

+    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);

+    prob = ((int)cm->fc.pre_ymode_prob[t] * (256 - factor) +

+            (int)ymode_probs[t] * factor + 128) >> 8;

+    if (prob <= 0) cm->fc.ymode_prob[t] = 1;

+    else if (prob > 255) cm->fc.ymode_prob[t] = 255;

+    else cm->fc.ymode_prob[t] = prob;

+  }

+  for (i = 0; i < VP9_YMODES; ++i) {

+    vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,

+                                     vp9_uv_mode_tree, uvmode_probs, branch_ct,

+                                     cm->fc.uv_mode_counts[i], 256, 1);

+    for (t = 0; t < VP9_UV_MODES - 1; ++t) {

+      int prob;

+      count = branch_ct[t][0] + branch_ct[t][1];

+      count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;

+      factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);

+      prob = ((int)cm->fc.pre_uv_mode_prob[i][t] * (256 - factor) +

+              (int)uvmode_probs[t] * factor + 128) >> 8;

+      if (prob <= 0) cm->fc.uv_mode_prob[i][t] = 1;

+      else if (prob > 255) cm->fc.uv_mode_prob[i][t] = 255;

+      else cm->fc.uv_mode_prob[i][t] = prob;

+    }

+  }

+  vp9_tree_probs_from_distribution(VP9_BINTRAMODES, vp9_bmode_encodings,

+                                   vp9_bmode_tree, bmode_probs, branch_ct,

+                                   cm->fc.bmode_counts, 256, 1);

+  for (t = 0; t < VP9_BINTRAMODES - 1; ++t) {

+    int prob;

+    count = branch_ct[t][0] + branch_ct[t][1];

+    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;

+    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);

+    prob = ((int)cm->fc.pre_bmode_prob[t] * (256 - factor) +

+            (int)bmode_probs[t] * factor + 128) >> 8;

+    if (prob <= 0) cm->fc.bmode_prob[t] = 1;

+    else if (prob > 255) cm->fc.bmode_prob[t] = 255;

+    else cm->fc.bmode_prob[t] = prob;

+  }

+  vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings,

+                                   vp9_i8x8_mode_tree, i8x8_mode_probs,

+                                   branch_ct, cm->fc.i8x8_mode_counts, 256, 1);

+  for (t = 0; t < VP9_I8X8_MODES - 1; ++t) {

+    int prob;

+    count = branch_ct[t][0] + branch_ct[t][1];

+    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;

+    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);

+    prob = ((int)cm->fc.pre_i8x8_mode_prob[t] * (256 - factor) +

+            (int)i8x8_mode_probs[t] * factor + 128) >> 8;

+    if (prob <= 0) cm->fc.i8x8_mode_prob[t] = 1;

+    else if (prob > 255) cm->fc.i8x8_mode_prob[t] = 255;

+    else cm->fc.i8x8_mode_prob[t] = prob;

+  }

+  for (i = 0; i < SUBMVREF_COUNT; ++i) {

+    vp9_tree_probs_from_distribution(VP9_SUBMVREFS,

+                                     vp9_sub_mv_ref_encoding_array,

+                                     vp9_sub_mv_ref_tree, sub_mv_ref_probs,

+                                     branch_ct, cm->fc.sub_mv_ref_counts[i],

+                                     256, 1);

+    for (t = 0; t < VP9_SUBMVREFS - 1; ++t) {

+      int prob;

+      count = branch_ct[t][0] + branch_ct[t][1];

+      count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;

+      factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);

+      prob = ((int)cm->fc.pre_sub_mv_ref_prob[i][t] * (256 - factor) +

+              (int)sub_mv_ref_probs[t] * factor + 128) >> 8;

+      if (prob <= 0) cm->fc.sub_mv_ref_prob[i][t] = 1;

+      else if (prob > 255) cm->fc.sub_mv_ref_prob[i][t] = 255;

+      else cm->fc.sub_mv_ref_prob[i][t] = prob;

+    }

+  }

+  vp9_tree_probs_from_distribution(VP9_NUMMBSPLITS, vp9_mbsplit_encodings,

+                                   vp9_mbsplit_tree, mbsplit_probs, branch_ct,

+                                   cm->fc.mbsplit_counts, 256, 1);

+  for (t = 0; t < VP9_NUMMBSPLITS - 1; ++t) {

+    int prob;

+    count = branch_ct[t][0] + branch_ct[t][1];

+    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;

+    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);

+    prob = ((int)cm->fc.pre_mbsplit_prob[t] * (256 - factor) +

+            (int)mbsplit_probs[t] * factor + 128) >> 8;

+    if (prob <= 0) cm->fc.mbsplit_prob[t] = 1;

+    else if (prob > 255) cm->fc.mbsplit_prob[t] = 255;

+    else cm->fc.mbsplit_prob[t] = prob;

+  }

+}

--- /dev/null

+++ b/vp9/common/entropymode.h

@@ -1,0 +1,102 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_ENTROPYMODE_H

+#define __INC_ENTROPYMODE_H

+#include "blockd.h"

+#include "treecoder.h"

+#define SUBMVREF_COUNT 5

+#define VP9_NUMMBSPLITS 4

+typedef const int vp9_mbsplit[16];

+extern vp9_mbsplit vp9_mbsplits[VP9_NUMMBSPLITS];

+extern const int vp9_mbsplit_count[VP9_NUMMBSPLITS];    /* # of subsets */

+extern const vp9_prob vp9_mbsplit_probs[VP9_NUMMBSPLITS - 1];

+extern int vp9_mv_cont(const int_mv *l, const int_mv *a);

+extern const vp9_prob vp9_sub_mv_ref_prob[VP9_SUBMVREFS - 1];

+extern const vp9_prob vp9_sub_mv_ref_prob2[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];

+extern const unsigned int vp9_kf_default_bmode_counts[VP9_BINTRAMODES]

+                                                     [VP9_BINTRAMODES]

+                                                     [VP9_BINTRAMODES];

+extern const vp9_tree_index vp9_bmode_tree[];

+extern const vp9_tree_index  vp9_ymode_tree[];

+extern const vp9_tree_index  vp9_kf_ymode_tree[];

+extern const vp9_tree_index  vp9_uv_mode_tree[];

+#define vp9_sb_ymode_tree vp9_uv_mode_tree

+extern const vp9_tree_index  vp9_i8x8_mode_tree[];

+extern const vp9_tree_index  vp9_mbsplit_tree[];

+extern const vp9_tree_index  vp9_mv_ref_tree[];

+extern const vp9_tree_index  vp9_sb_mv_ref_tree[];

+extern const vp9_tree_index  vp9_sub_mv_ref_tree[];

+extern struct vp9_token_struct vp9_bmode_encodings[VP9_BINTRAMODES];

+extern struct vp9_token_struct vp9_ymode_encodings[VP9_YMODES];

+extern struct vp9_token_struct vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES];

+extern struct vp9_token_struct vp9_kf_ymode_encodings[VP9_YMODES];

+extern struct vp9_token_struct vp9_i8x8_mode_encodings[VP9_I8X8_MODES];

+extern struct vp9_token_struct vp9_uv_mode_encodings[VP9_UV_MODES];

+extern struct vp9_token_struct vp9_mbsplit_encodings[VP9_NUMMBSPLITS];

+/* Inter mode values do not start at zero */

+extern struct vp9_token_struct vp9_mv_ref_encoding_array[VP9_MVREFS];

+extern struct vp9_token_struct vp9_sb_mv_ref_encoding_array[VP9_MVREFS];

+extern struct vp9_token_struct vp9_sub_mv_ref_encoding_array[VP9_SUBMVREFS];

+void vp9_entropy_mode_init(void);

+struct VP9Common;

+void vp9_init_mbmode_probs(struct VP9Common *x);

+extern void vp9_init_mode_contexts(struct VP9Common *pc);

+extern void vp9_update_mode_context(struct VP9Common *pc);

+extern void vp9_accum_mv_refs(struct VP9Common *pc,

+                              MB_PREDICTION_MODE m,

+                              const int ct[4]);

+void vp9_default_bmode_probs(vp9_prob dest[VP9_BINTRAMODES - 1]);

+void vp9_kf_default_bmode_probs(vp9_prob dest[VP9_BINTRAMODES][VP9_BINTRAMODES]

+                                             [VP9_BINTRAMODES - 1]);

+void vp9_adapt_mode_probs(struct VP9Common *);

+#define VP9_SWITCHABLE_FILTERS 2 /* number of switchable filters */

+extern const  INTERPOLATIONFILTERTYPE vp9_switchable_interp

+                  [VP9_SWITCHABLE_FILTERS];

+extern const  int vp9_switchable_interp_map[SWITCHABLE + 1];

+extern const  vp9_tree_index vp9_switchable_interp_tree

+                  [2 * (VP9_SWITCHABLE_FILTERS - 1)];

+extern struct vp9_token_struct vp9_switchable_interp_encodings

+                  [VP9_SWITCHABLE_FILTERS];

+extern const  vp9_prob vp9_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]

+                                                 [VP9_SWITCHABLE_FILTERS - 1];

+#endif

--- /dev/null

+++ b/vp9/common/entropymv.c

@@ -1,0 +1,465 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "onyxc_int.h"

+#include "entropymv.h"

+//#define MV_COUNT_TESTING

+#define MV_COUNT_SAT 16

+#define MV_MAX_UPDATE_FACTOR 160

+#if CONFIG_NEW_MVREF

+/* Integer pel reference mv threshold for use of high-precision 1/8 mv */

+#define COMPANDED_MVREF_THRESH    1000000

+#else

+/* Integer pel reference mv threshold for use of high-precision 1/8 mv */

+#define COMPANDED_MVREF_THRESH    8

+#endif

+/* Smooth or bias the mv-counts before prob computation */

+/* #define SMOOTH_MV_COUNTS */

+const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = {

+  -MV_JOINT_ZERO, 2,

+  -MV_JOINT_HNZVZ, 4,

+  -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ

+};

+struct vp9_token_struct vp9_mv_joint_encodings[MV_JOINTS];

+const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = {

+  -MV_CLASS_0, 2,

+  -MV_CLASS_1, 4,

+  6, 8,

+  -MV_CLASS_2, -MV_CLASS_3,

+  10, 12,

+  -MV_CLASS_4, -MV_CLASS_5,

+  -MV_CLASS_6, -MV_CLASS_7,

+};

+struct vp9_token_struct vp9_mv_class_encodings[MV_CLASSES];

+const vp9_tree_index vp9_mv_class0_tree [2 * CLASS0_SIZE - 2] = {

+  -0, -1,

+};

+struct vp9_token_struct vp9_mv_class0_encodings[CLASS0_SIZE];

+const vp9_tree_index vp9_mv_fp_tree [2 * 4 - 2] = {

+  -0, 2,

+  -1, 4,

+  -2, -3

+};

+struct vp9_token_struct vp9_mv_fp_encodings[4];

+const nmv_context vp9_default_nmv_context = {

+  {32, 64, 96},

+  {

+    { /* vert component */

+      128,                                             /* sign */

+      {224, 144, 192, 168, 192, 176, 192},             /* class */

+      {216},                                           /* class0 */

+      {136, 140, 148, 160, 176, 192, 224},             /* bits */

+      {{128, 128, 64}, {96, 112, 64}},                 /* class0_fp */

+      {64, 96, 64},                                    /* fp */

+      160,                                             /* class0_hp bit */

+      128,                                             /* hp */

+    },

+    { /* hor component */

+      128,                                             /* sign */

+      {216, 128, 176, 160, 176, 176, 192},             /* class */

+      {208},                                           /* class0 */

+      {136, 140, 148, 160, 176, 192, 224},             /* bits */

+      {{128, 128, 64}, {96, 112, 64}},                 /* class0_fp */

+      {64, 96, 64},                                    /* fp */

+      160,                                             /* class0_hp bit */

+      128,                                             /* hp */

+    }

+  },

+};

+MV_JOINT_TYPE vp9_get_mv_joint(MV mv) {

+  if (mv.row == 0 && mv.col == 0) return MV_JOINT_ZERO;

+  else if (mv.row == 0 && mv.col != 0) return MV_JOINT_HNZVZ;

+  else if (mv.row != 0 && mv.col == 0) return MV_JOINT_HZVNZ;

+  else return MV_JOINT_HNZVNZ;

+}

+#define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0)

+MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {

+  MV_CLASS_TYPE c;

+  if      (z < CLASS0_SIZE * 8)    c = MV_CLASS_0;

+  else if (z < CLASS0_SIZE * 16)   c = MV_CLASS_1;

+  else if (z < CLASS0_SIZE * 32)   c = MV_CLASS_2;

+  else if (z < CLASS0_SIZE * 64)   c = MV_CLASS_3;

+  else if (z < CLASS0_SIZE * 128)  c = MV_CLASS_4;

+  else if (z < CLASS0_SIZE * 256)  c = MV_CLASS_5;

+  else if (z < CLASS0_SIZE * 512)  c = MV_CLASS_6;

+  else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7;

+  else assert(0);

+  if (offset)

+    *offset = z - mv_class_base(c);

+  return c;

+}

+int vp9_use_nmv_hp(const MV *ref) {

+  if ((abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH &&

+      (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH)

+    return 1;

+  else

+    return 0;

+}

+int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {

+  return mv_class_base(c) + offset;

+}

+static void increment_nmv_component_count(int v,

+                                          nmv_component_counts *mvcomp,

+                                          int incr,

+                                          int usehp) {

+  assert (v != 0);            /* should not be zero */

+  mvcomp->mvcount[MV_MAX + v] += incr;

+}

+static void increment_nmv_component(int v,

+                                    nmv_component_counts *mvcomp,

+                                    int incr,

+                                    int usehp) {

+  int s, z, c, o, d, e, f;

+  assert (v != 0);            /* should not be zero */

+  s = v < 0;

+  mvcomp->sign[s] += incr;

+  z = (s ? -v : v) - 1;       /* magnitude - 1 */

+  c = vp9_get_mv_class(z, &o);

+  mvcomp->classes[c] += incr;

+  d = (o >> 3);               /* int mv data */

+  f = (o >> 1) & 3;           /* fractional pel mv data */

+  e = (o & 1);                /* high precision mv data */

+  if (c == MV_CLASS_0) {

+    mvcomp->class0[d] += incr;

+  } else {

+    int i, b;

+    b = c + CLASS0_BITS - 1;  /* number of bits */

+    for (i = 0; i < b; ++i)

+      mvcomp->bits[i][((d >> i) & 1)] += incr;

+  }

+  /* Code the fractional pel bits */

+  if (c == MV_CLASS_0) {

+    mvcomp->class0_fp[d][f] += incr;

+  } else {

+    mvcomp->fp[f] += incr;

+  }

+  /* Code the high precision bit */

+  if (usehp) {

+    if (c == MV_CLASS_0) {

+      mvcomp->class0_hp[e] += incr;

+    } else {

+      mvcomp->hp[e] += incr;

+    }

+  }

+}

+#ifdef SMOOTH_MV_COUNTS

+static void smooth_counts(nmv_component_counts *mvcomp) {

+  static const int flen = 3;  // (filter_length + 1) / 2

+  static const int fval[] = {8, 3, 1};

+  static const int fvalbits = 4;

+  int i;

+  unsigned int smvcount[MV_VALS];

+  vpx_memcpy(smvcount, mvcomp->mvcount, sizeof(smvcount));

+  smvcount[MV_MAX] = (smvcount[MV_MAX - 1] + smvcount[MV_MAX + 1]) >> 1;

+  for (i = flen - 1; i <= MV_VALS - flen; ++i) {

+    int j, s = smvcount[i] * fval[0];

+    for (j = 1; j < flen; ++j)

+      s += (smvcount[i - j] + smvcount[i + j]) * fval[j];

+    mvcomp->mvcount[i] = (s + (1 << (fvalbits - 1))) >> fvalbits;

+  }

+}

+#endif

+static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {

+  int v;

+  vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount));

+  for (v = 1; v <= MV_MAX; v++) {

+    increment_nmv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp);

+    increment_nmv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp);

+  }

+}

+void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,

+                       int usehp) {

+  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);

+  mvctx->joints[j]++;

+  usehp = usehp && vp9_use_nmv_hp(ref);

+  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {

+    increment_nmv_component_count(mv->row, &mvctx->comps[0], 1, usehp);

+  }

+  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {

+    increment_nmv_component_count(mv->col, &mvctx->comps[1], 1, usehp);

+  }

+}

+static void adapt_prob(vp9_prob *dest, vp9_prob prep, vp9_prob newp,

+                       unsigned int ct[2]) {

+  int factor;

+  int prob;

+  int count = ct[0] + ct[1];

+  if (count) {

+    count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count;

+    factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT);

+    prob = ((int)prep * (256 - factor) + (int)(newp) * factor + 128) >> 8;

+    prob += !prob;

+    prob = (prob > 255 ? 255 : prob);

+    *dest = prob;

+  }

+}

+void vp9_counts_to_nmv_context(

+    nmv_context_counts *NMVcount,

+    nmv_context *prob,

+    int usehp,

+    unsigned int (*branch_ct_joint)[2],

+    unsigned int (*branch_ct_sign)[2],

+    unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],

+    unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],

+    unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],

+    unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],

+    unsigned int (*branch_ct_fp)[4 - 1][2],

+    unsigned int (*branch_ct_class0_hp)[2],

+    unsigned int (*branch_ct_hp)[2]) {

+  int i, j, k;

+  counts_to_context(&NMVcount->comps[0], usehp);

+  counts_to_context(&NMVcount->comps[1], usehp);

+  vp9_tree_probs_from_distribution(MV_JOINTS,

+                                   vp9_mv_joint_encodings,

+                                   vp9_mv_joint_tree,

+                                   prob->joints,

+                                   branch_ct_joint,

+                                   NMVcount->joints,

+                                   256, 1);

+  for (i = 0; i < 2; ++i) {

+    prob->comps[i].sign =

+        vp9_bin_prob_from_distribution(NMVcount->comps[i].sign);

+    branch_ct_sign[i][0] = NMVcount->comps[i].sign[0];

+    branch_ct_sign[i][1] = NMVcount->comps[i].sign[1];

+    vp9_tree_probs_from_distribution(MV_CLASSES,

+                                     vp9_mv_class_encodings,

+                                     vp9_mv_class_tree,

+                                     prob->comps[i].classes,

+                                     branch_ct_classes[i],

+                                     NMVcount->comps[i].classes,

+                                     256, 1);

+    vp9_tree_probs_from_distribution(CLASS0_SIZE,

+                                     vp9_mv_class0_encodings,

+                                     vp9_mv_class0_tree,

+                                     prob->comps[i].class0,

+                                     branch_ct_class0[i],

+                                     NMVcount->comps[i].class0,

+                                     256, 1);

+    for (j = 0; j < MV_OFFSET_BITS; ++j) {

+      prob->comps[i].bits[j] = vp9_bin_prob_from_distribution(

+          NMVcount->comps[i].bits[j]);

+      branch_ct_bits[i][j][0] = NMVcount->comps[i].bits[j][0];

+      branch_ct_bits[i][j][1] = NMVcount->comps[i].bits[j][1];

+    }

+  }

+  for (i = 0; i < 2; ++i) {

+    for (k = 0; k < CLASS0_SIZE; ++k) {

+      vp9_tree_probs_from_distribution(4,

+                                       vp9_mv_fp_encodings,

+                                       vp9_mv_fp_tree,

+                                       prob->comps[i].class0_fp[k],

+                                       branch_ct_class0_fp[i][k],

+                                       NMVcount->comps[i].class0_fp[k],

+                                       256, 1);

+    }

+    vp9_tree_probs_from_distribution(4,

+                                     vp9_mv_fp_encodings,

+                                     vp9_mv_fp_tree,

+                                     prob->comps[i].fp,

+                                     branch_ct_fp[i],

+                                     NMVcount->comps[i].fp,

+                                     256, 1);

+  }

+  if (usehp) {

+    for (i = 0; i < 2; ++i) {

+      prob->comps[i].class0_hp = vp9_bin_prob_from_distribution(

+          NMVcount->comps[i].class0_hp);

+      branch_ct_class0_hp[i][0] = NMVcount->comps[i].class0_hp[0];

+      branch_ct_class0_hp[i][1] = NMVcount->comps[i].class0_hp[1];

+      prob->comps[i].hp =

+          vp9_bin_prob_from_distribution(NMVcount->comps[i].hp);

+      branch_ct_hp[i][0] = NMVcount->comps[i].hp[0];

+      branch_ct_hp[i][1] = NMVcount->comps[i].hp[1];

+    }

+  }

+}

+void vp9_adapt_nmv_probs(VP9_COMMON *cm, int usehp) {

+  int i, j, k;

+  nmv_context prob;

+  unsigned int branch_ct_joint[MV_JOINTS - 1][2];

+  unsigned int branch_ct_sign[2][2];

+  unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];

+  unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];

+  unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];

+  unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];

+  unsigned int branch_ct_fp[2][4 - 1][2];

+  unsigned int branch_ct_class0_hp[2][2];

+  unsigned int branch_ct_hp[2][2];

+#ifdef MV_COUNT_TESTING

+  printf("joints count: ");

+  for (j = 0; j < MV_JOINTS; ++j) printf("%d ", cm->fc.NMVcount.joints[j]);

+  printf("\n"); fflush(stdout);

+  printf("signs count:\n");

+  for (i = 0; i < 2; ++i)

+    printf("%d/%d ", cm->fc.NMVcount.comps[i].sign[0], cm->fc.NMVcount.comps[i].sign[1]);

+  printf("\n"); fflush(stdout);

+  printf("classes count:\n");

+  for (i = 0; i < 2; ++i) {

+    for (j = 0; j < MV_CLASSES; ++j)

+      printf("%d ", cm->fc.NMVcount.comps[i].classes[j]);

+    printf("\n"); fflush(stdout);

+  }

+  printf("class0 count:\n");

+  for (i = 0; i < 2; ++i) {

+    for (j = 0; j < CLASS0_SIZE; ++j)

+      printf("%d ", cm->fc.NMVcount.comps[i].class0[j]);

+    printf("\n"); fflush(stdout);

+  }

+  printf("bits count:\n");

+  for (i = 0; i < 2; ++i) {

+    for (j = 0; j < MV_OFFSET_BITS; ++j)

+      printf("%d/%d ", cm->fc.NMVcount.comps[i].bits[j][0],

+                       cm->fc.NMVcount.comps[i].bits[j][1]);

+    printf("\n"); fflush(stdout);

+  }

+  printf("class0_fp count:\n");

+  for (i = 0; i < 2; ++i) {

+    for (j = 0; j < CLASS0_SIZE; ++j) {

+      printf("{");

+      for (k = 0; k < 4; ++k)

+        printf("%d ", cm->fc.NMVcount.comps[i].class0_fp[j][k]);

+      printf("}, ");

+    }

+    printf("\n"); fflush(stdout);

+  }

+  printf("fp count:\n");

+  for (i = 0; i < 2; ++i) {

+    for (j = 0; j < 4; ++j)

+      printf("%d ", cm->fc.NMVcount.comps[i].fp[j]);

+    printf("\n"); fflush(stdout);

+  }

+  if (usehp) {

+    printf("class0_hp count:\n");

+    for (i = 0; i < 2; ++i)

+      printf("%d/%d ", cm->fc.NMVcount.comps[i].class0_hp[0],

+                       cm->fc.NMVcount.comps[i].class0_hp[1]);

+    printf("\n"); fflush(stdout);

+    printf("hp count:\n");

+    for (i = 0; i < 2; ++i)

+      printf("%d/%d ", cm->fc.NMVcount.comps[i].hp[0],

+                       cm->fc.NMVcount.comps[i].hp[1]);

+    printf("\n"); fflush(stdout);

+  }

+#endif

+#ifdef SMOOTH_MV_COUNTS

+  smooth_counts(&cm->fc.NMVcount.comps[0]);

+  smooth_counts(&cm->fc.NMVcount.comps[1]);

+#endif

+  vp9_counts_to_nmv_context(&cm->fc.NMVcount,

+                            &prob,

+                            usehp,

+                            branch_ct_joint,

+                            branch_ct_sign,

+                            branch_ct_classes,

+                            branch_ct_class0,

+                            branch_ct_bits,

+                            branch_ct_class0_fp,

+                            branch_ct_fp,

+                            branch_ct_class0_hp,

+                            branch_ct_hp);

+  for (j = 0; j < MV_JOINTS - 1; ++j) {

+    adapt_prob(&cm->fc.nmvc.joints[j],

+               cm->fc.pre_nmvc.joints[j],

+               prob.joints[j],

+               branch_ct_joint[j]);

+  }

+  for (i = 0; i < 2; ++i) {

+    adapt_prob(&cm->fc.nmvc.comps[i].sign,

+               cm->fc.pre_nmvc.comps[i].sign,

+               prob.comps[i].sign,

+               branch_ct_sign[i]);

+    for (j = 0; j < MV_CLASSES - 1; ++j) {

+      adapt_prob(&cm->fc.nmvc.comps[i].classes[j],

+                 cm->fc.pre_nmvc.comps[i].classes[j],

+                 prob.comps[i].classes[j],

+                 branch_ct_classes[i][j]);

+    }

+    for (j = 0; j < CLASS0_SIZE - 1; ++j) {

+      adapt_prob(&cm->fc.nmvc.comps[i].class0[j],

+                 cm->fc.pre_nmvc.comps[i].class0[j],

+                 prob.comps[i].class0[j],

+                 branch_ct_class0[i][j]);

+    }

+    for (j = 0; j < MV_OFFSET_BITS; ++j) {

+      adapt_prob(&cm->fc.nmvc.comps[i].bits[j],

+                 cm->fc.pre_nmvc.comps[i].bits[j],

+                 prob.comps[i].bits[j],

+                 branch_ct_bits[i][j]);

+    }

+  }

+  for (i = 0; i < 2; ++i) {

+    for (j = 0; j < CLASS0_SIZE; ++j) {

+      for (k = 0; k < 3; ++k) {

+        adapt_prob(&cm->fc.nmvc.comps[i].class0_fp[j][k],

+                   cm->fc.pre_nmvc.comps[i].class0_fp[j][k],

+                   prob.comps[i].class0_fp[j][k],

+                   branch_ct_class0_fp[i][j][k]);

+      }

+    }

+    for (j = 0; j < 3; ++j) {

+      adapt_prob(&cm->fc.nmvc.comps[i].fp[j],

+                 cm->fc.pre_nmvc.comps[i].fp[j],

+                 prob.comps[i].fp[j],

+                 branch_ct_fp[i][j]);

+    }

+  }

+  if (usehp) {

+    for (i = 0; i < 2; ++i) {

+      adapt_prob(&cm->fc.nmvc.comps[i].class0_hp,

+                 cm->fc.pre_nmvc.comps[i].class0_hp,

+                 prob.comps[i].class0_hp,

+                 branch_ct_class0_hp[i]);

+      adapt_prob(&cm->fc.nmvc.comps[i].hp,

+                 cm->fc.pre_nmvc.comps[i].hp,

+                 prob.comps[i].hp,

+                 branch_ct_hp[i]);

+    }

+  }

+}

+void vp9_entropy_mv_init() {

+  vp9_tokens_from_tree(vp9_mv_joint_encodings, vp9_mv_joint_tree);

+  vp9_tokens_from_tree(vp9_mv_class_encodings, vp9_mv_class_tree);

+  vp9_tokens_from_tree(vp9_mv_class0_encodings, vp9_mv_class0_tree);

+  vp9_tokens_from_tree(vp9_mv_fp_encodings, vp9_mv_fp_tree);

+}

+void vp9_init_mv_probs(VP9_COMMON *cm) {

+  vpx_memcpy(&cm->fc.nmvc, &vp9_default_nmv_context, sizeof(nmv_context));

+}

--- /dev/null

+++ b/vp9/common/entropymv.h

@@ -1,0 +1,129 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_ENTROPYMV_H

+#define __INC_ENTROPYMV_H

+#include "treecoder.h"

+#include "vpx_config.h"

+#include "blockd.h"

+struct VP9Common;

+void vp9_entropy_mv_init();

+void vp9_init_mv_probs(struct VP9Common *cm);

+void vp9_adapt_nmv_probs(struct VP9Common *cm, int usehp);

+int vp9_use_nmv_hp(const MV *ref);

+#define VP9_NMV_UPDATE_PROB  255

+//#define MV_GROUP_UPDATE

+#define LOW_PRECISION_MV_UPDATE  /* Use 7 bit forward update */

+/* Symbols for coding which components are zero jointly */

+#define MV_JOINTS     4

+typedef enum {

+  MV_JOINT_ZERO = 0,             /* Zero vector */

+  MV_JOINT_HNZVZ = 1,            /* Vert zero, hor nonzero */

+  MV_JOINT_HZVNZ = 2,            /* Hor zero, vert nonzero */

+  MV_JOINT_HNZVNZ = 3,           /* Both components nonzero */

+} MV_JOINT_TYPE;

+extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2];

+extern struct vp9_token_struct vp9_mv_joint_encodings [MV_JOINTS];

+/* Symbols for coding magnitude class of nonzero components */

+#define MV_CLASSES     8

+typedef enum {

+  MV_CLASS_0 = 0,      /* (0, 2]     integer pel */

+  MV_CLASS_1 = 1,      /* (2, 4]     integer pel */

+  MV_CLASS_2 = 2,      /* (4, 8]     integer pel */

+  MV_CLASS_3 = 3,      /* (8, 16]    integer pel */

+  MV_CLASS_4 = 4,      /* (16, 32]   integer pel */

+  MV_CLASS_5 = 5,      /* (32, 64]   integer pel */

+  MV_CLASS_6 = 6,      /* (64, 128]  integer pel */

+  MV_CLASS_7 = 7,      /* (128, 256] integer pel */

+} MV_CLASS_TYPE;

+extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2];

+extern struct vp9_token_struct vp9_mv_class_encodings [MV_CLASSES];

+#define CLASS0_BITS    1  /* bits at integer precision for class 0 */

+#define CLASS0_SIZE    (1 << CLASS0_BITS)

+#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)

+#define MV_MAX_BITS    (MV_CLASSES + CLASS0_BITS + 2)

+#define MV_MAX         ((1 << MV_MAX_BITS) - 1)

+#define MV_VALS        ((MV_MAX << 1) + 1)

+extern const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2];

+extern struct vp9_token_struct vp9_mv_class0_encodings[CLASS0_SIZE];

+extern const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2];

+extern struct vp9_token_struct vp9_mv_fp_encodings[4];

+typedef struct {

+  vp9_prob sign;

+  vp9_prob classes[MV_CLASSES - 1];

+  vp9_prob class0[CLASS0_SIZE - 1];

+  vp9_prob bits[MV_OFFSET_BITS];

+  vp9_prob class0_fp[CLASS0_SIZE][4 - 1];

+  vp9_prob fp[4 - 1];

+  vp9_prob class0_hp;

+  vp9_prob hp;

+} nmv_component;

+typedef struct {

+  vp9_prob joints[MV_JOINTS - 1];

+  nmv_component comps[2];

+} nmv_context;

+MV_JOINT_TYPE vp9_get_mv_joint(MV mv);

+MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset);

+int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset);

+typedef struct {

+  unsigned int mvcount[MV_VALS];

+  unsigned int sign[2];

+  unsigned int classes[MV_CLASSES];

+  unsigned int class0[CLASS0_SIZE];

+  unsigned int bits[MV_OFFSET_BITS][2];

+  unsigned int class0_fp[CLASS0_SIZE][4];

+  unsigned int fp[4];

+  unsigned int class0_hp[2];

+  unsigned int hp[2];

+} nmv_component_counts;

+typedef struct {

+  unsigned int joints[MV_JOINTS];

+  nmv_component_counts comps[2];

+} nmv_context_counts;

+void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,

+                       int usehp);

+extern const nmv_context vp9_default_nmv_context;

+void vp9_counts_to_nmv_context(

+    nmv_context_counts *NMVcount,

+    nmv_context *prob,

+    int usehp,

+    unsigned int (*branch_ct_joint)[2],

+    unsigned int (*branch_ct_sign)[2],

+    unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],

+    unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],

+    unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],

+    unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],

+    unsigned int (*branch_ct_fp)[4 - 1][2],

+    unsigned int (*branch_ct_class0_hp)[2],

+    unsigned int (*branch_ct_hp)[2]);

+#endif

--- /dev/null

+++ b/vp9/common/extend.c

@@ -1,0 +1,169 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "extend.h"

+#include "vpx_mem/vpx_mem.h"

+static void copy_and_extend_plane(unsigned char *s, /* source */

+                                  int sp,           /* source pitch */

+                                  unsigned char *d, /* destination */

+                                  int dp,           /* destination pitch */

+                                  int h,            /* height */

+                                  int w,            /* width */

+                                  int et,           /* extend top border */

+                                  int el,           /* extend left border */

+                                  int eb,           /* extend bottom border */

+                                  int er) {         /* extend right border */

+  int i;

+  unsigned char *src_ptr1, *src_ptr2;

+  unsigned char *dest_ptr1, *dest_ptr2;

+  int linesize;

+  /* copy the left and right most columns out */

+  src_ptr1 = s;

+  src_ptr2 = s + w - 1;

+  dest_ptr1 = d - el;

+  dest_ptr2 = d + w;

+  for (i = 0; i < h; i++) {

+    vpx_memset(dest_ptr1, src_ptr1[0], el);

+    vpx_memcpy(dest_ptr1 + el, src_ptr1, w);

+    vpx_memset(dest_ptr2, src_ptr2[0], er);

+    src_ptr1  += sp;

+    src_ptr2  += sp;

+    dest_ptr1 += dp;

+    dest_ptr2 += dp;

+  }

+  /* Now copy the top and bottom lines into each line of the respective

+   * borders

+   */

+  src_ptr1 = d - el;

+  src_ptr2 = d + dp * (h - 1) - el;

+  dest_ptr1 = d + dp * (-et) - el;

+  dest_ptr2 = d + dp * (h) - el;

+  linesize = el + er + w;

+  for (i = 0; i < et; i++) {

+    vpx_memcpy(dest_ptr1, src_ptr1, linesize);

+    dest_ptr1 += dp;

+  }

+  for (i = 0; i < eb; i++) {

+    vpx_memcpy(dest_ptr2, src_ptr2, linesize);

+    dest_ptr2 += dp;

+  }

+}

+void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,

+                               YV12_BUFFER_CONFIG *dst) {

+  int et = dst->border;

+  int el = dst->border;

+  int eb = dst->border + dst->y_height - src->y_height;

+  int er = dst->border + dst->y_width - src->y_width;

+  copy_and_extend_plane(src->y_buffer, src->y_stride,

+                        dst->y_buffer, dst->y_stride,

+                        src->y_height, src->y_width,

+                        et, el, eb, er);

+  et = dst->border >> 1;

+  el = dst->border >> 1;

+  eb = (dst->border >> 1) + dst->uv_height - src->uv_height;

+  er = (dst->border >> 1) + dst->uv_width - src->uv_width;

+  copy_and_extend_plane(src->u_buffer, src->uv_stride,

+                        dst->u_buffer, dst->uv_stride,

+                        src->uv_height, src->uv_width,

+                        et, el, eb, er);

+  copy_and_extend_plane(src->v_buffer, src->uv_stride,

+                        dst->v_buffer, dst->uv_stride,

+                        src->uv_height, src->uv_width,

+                        et, el, eb, er);

+}

+void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,

+                                         YV12_BUFFER_CONFIG *dst,

+                                         int srcy, int srcx,

+                                         int srch, int srcw) {

+  int et = dst->border;

+  int el = dst->border;

+  int eb = dst->border + dst->y_height - src->y_height;

+  int er = dst->border + dst->y_width - src->y_width;

+  int src_y_offset = srcy * src->y_stride + srcx;

+  int dst_y_offset = srcy * dst->y_stride + srcx;

+  int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);

+  int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);

+  // If the side is not touching the bounder then don't extend.

+  if (srcy)

+    et = 0;

+  if (srcx)

+    el = 0;

+  if (srcy + srch != src->y_height)

+    eb = 0;

+  if (srcx + srcw != src->y_width)

+    er = 0;

+  copy_and_extend_plane(src->y_buffer + src_y_offset,

+                        src->y_stride,

+                        dst->y_buffer + dst_y_offset,

+                        dst->y_stride,

+                        srch, srcw,

+                        et, el, eb, er);

+  et = (et + 1) >> 1;

+  el = (el + 1) >> 1;

+  eb = (eb + 1) >> 1;

+  er = (er + 1) >> 1;

+  srch = (srch + 1) >> 1;

+  srcw = (srcw + 1) >> 1;

+  copy_and_extend_plane(src->u_buffer + src_uv_offset,

+                        src->uv_stride,

+                        dst->u_buffer + dst_uv_offset,

+                        dst->uv_stride,

+                        srch, srcw,

+                        et, el, eb, er);

+  copy_and_extend_plane(src->v_buffer + src_uv_offset,

+                        src->uv_stride,

+                        dst->v_buffer + dst_uv_offset,

+                        dst->uv_stride,

+                        srch, srcw,

+                        et, el, eb, er);

+}

+/* note the extension is only for the last row, for intra prediction purpose */

+void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr,

+                       unsigned char *UPtr, unsigned char *VPtr) {

+  int i;

+  YPtr += ybf->y_stride * 14;

+  UPtr += ybf->uv_stride * 6;

+  VPtr += ybf->uv_stride * 6;

+  for (i = 0; i < 4; i++) {

+    YPtr[i] = YPtr[-1];

+    UPtr[i] = UPtr[-1];

+    VPtr[i] = VPtr[-1];

+  }

+  YPtr += ybf->y_stride;

+  UPtr += ybf->uv_stride;

+  VPtr += ybf->uv_stride;

+  for (i = 0; i < 4; i++) {

+    YPtr[i] = YPtr[-1];

+    UPtr[i] = UPtr[-1];

+    VPtr[i] = VPtr[-1];

+  }

+}

--- /dev/null

+++ b/vp9/common/extend.h

@@ -1,0 +1,27 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_EXTEND_H

+#define __INC_EXTEND_H

+#include "vpx_scale/yv12config.h"

+void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr,

+                       unsigned char *UPtr, unsigned char *VPtr);

+void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,

+                               YV12_BUFFER_CONFIG *dst);

+void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,

+                                         YV12_BUFFER_CONFIG *dst,

+                                         int srcy, int srcx,

+                                         int srch, int srcw);

+#endif  // __INC_EXTEND_H

--- /dev/null

+++ b/vp9/common/filter.c

@@ -1,0 +1,1159 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <stdlib.h>

+#include "filter.h"

+#include "vpx_ports/mem.h"

+#include "vpx_rtcd.h"

+DECLARE_ALIGNED(16, const short, vp9_bilinear_filters[SUBPEL_SHIFTS][2]) = {

+  { 128,   0 },

+  { 120,   8 },

+  { 112,  16 },

+  { 104,  24 },

+  {  96,  32 },

+  {  88,  40 },

+  {  80,  48 },

+  {  72,  56 },

+  {  64,  64 },

+  {  56,  72 },

+  {  48,  80 },

+  {  40,  88 },

+  {  32,  96 },

+  {  24, 104 },

+  {  16, 112 },

+  {   8, 120 }

+};

+#define FILTER_ALPHA       0

+#define FILTER_ALPHA_SHARP 1

+DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {

+#if FILTER_ALPHA == 0

+  /* Lagrangian interpolation filter */

+  { 0,   0,   0, 128,   0,   0,   0,  0},

+  { 0,   1,  -5, 126,   8,  -3,   1,  0},

+  { -1,   3, -10, 122,  18,  -6,   2,  0},

+  { -1,   4, -13, 118,  27,  -9,   3, -1},

+  { -1,   4, -16, 112,  37, -11,   4, -1},

+  { -1,   5, -18, 105,  48, -14,   4, -1},

+  { -1,   5, -19,  97,  58, -16,   5, -1},

+  { -1,   6, -19,  88,  68, -18,   5, -1},

+  { -1,   6, -19,  78,  78, -19,   6, -1},

+  { -1,   5, -18,  68,  88, -19,   6, -1},

+  { -1,   5, -16,  58,  97, -19,   5, -1},

+  { -1,   4, -14,  48, 105, -18,   5, -1},

+  { -1,   4, -11,  37, 112, -16,   4, -1},

+  { -1,   3,  -9,  27, 118, -13,   4, -1},

+  { 0,   2,  -6,  18, 122, -10,   3, -1},

+  { 0,   1,  -3,   8, 126,  -5,   1,  0}

+#elif FILTER_ALPHA == 50

+  /* Generated using MATLAB:

+   * alpha = 0.5;

+   * b=intfilt(8,4,alpha);

+   * bi=round(128*b);

+   * ba=flipud(reshape([bi 0], 8, 8));

+   * disp(num2str(ba, '%d,'))

+   */

+  { 0,   0,   0, 128,   0,   0,   0,  0},

+  { 0,   1,  -5, 126,   8,  -3,   1,  0},

+  { 0,   2, -10, 122,  18,  -6,   2,  0},

+  { -1,   3, -13, 118,  27,  -9,   3,  0},

+  { -1,   4, -16, 112,  37, -11,   3,  0},

+  { -1,   5, -17, 104,  48, -14,   4, -1},

+  { -1,   5, -18,  96,  58, -16,   5, -1},

+  { -1,   5, -19,  88,  68, -17,   5, -1},

+  { -1,   5, -18,  78,  78, -18,   5, -1},

+  { -1,   5, -17,  68,  88, -19,   5, -1},

+  { -1,   5, -16,  58,  96, -18,   5, -1},

+  { -1,   4, -14,  48, 104, -17,   5, -1},

+  { 0,   3, -11,  37, 112, -16,   4, -1},

+  { 0,   3,  -9,  27, 118, -13,   3, -1},

+  { 0,   2,  -6,  18, 122, -10,   2,  0},

+  { 0,   1,  -3,   8, 126,  -5,   1,  0}

+#endif  /* FILTER_ALPHA */

+};

+DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) = {

+#if FILTER_ALPHA_SHARP == 1

+  /* dct based filter */

+  {0,   0,   0, 128,   0,   0,   0, 0},

+  {-1,   3,  -7, 127,   8,  -3,   1, 0},

+  {-2,   5, -13, 125,  17,  -6,   3, -1},

+  {-3,   7, -17, 121,  27, -10,   5, -2},

+  {-4,   9, -20, 115,  37, -13,   6, -2},

+  {-4,  10, -23, 108,  48, -16,   8, -3},

+  {-4,  10, -24, 100,  59, -19,   9, -3},

+  {-4,  11, -24,  90,  70, -21,  10, -4},

+  {-4,  11, -23,  80,  80, -23,  11, -4},

+  {-4,  10, -21,  70,  90, -24,  11, -4},

+  {-3,   9, -19,  59, 100, -24,  10, -4},

+  {-3,   8, -16,  48, 108, -23,  10, -4},

+  {-2,   6, -13,  37, 115, -20,   9, -4},

+  {-2,   5, -10,  27, 121, -17,   7, -3},

+  {-1,   3,  -6,  17, 125, -13,   5, -2},

+  {0,   1,  -3,   8, 127,  -7,   3, -1}

+#elif FILTER_ALPHA_SHARP == 75

+  /* alpha = 0.75 */

+  {0,   0,   0, 128,   0,   0,   0, 0},

+  {-1,   2,  -6, 126,   9,  -3,   2, -1},

+  {-1,   4, -11, 123,  18,  -7,   3, -1},

+  {-2,   6, -16, 119,  28, -10,   5, -2},

+  {-2,   7, -19, 113,  38, -13,   6, -2},

+  {-3,   8, -21, 106,  49, -16,   7, -2},

+  {-3,   9, -22,  99,  59, -19,   8, -3},

+  {-3,   9, -23,  90,  70, -21,   9, -3},

+  {-3,   9, -22,  80,  80, -22,   9, -3},

+  {-3,   9, -21,  70,  90, -23,   9, -3},

+  {-3,   8, -19,  59,  99, -22,   9, -3},

+  {-2,   7, -16,  49, 106, -21,   8, -3},

+  {-2,   6, -13,  38, 113, -19,   7, -2},

+  {-2,   5, -10,  28, 119, -16,   6, -2},

+  {-1,   3,  -7,  18, 123, -11,   4, -1},

+  {-1,   2,  -3,   9, 126,  -6,   2, -1}

+#endif  /* FILTER_ALPHA_SHARP */

+};

+DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]) = {

+  {0,   0, 128,   0,   0, 0},

+  {1,  -5, 125,   8,  -2, 1},

+  {1,  -8, 122,  17,  -5, 1},

+  {2, -11, 116,  27,  -8, 2},

+  {3, -14, 110,  37, -10, 2},

+  {3, -15, 103,  47, -12, 2},

+  {3, -16,  95,  57, -14, 3},

+  {3, -16,  86,  67, -15, 3},

+  {3, -16,  77,  77, -16, 3},

+  {3, -15,  67,  86, -16, 3},

+  {3, -14,  57,  95, -16, 3},

+  {2, -12,  47, 103, -15, 3},

+  {2, -10,  37, 110, -14, 3},

+  {2,  -8,  27, 116, -11, 2},

+  {1,  -5,  17, 122,  -8, 1},

+  {1,  -2,   8, 125,  -5, 1}

+};

+static void filter_block2d_first_pass_6(unsigned char *src_ptr,

+                                        int *output_ptr,

+                                        unsigned int src_pixels_per_line,

+                                        unsigned int pixel_step,

+                                        unsigned int output_height,

+                                        unsigned int output_width,

+                                        const short *vp9_filter) {

+  unsigned int i, j;

+  int  Temp;

+  for (i = 0; i < output_height; i++) {

+    for (j = 0; j < output_width; j++) {

+      Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +

+             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +

+             ((int)src_ptr[0]                    * vp9_filter[2]) +

+             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +

+             ((int)src_ptr[2 * pixel_step]       * vp9_filter[4]) +

+             ((int)src_ptr[3 * pixel_step]       * vp9_filter[5]) +

+             (VP9_FILTER_WEIGHT >> 1);      /* Rounding */

+      /* Normalize back to 0-255 */

+      Temp = Temp >> VP9_FILTER_SHIFT;

+      if (Temp < 0)

+        Temp = 0;

+      else if (Temp > 255)

+        Temp = 255;

+      output_ptr[j] = Temp;

+      src_ptr++;

+    }

+    /* Next row... */

+    src_ptr    += src_pixels_per_line - output_width;

+    output_ptr += output_width;

+  }

+}

+static void filter_block2d_second_pass_6(int *src_ptr,

+                                         unsigned char *output_ptr,

+                                         int output_pitch,

+                                         unsigned int src_pixels_per_line,

+                                         unsigned int pixel_step,

+                                         unsigned int output_height,

+                                         unsigned int output_width,

+                                         const short *vp9_filter) {

+  unsigned int i, j;

+  int  Temp;

+  for (i = 0; i < output_height; i++) {

+    for (j = 0; j < output_width; j++) {

+      /* Apply filter */

+      Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +

+             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +

+             ((int)src_ptr[0]                    * vp9_filter[2]) +

+             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +

+             ((int)src_ptr[2 * pixel_step]         * vp9_filter[4]) +

+             ((int)src_ptr[3 * pixel_step]         * vp9_filter[5]) +

+             (VP9_FILTER_WEIGHT >> 1);   /* Rounding */

+      /* Normalize back to 0-255 */

+      Temp = Temp >> VP9_FILTER_SHIFT;

+      if (Temp < 0)

+        Temp = 0;

+      else if (Temp > 255)

+        Temp = 255;

+      output_ptr[j] = (unsigned char)Temp;

+      src_ptr++;

+    }

+    /* Start next row */

+    src_ptr    += src_pixels_per_line - output_width;

+    output_ptr += output_pitch;

+  }

+}

+/*

+ * The only functional difference between filter_block2d_second_pass()

+ * and this function is that filter_block2d_second_pass() does a sixtap

+ * filter on the input and stores it in the output. This function

+ * (filter_block2d_second_pass_avg()) does a sixtap filter on the input,

+ * and then averages that with the content already present in the output

+ * ((filter_result + dest + 1) >> 1) and stores that in the output.

+ */

+static void filter_block2d_second_pass_avg_6(int *src_ptr,

+                                             unsigned char *output_ptr,

+                                             int output_pitch,

+                                             unsigned int src_pixels_per_line,

+                                             unsigned int pixel_step,

+                                             unsigned int output_height,

+                                             unsigned int output_width,

+                                             const short *vp9_filter) {

+  unsigned int i, j;

+  int  Temp;

+  for (i = 0; i < output_height; i++) {

+    for (j = 0; j < output_width; j++) {

+      /* Apply filter */

+      Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +

+             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +

+             ((int)src_ptr[0]                    * vp9_filter[2]) +

+             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +

+             ((int)src_ptr[2 * pixel_step]         * vp9_filter[4]) +

+             ((int)src_ptr[3 * pixel_step]         * vp9_filter[5]) +

+             (VP9_FILTER_WEIGHT >> 1);   /* Rounding */

+      /* Normalize back to 0-255 */

+      Temp = Temp >> VP9_FILTER_SHIFT;

+      if (Temp < 0)

+        Temp = 0;

+      else if (Temp > 255)

+        Temp = 255;

+      output_ptr[j] = (unsigned char)((output_ptr[j] + Temp + 1) >> 1);

+      src_ptr++;

+    }

+    /* Start next row */

+    src_ptr    += src_pixels_per_line - output_width;

+    output_ptr += output_pitch;

+  }

+}

+#define Interp_Extend 3

+static void filter_block2d_6(unsigned char  *src_ptr,

+                             unsigned char  *output_ptr,

+                             unsigned int src_pixels_per_line,

+                             int output_pitch,

+                             const short  *HFilter,

+                             const short  *VFilter) {

+  int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer used in filtering */

+  /* First filter 1-D horizontally... */

+  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,

+                              3 + Interp_Extend * 2, 4, HFilter);

+  /* then filter verticaly... */

+  filter_block2d_second_pass_6(FData + 4 * (Interp_Extend - 1), output_ptr, output_pitch, 4, 4, 4, 4, VFilter);

+}

+void vp9_sixtap_predict_c(unsigned char  *src_ptr,

+                          int   src_pixels_per_line,

+                          int  xoffset,

+                          int  yoffset,

+                          unsigned char *dst_ptr,

+                          int dst_pitch) {

+  const short  *HFilter;

+  const short  *VFilter;

+  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

+  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

+  filter_block2d_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);

+}

+/*

+ * The difference between filter_block2d_6() and filter_block2d_avg_6 is

+ * that filter_block2d_6() does a 6-tap filter and stores it in the output

+ * buffer, whereas filter_block2d_avg_6() does the same 6-tap filter, and

+ * then averages that with the content already present in the output

+ * ((filter_result + dest + 1) >> 1) and stores that in the output.

+ */

+static void filter_block2d_avg_6(unsigned char  *src_ptr,

+                                 unsigned char  *output_ptr,

+                                 unsigned int src_pixels_per_line,

+                                 int output_pitch,

+                                 const short  *HFilter,

+                                 const short  *VFilter) {

+  int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer used in filtering */

+  /* First filter 1-D horizontally... */

+  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line),

+                              FData, src_pixels_per_line, 1,

+                              3 + Interp_Extend * 2, 4, HFilter);

+  /* then filter verticaly... */

+  filter_block2d_second_pass_avg_6(FData + 4 * (Interp_Extend - 1), output_ptr,

+                                   output_pitch, 4, 4, 4, 4, VFilter);

+}

+void vp9_sixtap_predict_avg_c

+(

+  unsigned char  *src_ptr,

+  int   src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  unsigned char *dst_ptr,

+  int dst_pitch

+) {

+  const short  *HFilter;

+  const short  *VFilter;

+  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

+  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

+  filter_block2d_avg_6(src_ptr, dst_ptr, src_pixels_per_line,

+                       dst_pitch, HFilter, VFilter);

+}

+void vp9_sixtap_predict8x8_c

+(

+  unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  unsigned char *dst_ptr,

+  int  dst_pitch

+) {

+  const short  *HFilter;

+  const short  *VFilter;

+  // int FData[(7+Interp_Extend*2)*16];   /* Temp data buffer used in filtering */

+  int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */

+  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

+  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

+  /* First filter 1-D horizontally... */

+  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,

+                              7 + Interp_Extend * 2, 8, HFilter);

+  /* then filter verticaly... */

+  filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);

+}

+void vp9_sixtap_predict_avg8x8_c

+(

+  unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  unsigned char *dst_ptr,

+  int  dst_pitch

+) {

+  const short  *HFilter;

+  const short  *VFilter;

+  // int FData[(7+Interp_Extend*2)*16];   /* Temp data buffer used in filtering */

+  int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */

+  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

+  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

+  /* First filter 1-D horizontally... */

+  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,

+                              7 + Interp_Extend * 2, 8, HFilter);

+  /* then filter verticaly... */

+  filter_block2d_second_pass_avg_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);

+}

+void vp9_sixtap_predict8x4_c

+(

+  unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  unsigned char *dst_ptr,

+  int  dst_pitch

+) {

+  const short  *HFilter;

+  const short  *VFilter;

+  // int FData[(7+Interp_Extend*2)*16];   /* Temp data buffer used in filtering */

+  int FData[(3 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */

+  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

+  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

+  /* First filter 1-D horizontally... */

+  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,

+                              3 + Interp_Extend * 2, 8, HFilter);

+  /* then filter verticaly... */

+  filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);

+}

+void vp9_sixtap_predict16x16_c

+(

+  unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  unsigned char *dst_ptr,

+  int  dst_pitch

+) {

+  const short  *HFilter;

+  const short  *VFilter;

+  // int FData[(15+Interp_Extend*2)*24];   /* Temp data buffer used in filtering */

+  int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer used in filtering */

+  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

+  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

+  /* First filter 1-D horizontally... */

+  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,

+                              15 + Interp_Extend * 2, 16, HFilter);

+  /* then filter verticaly... */

+  filter_block2d_second_pass_6(FData + 16 * (Interp_Extend - 1), dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);

+}

+void vp9_sixtap_predict_avg16x16_c

+(

+  unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  unsigned char *dst_ptr,

+  int  dst_pitch

+) {

+  const short  *HFilter;

+  const short  *VFilter;

+  // int FData[(15+Interp_Extend*2)*24];   /* Temp data buffer used in filtering */

+  int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer used in filtering */

+  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

+  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

+  /* First filter 1-D horizontally... */

+  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,

+                              src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter);

+  /* then filter verticaly... */

+  filter_block2d_second_pass_avg_6(FData + 16 * (Interp_Extend - 1), dst_ptr, dst_pitch,

+                                   16, 16, 16, 16, VFilter);

+}

+typedef enum {

+  VPX_FILTER_4x4 = 0,

+  VPX_FILTER_8x8 = 1,

+  VPX_FILTER_8x4 = 2,

+  VPX_FILTER_16x16 = 3,

+} filter_size_t;

+static const unsigned int filter_size_to_wh[][2] = {

+  {4, 4},

+  {8, 8},

+  {8, 4},

+  {16,16},

+};

+static const unsigned int filter_max_height = 16;

+static const unsigned int filter_max_width = 16;

+static void filter_block2d_8_c(const unsigned char *src_ptr,

+                               const unsigned int   src_stride,

+                               const short *HFilter,

+                               const short *VFilter,

+                               const filter_size_t filter_size,

+                               unsigned char *dst_ptr,

+                               unsigned int   dst_stride) {

+  const unsigned int output_width = filter_size_to_wh[filter_size][0];

+  const unsigned int output_height = filter_size_to_wh[filter_size][1];

+  // Between passes, we use an intermediate buffer whose height is extended to

+  // have enough horizontally filtered values as input for the vertical pass.

+  // This buffer is allocated to be big enough for the largest block type we

+  // support.

+  const int kInterp_Extend = 4;

+  const unsigned int intermediate_height =

+    (kInterp_Extend - 1) +     output_height + kInterp_Extend;

+  const unsigned int max_intermediate_height =

+    (kInterp_Extend - 1) + filter_max_height + kInterp_Extend;

+#ifdef _MSC_VER

+  // MSVC does not support C99 style declaration

+  unsigned char intermediate_buffer[23 * 16];

+#else

+  unsigned char intermediate_buffer[max_intermediate_height * filter_max_width];

+#endif

+  const int intermediate_next_stride = 1 - intermediate_height * output_width;

+  // Horizontal pass (src -> transposed intermediate).

+  {

+    unsigned char *output_ptr = intermediate_buffer;

+    const int src_next_row_stride = src_stride - output_width;

+    unsigned int i, j;

+    src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);

+    for (i = 0; i < intermediate_height; i++) {

+      for (j = 0; j < output_width; j++) {

+        // Apply filter...

+        int temp = ((int)src_ptr[0] * HFilter[0]) +

+                   ((int)src_ptr[1] * HFilter[1]) +

+                   ((int)src_ptr[2] * HFilter[2]) +

+                   ((int)src_ptr[3] * HFilter[3]) +

+                   ((int)src_ptr[4] * HFilter[4]) +

+                   ((int)src_ptr[5] * HFilter[5]) +

+                   ((int)src_ptr[6] * HFilter[6]) +

+                   ((int)src_ptr[7] * HFilter[7]) +

+                   (VP9_FILTER_WEIGHT >> 1); // Rounding

+        // Normalize back to 0-255...

+        temp >>= VP9_FILTER_SHIFT;

+        if (temp < 0) {

+          temp = 0;

+        } else if (temp > 255) {

+          temp = 255;

+        }

+        src_ptr++;

+        *output_ptr = temp;

+        output_ptr += intermediate_height;

+      }

+      src_ptr += src_next_row_stride;

+      output_ptr += intermediate_next_stride;

+    }

+  }

+  // Vertical pass (transposed intermediate -> dst).

+  {

+    unsigned char *src_ptr = intermediate_buffer;

+    const int dst_next_row_stride = dst_stride - output_width;

+    unsigned int i, j;

+    for (i = 0; i < output_height; i++) {

+      for (j = 0; j < output_width; j++) {

+        // Apply filter...

+        int temp = ((int)src_ptr[0] * VFilter[0]) +

+                   ((int)src_ptr[1] * VFilter[1]) +

+                   ((int)src_ptr[2] * VFilter[2]) +

+                   ((int)src_ptr[3] * VFilter[3]) +

+                   ((int)src_ptr[4] * VFilter[4]) +

+                   ((int)src_ptr[5] * VFilter[5]) +

+                   ((int)src_ptr[6] * VFilter[6]) +

+                   ((int)src_ptr[7] * VFilter[7]) +

+                   (VP9_FILTER_WEIGHT >> 1); // Rounding

+        // Normalize back to 0-255...

+        temp >>= VP9_FILTER_SHIFT;

+        if (temp < 0) {

+          temp = 0;

+        } else if (temp > 255) {

+          temp = 255;

+        }

+        src_ptr += intermediate_height;

+        *dst_ptr++ = (unsigned char)temp;

+      }

+      src_ptr += intermediate_next_stride;

+      dst_ptr += dst_next_row_stride;

+    }

+  }

+}

+void vp9_filter_block2d_4x4_8_c(const unsigned char *src_ptr,

+                                const unsigned int src_stride,

+                                const short *HFilter_aligned16,

+                                const short *VFilter_aligned16,

+                                unsigned char *dst_ptr,

+                                unsigned int dst_stride) {

+  filter_block2d_8_c(src_ptr, src_stride,

+                     HFilter_aligned16, VFilter_aligned16,

+                     VPX_FILTER_4x4, dst_ptr, dst_stride);

+}

+void vp9_filter_block2d_8x4_8_c(const unsigned char *src_ptr,

+                                const unsigned int src_stride,

+                                const short *HFilter_aligned16,

+                                const short *VFilter_aligned16,

+                                unsigned char *dst_ptr,

+                                unsigned int dst_stride) {

+  filter_block2d_8_c(src_ptr, src_stride,

+                     HFilter_aligned16, VFilter_aligned16,

+                     VPX_FILTER_8x4, dst_ptr, dst_stride);

+}

+void vp9_filter_block2d_8x8_8_c(const unsigned char *src_ptr,

+                                const unsigned int src_stride,

+                                const short *HFilter_aligned16,

+                                const short *VFilter_aligned16,

+                                unsigned char *dst_ptr,

+                                unsigned int dst_stride) {

+  filter_block2d_8_c(src_ptr, src_stride,

+                     HFilter_aligned16, VFilter_aligned16,

+                     VPX_FILTER_8x8, dst_ptr, dst_stride);

+}

+void vp9_filter_block2d_16x16_8_c(const unsigned char *src_ptr,

+                                  const unsigned int src_stride,

+                                  const short *HFilter_aligned16,

+                                  const short *VFilter_aligned16,

+                                  unsigned char *dst_ptr,

+                                  unsigned int dst_stride) {

+  filter_block2d_8_c(src_ptr, src_stride,

+                     HFilter_aligned16, VFilter_aligned16,

+                     VPX_FILTER_16x16, dst_ptr, dst_stride);

+}

+static void block2d_average_c(unsigned char *src,

+                              unsigned int   src_stride,

+                              unsigned char *output_ptr,

+                              unsigned int output_stride,

+                              const filter_size_t filter_size) {

+  const unsigned int output_width = filter_size_to_wh[filter_size][0];

+  const unsigned int output_height = filter_size_to_wh[filter_size][1];

+  unsigned int i, j;

+  for (i = 0; i < output_height; i++) {

+    for (j = 0; j < output_width; j++) {

+      output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;

+    }

+    output_ptr += output_stride;

+  }

+}

+#define block2d_average block2d_average_c

+void vp9_eighttap_predict_c(unsigned char  *src_ptr,

+                            int   src_pixels_per_line,

+                            int  xoffset,

+                            int  yoffset,

+                            unsigned char *dst_ptr,

+                            int dst_pitch) {

+  const short  *HFilter;

+  const short  *VFilter;

+  HFilter = vp9_sub_pel_filters_8[xoffset];

+  VFilter = vp9_sub_pel_filters_8[yoffset];

+  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,

+                           HFilter, VFilter,

+                           dst_ptr, dst_pitch);

+}

+void vp9_eighttap_predict_avg4x4_c(unsigned char  *src_ptr,

+                                   int   src_pixels_per_line,

+                                   int  xoffset,

+                                   int  yoffset,

+                                   unsigned char *dst_ptr,

+                                   int dst_pitch) {

+  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];

+  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];

+  unsigned char tmp[4 * 4];

+  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,

+                           HFilter, VFilter,

+                           tmp, 4);

+  block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);

+}

+void vp9_eighttap_predict_sharp_c(unsigned char  *src_ptr,

+                                  int   src_pixels_per_line,

+                                  int  xoffset,

+                                  int  yoffset,

+                                  unsigned char *dst_ptr,

+                                  int dst_pitch) {

+  const short  *HFilter;

+  const short  *VFilter;

+  HFilter = vp9_sub_pel_filters_8s[xoffset];

+  VFilter = vp9_sub_pel_filters_8s[yoffset];

+  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,

+                           HFilter, VFilter,

+                           dst_ptr, dst_pitch);

+}

+void vp9_eighttap_predict_avg4x4_sharp_c(unsigned char  *src_ptr,

+                                         int   src_pixels_per_line,

+                                         int  xoffset,

+                                         int  yoffset,

+                                         unsigned char *dst_ptr,

+                                         int dst_pitch) {

+  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];

+  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];

+  unsigned char tmp[4 * 4];

+  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,

+                           HFilter, VFilter,

+                           tmp, 4);

+  block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);

+}

+void vp9_eighttap_predict8x8_c(unsigned char  *src_ptr,

+                               int  src_pixels_per_line,

+                               int  xoffset,

+                               int  yoffset,

+                               unsigned char *dst_ptr,

+                               int  dst_pitch) {

+  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];

+  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];

+  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,

+                           HFilter, VFilter,

+                           dst_ptr, dst_pitch);

+}

+void vp9_eighttap_predict8x8_sharp_c(unsigned char  *src_ptr,

+                                     int  src_pixels_per_line,

+                                     int  xoffset,

+                                     int  yoffset,

+                                     unsigned char *dst_ptr,

+                                     int  dst_pitch) {

+  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];

+  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];

+  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,

+                           HFilter, VFilter,

+                           dst_ptr, dst_pitch);

+}

+void vp9_eighttap_predict_avg8x8_c(unsigned char  *src_ptr,

+                                   int  src_pixels_per_line,

+                                   int  xoffset,

+                                   int  yoffset,

+                                   unsigned char *dst_ptr,

+                                   int  dst_pitch) {

+  unsigned char tmp[8 * 8];

+  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];

+  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];

+  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,

+                           HFilter, VFilter,

+                           tmp, 8);

+  block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);

+}

+void vp9_eighttap_predict_avg8x8_sharp_c(unsigned char  *src_ptr,

+                                         int  src_pixels_per_line,

+                                         int  xoffset,

+                                         int  yoffset,

+                                         unsigned char *dst_ptr,

+                                         int  dst_pitch) {

+  unsigned char tmp[8 * 8];

+  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];

+  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];

+  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,

+                           HFilter, VFilter,

+                           tmp, 8);

+  block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);

+}

+void vp9_eighttap_predict8x4_c(unsigned char  *src_ptr,

+                               int  src_pixels_per_line,

+                               int  xoffset,

+                               int  yoffset,

+                               unsigned char *dst_ptr,

+                               int  dst_pitch) {

+  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];

+  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];

+  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line,

+                           HFilter, VFilter,

+                           dst_ptr, dst_pitch);

+}

+void vp9_eighttap_predict8x4_sharp_c(unsigned char  *src_ptr,

+                                     int  src_pixels_per_line,

+                                     int  xoffset,

+                                     int  yoffset,

+                                     unsigned char *dst_ptr,

+                                     int  dst_pitch) {

+  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];

+  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];

+  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line,

+                           HFilter, VFilter,

+                           dst_ptr, dst_pitch);

+}

+void vp9_eighttap_predict16x16_c(unsigned char  *src_ptr,

+                                 int  src_pixels_per_line,

+                                 int  xoffset,

+                                 int  yoffset,

+                                 unsigned char *dst_ptr,

+                                 int  dst_pitch) {

+  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];

+  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];

+  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,

+                       HFilter, VFilter,

+                       dst_ptr, dst_pitch);

+}

+void vp9_eighttap_predict16x16_sharp_c(unsigned char  *src_ptr,

+                                       int  src_pixels_per_line,

+                                       int  xoffset,

+                                       int  yoffset,

+                                       unsigned char *dst_ptr,

+                                       int  dst_pitch) {

+  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];

+  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];

+  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,

+                       HFilter, VFilter,

+                       dst_ptr, dst_pitch);

+}

+void vp9_eighttap_predict_avg16x16_c(unsigned char  *src_ptr,

+                                     int  src_pixels_per_line,

+                                     int  xoffset,

+                                     int  yoffset,

+                                     unsigned char *dst_ptr,

+                                     int  dst_pitch) {

+  DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 16 * 16);

+  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];

+  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];

+  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,

+                       HFilter, VFilter,

+                       tmp, 16);

+  block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);

+}

+void vp9_eighttap_predict_avg16x16_sharp_c(unsigned char  *src_ptr,

+                                           int  src_pixels_per_line,

+                                           int  xoffset,

+                                           int  yoffset,

+                                           unsigned char *dst_ptr,

+                                           int  dst_pitch) {

+  DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 16 * 16);

+  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];

+  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];

+  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,

+                       HFilter, VFilter,

+                       tmp, 16);

+  block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);

+}

+/****************************************************************************

+ *

+ *  ROUTINE       : filter_block2d_bil_first_pass

+ *

+ *  INPUTS        : UINT8  *src_ptr    : Pointer to source block.

+ *                  UINT32  src_stride : Stride of source block.

+ *                  UINT32  height     : Block height.

+ *                  UINT32  width      : Block width.

+ *                  INT32  *vp9_filter : Array of 2 bi-linear filter taps.

+ *

+ *  OUTPUTS       : INT32  *dst_ptr    : Pointer to filtered block.

+ *

+ *  RETURNS       : void

+ *

+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block

+ *                  in the horizontal direction to produce the filtered output

+ *                  block. Used to implement first-pass of 2-D separable filter.

+ *

+ *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.

+ *                  Two filter taps should sum to VP9_FILTER_WEIGHT.

+ *

+ ****************************************************************************/

+static void filter_block2d_bil_first_pass(unsigned char  *src_ptr,

+                                          unsigned short *dst_ptr,

+                                          unsigned int    src_stride,

+                                          unsigned int    height,

+                                          unsigned int    width,

+                                          const short    *vp9_filter) {

+  unsigned int i, j;

+  for (i = 0; i < height; i++) {

+    for (j = 0; j < width; j++) {

+      /* Apply bilinear filter */

+      dst_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) +

+                    ((int)src_ptr[1] * vp9_filter[1]) +

+                    (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;

+      src_ptr++;

+    }

+    /* Next row... */

+    src_ptr += src_stride - width;

+    dst_ptr += width;

+  }

+}

+/****************************************************************************

+ *

+ *  ROUTINE       : filter_block2d_bil_second_pass

+ *

+ *  INPUTS        : INT32  *src_ptr    : Pointer to source block.

+ *                  UINT32  dst_pitch  : Destination block pitch.

+ *                  UINT32  height     : Block height.

+ *                  UINT32  width      : Block width.

+ *                  INT32  *vp9_filter : Array of 2 bi-linear filter taps.

+ *

+ *  OUTPUTS       : UINT16 *dst_ptr    : Pointer to filtered block.

+ *

+ *  RETURNS       : void

+ *

+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block

+ *                  in the vertical direction to produce the filtered output

+ *                  block. Used to implement second-pass of 2-D separable filter.

+ *

+ *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.

+ *                  Two filter taps should sum to VP9_FILTER_WEIGHT.

+ *

+ ****************************************************************************/

+static void filter_block2d_bil_second_pass(unsigned short *src_ptr,

+                                           unsigned char  *dst_ptr,

+                                           int             dst_pitch,

+                                           unsigned int    height,

+                                           unsigned int    width,

+                                           const short    *vp9_filter) {

+  unsigned int  i, j;

+  int  Temp;

+  for (i = 0; i < height; i++) {

+    for (j = 0; j < width; j++) {

+      /* Apply filter */

+      Temp = ((int)src_ptr[0]     * vp9_filter[0]) +

+             ((int)src_ptr[width] * vp9_filter[1]) +

+             (VP9_FILTER_WEIGHT / 2);

+      dst_ptr[j] = (unsigned int)(Temp >> VP9_FILTER_SHIFT);

+      src_ptr++;

+    }

+    /* Next row... */

+    dst_ptr += dst_pitch;

+  }

+}

+/*

+ * As before for filter_block2d_second_pass_avg(), the functional difference

+ * between filter_block2d_bil_second_pass() and filter_block2d_bil_second_pass_avg()

+ * is that filter_block2d_bil_second_pass() does a bilinear filter on input

+ * and stores the result in output; filter_block2d_bil_second_pass_avg(),

+ * instead, does a bilinear filter on input, averages the resulting value

+ * with the values already present in the output and stores the result of

+ * that back into the output ((filter_result + dest + 1) >> 1).

+ */

+static void filter_block2d_bil_second_pass_avg(unsigned short *src_ptr,

+                                               unsigned char  *dst_ptr,

+                                               int             dst_pitch,

+                                               unsigned int    height,

+                                               unsigned int    width,

+                                               const short    *vp9_filter) {

+  unsigned int  i, j;

+  int  Temp;

+  for (i = 0; i < height; i++) {

+    for (j = 0; j < width; j++) {

+      /* Apply filter */

+      Temp = ((int)src_ptr[0]     * vp9_filter[0]) +

+             ((int)src_ptr[width] * vp9_filter[1]) +

+             (VP9_FILTER_WEIGHT / 2);

+      dst_ptr[j] = (unsigned int)(((Temp >> VP9_FILTER_SHIFT) + dst_ptr[j] + 1) >> 1);

+      src_ptr++;

+    }

+    /* Next row... */

+    dst_ptr += dst_pitch;

+  }

+}

+/****************************************************************************

+ *

+ *  ROUTINE       : filter_block2d_bil

+ *

+ *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.

+ *                  UINT32  src_pitch        : Stride of source block.

+ *                  UINT32  dst_pitch        : Stride of destination block.

+ *                  INT32  *HFilter          : Array of 2 horizontal filter taps.

+ *                  INT32  *VFilter          : Array of 2 vertical filter taps.

+ *                  INT32  Width             : Block width

+ *                  INT32  Height            : Block height

+ *

+ *  OUTPUTS       : UINT16 *dst_ptr       : Pointer to filtered block.

+ *

+ *  RETURNS       : void

+ *

+ *  FUNCTION      : 2-D filters an input block by applying a 2-tap

+ *                  bi-linear filter horizontally followed by a 2-tap

+ *                  bi-linear filter vertically on the result.

+ *

+ *  SPECIAL NOTES : The largest block size can be handled here is 16x16

+ *

+ ****************************************************************************/

+static void filter_block2d_bil(unsigned char *src_ptr,

+                               unsigned char *dst_ptr,

+                               unsigned int   src_pitch,

+                               unsigned int   dst_pitch,

+                               const short   *HFilter,

+                               const short   *VFilter,

+                               int            Width,

+                               int            Height) {

+  unsigned short FData[17 * 16];  /* Temp data buffer used in filtering */

+  /* First filter 1-D horizontally... */

+  filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);

+  /* then 1-D vertically... */

+  filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);

+}

+static void filter_block2d_bil_avg(unsigned char *src_ptr,

+                                   unsigned char *dst_ptr,

+                                   unsigned int   src_pitch,

+                                   unsigned int   dst_pitch,

+                                   const short   *HFilter,

+                                   const short   *VFilter,

+                                   int            Width,

+                                   int            Height) {

+  unsigned short FData[17 * 16];  /* Temp data buffer used in filtering */

+  /* First filter 1-D horizontally... */

+  filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);

+  /* then 1-D vertically... */

+  filter_block2d_bil_second_pass_avg(FData, dst_ptr, dst_pitch, Height, Width, VFilter);

+}

+void vp9_bilinear_predict4x4_c(unsigned char  *src_ptr,

+                               int   src_pixels_per_line,

+                               int  xoffset,

+                               int  yoffset,

+                               unsigned char *dst_ptr,

+                               int dst_pitch) {

+  const short *HFilter;

+  const short *VFilter;

+  HFilter = vp9_bilinear_filters[xoffset];

+  VFilter = vp9_bilinear_filters[yoffset];

+  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);

+}

+void vp9_bilinear_predict_avg4x4_c(unsigned char  *src_ptr,

+                                   int   src_pixels_per_line,

+                                   int  xoffset,

+                                   int  yoffset,

+                                   unsigned char *dst_ptr,

+                                   int dst_pitch) {

+  const short *HFilter;

+  const short *VFilter;

+  HFilter = vp9_bilinear_filters[xoffset];

+  VFilter = vp9_bilinear_filters[yoffset];

+  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,

+                         dst_pitch, HFilter, VFilter, 4, 4);

+}

+void vp9_bilinear_predict8x8_c(unsigned char  *src_ptr,

+                               int  src_pixels_per_line,

+                               int  xoffset,

+                               int  yoffset,

+                               unsigned char *dst_ptr,

+                               int  dst_pitch) {

+  const short *HFilter;

+  const short *VFilter;

+  HFilter = vp9_bilinear_filters[xoffset];

+  VFilter = vp9_bilinear_filters[yoffset];

+  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);

+}

+void vp9_bilinear_predict_avg8x8_c(unsigned char  *src_ptr,

+                                   int  src_pixels_per_line,

+                                   int  xoffset,

+                                   int  yoffset,

+                                   unsigned char *dst_ptr,

+                                   int  dst_pitch) {

+  const short *HFilter;

+  const short *VFilter;

+  HFilter = vp9_bilinear_filters[xoffset];

+  VFilter = vp9_bilinear_filters[yoffset];

+  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,

+                         dst_pitch, HFilter, VFilter, 8, 8);

+}

+void vp9_bilinear_predict8x4_c(unsigned char  *src_ptr,

+                               int  src_pixels_per_line,

+                               int  xoffset,

+                               int  yoffset,

+                               unsigned char *dst_ptr,

+                               int  dst_pitch) {

+  const short *HFilter;

+  const short *VFilter;

+  HFilter = vp9_bilinear_filters[xoffset];

+  VFilter = vp9_bilinear_filters[yoffset];

+  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);

+}

+void vp9_bilinear_predict16x16_c(unsigned char  *src_ptr,

+                                 int  src_pixels_per_line,

+                                 int  xoffset,

+                                 int  yoffset,

+                                 unsigned char *dst_ptr,

+                                 int  dst_pitch) {

+  const short *HFilter;

+  const short *VFilter;

+  HFilter = vp9_bilinear_filters[xoffset];

+  VFilter = vp9_bilinear_filters[yoffset];

+  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);

+}

+void vp9_bilinear_predict_avg16x16_c(unsigned char  *src_ptr,

+                                     int  src_pixels_per_line,

+                                     int  xoffset,

+                                     int  yoffset,

+                                     unsigned char *dst_ptr,

+                                     int  dst_pitch) {

+  const short *HFilter;

+  const short *VFilter;

+  HFilter = vp9_bilinear_filters[xoffset];

+  VFilter = vp9_bilinear_filters[yoffset];

+  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,

+                         dst_pitch, HFilter, VFilter, 16, 16);

+}

--- /dev/null

+++ b/vp9/common/filter.h

@@ -1,0 +1,28 @@

+/*

+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef FILTER_H

+#define FILTER_H

+#include "vpx_config.h"

+#include "vpx_scale/yv12config.h"

+#define BLOCK_HEIGHT_WIDTH 4

+#define VP9_FILTER_WEIGHT 128

+#define VP9_FILTER_SHIFT  7

+#define SUBPEL_SHIFTS 16

+extern const short vp9_bilinear_filters[SUBPEL_SHIFTS][2];

+extern const short vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6];

+extern const short vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8];

+extern const short vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8];

+#endif // FILTER_H

--- /dev/null

+++ b/vp9/common/findnearmv.c

@@ -1,0 +1,327 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "findnearmv.h"

+#include "vp9/common/sadmxn.h"

+#include <limits.h>

+const unsigned char vp9_mbsplit_offset[4][16] = {

+  { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},

+  { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},

+  { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},

+  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}

+};

+static void lower_mv_precision(int_mv *mv, int usehp)

+{

+  if (!usehp || !vp9_use_nmv_hp(&mv->as_mv)) {

+    if (mv->as_mv.row & 1)

+      mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1);

+    if (mv->as_mv.col & 1)

+      mv->as_mv.col += (mv->as_mv.col > 0 ? -1 : 1);

+  }

+}

+/* Predict motion vectors using those from already-decoded nearby blocks.

+   Note that we only consider one 4x4 subblock from each candidate 16x16

+   macroblock.   */

+void vp9_find_near_mvs

+(

+  MACROBLOCKD *xd,

+  const MODE_INFO *here,

+  const MODE_INFO *lf_here,

+  int_mv *nearest,

+  int_mv *nearby,

+  int_mv *best_mv,

+  int cnt[4],

+  int refframe,

+  int *ref_frame_sign_bias) {

+  const MODE_INFO *above = here - xd->mode_info_stride;

+  const MODE_INFO *left = here - 1;

+  const MODE_INFO *aboveleft = above - 1;

+  const MODE_INFO *third = NULL;

+  int_mv            near_mvs[4];

+  int_mv           *mv = near_mvs;

+  int             *cntx = cnt;

+  enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV};

+  /* Zero accumulators */

+  mv[0].as_int = mv[1].as_int = mv[2].as_int = 0;

+  cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;

+  /* Process above */

+  if (above->mbmi.ref_frame != INTRA_FRAME) {

+    if (above->mbmi.mv[0].as_int) {

+      ++ mv;

+      mv->as_int = above->mbmi.mv[0].as_int;

+      mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame],

+              refframe, mv, ref_frame_sign_bias);

+      ++cntx;

+    }

+    *cntx += 2;

+  }

+  /* Process left */

+  if (left->mbmi.ref_frame != INTRA_FRAME) {

+    if (left->mbmi.mv[0].as_int) {

+      int_mv this_mv;

+      this_mv.as_int = left->mbmi.mv[0].as_int;

+      mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame],

+              refframe, &this_mv, ref_frame_sign_bias);

+      if (this_mv.as_int != mv->as_int) {

+        ++ mv;

+        mv->as_int = this_mv.as_int;

+        ++ cntx;

+      }

+      *cntx += 2;

+    } else

+      cnt[CNT_INTRA] += 2;

+  }

+  /* Process above left or the one from last frame */

+  if (aboveleft->mbmi.ref_frame != INTRA_FRAME ||

+      (lf_here->mbmi.ref_frame == LAST_FRAME && refframe == LAST_FRAME)) {

+    if (aboveleft->mbmi.mv[0].as_int) {

+      third = aboveleft;

+    } else if (lf_here->mbmi.mv[0].as_int) {

+      third = lf_here;

+    }

+    if (third) {

+      int_mv this_mv;

+      this_mv.as_int = third->mbmi.mv[0].as_int;

+      mv_bias(ref_frame_sign_bias[third->mbmi.ref_frame],

+              refframe, &this_mv, ref_frame_sign_bias);

+      if (this_mv.as_int != mv->as_int) {

+        ++ mv;

+        mv->as_int = this_mv.as_int;

+        ++ cntx;

+      }

+      *cntx += 1;

+    } else

+      cnt[CNT_INTRA] += 1;

+  }

+  /* If we have three distinct MV's ... */

+  if (cnt[CNT_SPLITMV]) {

+    /* See if the third MV can be merged with NEAREST */

+    if (mv->as_int == near_mvs[CNT_NEAREST].as_int)

+      cnt[CNT_NEAREST] += 1;

+  }

+  cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV)

+                      + (left->mbmi.mode == SPLITMV)) * 2

+                     + (

+                       lf_here->mbmi.mode == SPLITMV ||

+                       aboveleft->mbmi.mode == SPLITMV);

+  /* Swap near and nearest if necessary */

+  if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {

+    int tmp;

+    tmp = cnt[CNT_NEAREST];

+    cnt[CNT_NEAREST] = cnt[CNT_NEAR];

+    cnt[CNT_NEAR] = tmp;

+    tmp = near_mvs[CNT_NEAREST].as_int;

+    near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;

+    near_mvs[CNT_NEAR].as_int = tmp;

+  }

+  /* Use near_mvs[0] to store the "best" MV */

+  if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA])

+    near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST];

+  /* Set up return values */

+  best_mv->as_int = near_mvs[0].as_int;

+  nearest->as_int = near_mvs[CNT_NEAREST].as_int;

+  nearby->as_int = near_mvs[CNT_NEAR].as_int;

+  /* Make sure that the 1/8th bits of the Mvs are zero if high_precision

+   * is not being used, by truncating the last bit towards 0

+   */

+  lower_mv_precision(best_mv, xd->allow_high_precision_mv);

+  lower_mv_precision(nearest, xd->allow_high_precision_mv);

+  lower_mv_precision(nearby, xd->allow_high_precision_mv);

+  // TODO: move clamp outside findnearmv

+  clamp_mv2(nearest, xd);

+  clamp_mv2(nearby, xd);

+  clamp_mv2(best_mv, xd);

+}

+vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,

+                           vp9_prob p[VP9_MVREFS - 1], const int near_mv_ref_ct[4]

+                          ) {

+  p[0] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[0]] [0];

+  p[1] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[1]] [1];

+  p[2] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[2]] [2];

+  p[3] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[3]] [3];

+  return p;

+}

+#if CONFIG_NEWBESTREFMV

+#define SP(x) (((x) & 7) << 1)

+unsigned int vp9_sad3x16_c(

+  const unsigned char *src_ptr,

+  int  src_stride,

+  const unsigned char *ref_ptr,

+  int  ref_stride,

+  int max_sad) {

+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 3, 16);

+}

+unsigned int vp9_sad16x3_c(

+  const unsigned char *src_ptr,

+  int  src_stride,

+  const unsigned char *ref_ptr,

+  int  ref_stride,

+  int max_sad) {

+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 3);

+}

+/* check a list of motion vectors by sad score using a number rows of pixels

+ * above and a number cols of pixels in the left to select the one with best

+ * score to use as ref motion vector

+ */

+void vp9_find_best_ref_mvs(MACROBLOCKD *xd,

+                           unsigned char *ref_y_buffer,

+                           int ref_y_stride,

+                           int_mv *mvlist,

+                           int_mv *best_mv,

+                           int_mv *nearest,

+                           int_mv *near) {

+  int i, j;

+  unsigned char *above_src;

+  unsigned char *left_src;

+  unsigned char *above_ref;

+  unsigned char *left_ref;

+  int score;

+  int sse;

+  int ref_scores[MAX_MV_REFS] = {0};

+  int_mv sorted_mvs[MAX_MV_REFS];

+  int zero_seen = FALSE;

+  // Default all to 0,0 if nothing else available

+  best_mv->as_int = nearest->as_int = near->as_int = 0;

+  vpx_memset(sorted_mvs, 0, sizeof(sorted_mvs));

+#if CONFIG_SUBPELREFMV

+  above_src = xd->dst.y_buffer - xd->dst.y_stride * 2;

+  left_src  = xd->dst.y_buffer - 2;

+  above_ref = ref_y_buffer - ref_y_stride * 2;

+  left_ref  = ref_y_buffer - 2;

+#else

+  above_src = xd->dst.y_buffer - xd->dst.y_stride * 3;

+  left_src  = xd->dst.y_buffer - 3;

+  above_ref = ref_y_buffer - ref_y_stride * 3;

+  left_ref  = ref_y_buffer - 3;

+#endif

+  //for(i = 0; i < MAX_MV_REFS; ++i) {

+  // Limit search to the predicted best 4

+  for(i = 0; i < 4; ++i) {

+    int_mv this_mv;

+    int offset = 0;

+    int row_offset, col_offset;

+    this_mv.as_int = mvlist[i].as_int;

+    // If we see a 0,0 vector for a second time we have reached the end of

+    // the list of valid candidate vectors.

+    if (!this_mv.as_int && zero_seen)

+      break;

+    zero_seen = zero_seen || !this_mv.as_int;

+    clamp_mv(&this_mv,

+             xd->mb_to_left_edge - LEFT_TOP_MARGIN + 24,

+             xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,

+             xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,

+             xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);

+#if CONFIG_SUBPELREFMV

+    row_offset = this_mv.as_mv.row >> 3;

+    col_offset = this_mv.as_mv.col >> 3;

+    offset = ref_y_stride * row_offset + col_offset;

+    score = 0;

+    if (xd->up_available) {

+      vp9_sub_pixel_variance16x2_c(above_ref + offset, ref_y_stride,

+                                   SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

+                                   above_src, xd->dst.y_stride, &sse);

+      score += sse;

+    }

+    if (xd->left_available) {

+      vp9_sub_pixel_variance2x16_c(left_ref + offset, ref_y_stride,

+                                   SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

+                                   left_src, xd->dst.y_stride, &sse);

+      score += sse;

+    }

+#else

+    row_offset = (this_mv.as_mv.row > 0) ?

+      ((this_mv.as_mv.row + 3) >> 3):((this_mv.as_mv.row + 4) >> 3);

+    col_offset = (this_mv.as_mv.col > 0) ?

+      ((this_mv.as_mv.col + 3) >> 3):((this_mv.as_mv.col + 4) >> 3);

+    offset = ref_y_stride * row_offset + col_offset;

+    score = 0;

+    if (xd->up_available) {

+      score += vp9_sad16x3(above_src, xd->dst.y_stride,

+                           above_ref + offset, ref_y_stride, INT_MAX);

+    }

+    if (xd->left_available) {

+      score += vp9_sad3x16(left_src, xd->dst.y_stride,

+                           left_ref + offset, ref_y_stride, INT_MAX);

+    }

+#endif

+    // Add the entry to our list and then resort the list on score.

+    ref_scores[i] = score;

+    sorted_mvs[i].as_int = this_mv.as_int;

+    j = i;

+    while (j > 0) {

+      if (ref_scores[j] < ref_scores[j-1]) {

+        ref_scores[j] = ref_scores[j-1];

+        sorted_mvs[j].as_int = sorted_mvs[j-1].as_int;

+        ref_scores[j-1] = score;

+        sorted_mvs[j-1].as_int = this_mv.as_int;

+        j--;

+      } else

+        break;

+    }

+  }

+  // Make sure all the candidates are properly clamped etc

+  for (i = 0; i < 4; ++i) {

+    lower_mv_precision(&sorted_mvs[i], xd->allow_high_precision_mv);

+    clamp_mv2(&sorted_mvs[i], xd);

+  }

+  // Set the best mv to the first entry in the sorted list

+  best_mv->as_int = sorted_mvs[0].as_int;

+  // Provided that there are non zero vectors available there will not

+  // be more than one 0,0 entry in the sorted list.

+  // The best ref mv is always set to the first entry (which gave the best

+  // results. The nearest is set to the first non zero vector if available and

+  // near to the second non zero vector if available.

+  // We do not use 0,0 as a nearest or near as 0,0 has its own mode.

+  if ( sorted_mvs[0].as_int ) {

+    nearest->as_int = sorted_mvs[0].as_int;

+    if ( sorted_mvs[1].as_int )

+      near->as_int = sorted_mvs[1].as_int;

+    else

+      near->as_int = sorted_mvs[2].as_int;

+  } else {

+      nearest->as_int = sorted_mvs[1].as_int;

+      near->as_int = sorted_mvs[2].as_int;

+  }

+  // Copy back the re-ordered mv list

+  vpx_memcpy(mvlist, sorted_mvs, sizeof(sorted_mvs));

+}

+#endif  // CONFIG_NEWBESTREFMV

--- /dev/null

+++ b/vp9/common/findnearmv.h

@@ -1,0 +1,188 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_FINDNEARMV_H

+#define __INC_FINDNEARMV_H

+#include "mv.h"

+#include "blockd.h"

+#include "modecont.h"

+#include "treecoder.h"

+#include "onyxc_int.h"

+#if CONFIG_NEWBESTREFMV

+/* check a list of motion vectors by sad score using a number rows of pixels

+ * above and a number cols of pixels in the left to select the one with best

+ * score to use as ref motion vector

+ */

+void vp9_find_best_ref_mvs(MACROBLOCKD *xd,

+                           unsigned char *ref_y_buffer,

+                           int ref_y_stride,

+                           int_mv *mvlist,

+                           int_mv *best_mv,

+                           int_mv *nearest,

+                           int_mv *near);

+#endif

+static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, const int *ref_frame_sign_bias) {

+  MV xmv;

+  xmv = mvp->as_mv;

+  if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe]) {

+    xmv.row *= -1;

+    xmv.col *= -1;

+  }

+  mvp->as_mv = xmv;

+}

+#define LEFT_TOP_MARGIN (16 << 3)

+#define RIGHT_BOTTOM_MARGIN (16 << 3)

+static void clamp_mv(int_mv *mv,

+                     int mb_to_left_edge,

+                     int mb_to_right_edge,

+                     int mb_to_top_edge,

+                     int mb_to_bottom_edge) {

+  mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ?

+                  mb_to_left_edge : mv->as_mv.col;

+  mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ?

+                  mb_to_right_edge : mv->as_mv.col;

+  mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ?

+                  mb_to_top_edge : mv->as_mv.row;

+  mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ?

+                  mb_to_bottom_edge : mv->as_mv.row;

+}

+static void clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) {

+  clamp_mv(mv,

+           xd->mb_to_left_edge - LEFT_TOP_MARGIN,

+           xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,

+           xd->mb_to_top_edge - LEFT_TOP_MARGIN,

+           xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);

+}

+static unsigned int check_mv_bounds(int_mv *mv,

+                                    int mb_to_left_edge,

+                                    int mb_to_right_edge,

+                                    int mb_to_top_edge,

+                                    int mb_to_bottom_edge) {

+  return (mv->as_mv.col < mb_to_left_edge) ||

+         (mv->as_mv.col > mb_to_right_edge) ||

+         (mv->as_mv.row < mb_to_top_edge) ||

+         (mv->as_mv.row > mb_to_bottom_edge);

+}

+void vp9_find_near_mvs(MACROBLOCKD *xd,

+                       const MODE_INFO *here,

+                       const MODE_INFO *lfhere,

+                       int_mv *nearest, int_mv *nearby, int_mv *best,

+                       int near_mv_ref_cts[4],

+                       int refframe,

+                       int *ref_frame_sign_bias);

+vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,

+                           vp9_prob p[VP9_MVREFS - 1],

+                           const int near_mv_ref_ct[4]);

+extern const unsigned char vp9_mbsplit_offset[4][16];

+static int left_block_mv(const MODE_INFO *cur_mb, int b) {

+  if (!(b & 3)) {

+    /* On L edge, get from MB to left of us */

+    --cur_mb;

+    if (cur_mb->mbmi.mode != SPLITMV)

+      return cur_mb->mbmi.mv[0].as_int;

+    b += 4;

+  }

+  return (cur_mb->bmi + b - 1)->as_mv.first.as_int;

+}

+static int left_block_second_mv(const MODE_INFO *cur_mb, int b) {

+  if (!(b & 3)) {

+    /* On L edge, get from MB to left of us */

+    --cur_mb;

+    if (cur_mb->mbmi.mode != SPLITMV)

+      return cur_mb->mbmi.second_ref_frame ? cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;

+    b += 4;

+  }

+  return cur_mb->mbmi.second_ref_frame ? (cur_mb->bmi + b - 1)->as_mv.second.as_int : (cur_mb->bmi + b - 1)->as_mv.first.as_int;

+}

+static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {

+  if (!(b >> 2)) {

+    /* On top edge, get from MB above us */

+    cur_mb -= mi_stride;

+    if (cur_mb->mbmi.mode != SPLITMV)

+      return cur_mb->mbmi.mv[0].as_int;

+    b += 16;

+  }

+  return (cur_mb->bmi + b - 4)->as_mv.first.as_int;

+}

+static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {

+  if (!(b >> 2)) {

+    /* On top edge, get from MB above us */

+    cur_mb -= mi_stride;

+    if (cur_mb->mbmi.mode != SPLITMV)

+      return cur_mb->mbmi.second_ref_frame ? cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;

+    b += 16;

+  }

+  return cur_mb->mbmi.second_ref_frame ? (cur_mb->bmi + b - 4)->as_mv.second.as_int : (cur_mb->bmi + b - 4)->as_mv.first.as_int;

+}

+static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {

+  if (!(b & 3)) {

+    /* On L edge, get from MB to left of us */

+    --cur_mb;

+    if (cur_mb->mbmi.mode < I8X8_PRED) {

+      return pred_mode_conv(cur_mb->mbmi.mode);

+    } else if (cur_mb->mbmi.mode == I8X8_PRED) {

+      return pred_mode_conv((cur_mb->bmi + 3 + b)->as_mode.first);

+    } else if (cur_mb->mbmi.mode == B_PRED) {

+      return ((cur_mb->bmi + 3 + b)->as_mode.first);

+    } else {

+      return B_DC_PRED;

+    }

+  }

+  return (cur_mb->bmi + b - 1)->as_mode.first;

+}

+static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,

+                                          int b, int mi_stride) {

+  if (!(b >> 2)) {

+    /* On top edge, get from MB above us */

+    cur_mb -= mi_stride;

+    if (cur_mb->mbmi.mode < I8X8_PRED) {

+      return pred_mode_conv(cur_mb->mbmi.mode);

+    } else if (cur_mb->mbmi.mode == I8X8_PRED) {

+      return pred_mode_conv((cur_mb->bmi + 12 + b)->as_mode.first);

+    } else if (cur_mb->mbmi.mode == B_PRED) {

+      return ((cur_mb->bmi + 12 + b)->as_mode.first);

+    } else {

+      return B_DC_PRED;

+    }

+  }

+  return (cur_mb->bmi + b - 4)->as_mode.first;

+}

+#endif

--- /dev/null

+++ b/vp9/common/generic/systemdependent.c

@@ -1,0 +1,87 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vpx_rtcd.h"

+#include "vp9/common/subpixel.h"

+#include "vp9/common/loopfilter.h"

+#include "vp9/common/idct.h"

+#include "vp9/common/onyxc_int.h"

+extern void vp9_arch_x86_common_init(VP9_COMMON *ctx);

+extern void vp9_arch_arm_common_init(VP9_COMMON *ctx);

+void vp9_machine_specific_config(VP9_COMMON *ctx) {

+#if CONFIG_RUNTIME_CPU_DETECT

+  VP9_COMMON_RTCD *rtcd = &ctx->rtcd;

+  rtcd->idct.idct1        = vp9_short_idct4x4llm_1_c;

+  rtcd->idct.idct16       = vp9_short_idct4x4llm_c;

+  rtcd->idct.idct1_scalar_add = vp9_dc_only_idct_add_c;

+  rtcd->idct.iwalsh1      = vp9_short_inv_walsh4x4_1_c;

+  rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_c;

+  rtcd->idct.idct8        = vp9_short_idct8x8_c;

+  rtcd->idct.idct1_scalar_add_8x8 = vp9_dc_only_idct_add_8x8_c;

+  rtcd->idct.ihaar2       = vp9_short_ihaar2x2_c;

+  rtcd->idct.idct16x16    = vp9_short_idct16x16_c;

+  rtcd->subpix.eighttap16x16       = vp9_eighttap_predict16x16_c;

+  rtcd->subpix.eighttap8x8         = vp9_eighttap_predict8x8_c;

+  rtcd->subpix.eighttap_avg16x16   = vp9_eighttap_predict_avg16x16_c;

+  rtcd->subpix.eighttap_avg8x8     = vp9_eighttap_predict_avg8x8_c;

+  rtcd->subpix.eighttap_avg4x4     = vp9_eighttap_predict_avg4x4_c;

+  rtcd->subpix.eighttap8x4         = vp9_eighttap_predict8x4_c;

+  rtcd->subpix.eighttap4x4         = vp9_eighttap_predict_c;

+  rtcd->subpix.eighttap16x16_sharp     = vp9_eighttap_predict16x16_sharp_c;

+  rtcd->subpix.eighttap8x8_sharp       = vp9_eighttap_predict8x8_sharp_c;

+  rtcd->subpix.eighttap_avg16x16_sharp = vp9_eighttap_predict_avg16x16_sharp_c;

+  rtcd->subpix.eighttap_avg8x8_sharp   = vp9_eighttap_predict_avg8x8_sharp_c;

+  rtcd->subpix.eighttap_avg4x4_sharp   = vp9_eighttap_predict_avg4x4_sharp_c;

+  rtcd->subpix.eighttap8x4_sharp       = vp9_eighttap_predict8x4_sharp_c;

+  rtcd->subpix.eighttap4x4_sharp       = vp9_eighttap_predict_sharp_c;

+  rtcd->subpix.sixtap16x16       = vp9_sixtap_predict16x16_c;

+  rtcd->subpix.sixtap8x8         = vp9_sixtap_predict8x8_c;

+  rtcd->subpix.sixtap_avg16x16   = vp9_sixtap_predict_avg16x16_c;

+  rtcd->subpix.sixtap_avg8x8     = vp9_sixtap_predict_avg8x8_c;

+  rtcd->subpix.sixtap8x4         = vp9_sixtap_predict8x4_c;

+  rtcd->subpix.sixtap4x4         = vp9_sixtap_predict_c;

+  rtcd->subpix.sixtap_avg4x4     = vp9_sixtap_predict_avg_c;

+  rtcd->subpix.bilinear16x16     = vp9_bilinear_predict16x16_c;

+  rtcd->subpix.bilinear8x8       = vp9_bilinear_predict8x8_c;

+  rtcd->subpix.bilinear_avg16x16 = vp9_bilinear_predict_avg16x16_c;

+  rtcd->subpix.bilinear_avg8x8   = vp9_bilinear_predict_avg8x8_c;

+  rtcd->subpix.bilinear8x4       = vp9_bilinear_predict8x4_c;

+  rtcd->subpix.bilinear4x4       = vp9_bilinear_predict4x4_c;

+  rtcd->subpix.bilinear_avg4x4   = vp9_bilinear_predict_avg4x4_c;

+#if CONFIG_POSTPROC || (CONFIG_VP9_ENCODER && CONFIG_INTERNAL_STATS)

+  rtcd->postproc.down             = vp9_mbpost_proc_down_c;

+  rtcd->postproc.across           = vp9_mbpost_proc_across_ip_c;

+  rtcd->postproc.downacross       = vp9_post_proc_down_and_across_c;

+  rtcd->postproc.addnoise         = vp9_plane_add_noise_c;

+  rtcd->postproc.blend_mb_inner   = vp9_blend_mb_inner_c;

+  rtcd->postproc.blend_mb_outer   = vp9_blend_mb_outer_c;

+  rtcd->postproc.blend_b          = vp9_blend_b_c;

+#endif

+#endif

+#if ARCH_X86 || ARCH_X86_64

+  vp9_arch_x86_common_init(ctx);

+#endif

+#if ARCH_ARM

+  vp9_arch_arm_common_init(ctx);

+#endif

+  vpx_rtcd();

+}

--- /dev/null

+++ b/vp9/common/header.h

@@ -1,0 +1,42 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_HEADER_H

+#define __INC_HEADER_H

+/* 24 bits total */

+typedef struct {

+  unsigned int type: 1;

+  unsigned int version: 3;

+  unsigned int show_frame: 1;

+  /* Allow 2^20 bytes = 8 megabits for first partition */

+  unsigned int first_partition_length_in_bytes: 19;

+#ifdef PACKET_TESTING

+  unsigned int frame_number;

+  unsigned int update_gold: 1;

+  unsigned int uses_gold: 1;

+  unsigned int update_last: 1;

+  unsigned int uses_last: 1;

+#endif

+} VP9_HEADER;

+#ifdef PACKET_TESTING

+#define VP9_HEADER_SIZE 8

+#else

+#define VP9_HEADER_SIZE 3

+#endif

+#endif

--- /dev/null

+++ b/vp9/common/idct.h

@@ -1,0 +1,144 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_IDCT_H

+#define __INC_IDCT_H

+#include "vp9/common/blockd.h"

+#define prototype_second_order(sym) \

+  void sym(short *input, short *output)

+#define prototype_idct(sym) \

+  void sym(short *input, short *output, int pitch)

+#define prototype_idct_scalar_add(sym) \

+  void sym(short input, \

+           unsigned char *pred, unsigned char *output, \

+           int pitch, int stride)

+#if ARCH_X86 || ARCH_X86_64

+#include "x86/idct_x86.h"

+#endif

+#ifdef _MSC_VER

+/* TODO: remove these after integer implmementations are done */

+#define M_PI       3.14159265358979323846

+#define round(x) (((x)>0)? floor((x)+0.5): ceil((x)-0.5))

+#endif

+#if ARCH_ARM

+#include "arm/idct_arm.h"

+#endif

+#if CONFIG_LOSSLESS

+#define WHT_UPSCALE_FACTOR 3

+#define Y2_WHT_UPSCALE_FACTOR 2

+#endif

+#ifndef vp9_idct_idct16x16

+#define vp9_idct_idct16x16 vp9_short_idct16x16_c

+#endif

+extern prototype_idct(vp9_idct_idct16x16);

+#ifndef vp9_idct_idct8

+#define vp9_idct_idct8 vp9_short_idct8x8_c

+#endif

+extern prototype_idct(vp9_idct_idct8);

+#ifndef vp9_idct_idct8_1

+#define vp9_idct_idct8_1 vp9_short_idct8x8_1_c

+#endif

+extern prototype_idct(vp9_idct_idct8_1);

+#ifndef vp9_idct_ihaar2

+#define vp9_idct_ihaar2 vp9_short_ihaar2x2_c

+#endif

+extern prototype_idct(vp9_idct_ihaar2);

+#ifndef vp9_idct_ihaar2_1

+#define vp9_idct_ihaar2_1 vp9_short_ihaar2x2_1_c

+#endif

+extern prototype_idct(vp9_idct_ihaar2_1);

+#ifndef vp9_idct_idct1_scalar_add_8x8

+#define vp9_idct_idct1_scalar_add_8x8 vp9_dc_only_idct_add_8x8_c

+#endif

+extern prototype_idct_scalar_add(vp9_idct_idct1_scalar_add_8x8);

+#ifndef vp9_idct_idct1

+#define vp9_idct_idct1 vp9_short_idct4x4llm_1_c

+#endif

+extern prototype_idct(vp9_idct_idct1);

+#ifndef vp9_idct_idct16

+#define vp9_idct_idct16 vp9_short_idct4x4llm_c

+#endif

+extern prototype_idct(vp9_idct_idct16);

+#ifndef vp9_idct_idct1_scalar_add

+#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_c

+#endif

+extern prototype_idct_scalar_add(vp9_idct_idct1_scalar_add);

+#ifndef vp9_idct_iwalsh1

+#define vp9_idct_iwalsh1 vp9_short_inv_walsh4x4_1_c

+#endif

+extern prototype_second_order(vp9_idct_iwalsh1);

+#ifndef vp9_idct_iwalsh16

+#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_c

+#endif

+extern prototype_second_order(vp9_idct_iwalsh16);

+#if CONFIG_LOSSLESS

+extern prototype_idct(vp9_short_inv_walsh4x4_x8_c);

+extern prototype_idct(vp9_short_inv_walsh4x4_1_x8_c);

+extern prototype_idct_scalar_add(vp9_dc_only_inv_walsh_add_c);

+extern prototype_second_order(vp9_short_inv_walsh4x4_lossless_c);

+extern prototype_second_order(vp9_short_inv_walsh4x4_1_lossless_c);

+#endif

+void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,

+                  TX_TYPE tx_type, int tx_dim);

+typedef prototype_idct((*vp9_idct_fn_t));

+typedef prototype_idct_scalar_add((*vp9_idct_scalar_add_fn_t));

+typedef prototype_second_order((*vp9_second_order_fn_t));

+typedef struct {

+  vp9_idct_fn_t            idct1;

+  vp9_idct_fn_t            idct16;

+  vp9_idct_scalar_add_fn_t idct1_scalar_add;

+  vp9_second_order_fn_t iwalsh1;

+  vp9_second_order_fn_t iwalsh16;

+  vp9_idct_fn_t            idct8;

+  vp9_idct_fn_t            idct8_1;

+  vp9_idct_scalar_add_fn_t idct1_scalar_add_8x8;

+  vp9_idct_fn_t ihaar2;

+  vp9_idct_fn_t ihaar2_1;

+  vp9_idct_fn_t            idct16x16;

+} vp9_idct_rtcd_vtable_t;

+#if CONFIG_RUNTIME_CPU_DETECT

+#define IDCT_INVOKE(ctx,fn) (ctx)->fn

+#else

+#define IDCT_INVOKE(ctx,fn) vp9_idct_##fn

+#endif

+#endif

--- /dev/null

+++ b/vp9/common/idctllm.c

@@ -1,0 +1,1275 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+/****************************************************************************

+ * Notes:

+ *

+ * This implementation makes use of 16 bit fixed point verio of two multiply

+ * constants:

+ *         1.   sqrt(2) * cos (pi/8)

+ *         2.   sqrt(2) * sin (pi/8)

+ * Becuase the first constant is bigger than 1, to maintain the same 16 bit

+ * fixed point precision as the second one, we use a trick of

+ *         x * a = x + x*(a-1)

+ * so

+ *         x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).

+ **************************************************************************/

+#include <assert.h>

+#include <math.h>

+#include "vpx_ports/config.h"

+#include "vp9/common/idct.h"

+#include "vp9/common/systemdependent.h"

+#include "vp9/common/blockd.h"

+static const int cospi8sqrt2minus1 = 20091;

+static const int sinpi8sqrt2      = 35468;

+static const int rounding = 0;

+// TODO: these transforms can be further converted into integer forms

+//       for complexity optimization

+static const float idct_4[16] = {

+  0.500000000000000,   0.653281482438188,   0.500000000000000,   0.270598050073099,

+  0.500000000000000,   0.270598050073099,  -0.500000000000000,  -0.653281482438188,

+  0.500000000000000,  -0.270598050073099,  -0.500000000000000,   0.653281482438188,

+  0.500000000000000,  -0.653281482438188,   0.500000000000000,  -0.270598050073099

+};

+static const float iadst_4[16] = {

+  0.228013428883779,   0.577350269189626,   0.656538502008139,   0.428525073124360,

+  0.428525073124360,   0.577350269189626,  -0.228013428883779,  -0.656538502008139,

+  0.577350269189626,                   0,  -0.577350269189626,   0.577350269189626,

+  0.656538502008139,  -0.577350269189626,   0.428525073124359,  -0.228013428883779

+};

+static const float idct_8[64] = {

+  0.353553390593274,   0.490392640201615,   0.461939766255643,   0.415734806151273,

+  0.353553390593274,   0.277785116509801,   0.191341716182545,   0.097545161008064,

+  0.353553390593274,   0.415734806151273,   0.191341716182545,  -0.097545161008064,

+ -0.353553390593274,  -0.490392640201615,  -0.461939766255643,  -0.277785116509801,

+  0.353553390593274,   0.277785116509801,  -0.191341716182545,  -0.490392640201615,

+ -0.353553390593274,   0.097545161008064,   0.461939766255643,   0.415734806151273,

+  0.353553390593274,   0.097545161008064,  -0.461939766255643,  -0.277785116509801,

+  0.353553390593274,   0.415734806151273,  -0.191341716182545,  -0.490392640201615,

+  0.353553390593274,  -0.097545161008064,  -0.461939766255643,   0.277785116509801,

+  0.353553390593274,  -0.415734806151273,  -0.191341716182545,   0.490392640201615,

+  0.353553390593274,  -0.277785116509801,  -0.191341716182545,   0.490392640201615,

+ -0.353553390593274,  -0.097545161008064,   0.461939766255643,  -0.415734806151273,

+  0.353553390593274,  -0.415734806151273,   0.191341716182545,   0.097545161008064,

+ -0.353553390593274,   0.490392640201615,  -0.461939766255643,   0.277785116509801,

+  0.353553390593274,  -0.490392640201615,   0.461939766255643,  -0.415734806151273,

+  0.353553390593274,  -0.277785116509801,   0.191341716182545,  -0.097545161008064

+};

+static const float iadst_8[64] = {

+  0.089131608307533,   0.255357107325376,   0.387095214016349,   0.466553967085785,

+  0.483002021635509,   0.434217976756762,   0.326790388032145,   0.175227946595735,

+  0.175227946595735,   0.434217976756762,   0.466553967085785,   0.255357107325376,

+ -0.089131608307533,  -0.387095214016348,  -0.483002021635509,  -0.326790388032145,

+  0.255357107325376,   0.483002021635509,   0.175227946595735,  -0.326790388032145,

+ -0.466553967085785,  -0.089131608307533,   0.387095214016349,   0.434217976756762,

+  0.326790388032145,   0.387095214016349,  -0.255357107325376,  -0.434217976756762,

+  0.175227946595735,   0.466553967085786,  -0.089131608307534,  -0.483002021635509,

+  0.387095214016349,   0.175227946595735,  -0.483002021635509,   0.089131608307533,

+  0.434217976756762,  -0.326790388032145,  -0.255357107325377,   0.466553967085785,

+  0.434217976756762,  -0.089131608307533,  -0.326790388032145,   0.483002021635509,

+ -0.255357107325376,  -0.175227946595735,   0.466553967085785,  -0.387095214016348,

+  0.466553967085785,  -0.326790388032145,   0.089131608307533,   0.175227946595735,

+ -0.387095214016348,   0.483002021635509,  -0.434217976756762,   0.255357107325376,

+  0.483002021635509,  -0.466553967085785,   0.434217976756762,  -0.387095214016348,

+  0.326790388032145,  -0.255357107325375,   0.175227946595736,  -0.089131608307532

+};

+static const int16_t idct_i4[16] = {

+  8192,  10703,  8192,   4433,

+  8192,   4433, -8192, -10703,

+  8192,  -4433, -8192,  10703,

+  8192, -10703,  8192,  -4433

+};

+static const int16_t iadst_i4[16] = {

+   3736,  9459, 10757,   7021,

+   7021,  9459, -3736, -10757,

+   9459,     0, -9459,   9459,

+  10757, -9459,  7021,  -3736

+};

+static const int16_t idct_i8[64] = {

+   5793,  8035,  7568,  6811,

+   5793,  4551,  3135,  1598,

+   5793,  6811,  3135, -1598,

+  -5793, -8035, -7568, -4551,

+   5793,  4551, -3135, -8035,

+  -5793,  1598,  7568,  6811,

+   5793,  1598, -7568, -4551,

+   5793,  6811, -3135, -8035,

+   5793, -1598, -7568,  4551,

+   5793, -6811, -3135,  8035,

+   5793, -4551, -3135,  8035,

+  -5793, -1598,  7568, -6811,

+   5793, -6811,  3135,  1598,

+  -5793,  8035, -7568,  4551,

+   5793, -8035,  7568, -6811,

+   5793, -4551,  3135, -1598

+};

+static const int16_t iadst_i8[64] = {

+   1460,  4184,  6342,  7644,

+   7914,  7114,  5354,  2871,

+   2871,  7114,  7644,  4184,

+  -1460, -6342, -7914, -5354,

+   4184,  7914,  2871, -5354,

+  -7644, -1460,  6342,  7114,

+   5354,  6342, -4184, -7114,

+   2871,  7644, -1460, -7914,

+   6342,  2871, -7914,  1460,

+   7114, -5354, -4184,  7644,

+   7114, -1460, -5354,  7914,

+  -4184, -2871,  7644, -6342,

+   7644, -5354,  1460,  2871,

+  -6342,  7914, -7114,  4184,

+   7914, -7644,  7114, -6342,

+   5354, -4184,  2871, -1460

+};

+static float idct_16[256] = {

+  0.250000,  0.351851,  0.346760,  0.338330,  0.326641,  0.311806,  0.293969,  0.273300,

+  0.250000,  0.224292,  0.196424,  0.166664,  0.135299,  0.102631,  0.068975,  0.034654,

+  0.250000,  0.338330,  0.293969,  0.224292,  0.135299,  0.034654, -0.068975, -0.166664,

+ -0.250000, -0.311806, -0.346760, -0.351851, -0.326641, -0.273300, -0.196424, -0.102631,

+  0.250000,  0.311806,  0.196424,  0.034654, -0.135299, -0.273300, -0.346760, -0.338330,

+ -0.250000, -0.102631,  0.068975,  0.224292,  0.326641,  0.351851,  0.293969,  0.166664,

+  0.250000,  0.273300,  0.068975, -0.166664, -0.326641, -0.338330, -0.196424,  0.034654,

+  0.250000,  0.351851,  0.293969,  0.102631, -0.135299, -0.311806, -0.346760, -0.224292,

+  0.250000,  0.224292, -0.068975, -0.311806, -0.326641, -0.102631,  0.196424,  0.351851,

+  0.250000, -0.034654, -0.293969, -0.338330, -0.135299,  0.166664,  0.346760,  0.273300,

+  0.250000,  0.166664, -0.196424, -0.351851, -0.135299,  0.224292,  0.346760,  0.102631,

+ -0.250000, -0.338330, -0.068975,  0.273300,  0.326641,  0.034654, -0.293969, -0.311806,

+  0.250000,  0.102631, -0.293969, -0.273300,  0.135299,  0.351851,  0.068975, -0.311806,

+ -0.250000,  0.166664,  0.346760,  0.034654, -0.326641, -0.224292,  0.196424,  0.338330,

+  0.250000,  0.034654, -0.346760, -0.102631,  0.326641,  0.166664, -0.293969, -0.224292,

+  0.250000,  0.273300, -0.196424, -0.311806,  0.135299,  0.338330, -0.068975, -0.351851,

+  0.250000, -0.034654, -0.346760,  0.102631,  0.326641, -0.166664, -0.293969,  0.224292,

+  0.250000, -0.273300, -0.196424,  0.311806,  0.135299, -0.338330, -0.068975,  0.351851,

+  0.250000, -0.102631, -0.293969,  0.273300,  0.135299, -0.351851,  0.068975,  0.311806,

+ -0.250000, -0.166664,  0.346760, -0.034654, -0.326641,  0.224292,  0.196424, -0.338330,

+  0.250000, -0.166664, -0.196424,  0.351851, -0.135299, -0.224292,  0.346760, -0.102631,

+ -0.250000,  0.338330, -0.068975, -0.273300,  0.326641, -0.034654, -0.293969,  0.311806,

+  0.250000, -0.224292, -0.068975,  0.311806, -0.326641,  0.102631,  0.196424, -0.351851,

+  0.250000,  0.034654, -0.293969,  0.338330, -0.135299, -0.166664,  0.346760, -0.273300,

+  0.250000, -0.273300,  0.068975,  0.166664, -0.326641,  0.338330, -0.196424, -0.034654,

+  0.250000, -0.351851,  0.293969, -0.102631, -0.135299,  0.311806, -0.346760,  0.224292,

+  0.250000, -0.311806,  0.196424, -0.034654, -0.135299,  0.273300, -0.346760,  0.338330,

+ -0.250000,  0.102631,  0.068975, -0.224292,  0.326641, -0.351851,  0.293969, -0.166664,

+  0.250000, -0.338330,  0.293969, -0.224292,  0.135299, -0.034654, -0.068975,  0.166664,

+ -0.250000,  0.311806, -0.346760,  0.351851, -0.326641,  0.273300, -0.196424,  0.102631,

+  0.250000, -0.351851,  0.346760, -0.338330,  0.326641, -0.311806,  0.293969, -0.273300,

+  0.250000, -0.224292,  0.196424, -0.166664,  0.135299, -0.102631,  0.068975, -0.034654

+};

+static float iadst_16[256] = {

+  0.033094,  0.098087,  0.159534,  0.215215,  0.263118,  0.301511,  0.329007,  0.344612,

+  0.347761,  0.338341,  0.316693,  0.283599,  0.240255,  0.188227,  0.129396,  0.065889,

+  0.065889,  0.188227,  0.283599,  0.338341,  0.344612,  0.301511,  0.215215,  0.098087,

+ -0.033094, -0.159534, -0.263118, -0.329007, -0.347761, -0.316693, -0.240255, -0.129396,

+  0.098087,  0.263118,  0.344612,  0.316693,  0.188227,  0.000000, -0.188227, -0.316693,

+ -0.344612, -0.263118, -0.098087,  0.098087,  0.263118,  0.344612,  0.316693,  0.188227,

+  0.129396,  0.316693,  0.329007,  0.159534, -0.098087, -0.301511, -0.338341, -0.188227,

+  0.065889,  0.283599,  0.344612,  0.215215, -0.033094, -0.263118, -0.347761, -0.240255,

+  0.159534,  0.344612,  0.240255, -0.065889, -0.316693, -0.301511, -0.033094,  0.263118,

+  0.338341,  0.129396, -0.188227, -0.347761, -0.215215,  0.098087,  0.329007,  0.283599,

+  0.188227,  0.344612,  0.098087, -0.263118, -0.316693, -0.000000,  0.316693,  0.263118,

+ -0.098087, -0.344612, -0.188227,  0.188227,  0.344612,  0.098087, -0.263118, -0.316693,

+  0.215215,  0.316693, -0.065889, -0.347761, -0.098087,  0.301511,  0.240255, -0.188227,

+ -0.329007,  0.033094,  0.344612,  0.129396, -0.283599, -0.263118,  0.159534,  0.338341,

+  0.240255,  0.263118, -0.215215, -0.283599,  0.188227,  0.301511, -0.159534, -0.316693,

+  0.129396,  0.329007, -0.098087, -0.338341,  0.065889,  0.344612, -0.033094, -0.347761,

+  0.263118,  0.188227, -0.316693, -0.098087,  0.344612,  0.000000, -0.344612,  0.098087,

+  0.316693, -0.188227, -0.263118,  0.263118,  0.188227, -0.316693, -0.098087,  0.344612,

+  0.283599,  0.098087, -0.347761,  0.129396,  0.263118, -0.301511, -0.065889,  0.344612,

+ -0.159534, -0.240255,  0.316693,  0.033094, -0.338341,  0.188227,  0.215215, -0.329007,

+  0.301511,  0.000000, -0.301511,  0.301511,  0.000000, -0.301511,  0.301511,  0.000000,

+ -0.301511,  0.301511,  0.000000, -0.301511,  0.301511,  0.000000, -0.301511,  0.301511,

+  0.316693, -0.098087, -0.188227,  0.344612, -0.263118, -0.000000,  0.263118, -0.344612,

+  0.188227,  0.098087, -0.316693,  0.316693, -0.098087, -0.188227,  0.344612, -0.263118,

+  0.329007, -0.188227, -0.033094,  0.240255, -0.344612,  0.301511, -0.129396, -0.098087,

+  0.283599, -0.347761,  0.263118, -0.065889, -0.159534,  0.316693, -0.338341,  0.215215,

+  0.338341, -0.263118,  0.129396,  0.033094, -0.188227,  0.301511, -0.347761,  0.316693,

+ -0.215215,  0.065889,  0.098087, -0.240255,  0.329007, -0.344612,  0.283599, -0.159534,

+  0.344612, -0.316693,  0.263118, -0.188227,  0.098087,  0.000000, -0.098087,  0.188227,

+ -0.263118,  0.316693, -0.344612,  0.344612, -0.316693,  0.263118, -0.188227,  0.098087,

+  0.347761, -0.344612,  0.338341, -0.329007,  0.316693, -0.301511,  0.283599, -0.263118,

+  0.240255, -0.215215,  0.188227, -0.159534,  0.129396, -0.098087,  0.065889, -0.033094

+};

+static const int16_t idct_i16[256] = {

+   4096,  5765,  5681,  5543,  5352,  5109,  4816,  4478,

+   4096,  3675,  3218,  2731,  2217,  1682,  1130,   568,

+   4096,  5543,  4816,  3675,  2217,   568, -1130, -2731,

+  -4096, -5109, -5681, -5765, -5352, -4478, -3218, -1682,

+   4096,  5109,  3218,   568, -2217, -4478, -5681, -5543,

+  -4096, -1682,  1130,  3675,  5352,  5765,  4816,  2731,

+   4096,  4478,  1130, -2731, -5352, -5543, -3218,   568,

+   4096,  5765,  4816,  1682, -2217, -5109, -5681, -3675,

+   4096,  3675, -1130, -5109, -5352, -1682,  3218,  5765,

+   4096,  -568, -4816, -5543, -2217,  2731,  5681,  4478,

+   4096,  2731, -3218, -5765, -2217,  3675,  5681,  1682,

+  -4096, -5543, -1130,  4478,  5352,   568, -4816, -5109,

+   4096,  1682, -4816, -4478,  2217,  5765,  1130, -5109,

+  -4096,  2731,  5681,   568, -5352, -3675,  3218,  5543,

+   4096,   568, -5681, -1682,  5352,  2731, -4816, -3675,

+   4096,  4478, -3218, -5109,  2217,  5543, -1130, -5765,

+   4096,  -568, -5681,  1682,  5352, -2731, -4816,  3675,

+   4096, -4478, -3218,  5109,  2217, -5543, -1130,  5765,

+   4096, -1682, -4816,  4478,  2217, -5765,  1130,  5109,

+  -4096, -2731,  5681,  -568, -5352,  3675,  3218, -5543,

+   4096, -2731, -3218,  5765, -2217, -3675,  5681, -1682,

+  -4096,  5543, -1130, -4478,  5352,  -568, -4816,  5109,

+   4096, -3675, -1130,  5109, -5352,  1682,  3218, -5765,

+   4096,   568, -4816,  5543, -2217, -2731,  5681, -4478,

+   4096, -4478,  1130,  2731, -5352,  5543, -3218,  -568,

+   4096, -5765,  4816, -1682, -2217,  5109, -5681,  3675,

+   4096, -5109,  3218,  -568, -2217,  4478, -5681,  5543,

+  -4096,  1682,  1130, -3675,  5352, -5765,  4816, -2731,

+   4096, -5543,  4816, -3675,  2217,  -568, -1130,  2731,

+  -4096,  5109, -5681,  5765, -5352,  4478, -3218,  1682,

+   4096, -5765,  5681, -5543,  5352, -5109,  4816, -4478,

+   4096, -3675,  3218, -2731,  2217, -1682,  1130,  -568

+};

+static const int16_t iadst_i16[256] = {

+    542,  1607,  2614,  3526,  4311,  4940,  5390,  5646,

+   5698,  5543,  5189,  4646,  3936,  3084,  2120,  1080,

+   1080,  3084,  4646,  5543,  5646,  4940,  3526,  1607,

+   -542, -2614, -4311, -5390, -5698, -5189, -3936, -2120,

+   1607,  4311,  5646,  5189,  3084,     0, -3084, -5189,

+  -5646, -4311, -1607,  1607,  4311,  5646,  5189,  3084,

+   2120,  5189,  5390,  2614, -1607, -4940, -5543, -3084,

+   1080,  4646,  5646,  3526, -542,  -4311, -5698, -3936,

+   2614,  5646,  3936, -1080, -5189, -4940,  -542,  4311,

+   5543,  2120, -3084, -5698, -3526,  1607,  5390,  4646,

+   3084,  5646,  1607, -4311, -5189,     0,  5189,  4311,

+  -1607, -5646, -3084,  3084,  5646,  1607, -4311, -5189,

+   3526,  5189, -1080, -5698, -1607,  4940,  3936, -3084,

+  -5390,   542,  5646,  2120, -4646, -4311,  2614,  5543,

+   3936,  4311, -3526, -4646,  3084,  4940, -2614, -5189,

+   2120,  5390, -1607, -5543,  1080,  5646,  -542, -5698,

+   4311,  3084, -5189, -1607,  5646,     0, -5646,  1607,

+   5189, -3084, -4311,  4311,  3084, -5189, -1607,  5646,

+   4646,  1607, -5698,  2120,  4311, -4940, -1080,  5646,

+  -2614, -3936,  5189,   542, -5543,  3084,  3526, -5390,

+   4940,     0, -4940,  4940,     0, -4940,  4940,     0,

+  -4940,  4940,     0, -4940,  4940,     0, -4940,  4940,

+   5189, -1607, -3084,  5646, -4311,     0,  4311, -5646,

+   3084,  1607, -5189,  5189, -1607, -3084,  5646, -4311,

+   5390, -3084,  -542,  3936, -5646,  4940, -2120, -1607,

+   4646, -5698,  4311, -1080, -2614,  5189, -5543,  3526,

+   5543, -4311,  2120,   542, -3084,  4940, -5698,  5189,

+  -3526,  1080,  1607, -3936,  5390, -5646,  4646, -2614,

+   5646, -5189,  4311, -3084,  1607,     0, -1607,  3084,

+  -4311,  5189, -5646,  5646, -5189,  4311, -3084,  1607,

+   5698, -5646,  5543, -5390,  5189, -4940,  4646, -4311,

+   3936, -3526,  3084, -2614,  2120, -1607,  1080,  -542

+};

+/* For test */

+#define TEST_INT 1

+#if TEST_INT

+#define vp9_ihtllm_int_c vp9_ihtllm_c

+#else

+#define vp9_ihtllm_float_c vp9_ihtllm_c

+#endif

+void vp9_ihtllm_float_c(const int16_t *input, int16_t *output, int pitch,

+                  TX_TYPE tx_type, int tx_dim) {

+  vp9_clear_system_state();  // Make it simd safe : __asm emms;

+  {

+    int i, j, k;

+    float bufa[256], bufb[256];  // buffers are for floating-point test purpose

+                                 // the implementation could be simplified in

+                                 // conjunction with integer transform

+    const int16_t *ip = input;

+    int16_t *op = output;

+    int shortpitch = pitch >> 1;

+    float *pfa = &bufa[0];

+    float *pfb = &bufb[0];

+    // pointers to vertical and horizontal transforms

+    const float *ptv, *pth;

+    assert(tx_type != DCT_DCT);

+    // load and convert residual array into floating-point

+    for(j = 0; j < tx_dim; j++) {

+      for(i = 0; i < tx_dim; i++) {

+        pfa[i] = (float)ip[i];

+      }

+      pfa += tx_dim;

+      ip  += tx_dim;

+    }

+    // vertical transformation

+    pfa = &bufa[0];

+    pfb = &bufb[0];

+    switch(tx_type) {

+      case ADST_ADST :

+      case ADST_DCT  :

+        ptv = (tx_dim == 4) ? &iadst_4[0] :

+                              ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);

+        break;

+      default :

+        ptv = (tx_dim == 4) ? &idct_4[0] :

+                              ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);

+        break;

+    }

+    for(j = 0; j < tx_dim; j++) {

+      for(i = 0; i < tx_dim; i++) {

+        pfb[i] = 0 ;

+        for(k = 0; k < tx_dim; k++) {

+          pfb[i] += ptv[k] * pfa[(k * tx_dim)];

+        }

+        pfa += 1;

+      }

+      pfb += tx_dim;

+      ptv += tx_dim;

+      pfa = &bufa[0];

+    }

+    // horizontal transformation

+    pfa = &bufa[0];

+    pfb = &bufb[0];

+    switch(tx_type) {

+      case ADST_ADST :

+      case  DCT_ADST :

+        pth = (tx_dim == 4) ? &iadst_4[0] :

+                              ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);

+        break;

+      default :

+        pth = (tx_dim == 4) ? &idct_4[0] :

+                              ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);

+        break;

+    }

+    for(j = 0; j < tx_dim; j++) {

+      for(i = 0; i < tx_dim; i++) {

+        pfa[i] = 0;

+        for(k = 0; k < tx_dim; k++) {

+          pfa[i] += pfb[k] * pth[k];

+        }

+        pth += tx_dim;

+       }

+      pfa += tx_dim;

+      pfb += tx_dim;

+      switch(tx_type) {

+        case ADST_ADST :

+        case  DCT_ADST :

+          pth = (tx_dim == 4) ? &iadst_4[0] :

+                                ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);

+          break;

+        default :

+          pth = (tx_dim == 4) ? &idct_4[0] :

+                                ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);

+          break;

+      }

+    }

+    // convert to short integer format and load BLOCKD buffer

+    op  = output;

+    pfa = &bufa[0];

+    for(j = 0; j < tx_dim; j++) {

+      for(i = 0; i < tx_dim; i++) {

+        op[i] = (pfa[i] > 0 ) ? (int16_t)( pfa[i] / 8 + 0.49) :

+                               -(int16_t)( - pfa[i] / 8 + 0.49);

+      }

+      op += shortpitch;

+      pfa += tx_dim;

+    }

+  }

+  vp9_clear_system_state(); // Make it simd safe : __asm emms;

+}

+/* Converted the transforms to integer form. */

+#define VERTICAL_SHIFT 14  // 16

+#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)

+#define HORIZONTAL_SHIFT 17  // 15

+#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)

+void vp9_ihtllm_int_c(const int16_t *input, int16_t *output, int pitch,

+                      TX_TYPE tx_type, int tx_dim) {

+  int i, j, k;

+  int16_t imbuf[256];

+  const int16_t *ip = input;

+  int16_t *op = output;

+  int16_t *im = &imbuf[0];

+  /* pointers to vertical and horizontal transforms. */

+  const int16_t *ptv = NULL, *pth = NULL;

+  int shortpitch = pitch >> 1;

+  switch (tx_type) {

+    case ADST_ADST :

+      ptv = pth = (tx_dim == 4) ? &iadst_i4[0]

+                                  : ((tx_dim == 8) ? &iadst_i8[0]

+                                                     : &iadst_i16[0]);

+      break;

+    case ADST_DCT  :

+      ptv = (tx_dim == 4) ? &iadst_i4[0]

+                            : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);

+      pth = (tx_dim == 4) ? &idct_i4[0]

+                            : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);

+      break;

+    case  DCT_ADST :

+      ptv = (tx_dim == 4) ? &idct_i4[0]

+                            : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);

+      pth = (tx_dim == 4) ? &iadst_i4[0]

+                            : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);

+      break;

+    case  DCT_DCT :

+      ptv = pth = (tx_dim == 4) ? &idct_i4[0]

+                                  : ((tx_dim == 8) ? &idct_i8[0]

+                                                     : &idct_i16[0]);

+      break;

+    default:

+      assert(0);

+      break;

+  }

+  /* vertical transformation */

+  for (j = 0; j < tx_dim; j++) {

+    for (i = 0; i < tx_dim; i++) {

+      int temp = 0;

+      for (k = 0; k < tx_dim; k++) {

+        temp += ptv[k] * ip[(k * tx_dim)];

+      }

+      im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);

+      ip++;

+    }

+    im += tx_dim;  // 16

+    ptv += tx_dim;

+    ip = input;

+  }

+  /* horizontal transformation */

+  im = &imbuf[0];

+  for (j = 0; j < tx_dim; j++) {

+    const int16_t *pthc = pth;

+    for (i = 0; i < tx_dim; i++) {

+      int temp = 0;

+      for (k = 0; k < tx_dim; k++) {

+        temp += im[k] * pthc[k];

+      }

+      op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);

+      pthc += tx_dim;

+    }

+    im += tx_dim;  // 16

+    op += shortpitch;

+  }

+}

+void vp9_short_idct4x4llm_c(short *input, short *output, int pitch) {

+  int i;

+  int a1, b1, c1, d1;

+  short *ip = input;

+  short *op = output;

+  int temp1, temp2;

+  int shortpitch = pitch >> 1;

+  for (i = 0; i < 4; i++) {

+    a1 = ip[0] + ip[8];

+    b1 = ip[0] - ip[8];

+    temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;

+    temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);

+    c1 = temp1 - temp2;

+    temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);

+    temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;

+    d1 = temp1 + temp2;

+    op[shortpitch * 0] = a1 + d1;

+    op[shortpitch * 3] = a1 - d1;

+    op[shortpitch * 1] = b1 + c1;

+    op[shortpitch * 2] = b1 - c1;

+    ip++;

+    op++;

+  }

+  ip = output;

+  op = output;

+  for (i = 0; i < 4; i++) {

+    a1 = ip[0] + ip[2];

+    b1 = ip[0] - ip[2];

+    temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16;

+    temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16);

+    c1 = temp1 - temp2;

+    temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16);

+    temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16;

+    d1 = temp1 + temp2;

+    op[0] = (a1 + d1 + 16) >> 5;

+    op[3] = (a1 - d1 + 16) >> 5;

+    op[1] = (b1 + c1 + 16) >> 5;

+    op[2] = (b1 - c1 + 16) >> 5;

+    ip += shortpitch;

+    op += shortpitch;

+  }

+}

+void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch) {

+  int i;

+  int a1;

+  short *op = output;

+  int shortpitch = pitch >> 1;

+  a1 = ((input[0] + 16) >> 5);

+  for (i = 0; i < 4; i++) {

+    op[0] = a1;

+    op[1] = a1;

+    op[2] = a1;

+    op[3] = a1;

+    op += shortpitch;

+  }

+}

+void vp9_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,

+                            unsigned char *dst_ptr, int pitch, int stride) {

+  int a1 = ((input_dc + 16) >> 5);

+  int r, c;

+  for (r = 0; r < 4; r++) {

+    for (c = 0; c < 4; c++) {

+      int a = a1 + pred_ptr[c];

+      if (a < 0)

+        a = 0;

+      if (a > 255)

+        a = 255;

+      dst_ptr[c] = (unsigned char) a;

+    }

+    dst_ptr += stride;

+    pred_ptr += pitch;

+  }

+}

+void vp9_short_inv_walsh4x4_c(short *input, short *output) {

+  int i;

+  int a1, b1, c1, d1;

+  short *ip = input;

+  short *op = output;

+  for (i = 0; i < 4; i++) {

+    a1 = ((ip[0] + ip[3]));

+    b1 = ((ip[1] + ip[2]));

+    c1 = ((ip[1] - ip[2]));

+    d1 = ((ip[0] - ip[3]));

+    op[0] = (a1 + b1 + 1) >> 1;

+    op[1] = (c1 + d1) >> 1;

+    op[2] = (a1 - b1) >> 1;

+    op[3] = (d1 - c1) >> 1;

+    ip += 4;

+    op += 4;

+  }

+  ip = output;

+  op = output;

+  for (i = 0; i < 4; i++) {

+    a1 = ip[0] + ip[12];

+    b1 = ip[4] + ip[8];

+    c1 = ip[4] - ip[8];

+    d1 = ip[0] - ip[12];

+    op[0] = (a1 + b1 + 1) >> 1;

+    op[4] = (c1 + d1) >> 1;

+    op[8] = (a1 - b1) >> 1;

+    op[12] = (d1 - c1) >> 1;

+    ip++;

+    op++;

+  }

+}

+void vp9_short_inv_walsh4x4_1_c(short *in, short *out) {

+  int i;

+  short tmp[4];

+  short *ip = in;

+  short *op = tmp;

+  op[0] = (ip[0] + 1) >> 1;

+  op[1] = op[2] = op[3] = (ip[0] >> 1);

+  ip = tmp;

+  op = out;

+  for (i = 0; i < 4; i++) {

+    op[0] = (ip[0] + 1) >> 1;

+    op[4] = op[8] = op[12] = (ip[0] >> 1);

+    ip++;

+    op++;

+  }

+}

+#if CONFIG_LOSSLESS

+void vp9_short_inv_walsh4x4_lossless_c(short *input, short *output) {

+  int i;

+  int a1, b1, c1, d1;

+  short *ip = input;

+  short *op = output;

+  for (i = 0; i < 4; i++) {

+    a1 = ((ip[0] + ip[3])) >> Y2_WHT_UPSCALE_FACTOR;

+    b1 = ((ip[1] + ip[2])) >> Y2_WHT_UPSCALE_FACTOR;

+    c1 = ((ip[1] - ip[2])) >> Y2_WHT_UPSCALE_FACTOR;

+    d1 = ((ip[0] - ip[3])) >> Y2_WHT_UPSCALE_FACTOR;

+    op[0] = (a1 + b1 + 1) >> 1;

+    op[1] = (c1 + d1) >> 1;

+    op[2] = (a1 - b1) >> 1;

+    op[3] = (d1 - c1) >> 1;

+    ip += 4;

+    op += 4;

+  }

+  ip = output;

+  op = output;

+  for (i = 0; i < 4; i++) {

+    a1 = ip[0] + ip[12];

+    b1 = ip[4] + ip[8];

+    c1 = ip[4] - ip[8];

+    d1 = ip[0] - ip[12];

+    op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

+    op[4] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

+    op[8] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

+    op[12] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

+    ip++;

+    op++;

+  }

+}

+void vp9_short_inv_walsh4x4_1_lossless_c(short *in, short *out) {

+  int i;

+  short tmp[4];

+  short *ip = in;

+  short *op = tmp;

+  op[0] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) + 1) >> 1;

+  op[1] = op[2] = op[3] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) >> 1);

+  ip = tmp;

+  op = out;

+  for (i = 0; i < 4; i++) {

+    op[0] = ((ip[0] + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

+    op[4] = op[8] = op[12] = ((ip[0] >> 1)) << Y2_WHT_UPSCALE_FACTOR;

+    ip++;

+    op++;

+  }

+}

+void vp9_short_inv_walsh4x4_x8_c(short *input, short *output, int pitch) {

+  int i;

+  int a1, b1, c1, d1;

+  short *ip = input;

+  short *op = output;

+  int shortpitch = pitch >> 1;

+  for (i = 0; i < 4; i++) {

+    a1 = ((ip[0] + ip[3])) >> WHT_UPSCALE_FACTOR;

+    b1 = ((ip[1] + ip[2])) >> WHT_UPSCALE_FACTOR;

+    c1 = ((ip[1] - ip[2])) >> WHT_UPSCALE_FACTOR;

+    d1 = ((ip[0] - ip[3])) >> WHT_UPSCALE_FACTOR;

+    op[0] = (a1 + b1 + 1) >> 1;

+    op[1] = (c1 + d1) >> 1;

+    op[2] = (a1 - b1) >> 1;

+    op[3] = (d1 - c1) >> 1;

+    ip += 4;

+    op += shortpitch;

+  }

+  ip = output;

+  op = output;

+  for (i = 0; i < 4; i++) {

+    a1 = ip[shortpitch * 0] + ip[shortpitch * 3];

+    b1 = ip[shortpitch * 1] + ip[shortpitch * 2];

+    c1 = ip[shortpitch * 1] - ip[shortpitch * 2];

+    d1 = ip[shortpitch * 0] - ip[shortpitch * 3];

+    op[shortpitch * 0] = (a1 + b1 + 1) >> 1;

+    op[shortpitch * 1] = (c1 + d1) >> 1;

+    op[shortpitch * 2] = (a1 - b1) >> 1;

+    op[shortpitch * 3] = (d1 - c1) >> 1;

+    ip++;

+    op++;

+  }

+}

+void vp9_short_inv_walsh4x4_1_x8_c(short *in, short *out, int pitch) {

+  int i;

+  short tmp[4];

+  short *ip = in;

+  short *op = tmp;

+  int shortpitch = pitch >> 1;

+  op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;

+  op[1] = op[2] = op[3] = ((ip[0] >> WHT_UPSCALE_FACTOR) >> 1);

+  ip = tmp;

+  op = out;

+  for (i = 0; i < 4; i++) {

+    op[shortpitch * 0] = (ip[0] + 1) >> 1;

+    op[shortpitch * 1] = op[shortpitch * 2] = op[shortpitch * 3] = ip[0] >> 1;

+    ip++;

+    op++;

+  }

+}

+void vp9_dc_only_inv_walsh_add_c(short input_dc, unsigned char *pred_ptr,

+                                 unsigned char *dst_ptr,

+                                 int pitch, int stride) {

+  int r, c;

+  short tmp[16];

+  vp9_short_inv_walsh4x4_1_x8_c(&input_dc, tmp, 4 << 1);

+  for (r = 0; r < 4; r++) {

+    for (c = 0; c < 4; c++) {

+      int a = tmp[r * 4 + c] + pred_ptr[c];

+      if (a < 0)

+        a = 0;

+      if (a > 255)

+        a = 255;

+      dst_ptr[c] = (unsigned char) a;

+    }

+    dst_ptr += stride;

+    pred_ptr += pitch;

+  }

+}

+#endif

+void vp9_dc_only_idct_add_8x8_c(short input_dc,

+                                unsigned char *pred_ptr,

+                                unsigned char *dst_ptr,

+                                int pitch, int stride) {

+  int a1 = ((input_dc + 16) >> 5);

+  int r, c, b;

+  unsigned char *orig_pred = pred_ptr;

+  unsigned char *orig_dst = dst_ptr;

+  for (b = 0; b < 4; b++) {

+    for (r = 0; r < 4; r++) {

+      for (c = 0; c < 4; c++) {

+        int a = a1 + pred_ptr[c];

+        if (a < 0)

+          a = 0;

+        if (a > 255)

+          a = 255;

+        dst_ptr[c] = (unsigned char) a;

+      }

+      dst_ptr += stride;

+      pred_ptr += pitch;

+    }

+    dst_ptr = orig_dst + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * stride;

+    pred_ptr = orig_pred + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * pitch;

+  }

+}

+#define W1 2841                 /* 2048*sqrt(2)*cos(1*pi/16) */

+#define W2 2676                 /* 2048*sqrt(2)*cos(2*pi/16) */

+#define W3 2408                 /* 2048*sqrt(2)*cos(3*pi/16) */

+#define W5 1609                 /* 2048*sqrt(2)*cos(5*pi/16) */

+#define W6 1108                 /* 2048*sqrt(2)*cos(6*pi/16) */

+#define W7 565                  /* 2048*sqrt(2)*cos(7*pi/16) */

+/* row (horizontal) IDCT

+ *

+ * 7                       pi         1 dst[k] = sum c[l] * src[l] * cos( -- *

+ * ( k + - ) * l ) l=0                      8          2

+ *

+ * where: c[0]    = 128 c[1..7] = 128*sqrt(2) */

+static void idctrow(int *blk) {

+  int x0, x1, x2, x3, x4, x5, x6, x7, x8;

+  /* shortcut */

+  if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |

+        (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {

+    blk[0] = blk[1] = blk[2] = blk[3] = blk[4]

+                                        = blk[5] = blk[6] = blk[7] = blk[0] << 3;

+    return;

+  }

+  x0 = (blk[0] << 11) + 128;    /* for proper rounding in the fourth stage */

+  /* first stage */

+  x8 = W7 * (x4 + x5);

+  x4 = x8 + (W1 - W7) * x4;

+  x5 = x8 - (W1 + W7) * x5;

+  x8 = W3 * (x6 + x7);

+  x6 = x8 - (W3 - W5) * x6;

+  x7 = x8 - (W3 + W5) * x7;

+  /* second stage */

+  x8 = x0 + x1;

+  x0 -= x1;

+  x1 = W6 * (x3 + x2);

+  x2 = x1 - (W2 + W6) * x2;

+  x3 = x1 + (W2 - W6) * x3;

+  x1 = x4 + x6;

+  x4 -= x6;

+  x6 = x5 + x7;

+  x5 -= x7;

+  /* third stage */

+  x7 = x8 + x3;

+  x8 -= x3;

+  x3 = x0 + x2;

+  x0 -= x2;

+  x2 = (181 * (x4 + x5) + 128) >> 8;

+  x4 = (181 * (x4 - x5) + 128) >> 8;

+  /* fourth stage */

+  blk[0] = (x7 + x1) >> 8;

+  blk[1] = (x3 + x2) >> 8;

+  blk[2] = (x0 + x4) >> 8;

+  blk[3] = (x8 + x6) >> 8;

+  blk[4] = (x8 - x6) >> 8;

+  blk[5] = (x0 - x4) >> 8;

+  blk[6] = (x3 - x2) >> 8;

+  blk[7] = (x7 - x1) >> 8;

+}

+/* column (vertical) IDCT

+ *

+ * 7                         pi         1 dst[8*k] = sum c[l] * src[8*l] *

+ * cos( -- * ( k + - ) * l ) l=0                        8          2

+ *

+ * where: c[0]    = 1/1024 c[1..7] = (1/1024)*sqrt(2) */

+static void idctcol(int *blk) {

+  int x0, x1, x2, x3, x4, x5, x6, x7, x8;

+  /* shortcut */

+  if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |

+        (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |

+        (x7 = blk[8 * 3]))) {

+    blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]

+                                           = blk[8 * 4] = blk[8 * 5] = blk[8 * 6]

+                                                                       = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);

+    return;

+  }

+  x0 = (blk[8 * 0] << 8) + 16384;

+  /* first stage */

+  x8 = W7 * (x4 + x5) + 4;

+  x4 = (x8 + (W1 - W7) * x4) >> 3;

+  x5 = (x8 - (W1 + W7) * x5) >> 3;

+  x8 = W3 * (x6 + x7) + 4;

+  x6 = (x8 - (W3 - W5) * x6) >> 3;

+  x7 = (x8 - (W3 + W5) * x7) >> 3;

+  /* second stage */

+  x8 = x0 + x1;

+  x0 -= x1;

+  x1 = W6 * (x3 + x2) + 4;

+  x2 = (x1 - (W2 + W6) * x2) >> 3;

+  x3 = (x1 + (W2 - W6) * x3) >> 3;

+  x1 = x4 + x6;

+  x4 -= x6;

+  x6 = x5 + x7;

+  x5 -= x7;

+  /* third stage */

+  x7 = x8 + x3;

+  x8 -= x3;

+  x3 = x0 + x2;

+  x0 -= x2;

+  x2 = (181 * (x4 + x5) + 128) >> 8;

+  x4 = (181 * (x4 - x5) + 128) >> 8;

+  /* fourth stage */

+  blk[8 * 0] = (x7 + x1) >> 14;

+  blk[8 * 1] = (x3 + x2) >> 14;

+  blk[8 * 2] = (x0 + x4) >> 14;

+  blk[8 * 3] = (x8 + x6) >> 14;

+  blk[8 * 4] = (x8 - x6) >> 14;

+  blk[8 * 5] = (x0 - x4) >> 14;

+  blk[8 * 6] = (x3 - x2) >> 14;

+  blk[8 * 7] = (x7 - x1) >> 14;

+}

+#define TX_DIM 8

+void vp9_short_idct8x8_c(short *coefs, short *block, int pitch) {

+  int X[TX_DIM * TX_DIM];

+  int i, j;

+  int shortpitch = pitch >> 1;

+  for (i = 0; i < TX_DIM; i++) {

+    for (j = 0; j < TX_DIM; j++) {

+      X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1

+                                + (coefs[i * TX_DIM + j] < 0)) >> 2;

+    }

+  }

+  for (i = 0; i < 8; i++)

+    idctrow(X + 8 * i);

+  for (i = 0; i < 8; i++)

+    idctcol(X + i);

+  for (i = 0; i < TX_DIM; i++) {

+    for (j = 0; j < TX_DIM; j++) {

+      block[i * shortpitch + j]  = X[i * TX_DIM + j] >> 1;

+    }

+  }

+}

+void vp9_short_ihaar2x2_c(short *input, short *output, int pitch) {

+  int i;

+  short *ip = input; // 0,1, 4, 8

+  short *op = output;

+  for (i = 0; i < 16; i++) {

+    op[i] = 0;

+  }

+  op[0] = (ip[0] + ip[1] + ip[4] + ip[8] + 1) >> 1;

+  op[1] = (ip[0] - ip[1] + ip[4] - ip[8]) >> 1;

+  op[4] = (ip[0] + ip[1] - ip[4] - ip[8]) >> 1;

+  op[8] = (ip[0] - ip[1] - ip[4] + ip[8]) >> 1;

+}

+#if 0

+// Keep a really bad float version as reference for now.

+void vp9_short_idct16x16_c(short *input, short *output, int pitch) {

+  vp9_clear_system_state(); // Make it simd safe : __asm emms;

+  {

+    double x;

+    const int short_pitch = pitch >> 1;

+    int i, j, k, l;

+    for (l = 0; l < 16; ++l) {

+      for (k = 0; k < 16; ++k) {

+        double s = 0;

+        for (i = 0; i < 16; ++i) {

+          for (j = 0; j < 16; ++j) {

+            x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32;

+            if (i != 0)

+              x *= sqrt(2.0);

+            if (j != 0)

+              x *= sqrt(2.0);

+            s += x;

+          }

+        }

+        output[k*short_pitch+l] = (short)round(s);

+      }

+    }

+  }

+  vp9_clear_system_state(); // Make it simd safe : __asm emms;

+}

+#endif

+static const double C1 = 0.995184726672197;

+static const double C2 = 0.98078528040323;

+static const double C3 = 0.956940335732209;

+static const double C4 = 0.923879532511287;

+static const double C5 = 0.881921264348355;

+static const double C6 = 0.831469612302545;

+static const double C7 = 0.773010453362737;

+static const double C8 = 0.707106781186548;

+static const double C9 = 0.634393284163646;

+static const double C10 = 0.555570233019602;

+static const double C11 = 0.471396736825998;

+static const double C12 = 0.38268343236509;

+static const double C13 = 0.290284677254462;

+static const double C14 = 0.195090322016128;

+static const double C15 = 0.098017140329561;

+static void butterfly_16x16_idct_1d(double input[16], double output[16]) {

+  vp9_clear_system_state(); // Make it simd safe : __asm emms;

+  {

+    double step[16];

+    double intermediate[16];

+    double temp1, temp2;

+    // step 1 and 2

+    step[ 0] = input[0] + input[8];

+    step[ 1] = input[0] - input[8];

+    temp1 = input[4]*C12;

+    temp2 = input[12]*C4;

+    temp1 -= temp2;

+    temp1 *= C8;

+    step[ 2] = 2*(temp1);

+    temp1 = input[4]*C4;

+    temp2 = input[12]*C12;

+    temp1 += temp2;

+    temp1 = (temp1);

+    temp1 *= C8;

+    step[ 3] = 2*(temp1);

+    temp1 = input[2]*C8;

+    temp1 = 2*(temp1);

+    temp2 = input[6] + input[10];

+    step[ 4] = temp1 + temp2;

+    step[ 5] = temp1 - temp2;

+    temp1 = input[14]*C8;

+    temp1 = 2*(temp1);

+    temp2 = input[6] - input[10];

+    step[ 6] = temp2 - temp1;

+    step[ 7] = temp2 + temp1;

+    // for odd input

+    temp1 = input[3]*C12;

+    temp2 = input[13]*C4;

+    temp1 += temp2;

+    temp1 = (temp1);

+    temp1 *= C8;

+    intermediate[ 8] = 2*(temp1);

+    temp1 = input[3]*C4;

+    temp2 = input[13]*C12;

+    temp2 -= temp1;

+    temp2 = (temp2);

+    temp2 *= C8;

+    intermediate[ 9] = 2*(temp2);

+    intermediate[10] = 2*(input[9]*C8);

+    intermediate[11] = input[15] - input[1];

+    intermediate[12] = input[15] + input[1];

+    intermediate[13] = 2*((input[7]*C8));

+    temp1 = input[11]*C12;

+    temp2 = input[5]*C4;

+    temp2 -= temp1;

+    temp2 = (temp2);

+    temp2 *= C8;

+    intermediate[14] = 2*(temp2);

+    temp1 = input[11]*C4;

+    temp2 = input[5]*C12;

+    temp1 += temp2;

+    temp1 = (temp1);

+    temp1 *= C8;

+    intermediate[15] = 2*(temp1);

+    step[ 8] = intermediate[ 8] + intermediate[14];

+    step[ 9] = intermediate[ 9] + intermediate[15];

+    step[10] = intermediate[10] + intermediate[11];

+    step[11] = intermediate[10] - intermediate[11];

+    step[12] = intermediate[12] + intermediate[13];

+    step[13] = intermediate[12] - intermediate[13];

+    step[14] = intermediate[ 8] - intermediate[14];

+    step[15] = intermediate[ 9] - intermediate[15];

+    // step 3

+    output[0] = step[ 0] + step[ 3];

+    output[1] = step[ 1] + step[ 2];

+    output[2] = step[ 1] - step[ 2];

+    output[3] = step[ 0] - step[ 3];

+    temp1 = step[ 4]*C14;

+    temp2 = step[ 7]*C2;

+    temp1 -= temp2;

+    output[4] =  (temp1);

+    temp1 = step[ 4]*C2;

+    temp2 = step[ 7]*C14;

+    temp1 += temp2;

+    output[7] =  (temp1);

+    temp1 = step[ 5]*C10;

+    temp2 = step[ 6]*C6;

+    temp1 -= temp2;

+    output[5] =  (temp1);

+    temp1 = step[ 5]*C6;

+    temp2 = step[ 6]*C10;

+    temp1 += temp2;

+    output[6] =  (temp1);

+    output[8] = step[ 8] + step[11];

+    output[9] = step[ 9] + step[10];

+    output[10] = step[ 9] - step[10];

+    output[11] = step[ 8] - step[11];

+    output[12] = step[12] + step[15];

+    output[13] = step[13] + step[14];

+    output[14] = step[13] - step[14];

+    output[15] = step[12] - step[15];

+    // output 4

+    step[ 0] = output[0] + output[7];

+    step[ 1] = output[1] + output[6];

+    step[ 2] = output[2] + output[5];

+    step[ 3] = output[3] + output[4];

+    step[ 4] = output[3] - output[4];

+    step[ 5] = output[2] - output[5];

+    step[ 6] = output[1] - output[6];

+    step[ 7] = output[0] - output[7];

+    temp1 = output[8]*C7;

+    temp2 = output[15]*C9;

+    temp1 -= temp2;

+    step[ 8] = (temp1);

+    temp1 = output[9]*C11;

+    temp2 = output[14]*C5;

+    temp1 += temp2;

+    step[ 9] = (temp1);

+    temp1 = output[10]*C3;

+    temp2 = output[13]*C13;

+    temp1 -= temp2;

+    step[10] = (temp1);

+    temp1 = output[11]*C15;

+    temp2 = output[12]*C1;

+    temp1 += temp2;

+    step[11] = (temp1);

+    temp1 = output[11]*C1;

+    temp2 = output[12]*C15;

+    temp2 -= temp1;

+    step[12] = (temp2);

+    temp1 = output[10]*C13;

+    temp2 = output[13]*C3;

+    temp1 += temp2;

+    step[13] = (temp1);

+    temp1 = output[9]*C5;

+    temp2 = output[14]*C11;

+    temp2 -= temp1;

+    step[14] = (temp2);

+    temp1 = output[8]*C9;

+    temp2 = output[15]*C7;

+    temp1 += temp2;

+    step[15] = (temp1);

+    // step 5

+    output[0] = (step[0] + step[15]);

+    output[1] = (step[1] + step[14]);

+    output[2] = (step[2] + step[13]);

+    output[3] = (step[3] + step[12]);

+    output[4] = (step[4] + step[11]);

+    output[5] = (step[5] + step[10]);

+    output[6] = (step[6] + step[ 9]);

+    output[7] = (step[7] + step[ 8]);

+    output[15] = (step[0] - step[15]);

+    output[14] = (step[1] - step[14]);

+    output[13] = (step[2] - step[13]);

+    output[12] = (step[3] - step[12]);

+    output[11] = (step[4] - step[11]);

+    output[10] = (step[5] - step[10]);

+    output[9] = (step[6] - step[ 9]);

+    output[8] = (step[7] - step[ 8]);

+  }

+  vp9_clear_system_state(); // Make it simd safe : __asm emms;

+}

+// Remove once an int version of iDCT is written

+#if 0

+void reference_16x16_idct_1d(double input[16], double output[16]) {

+  vp9_clear_system_state(); // Make it simd safe : __asm emms;

+  {

+    const double kPi = 3.141592653589793238462643383279502884;

+    const double kSqrt2 = 1.414213562373095048801688724209698;

+    for (int k = 0; k < 16; k++) {

+      output[k] = 0.0;

+      for (int n = 0; n < 16; n++) {

+        output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0);

+        if (n == 0)

+          output[k] = output[k]/kSqrt2;

+      }

+    }

+  }

+  vp9_clear_system_state(); // Make it simd safe : __asm emms;

+}

+#endif

+void vp9_short_idct16x16_c(short *input, short *output, int pitch) {

+  vp9_clear_system_state(); // Make it simd safe : __asm emms;

+  {

+    double out[16*16], out2[16*16];

+    const int short_pitch = pitch >> 1;

+    int i, j;

+      // First transform rows

+    for (i = 0; i < 16; ++i) {

+      double temp_in[16], temp_out[16];

+      for (j = 0; j < 16; ++j)

+        temp_in[j] = input[j + i*short_pitch];

+      butterfly_16x16_idct_1d(temp_in, temp_out);

+      for (j = 0; j < 16; ++j)

+        out[j + i*16] = temp_out[j];

+    }

+    // Then transform columns

+    for (i = 0; i < 16; ++i) {

+      double temp_in[16], temp_out[16];

+      for (j = 0; j < 16; ++j)

+        temp_in[j] = out[j*16 + i];

+      butterfly_16x16_idct_1d(temp_in, temp_out);

+      for (j = 0; j < 16; ++j)

+        out2[j*16 + i] = temp_out[j];

+    }

+    for (i = 0; i < 16*16; ++i)

+      output[i] = round(out2[i]/128);

+  }

+  vp9_clear_system_state(); // Make it simd safe : __asm emms;

+}

--- /dev/null

+++ b/vp9/common/implicit_segmentation.c

@@ -1,0 +1,255 @@

+/*

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vp9/common/onyxc_int.h"

+#define MAX_REGIONS 24000

+#ifndef NULL

+#define NULL 0

+#endif

+#define min_mbs_in_region 3

+// this linked list structure holds equivalences for connected

+// component labeling

+struct list_el {

+  int label;

+  int seg_value;

+  int count;

+  struct list_el *next;

+};

+typedef struct list_el item;

+// connected colorsegments

+typedef struct {

+  int min_x;

+  int min_y;

+  int max_x;

+  int max_y;

+  long long sum_x;

+  long long sum_y;

+  int pixels;

+  int seg_value;

+  int label;

+} segment_info;

+typedef enum {

+  SEGMENT_MODE,

+  SEGMENT_MV,

+  SEGMENT_REFFRAME,

+  SEGMENT_SKIPPED

+} SEGMENT_TYPE;

+// this merges the two equivalence lists and

+// then makes sure that every label points to the same

+// equivalence list

+void merge(item *labels, int u, int v) {

+  item *a = labels[u].next;

+  item *b = labels[v].next;

+  item c;

+  item *it = &c;

+  int count;

+  // check if they are already merged

+  if (u == v || a == b)

+    return;

+  count = a->count + b->count;

+  // merge 2 sorted linked lists.

+  while (a != NULL && b != NULL) {

+    if (a->label < b->label) {

+      it->next = a;

+      a = a->next;

+    } else {

+      it->next = b;

+      b = b->next;

+    }

+    it = it->next;

+  }

+  if (a == NULL)

+    it->next = b;

+  else

+    it->next = a;

+  it = c.next;

+  // make sure every equivalence in the linked list points to this new ll

+  while (it != NULL) {

+    labels[it->label].next = c.next;

+    it = it->next;

+  }

+  c.next->count = count;

+}

+void segment_via_mode_info(VP9_COMMON *oci, int how) {

+  MODE_INFO *mi = oci->mi;

+  int i, j;

+  int mb_index = 0;

+  int label = 1;

+  int pitch = oci->mb_cols;

+  // holds linked list equivalences

+  // the max should probably be allocated at a higher level in oci

+  item equivalences[MAX_REGIONS];

+  int eq_ptr = 0;

+  item labels[MAX_REGIONS];

+  segment_info segments[MAX_REGIONS];

+  int label_count = 1;

+  int labeling[400 * 300];

+  int *lp = labeling;

+  label_count = 1;

+  memset(labels, 0, sizeof(labels));

+  memset(segments, 0, sizeof(segments));

+  /* Go through each macroblock first pass labelling */

+  for (i = 0; i < oci->mb_rows; i++, lp += pitch) {

+    for (j = 0; j < oci->mb_cols; j++) {

+      // int above seg_value, left seg_value, this seg_value...

+      int a = -1, l = -1, n = -1;

+      // above label, left label

+      int al = -1, ll = -1;

+      if (i) {

+        al = lp[j - pitch];

+        a = labels[al].next->seg_value;

+      }

+      if (j) {

+        ll = lp[j - 1];

+        l = labels[ll].next->seg_value;

+      }

+      // what setting are we going to do the implicit segmentation on

+      switch (how) {

+        case SEGMENT_MODE:

+          n = mi[mb_index].mbmi.mode;

+          break;

+        case SEGMENT_MV:

+          n = mi[mb_index].mbmi.mv[0].as_int;

+          if (mi[mb_index].mbmi.ref_frame == INTRA_FRAME)

+            n = -9999999;

+          break;

+        case SEGMENT_REFFRAME:

+          n = mi[mb_index].mbmi.ref_frame;

+          break;

+        case SEGMENT_SKIPPED:

+          n = mi[mb_index].mbmi.mb_skip_coeff;

+          break;

+      }

+      // above and left both have the same seg_value

+      if (n == a && n == l) {

+        // pick the lowest label

+        lp[j] = (al < ll ? al : ll);

+        labels[lp[j]].next->count++;

+        // merge the above and left equivalencies

+        merge(labels, al, ll);

+      }

+      // this matches above seg_value

+      else if (n == a) {

+        // give it the same label as above

+        lp[j] = al;

+        labels[al].next->count++;

+      }

+      // this matches left seg_value

+      else if (n == l) {

+        // give it the same label as above

+        lp[j] = ll;

+        labels[ll].next->count++;

+      } else {

+        // new label doesn't match either

+        item *e = &labels[label];

+        item *nl = &equivalences[eq_ptr++];

+        lp[j] = label;

+        nl->label = label;

+        nl->next = 0;

+        nl->seg_value = n;

+        nl->count = 1;

+        e->next = nl;

+        label++;

+      }

+      mb_index++;

+    }

+    mb_index++;

+  }

+  lp = labeling;

+  // give new labels to regions

+  for (i = 1; i < label; i++)

+    if (labels[i].next->count > min_mbs_in_region  &&  labels[labels[i].next->label].label == 0) {

+      segment_info *cs = &segments[label_count];

+      cs->label = label_count;

+      labels[labels[i].next->label].label = label_count++;

+      labels[labels[i].next->label].seg_value  = labels[i].next->seg_value;

+      cs->seg_value = labels[labels[i].next->label].seg_value;

+      cs->min_x = oci->mb_cols;

+      cs->min_y = oci->mb_rows;

+      cs->max_x = 0;

+      cs->max_y = 0;

+      cs->sum_x = 0;

+      cs->sum_y = 0;

+      cs->pixels = 0;

+    }

+  lp = labeling;

+  // this is just to gather stats...

+  for (i = 0; i < oci->mb_rows; i++, lp += pitch) {

+    for (j = 0; j < oci->mb_cols; j++) {

+      segment_info *cs;

+      int oldlab = labels[lp[j]].next->label;

+      int lab = labels[oldlab].label;

+      lp[j] = lab;

+      cs = &segments[lab];

+      cs->min_x = (j < cs->min_x ? j : cs->min_x);

+      cs->max_x = (j > cs->max_x ? j : cs->max_x);

+      cs->min_y = (i < cs->min_y ? i : cs->min_y);

+      cs->max_y = (i > cs->max_y ? i : cs->max_y);

+      cs->sum_x += j;

+      cs->sum_y += i;

+      cs->pixels++;

+      lp[j] = lab;

+      mb_index++;

+    }

+    mb_index++;

+  }

+  {

+    lp = labeling;

+    printf("labelling \n");

+    mb_index = 0;

+    for (i = 0; i < oci->mb_rows; i++, lp += pitch) {

+      for (j = 0; j < oci->mb_cols; j++) {

+        printf("%4d", lp[j]);

+      }

+      printf("            ");

+      for (j = 0; j < oci->mb_cols; j++, mb_index++) {

+        // printf("%3d",mi[mb_index].mbmi.mode );

+        printf("%4d:%4d", mi[mb_index].mbmi.mv[0].as_mv.row,

+            mi[mb_index].mbmi.mv[0].as_mv.col);

+      }

+      printf("\n");

+      ++mb_index;

+    }

+    printf("\n");

+  }

+}

--- /dev/null

+++ b/vp9/common/invtrans.c

@@ -1,0 +1,135 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "invtrans.h"

+static void recon_dcblock(MACROBLOCKD *xd) {

+  BLOCKD *b = &xd->block[24];

+  int i;

+  for (i = 0; i < 16; i++) {

+    xd->block[i].dqcoeff[0] = b->diff[i];

+  }

+}

+static void recon_dcblock_8x8(MACROBLOCKD *xd) {

+  BLOCKD *b = &xd->block[24]; // for coeff 0, 2, 8, 10

+  xd->block[0].dqcoeff[0] = b->diff[0];

+  xd->block[4].dqcoeff[0] = b->diff[1];

+  xd->block[8].dqcoeff[0] = b->diff[4];

+  xd->block[12].dqcoeff[0] = b->diff[8];

+}

+void vp9_inverse_transform_b_4x4(const vp9_idct_rtcd_vtable_t *rtcd,

+                                 BLOCKD *b, int pitch) {

+  if (b->eob <= 1)

+    IDCT_INVOKE(rtcd, idct1)(b->dqcoeff, b->diff, pitch);

+  else

+    IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->diff, pitch);

+}

+void vp9_inverse_transform_mby_4x4(const vp9_idct_rtcd_vtable_t *rtcd,

+                                   MACROBLOCKD *xd) {

+  int i;

+  BLOCKD *blockd = xd->block;

+  if (xd->mode_info_context->mbmi.mode != SPLITMV) {

+    /* do 2nd order transform on the dc block */

+    IDCT_INVOKE(rtcd, iwalsh16)(blockd[24].dqcoeff, blockd[24].diff);

+    recon_dcblock(xd);

+  }

+  for (i = 0; i < 16; i++) {

+    vp9_inverse_transform_b_4x4(rtcd, &blockd[i], 32);

+  }

+}

+void vp9_inverse_transform_mbuv_4x4(const vp9_idct_rtcd_vtable_t *rtcd,

+                                    MACROBLOCKD *xd) {

+  int i;

+  BLOCKD *blockd = xd->block;

+  for (i = 16; i < 24; i++) {

+    vp9_inverse_transform_b_4x4(rtcd, &blockd[i], 16);

+  }

+}

+void vp9_inverse_transform_mb_4x4(const vp9_idct_rtcd_vtable_t *rtcd,

+                                  MACROBLOCKD *xd) {

+  vp9_inverse_transform_mby_4x4(rtcd, xd);

+  vp9_inverse_transform_mbuv_4x4(rtcd, xd);

+}

+void vp9_inverse_transform_b_8x8(const vp9_idct_rtcd_vtable_t *rtcd,

+                                 short *input_dqcoeff, short *output_coeff,

+                                 int pitch) {

+  // int b,i;

+  // if (b->eob > 1)

+  IDCT_INVOKE(rtcd, idct8)(input_dqcoeff, output_coeff, pitch);

+  // else

+  // IDCT_INVOKE(rtcd, idct8_1)(b->dqcoeff, b->diff, pitch);//pitch

+}

+void vp9_inverse_transform_mby_8x8(const vp9_idct_rtcd_vtable_t *rtcd,

+                                   MACROBLOCKD *xd) {

+  int i;

+  BLOCKD *blockd = xd->block;

+  if (xd->mode_info_context->mbmi.mode != SPLITMV) {

+    // do 2nd order transform on the dc block

+    IDCT_INVOKE(rtcd, ihaar2)(blockd[24].dqcoeff, blockd[24].diff, 8);

+    recon_dcblock_8x8(xd); // need to change for 8x8

+  }

+  for (i = 0; i < 9; i += 8) {

+    vp9_inverse_transform_b_8x8(rtcd, &blockd[i].dqcoeff[0],

+                                &blockd[i].diff[0], 32);

+  }

+  for (i = 2; i < 11; i += 8) {

+    vp9_inverse_transform_b_8x8(rtcd, &blockd[i + 2].dqcoeff[0],

+                                &blockd[i].diff[0], 32);

+  }

+}

+void vp9_inverse_transform_mbuv_8x8(const vp9_idct_rtcd_vtable_t *rtcd,

+                                    MACROBLOCKD *xd) {

+  int i;

+  BLOCKD *blockd = xd->block;

+  for (i = 16; i < 24; i += 4) {

+    vp9_inverse_transform_b_8x8(rtcd, &blockd[i].dqcoeff[0],

+                                &blockd[i].diff[0], 16);

+  }

+}

+void vp9_inverse_transform_mb_8x8(const vp9_idct_rtcd_vtable_t *rtcd,

+                                  MACROBLOCKD *xd) {

+  vp9_inverse_transform_mby_8x8(rtcd, xd);

+  vp9_inverse_transform_mbuv_8x8(rtcd, xd);

+}

+void vp9_inverse_transform_b_16x16(const vp9_idct_rtcd_vtable_t *rtcd,

+                                   short *input_dqcoeff,

+                                   short *output_coeff, int pitch) {

+  IDCT_INVOKE(rtcd, idct16x16)(input_dqcoeff, output_coeff, pitch);

+}

+void vp9_inverse_transform_mby_16x16(const vp9_idct_rtcd_vtable_t *rtcd,

+                                     MACROBLOCKD *xd) {

+  vp9_inverse_transform_b_16x16(rtcd, &xd->block[0].dqcoeff[0],

+                                &xd->block[0].diff[0], 32);

+}

+void vp9_inverse_transform_mb_16x16(const vp9_idct_rtcd_vtable_t *rtcd,

+                                    MACROBLOCKD *xd) {

+  vp9_inverse_transform_mby_16x16(rtcd, xd);

+  vp9_inverse_transform_mbuv_8x8(rtcd, xd);

+}

--- /dev/null

+++ b/vp9/common/invtrans.h

@@ -1,0 +1,53 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_INVTRANS_H

+#define __INC_INVTRANS_H

+#include "vpx_ports/config.h"

+#include "idct.h"

+#include "blockd.h"

+extern void vp9_inverse_transform_b_4x4(const vp9_idct_rtcd_vtable_t *rtcd,

+                                        BLOCKD *b, int pitch);

+extern void vp9_inverse_transform_mb_4x4(const vp9_idct_rtcd_vtable_t *rtcd,

+                                         MACROBLOCKD *xd);

+extern void vp9_inverse_transform_mby_4x4(const vp9_idct_rtcd_vtable_t *rtcd,

+                                          MACROBLOCKD *xd);

+extern void vp9_inverse_transform_mbuv_4x4(const vp9_idct_rtcd_vtable_t *rtcd,

+                                           MACROBLOCKD *xd);

+extern void vp9_inverse_transform_b_8x8(const vp9_idct_rtcd_vtable_t *rtcd,

+                                        short *input_dqcoeff,

+                                        short *output_coeff, int pitch);

+extern void vp9_inverse_transform_mb_8x8(const vp9_idct_rtcd_vtable_t *rtcd,

+                                         MACROBLOCKD *xd);

+extern void vp9_inverse_transform_mby_8x8(const vp9_idct_rtcd_vtable_t *rtcd,

+                                          MACROBLOCKD *xd);

+extern void vp9_inverse_transform_mbuv_8x8(const vp9_idct_rtcd_vtable_t *rtcd,

+                                           MACROBLOCKD *xd);

+extern void vp9_inverse_transform_b_16x16(const vp9_idct_rtcd_vtable_t *rtcd,

+                                          short *input_dqcoeff,

+                                          short *output_coeff, int pitch);

+extern void vp9_inverse_transform_mb_16x16(const vp9_idct_rtcd_vtable_t *rtcd,

+                                           MACROBLOCKD *xd);

+extern void vp9_inverse_transform_mby_16x16(const vp9_idct_rtcd_vtable_t *rtcd,

+                                            MACROBLOCKD *xd);

+#endif  // __INC_INVTRANS_H

--- /dev/null

+++ b/vp9/common/loopfilter.c

@@ -1,0 +1,524 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_config.h"

+#include "loopfilter.h"

+#include "onyxc_int.h"

+#include "vpx_mem/vpx_mem.h"

+#include "vp9/common/seg_common.h"

+static void lf_init_lut(loop_filter_info_n *lfi) {

+  int filt_lvl;

+  for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++) {

+    if (filt_lvl >= 40) {

+      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2;

+      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3;

+    } else if (filt_lvl >= 20) {

+      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;

+      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2;

+    } else if (filt_lvl >= 15) {

+      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;

+      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1;

+    } else {

+      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0;

+      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0;

+    }

+  }

+  lfi->mode_lf_lut[DC_PRED] = 1;

+  lfi->mode_lf_lut[D45_PRED] = 1;

+  lfi->mode_lf_lut[D135_PRED] = 1;

+  lfi->mode_lf_lut[D117_PRED] = 1;

+  lfi->mode_lf_lut[D153_PRED] = 1;

+  lfi->mode_lf_lut[D27_PRED] = 1;

+  lfi->mode_lf_lut[D63_PRED] = 1;

+  lfi->mode_lf_lut[V_PRED] = 1;

+  lfi->mode_lf_lut[H_PRED] = 1;

+  lfi->mode_lf_lut[TM_PRED] = 1;

+  lfi->mode_lf_lut[B_PRED]  = 0;

+  lfi->mode_lf_lut[I8X8_PRED] = 0;

+  lfi->mode_lf_lut[ZEROMV]  = 1;

+  lfi->mode_lf_lut[NEARESTMV] = 2;

+  lfi->mode_lf_lut[NEARMV] = 2;

+  lfi->mode_lf_lut[NEWMV] = 2;

+  lfi->mode_lf_lut[SPLITMV] = 3;

+}

+void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,

+                                      int sharpness_lvl) {

+  int i;

+  /* For each possible value for the loop filter fill out limits */

+  for (i = 0; i <= MAX_LOOP_FILTER; i++) {

+    int filt_lvl = i;

+    int block_inside_limit = 0;

+    /* Set loop filter paramaeters that control sharpness. */

+    block_inside_limit = filt_lvl >> (sharpness_lvl > 0);

+    block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);

+    if (sharpness_lvl > 0) {

+      if (block_inside_limit > (9 - sharpness_lvl))

+        block_inside_limit = (9 - sharpness_lvl);

+    }

+    if (block_inside_limit < 1)

+      block_inside_limit = 1;

+    vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);

+    vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit),

+               SIMD_WIDTH);

+    vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),

+               SIMD_WIDTH);

+  }

+}

+void vp9_loop_filter_init(VP9_COMMON *cm) {

+  loop_filter_info_n *lfi = &cm->lf_info;

+  int i;

+  /* init limits for given sharpness*/

+  vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);

+  cm->last_sharpness_level = cm->sharpness_level;

+  /* init LUT for lvl  and hev thr picking */

+  lf_init_lut(lfi);

+  /* init hev threshold const vectors */

+  for (i = 0; i < 4; i++) {

+    vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);

+  }

+}

+void vp9_loop_filter_frame_init(VP9_COMMON *cm,

+                                MACROBLOCKD *xd,

+                                int default_filt_lvl) {

+  int seg,  /* segment number */

+      ref,  /* index in ref_lf_deltas */

+      mode; /* index in mode_lf_deltas */

+  loop_filter_info_n *lfi = &cm->lf_info;

+  /* update limits if sharpness has changed */

+  if (cm->last_sharpness_level != cm->sharpness_level) {

+    vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);

+    cm->last_sharpness_level = cm->sharpness_level;

+  }

+  for (seg = 0; seg < MAX_MB_SEGMENTS; seg++) {

+    int lvl_seg = default_filt_lvl;

+    int lvl_ref, lvl_mode;

+    // Set the baseline filter values for each segment

+    if (vp9_segfeature_active(xd, seg, SEG_LVL_ALT_LF)) {

+      /* Abs value */

+      if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {

+        lvl_seg = vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF);

+      } else { /* Delta Value */

+        lvl_seg += vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF);

+        lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63 : lvl_seg) : 0;

+      }

+    }

+    if (!xd->mode_ref_lf_delta_enabled) {

+      /* we could get rid of this if we assume that deltas are set to

+       * zero when not in use; encoder always uses deltas

+       */

+      vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4);

+      continue;

+    }

+    lvl_ref = lvl_seg;

+    /* INTRA_FRAME */

+    ref = INTRA_FRAME;

+    /* Apply delta for reference frame */

+    lvl_ref += xd->ref_lf_deltas[ref];

+    /* Apply delta for Intra modes */

+    mode = 0; /* B_PRED */

+    /* Only the split mode BPRED has a further special case */

+    lvl_mode = lvl_ref +  xd->mode_lf_deltas[mode];

+    lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */

+    lfi->lvl[seg][ref][mode] = lvl_mode;

+    mode = 1; /* all the rest of Intra modes */

+    lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref)  : 0; /* clamp */

+    lfi->lvl[seg][ref][mode] = lvl_mode;

+    /* LAST, GOLDEN, ALT */

+    for (ref = 1; ref < MAX_REF_FRAMES; ref++) {

+      int lvl_ref = lvl_seg;

+      /* Apply delta for reference frame */

+      lvl_ref += xd->ref_lf_deltas[ref];

+      /* Apply delta for Inter modes */

+      for (mode = 1; mode < 4; mode++) {

+        lvl_mode = lvl_ref + xd->mode_lf_deltas[mode];

+        lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */

+        lfi->lvl[seg][ref][mode] = lvl_mode;

+      }

+    }

+  }

+}

+void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd) {

+  YV12_BUFFER_CONFIG *post = cm->frame_to_show;

+  loop_filter_info_n *lfi_n = &cm->lf_info;

+  struct loop_filter_info lfi;

+  FRAME_TYPE frame_type = cm->frame_type;

+  int mb_row;

+  int mb_col;

+  int filter_level;

+  unsigned char *y_ptr, *u_ptr, *v_ptr;

+  /* Point at base of Mb MODE_INFO list */

+  const MODE_INFO *mode_info_context = cm->mi;

+  /* Initialize the loop filter for this frame. */

+  vp9_loop_filter_frame_init(cm, xd, cm->filter_level);

+  /* Set up the buffer pointers */

+  y_ptr = post->y_buffer;

+  u_ptr = post->u_buffer;

+  v_ptr = post->v_buffer;

+  /* vp9_filter each macro block */

+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {

+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

+      int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&

+                     mode_info_context->mbmi.mode != I8X8_PRED &&

+                     mode_info_context->mbmi.mode != SPLITMV &&

+                     mode_info_context->mbmi.mb_skip_coeff);

+      const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];

+      const int seg = mode_info_context->mbmi.segment_id;

+      const int ref_frame = mode_info_context->mbmi.ref_frame;

+      int tx_type = mode_info_context->mbmi.txfm_size;

+      filter_level = lfi_n->lvl[seg][ref_frame][mode_index];

+      if (filter_level) {

+        if (cm->filter_type == NORMAL_LOOPFILTER) {

+          const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];

+          lfi.mblim = lfi_n->mblim[filter_level];

+          lfi.blim = lfi_n->blim[filter_level];

+          lfi.lim = lfi_n->lim[filter_level];

+          lfi.hev_thr = lfi_n->hev_thr[hev_index];

+          if (mb_col > 0

+#if CONFIG_SUPERBLOCKS

+              && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&

+                   mode_info_context[0].mbmi.mb_skip_coeff &&

+                   mode_info_context[-1].mbmi.mb_skip_coeff)

+#endif

+              )

+            vp9_loop_filter_mbv(y_ptr, u_ptr, v_ptr, post->y_stride,

+                                post->uv_stride, &lfi);

+          if (!skip_lf && tx_type != TX_16X16) {

+            if (tx_type == TX_8X8)

+              vp9_loop_filter_bv8x8(y_ptr, u_ptr, v_ptr, post->y_stride,

+                                    post->uv_stride, &lfi);

+            else

+              vp9_loop_filter_bv(y_ptr, u_ptr, v_ptr, post->y_stride,

+                                 post->uv_stride, &lfi);

+          }

+          /* don't apply across umv border */

+          if (mb_row > 0

+#if CONFIG_SUPERBLOCKS

+              && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&

+                   mode_info_context[0].mbmi.mb_skip_coeff &&

+                   mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)

+#endif

+              )

+            vp9_loop_filter_mbh(y_ptr, u_ptr, v_ptr, post->y_stride,

+                                post->uv_stride, &lfi);

+          if (!skip_lf && tx_type != TX_16X16) {

+            if (tx_type == TX_8X8)

+              vp9_loop_filter_bh8x8(y_ptr, u_ptr, v_ptr, post->y_stride,

+                                    post->uv_stride, &lfi);

+            else

+              vp9_loop_filter_bh(y_ptr, u_ptr, v_ptr, post->y_stride,

+                                 post->uv_stride, &lfi);

+          }

+        } else {

+          // FIXME: Not 8x8 aware

+          if (mb_col > 0

+#if CONFIG_SUPERBLOCKS

+              && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&

+                   mode_info_context[0].mbmi.mb_skip_coeff &&

+                   mode_info_context[-1].mbmi.mb_skip_coeff)

+#endif

+              )

+            vp9_loop_filter_simple_mbv(y_ptr, post->y_stride,

+                                       lfi_n->mblim[filter_level]);

+          if (!skip_lf)

+            vp9_loop_filter_simple_bv(y_ptr, post->y_stride,

+                                      lfi_n->blim[filter_level]);

+          /* don't apply across umv border */

+          if (mb_row > 0

+#if CONFIG_SUPERBLOCKS

+              && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&

+                   mode_info_context[0].mbmi.mb_skip_coeff &&

+                   mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)

+#endif

+              )

+            vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,

+                                       lfi_n->mblim[filter_level]);

+          if (!skip_lf)

+            vp9_loop_filter_simple_bh(y_ptr, post->y_stride,

+                                      lfi_n->blim[filter_level]);

+        }

+      }

+      y_ptr += 16;

+      u_ptr += 8;

+      v_ptr += 8;

+      mode_info_context++;     /* step to next MB */

+    }

+    y_ptr += post->y_stride  * 16 - post->y_width;

+    u_ptr += post->uv_stride *  8 - post->uv_width;

+    v_ptr += post->uv_stride *  8 - post->uv_width;

+    mode_info_context++;         /* Skip border mb */

+  }

+}

+void vp9_loop_filter_frame_yonly(VP9_COMMON *cm, MACROBLOCKD *xd,

+                                 int default_filt_lvl) {

+  YV12_BUFFER_CONFIG *post = cm->frame_to_show;

+  unsigned char *y_ptr;

+  int mb_row;

+  int mb_col;

+  loop_filter_info_n *lfi_n = &cm->lf_info;

+  struct loop_filter_info lfi;

+  int filter_level;

+  FRAME_TYPE frame_type = cm->frame_type;

+  /* Point at base of Mb MODE_INFO list */

+  const MODE_INFO *mode_info_context = cm->mi;

+#if 0

+  if (default_filt_lvl == 0) /* no filter applied */

+    return;

+#endif

+  /* Initialize the loop filter for this frame. */

+  vp9_loop_filter_frame_init(cm, xd, default_filt_lvl);

+  /* Set up the buffer pointers */

+  y_ptr = post->y_buffer;

+  /* vp9_filter each macro block */

+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {

+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

+      int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&

+                     mode_info_context->mbmi.mode != I8X8_PRED &&

+                     mode_info_context->mbmi.mode != SPLITMV &&

+                     mode_info_context->mbmi.mb_skip_coeff);

+      const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];

+      const int seg = mode_info_context->mbmi.segment_id;

+      const int ref_frame = mode_info_context->mbmi.ref_frame;

+      int tx_type = mode_info_context->mbmi.txfm_size;

+      filter_level = lfi_n->lvl[seg][ref_frame][mode_index];

+      if (filter_level) {

+        if (cm->filter_type == NORMAL_LOOPFILTER) {

+          const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];

+          lfi.mblim = lfi_n->mblim[filter_level];

+          lfi.blim = lfi_n->blim[filter_level];

+          lfi.lim = lfi_n->lim[filter_level];

+          lfi.hev_thr = lfi_n->hev_thr[hev_index];

+          if (mb_col > 0)

+            vp9_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi);

+          if (!skip_lf && tx_type != TX_16X16) {

+            if (tx_type == TX_8X8)

+              vp9_loop_filter_bv8x8(y_ptr, 0, 0, post->y_stride, 0, &lfi);

+            else

+              vp9_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi);

+          }

+          /* don't apply across umv border */

+          if (mb_row > 0)

+            vp9_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi);

+          if (!skip_lf && tx_type != TX_16X16) {

+            if (tx_type == TX_8X8)

+              vp9_loop_filter_bh8x8(y_ptr, 0, 0, post->y_stride, 0, &lfi);

+            else

+              vp9_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi);

+          }

+        } else {

+          // FIXME: Not 8x8 aware

+          if (mb_col > 0)

+            vp9_loop_filter_simple_mbv(y_ptr, post->y_stride,

+                                       lfi_n->mblim[filter_level]);

+          if (!skip_lf)

+            vp9_loop_filter_simple_bv(y_ptr, post->y_stride,

+                                      lfi_n->blim[filter_level]);

+          /* don't apply across umv border */

+          if (mb_row > 0)

+            vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,

+                                       lfi_n->mblim[filter_level]);

+          if (!skip_lf)

+            vp9_loop_filter_simple_bh(y_ptr, post->y_stride,

+                                      lfi_n->blim[filter_level]);

+        }

+      }

+      y_ptr += 16;

+      mode_info_context++;        /* step to next MB */

+    }

+    y_ptr += post->y_stride  * 16 - post->y_width;

+    mode_info_context++;            /* Skip border mb */

+  }

+}

+void vp9_loop_filter_partial_frame(VP9_COMMON *cm, MACROBLOCKD *xd,

+                                   int default_filt_lvl) {

+  YV12_BUFFER_CONFIG *post = cm->frame_to_show;

+  unsigned char *y_ptr;

+  int mb_row;

+  int mb_col;

+  int mb_cols = post->y_width  >> 4;

+  int linestocopy, i;

+  loop_filter_info_n *lfi_n = &cm->lf_info;

+  struct loop_filter_info lfi;

+  int filter_level;

+  int alt_flt_enabled = xd->segmentation_enabled;

+  FRAME_TYPE frame_type = cm->frame_type;

+  const MODE_INFO *mode_info_context;

+  int lvl_seg[MAX_MB_SEGMENTS];

+  mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);

+  /* 3 is a magic number. 4 is probably magic too */

+  linestocopy = (post->y_height >> (4 + 3));

+  if (linestocopy < 1)

+    linestocopy = 1;

+  linestocopy <<= 4;

+  /* Note the baseline filter values for each segment */

+  /* See vp9_loop_filter_frame_init. Rather than call that for each change

+   * to default_filt_lvl, copy the relevant calculation here.

+   */

+  if (alt_flt_enabled) {

+    for (i = 0; i < MAX_MB_SEGMENTS; i++) {

+      /* Abs value */

+      if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {

+        lvl_seg[i] = vp9_get_segdata(xd, i, SEG_LVL_ALT_LF);

+      }

+      /* Delta Value */

+      else {

+        lvl_seg[i] = default_filt_lvl +

+                     vp9_get_segdata(xd, i, SEG_LVL_ALT_LF);

+        lvl_seg[i] = (lvl_seg[i] > 0) ?

+                     ((lvl_seg[i] > 63) ? 63 : lvl_seg[i]) : 0;

+      }

+    }

+  }

+  /* Set up the buffer pointers */

+  y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride;

+  /* vp9_filter each macro block */

+  for (mb_row = 0; mb_row < (linestocopy >> 4); mb_row++) {

+    for (mb_col = 0; mb_col < mb_cols; mb_col++) {

+      int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&

+                     mode_info_context->mbmi.mode != I8X8_PRED &&

+                     mode_info_context->mbmi.mode != SPLITMV &&

+                     mode_info_context->mbmi.mb_skip_coeff);

+      if (alt_flt_enabled)

+        filter_level = lvl_seg[mode_info_context->mbmi.segment_id];

+      else

+        filter_level = default_filt_lvl;

+      if (filter_level) {

+        if (cm->filter_type == NORMAL_LOOPFILTER) {

+          const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];

+          lfi.mblim = lfi_n->mblim[filter_level];

+          lfi.blim = lfi_n->blim[filter_level];

+          lfi.lim = lfi_n->lim[filter_level];

+          lfi.hev_thr = lfi_n->hev_thr[hev_index];

+          if (mb_col > 0)

+            vp9_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi);

+          if (!skip_lf)

+            vp9_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi);

+          vp9_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi);

+          if (!skip_lf)

+            vp9_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi);

+        } else {

+          if (mb_col > 0)

+            vp9_loop_filter_simple_mbv (y_ptr, post->y_stride,

+                                        lfi_n->mblim[filter_level]);

+          if (!skip_lf)

+            vp9_loop_filter_simple_bv(y_ptr, post->y_stride,

+                                      lfi_n->blim[filter_level]);

+          vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,

+                                     lfi_n->mblim[filter_level]);

+          if (!skip_lf)

+            vp9_loop_filter_simple_bh(y_ptr, post->y_stride,

+                                      lfi_n->blim[filter_level]);

+        }

+      }

+      y_ptr += 16;

+      mode_info_context += 1;      /* step to next MB */

+    }

+    y_ptr += post->y_stride  * 16 - post->y_width;

+    mode_info_context += 1;          /* Skip border mb */

+  }

+}

--- /dev/null

+++ b/vp9/common/loopfilter.h

@@ -1,0 +1,104 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef loopfilter_h

+#define loopfilter_h

+#include "vpx_ports/mem.h"

+#include "vpx_config.h"

+#include "blockd.h"

+#define MAX_LOOP_FILTER 63

+typedef enum {

+  NORMAL_LOOPFILTER = 0,

+  SIMPLE_LOOPFILTER = 1

+} LOOPFILTERTYPE;

+#if ARCH_ARM

+#define SIMD_WIDTH 1

+#else

+#define SIMD_WIDTH 16

+#endif

+/* Need to align this structure so when it is declared and

+ * passed it can be loaded into vector registers.

+ */

+typedef struct {

+  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,

+                  mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);

+  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,

+                  blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);

+  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,

+                  lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);

+  DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,

+                  hev_thr[4][SIMD_WIDTH]);

+  unsigned char lvl[4][4][4];

+  unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1];

+  unsigned char mode_lf_lut[MB_MODE_COUNT];

+} loop_filter_info_n;

+struct loop_filter_info {

+  const unsigned char *mblim;

+  const unsigned char *blim;

+  const unsigned char *lim;

+  const unsigned char *hev_thr;

+};

+#define prototype_loopfilter(sym) \

+  void sym(unsigned char *src, int pitch, const unsigned char *blimit,\

+           const unsigned char *limit, const unsigned char *thresh, int count)

+#define prototype_loopfilter_block(sym) \

+  void sym(unsigned char *y, unsigned char *u, unsigned char *v, \

+           int ystride, int uv_stride, struct loop_filter_info *lfi)

+#define prototype_simple_loopfilter(sym) \

+  void sym(unsigned char *y, int ystride, const unsigned char *blimit)

+#if ARCH_X86 || ARCH_X86_64

+#include "x86/loopfilter_x86.h"

+#endif

+#if ARCH_ARM

+#include "arm/loopfilter_arm.h"

+#endif

+typedef void loop_filter_uvfunction(unsigned char *u,   /* source pointer */

+                                    int p,              /* pitch */

+                                    const unsigned char *blimit,

+                                    const unsigned char *limit,

+                                    const unsigned char *thresh,

+                                    unsigned char *v);

+/* assorted loopfilter functions which get used elsewhere */

+struct VP9Common;

+struct macroblockd;

+void vp9_loop_filter_init(struct VP9Common *cm);

+void vp9_loop_filter_frame_init(struct VP9Common *cm,

+                                struct macroblockd *mbd,

+                                int default_filt_lvl);

+void vp9_loop_filter_frame(struct VP9Common *cm, struct macroblockd *mbd);

+void vp9_loop_filter_partial_frame(struct VP9Common *cm,

+                                   struct macroblockd *mbd,

+                                   int default_filt_lvl);

+void vp9_loop_filter_frame_yonly(struct VP9Common *cm,

+                                 struct macroblockd *mbd,

+                                 int default_filt_lvl);

+void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,

+                                      int sharpness_lvl);

+#endif  // loopfilter_h

--- /dev/null

+++ b/vp9/common/loopfilter_filters.c

@@ -1,0 +1,480 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <stdlib.h>

+#include "vpx_config.h"

+#include "loopfilter.h"

+#include "onyxc_int.h"

+typedef unsigned char uc;

+static __inline signed char signed_char_clamp(int t) {

+  t = (t < -128 ? -128 : t);

+  t = (t > 127 ? 127 : t);

+  return (signed char) t;

+}

+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */

+static __inline signed char filter_mask(uc limit, uc blimit,

+                                        uc p3, uc p2, uc p1, uc p0,

+                                        uc q0, uc q1, uc q2, uc q3) {

+  signed char mask = 0;

+  mask |= (abs(p3 - p2) > limit) * -1;

+  mask |= (abs(p2 - p1) > limit) * -1;

+  mask |= (abs(p1 - p0) > limit) * -1;

+  mask |= (abs(q1 - q0) > limit) * -1;

+  mask |= (abs(q2 - q1) > limit) * -1;

+  mask |= (abs(q3 - q2) > limit) * -1;

+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;

+  mask = ~mask;

+  return mask;

+}

+/* is there high variance internal edge ( 11111111 yes, 00000000 no) */

+static __inline signed char hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1) {

+  signed char hev = 0;

+  hev  |= (abs(p1 - p0) > thresh) * -1;

+  hev  |= (abs(q1 - q0) > thresh) * -1;

+  return hev;

+}

+static __inline void filter(signed char mask, uc hev, uc *op1,

+                            uc *op0, uc *oq0, uc *oq1)

+{

+  signed char ps0, qs0;

+  signed char ps1, qs1;

+  signed char filter, Filter1, Filter2;

+  signed char u;

+  ps1 = (signed char) * op1 ^ 0x80;

+  ps0 = (signed char) * op0 ^ 0x80;

+  qs0 = (signed char) * oq0 ^ 0x80;

+  qs1 = (signed char) * oq1 ^ 0x80;

+  /* add outer taps if we have high edge variance */

+  filter = signed_char_clamp(ps1 - qs1);

+  filter &= hev;

+  /* inner taps */

+  filter = signed_char_clamp(filter + 3 * (qs0 - ps0));

+  filter &= mask;

+  /* save bottom 3 bits so that we round one side +4 and the other +3

+   * if it equals 4 we'll set to adjust by -1 to account for the fact

+   * we'd round 3 the other way

+   */

+  Filter1 = signed_char_clamp(filter + 4);

+  Filter2 = signed_char_clamp(filter + 3);

+  Filter1 >>= 3;

+  Filter2 >>= 3;

+  u = signed_char_clamp(qs0 - Filter1);

+  *oq0 = u ^ 0x80;

+  u = signed_char_clamp(ps0 + Filter2);

+  *op0 = u ^ 0x80;

+  filter = Filter1;

+  /* outer tap adjustments */

+  filter += 1;

+  filter >>= 1;

+  filter &= ~hev;

+  u = signed_char_clamp(qs1 - filter);

+  *oq1 = u ^ 0x80;

+  u = signed_char_clamp(ps1 + filter);

+  *op1 = u ^ 0x80;

+}

+void vp9_loop_filter_horizontal_edge_c

+(

+  unsigned char *s,

+  int p, /* pitch */

+  const unsigned char *blimit,

+  const unsigned char *limit,

+  const unsigned char *thresh,

+  int count

+) {

+  int  hev = 0; /* high edge variance */

+  signed char mask = 0;

+  int i = 0;

+  /* loop filter designed to work using chars so that we can make maximum use

+   * of 8 bit simd instructions.

+   */

+  do {

+    mask = filter_mask(limit[0], blimit[0],

+                       s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],

+                       s[0 * p], s[1 * p], s[2 * p], s[3 * p]);

+    hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);

+    filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);

+    ++s;

+  } while (++i < count * 8);

+}

+void vp9_loop_filter_vertical_edge_c(unsigned char *s,

+                                     int p,

+                                     const unsigned char *blimit,

+                                     const unsigned char *limit,

+                                     const unsigned char *thresh,

+                                     int count) {

+  int  hev = 0; /* high edge variance */

+  signed char mask = 0;

+  int i = 0;

+  /* loop filter designed to work using chars so that we can make maximum use

+   * of 8 bit simd instructions.

+   */

+  do {

+    mask = filter_mask(limit[0], blimit[0],

+                       s[-4], s[-3], s[-2], s[-1],

+                       s[0], s[1], s[2], s[3]);

+    hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);

+    filter(mask, hev, s - 2, s - 1, s, s + 1);

+    s += p;

+  } while (++i < count * 8);

+}

+static __inline signed char flatmask(uc thresh,

+                                     uc p4, uc p3, uc p2, uc p1, uc p0,

+                                     uc q0, uc q1, uc q2, uc q3, uc q4) {

+  signed char flat = 0;

+  flat |= (abs(p1 - p0) > 1) * -1;

+  flat |= (abs(q1 - q0) > 1) * -1;

+  flat |= (abs(p0 - p2) > 1) * -1;

+  flat |= (abs(q0 - q2) > 1) * -1;

+  flat |= (abs(p3 - p0) > 1) * -1;

+  flat |= (abs(q3 - q0) > 1) * -1;

+  flat |= (abs(p4 - p0) > 1) * -1;

+  flat |= (abs(q4 - q0) > 1) * -1;

+  flat = ~flat;

+  return flat;

+}

+static __inline void mbfilter(signed char mask, uc hev, uc flat,

+                              uc *op4, uc *op3, uc *op2, uc *op1, uc *op0,

+                              uc *oq0, uc *oq1, uc *oq2, uc *oq3, uc *oq4) {

+  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */

+  if (flat && mask) {

+    unsigned char p0, q0;

+    unsigned char p1, q1;

+    unsigned char p2, q2;

+    unsigned char p3, q3;

+    unsigned char p4, q4;

+    p4 = *op4;

+    p3 = *op3;

+    p2 = *op2;

+    p1 = *op1;

+    p0 = *op0;

+    q0 = *oq0;

+    q1 = *oq1;

+    q2 = *oq2;

+    q3 = *oq3;

+    q4 = *oq4;

+    *op2 = (p4 + p4 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3;

+    *op1 = (p4 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3;

+    *op0 = (p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2 + 4) >> 3;

+    *oq0 = (p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3 + 4) >> 3;

+    *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q4 + 4) >> 3;

+    *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q4 + q4 + 4) >> 3;

+  } else {

+    signed char ps0, qs0;

+    signed char ps1, qs1;

+    signed char filter, Filter1, Filter2;

+    signed char u;

+    ps1 = (signed char) * op1 ^ 0x80;

+    ps0 = (signed char) * op0 ^ 0x80;

+    qs0 = (signed char) * oq0 ^ 0x80;

+    qs1 = (signed char) * oq1 ^ 0x80;

+    /* add outer taps if we have high edge variance */

+    filter = signed_char_clamp(ps1 - qs1);

+    filter &= hev;

+    /* inner taps */

+    filter = signed_char_clamp(filter + 3 * (qs0 - ps0));

+    filter &= mask;

+    Filter1 = signed_char_clamp(filter + 4);

+    Filter2 = signed_char_clamp(filter + 3);

+    Filter1 >>= 3;

+    Filter2 >>= 3;

+    u = signed_char_clamp(qs0 - Filter1);

+    *oq0 = u ^ 0x80;

+    u = signed_char_clamp(ps0 + Filter2);

+    *op0 = u ^ 0x80;

+    filter = Filter1;

+    /* outer tap adjustments */

+    filter += 1;

+    filter >>= 1;

+    filter &= ~hev;

+    u = signed_char_clamp(qs1 - filter);

+    *oq1 = u ^ 0x80;

+    u = signed_char_clamp(ps1 + filter);

+    *op1 = u ^ 0x80;

+  }

+}

+void vp9_mbloop_filter_horizontal_edge_c

+(

+  unsigned char *s,

+  int p,

+  const unsigned char *blimit,

+  const unsigned char *limit,

+  const unsigned char *thresh,

+  int count

+) {

+  signed char hev = 0; /* high edge variance */

+  signed char mask = 0;

+  signed char flat = 0;

+  int i = 0;

+  /* loop filter designed to work using chars so that we can make maximum use

+   * of 8 bit simd instructions.

+   */

+  do {

+    mask = filter_mask(limit[0], blimit[0],

+                       s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],

+                       s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);

+    hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);

+    flat = flatmask(thresh[0],

+                    s[-5 * p], s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],

+                    s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p], s[ 4 * p]);

+    mbfilter(mask, hev, flat,

+             s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,

+             s,       s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p);

+    ++s;

+  } while (++i < count * 8);

+}

+void vp9_mbloop_filter_vertical_edge_c

+(

+  unsigned char *s,

+  int p,

+  const unsigned char *blimit,

+  const unsigned char *limit,

+  const unsigned char *thresh,

+  int count

+) {

+  signed char hev = 0; /* high edge variance */

+  signed char mask = 0;

+  signed char flat = 0;

+  int i = 0;

+  do {

+    mask = filter_mask(limit[0], blimit[0],

+                       s[-4], s[-3], s[-2], s[-1],

+                       s[0], s[1], s[2], s[3]);

+    hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);

+    flat = flatmask(thresh[0],

+                    s[-5], s[-4], s[-3], s[-2], s[-1],

+                    s[ 0], s[ 1], s[ 2], s[ 3], s[ 4]);

+    mbfilter(mask, hev, flat,

+             s - 5, s - 4, s - 3, s - 2, s - 1,

+             s,     s + 1, s + 2, s + 3, s + 4);

+    s += p;

+  } while (++i < count * 8);

+}

+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */

+static __inline signed char simple_filter_mask(uc blimit,

+                                               uc p1, uc p0,

+                                               uc q0, uc q1) {

+  /* Why does this cause problems for win32?

+   * error C2143: syntax error : missing ';' before 'type'

+   *  (void) limit;

+   */

+  signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= blimit) * -1;

+  return mask;

+}

+static __inline void simple_filter(signed char mask,

+                                   uc *op1, uc *op0,

+                                   uc *oq0, uc *oq1) {

+  signed char filter, Filter1, Filter2;

+  signed char p1 = (signed char) * op1 ^ 0x80;

+  signed char p0 = (signed char) * op0 ^ 0x80;

+  signed char q0 = (signed char) * oq0 ^ 0x80;

+  signed char q1 = (signed char) * oq1 ^ 0x80;

+  signed char u;

+  filter = signed_char_clamp(p1 - q1);

+  filter = signed_char_clamp(filter + 3 * (q0 - p0));

+  filter &= mask;

+  /* save bottom 3 bits so that we round one side +4 and the other +3 */

+  Filter1 = signed_char_clamp(filter + 4);

+  Filter1 >>= 3;

+  u = signed_char_clamp(q0 - Filter1);

+  *oq0  = u ^ 0x80;

+  Filter2 = signed_char_clamp(filter + 3);

+  Filter2 >>= 3;

+  u = signed_char_clamp(p0 + Filter2);

+  *op0 = u ^ 0x80;

+}

+void vp9_loop_filter_simple_horizontal_edge_c

+(

+  unsigned char *s,

+  int p,

+  const unsigned char *blimit

+) {

+  signed char mask = 0;

+  int i = 0;

+  do {

+    mask = simple_filter_mask(blimit[0],

+                              s[-2 * p], s[-1 * p],

+                              s[0 * p], s[1 * p]);

+    simple_filter(mask,

+                  s - 2 * p, s - 1 * p,

+                  s, s + 1 * p);

+    ++s;

+  } while (++i < 16);

+}

+void vp9_loop_filter_simple_vertical_edge_c

+(

+  unsigned char *s,

+  int p,

+  const unsigned char *blimit

+) {

+  signed char mask = 0;

+  int i = 0;

+  do {

+    mask = simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);

+    simple_filter(mask, s - 2, s - 1, s, s + 1);

+    s += p;

+  } while (++i < 16);

+}

+/* Vertical MB Filtering */

+void vp9_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,

+                           unsigned char *v_ptr, int y_stride, int uv_stride,

+                           struct loop_filter_info *lfi) {

+  vp9_mbloop_filter_vertical_edge_c(y_ptr, y_stride,

+                                    lfi->mblim, lfi->lim, lfi->hev_thr, 2);

+  if (u_ptr)

+    vp9_mbloop_filter_vertical_edge_c(u_ptr, uv_stride,

+                                      lfi->mblim, lfi->lim, lfi->hev_thr, 1);

+  if (v_ptr)

+    vp9_mbloop_filter_vertical_edge_c(v_ptr, uv_stride,

+                                      lfi->mblim, lfi->lim, lfi->hev_thr, 1);

+}

+/* Vertical B Filtering */

+void vp9_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,

+                          unsigned char *v_ptr, int y_stride, int uv_stride,

+                          struct loop_filter_info *lfi) {

+  vp9_loop_filter_vertical_edge_c(y_ptr + 4, y_stride,

+                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);

+  vp9_loop_filter_vertical_edge_c(y_ptr + 8, y_stride,

+                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);

+  vp9_loop_filter_vertical_edge_c(y_ptr + 12, y_stride,

+                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);

+  if (u_ptr)

+    vp9_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride,

+                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);

+  if (v_ptr)

+    vp9_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride,

+                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);

+}

+/* Horizontal MB filtering */

+void vp9_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,

+                           unsigned char *v_ptr, int y_stride, int uv_stride,

+                           struct loop_filter_info *lfi) {

+  vp9_mbloop_filter_horizontal_edge_c(y_ptr, y_stride,

+                                      lfi->mblim, lfi->lim, lfi->hev_thr, 2);

+  if (u_ptr)

+    vp9_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride,

+                                        lfi->mblim, lfi->lim, lfi->hev_thr, 1);

+  if (v_ptr)

+    vp9_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride,

+                                        lfi->mblim, lfi->lim, lfi->hev_thr, 1);

+}

+/* Horizontal B Filtering */

+void vp9_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,

+                          unsigned char *v_ptr, int y_stride, int uv_stride,

+                          struct loop_filter_info *lfi) {

+  vp9_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride,

+                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);

+  vp9_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride,

+                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);

+  vp9_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride,

+                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);

+  if (u_ptr)

+    vp9_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride,

+                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);

+  if (v_ptr)

+    vp9_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride,

+                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);

+}

+void vp9_loop_filter_bh8x8_c(unsigned char *y_ptr, unsigned char *u_ptr,

+                             unsigned char *v_ptr, int y_stride, int uv_stride,

+                             struct loop_filter_info *lfi) {

+  vp9_mbloop_filter_horizontal_edge_c(

+    y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

+}

+void vp9_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,

+                           const unsigned char *blimit) {

+  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride,

+                                           y_stride, blimit);

+  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride,

+                                           y_stride, blimit);

+  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride,

+                                           y_stride, blimit);

+}

+void vp9_loop_filter_bv8x8_c(unsigned char *y_ptr, unsigned char *u_ptr,

+                             unsigned char *v_ptr, int y_stride, int uv_stride,

+                             struct loop_filter_info *lfi) {

+  vp9_mbloop_filter_vertical_edge_c(

+    y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

+}

+void vp9_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,

+                           const unsigned char *blimit) {

+  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);

+  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);

+  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);

+}

--- /dev/null

+++ b/vp9/common/maskingmv.c

@@ -1,0 +1,806 @@

+/*

+ ============================================================================

+ Name        : maskingmv.c

+ Author      : jimbankoski

+ Version     :

+ Copyright   : Your copyright notice

+ Description : Hello World in C, Ansi-style

+ ============================================================================

+ */

+#include <stdio.h>

+#include <stdlib.h>

+#include <string.h>

+extern unsigned int vp9_sad16x16_sse3(

+  unsigned char *src_ptr,

+  int  src_stride,

+  unsigned char *ref_ptr,

+  int  ref_stride,

+  int  max_err);

+extern void vp9_sad16x16x3_sse3(

+  unsigned char *src_ptr,

+  int  src_stride,

+  unsigned char *ref_ptr,

+  int  ref_stride,

+  int  *results);

+extern int vp8_growmaskmb_sse3(

+  unsigned char *om,

+  unsigned char *nm);

+extern void vp8_makemask_sse3(

+  unsigned char *y,

+  unsigned char *u,

+  unsigned char *v,

+  unsigned char *ym,

+  int yp,

+  int uvp,

+  int ys,

+  int us,

+  int vs,

+  int yt,

+  int ut,

+  int vt);

+unsigned int vp9_sad16x16_unmasked_wmt(

+  unsigned char *src_ptr,

+  int  src_stride,

+  unsigned char *ref_ptr,

+  int  ref_stride,

+  unsigned char *mask);

+unsigned int vp9_sad16x16_masked_wmt(

+  unsigned char *src_ptr,

+  int  src_stride,

+  unsigned char *ref_ptr,

+  int  ref_stride,

+  unsigned char *mask);

+unsigned int vp8_masked_predictor_wmt(

+  unsigned char *masked,

+  unsigned char *unmasked,

+  int  src_stride,

+  unsigned char *dst_ptr,

+  int  dst_stride,

+  unsigned char *mask);

+unsigned int vp8_masked_predictor_uv_wmt(

+  unsigned char *masked,

+  unsigned char *unmasked,

+  int  src_stride,

+  unsigned char *dst_ptr,

+  int  dst_stride,

+  unsigned char *mask);

+unsigned int vp8_uv_from_y_mask(

+  unsigned char *ymask,

+  unsigned char *uvmask);

+int yp = 16;

+unsigned char sxy[] = {

+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

+  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

+  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

+  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

+  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

+  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

+  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90,

+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90

+};

+unsigned char sts[] = {

+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+};

+unsigned char str[] = {

+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

+};

+unsigned char y[] = {

+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,

+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,

+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,

+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,

+  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,

+  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,

+  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,

+  60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40,

+  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,

+  40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40,

+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,

+  40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40,

+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,

+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,

+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40,

+  40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40

+};

+int uvp = 8;

+unsigned char u[] = {

+  90, 80, 70, 70, 90, 90, 90, 17,

+  90, 80, 70, 70, 90, 90, 90, 17,

+  84, 70, 70, 90, 90, 90, 17, 17,

+  84, 70, 70, 90, 90, 90, 17, 17,

+  80, 70, 70, 90, 90, 90, 17, 17,

+  90, 80, 70, 70, 90, 90, 90, 17,

+  90, 80, 70, 70, 90, 90, 90, 17,

+  90, 80, 70, 70, 90, 90, 90, 17

+};

+unsigned char v[] = {

+  80, 80, 80, 80, 80, 80, 80, 80,

+  80, 80, 80, 80, 80, 80, 80, 80,

+  80, 80, 80, 80, 80, 80, 80, 80,

+  80, 80, 80, 80, 80, 80, 80, 80,

+  80, 80, 80, 80, 80, 80, 80, 80,

+  80, 80, 80, 80, 80, 80, 80, 80,

+  80, 80, 80, 80, 80, 80, 80, 80,

+  80, 80, 80, 80, 80, 80, 80, 80

+};

+unsigned char ym[256];

+unsigned char uvm[64];

+typedef struct {

+  unsigned char y;

+  unsigned char yt;

+  unsigned char u;

+  unsigned char ut;

+  unsigned char v;

+  unsigned char vt;

+  unsigned char use;

+} COLOR_SEG_ELEMENT;

+/*

+COLOR_SEG_ELEMENT segmentation[]=

+{

+    { 60,4,80,17,80,10, 1},

+    { 40,4,15,10,80,10, 1},

+};

+*/

+COLOR_SEG_ELEMENT segmentation[] = {

+  { 79, 44, 92, 44, 237, 60, 1},

+};

+unsigned char pixel_mask(unsigned char y, unsigned char u, unsigned char v,

+                         COLOR_SEG_ELEMENT sgm[],

+                         int c) {

+  COLOR_SEG_ELEMENT *s = sgm;

+  unsigned char m = 0;

+  int i;

+  for (i = 0; i < c; i++, s++)

+    m |= (abs(y - s->y) < s->yt &&

+          abs(u - s->u) < s->ut &&

+          abs(v - s->v) < s->vt ? 255 : 0);

+  return m;

+}

+int neighbors[256][8];

+int makeneighbors(void) {

+  int i, j;

+  for (i = 0; i < 256; i++) {

+    int r = (i >> 4), c = (i & 15);

+    int ni = 0;

+    for (j = 0; j < 8; j++)

+      neighbors[i][j] = i;

+    for (j = 0; j < 256; j++) {

+      int nr = (j >> 4), nc = (j & 15);

+      if (abs(nr - r) < 2 && abs(nc - c) < 2)

+        neighbors[i][ni++] = j;

+    }

+  }

+  return 0;

+}

+void grow_ymask(unsigned char *ym) {

+  unsigned char nym[256];

+  int i, j;

+  for (i = 0; i < 256; i++) {

+    nym[i] = ym[i];

+    for (j = 0; j < 8; j++) {

+      nym[i] |= ym[neighbors[i][j]];

+    }

+  }

+  for (i = 0; i < 256; i++)

+    ym[i] = nym[i];

+}

+void make_mb_mask(unsigned char *y, unsigned char *u, unsigned char *v,

+                  unsigned char *ym, unsigned char *uvm,

+                  int yp, int uvp,

+                  COLOR_SEG_ELEMENT sgm[],

+                  int count) {

+  int r, c;

+  unsigned char *oym = ym;

+  memset(ym, 20, 256);

+  for (r = 0; r < 8; r++, uvm += 8, u += uvp, v += uvp, y += (yp + yp), ym += 32)

+    for (c = 0; c < 8; c++) {

+      int y1 = y[c << 1];

+      int u1 = u[c];

+      int v1 = v[c];

+      int m = pixel_mask(y1, u1, v1, sgm, count);

+      uvm[c] = m;

+      ym[c << 1] = uvm[c]; // = pixel_mask(y[c<<1],u[c],v[c],sgm,count);

+      ym[(c << 1) + 1] = pixel_mask(y[1 + (c << 1)], u[c], v[c], sgm, count);

+      ym[(c << 1) + 16] = pixel_mask(y[yp + (c << 1)], u[c], v[c], sgm, count);

+      ym[(c << 1) + 17] = pixel_mask(y[1 + yp + (c << 1)], u[c], v[c], sgm, count);

+    }

+  grow_ymask(oym);

+}

+int masked_sad(unsigned char *src, int p, unsigned char *dst, int dp,

+               unsigned char *ym) {

+  int i, j;

+  unsigned sad = 0;

+  for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16)

+    for (j = 0; j < 16; j++)

+      if (ym[j])

+        sad += abs(src[j] - dst[j]);

+  return sad;

+}

+int compare_masks(unsigned char *sym, unsigned char *ym) {

+  int i, j;

+  unsigned sad = 0;

+  for (i = 0; i < 16; i++, sym += 16, ym += 16)

+    for (j = 0; j < 16; j++)

+      sad += (sym[j] != ym[j] ? 1 : 0);

+  return sad;

+}

+int unmasked_sad(unsigned char *src, int p, unsigned char *dst, int dp,

+                 unsigned char *ym) {

+  int i, j;

+  unsigned sad = 0;

+  for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16)

+    for (j = 0; j < 16; j++)

+      if (!ym[j])

+        sad += abs(src[j] - dst[j]);

+  return sad;

+}

+int masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,

+                         int yp, int uvp,

+                         unsigned char *dy, unsigned char *du, unsigned char *dv,

+                         int dyp, int duvp,

+                         COLOR_SEG_ELEMENT sgm[],

+                         int count,

+                         int *mi,

+                         int *mj,

+                         int *ui,

+                         int *uj,

+                         int *wm) {

+  int i, j;

+  unsigned char ym[256];

+  unsigned char uvm[64];

+  unsigned char dym[256];

+  unsigned char duvm[64];

+  unsigned int e = 0;

+  int beste = 256;

+  int bmi = -32, bmj = -32;

+  int bui = -32, buj = -32;

+  int beste1 = 256;

+  int bmi1 = -32, bmj1 = -32;

+  int bui1 = -32, buj1 = -32;

+  int obeste;

+  // first try finding best mask and then unmasked

+  beste = 0xffffffff;

+  // find best unmasked mv

+  for (i = -32; i < 32; i++) {

+    unsigned char *dyz = i * dyp + dy;

+    unsigned char *duz = i / 2 * duvp + du;

+    unsigned char *dvz = i / 2 * duvp + dv;

+    for (j = -32; j < 32; j++) {

+      // 0,0  masked destination

+      make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count);

+      e = unmasked_sad(y, yp, dyz + j, dyp, dym);

+      if (e < beste) {

+        bui = i;

+        buj = j;

+        beste = e;

+      }

+    }

+  }

+  // bui=0;buj=0;

+  // best mv masked destination

+  make_mb_mask(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2,

+               dym, duvm, dyp, duvp, sgm, count);

+  obeste = beste;

+  beste = 0xffffffff;

+  // find best masked

+  for (i = -32; i < 32; i++) {

+    unsigned char *dyz = i * dyp + dy;

+    for (j = -32; j < 32; j++) {

+      e = masked_sad(y, yp, dyz + j, dyp, dym);

+      if (e < beste) {

+        bmi = i;

+        bmj = j;

+        beste = e;

+      }

+    }

+  }

+  beste1 = beste + obeste;

+  bmi1 = bmi;

+  bmj1 = bmj;

+  bui1 = bui;

+  buj1 = buj;

+  beste = 0xffffffff;

+  // source mask

+  make_mb_mask(y, u, v, ym, uvm, yp, uvp, sgm, count);

+  // find best mask

+  for (i = -32; i < 32; i++) {

+    unsigned char *dyz = i * dyp + dy;

+    unsigned char *duz = i / 2 * duvp + du;

+    unsigned char *dvz = i / 2 * duvp + dv;

+    for (j = -32; j < 32; j++) {

+      // 0,0  masked destination

+      make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count);

+      e = compare_masks(ym, dym);

+      if (e < beste) {

+        bmi = i;

+        bmj = j;

+        beste = e;

+      }

+    }

+  }

+  // best mv masked destination

+  make_mb_mask(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2,

+               dym, duvm, dyp, duvp, sgm, count);

+  obeste = masked_sad(y, yp, dy + bmi * dyp + bmj, dyp, dym);

+  beste = 0xffffffff;

+  // find best unmasked mv

+  for (i = -32; i < 32; i++) {

+    unsigned char *dyz = i * dyp + dy;

+    for (j = -32; j < 32; j++) {

+      e = unmasked_sad(y, yp, dyz + j, dyp, dym);

+      if (e < beste) {

+        bui = i;

+        buj = j;

+        beste = e;

+      }

+    }

+  }

+  beste += obeste;

+  if (beste < beste1) {

+    *mi = bmi;

+    *mj = bmj;

+    *ui = bui;

+    *uj = buj;

+    *wm = 1;

+  } else {

+    *mi = bmi1;

+    *mj = bmj1;

+    *ui = bui1;

+    *uj = buj1;

+    *wm = 0;

+  }

+  return 0;

+}

+int predict(unsigned char *src, int p, unsigned char *dst, int dp,

+            unsigned char *ym, unsigned char *prd) {

+  int i, j;

+  for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16, prd += 16)

+    for (j = 0; j < 16; j++)

+      prd[j] = (ym[j] ? src[j] : dst[j]);

+  return 0;

+}

+int fast_masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,

+                              int yp, int uvp,

+                              unsigned char *dy, unsigned char *du, unsigned char *dv,

+                              int dyp, int duvp,

+                              COLOR_SEG_ELEMENT sgm[],

+                              int count,

+                              int *mi,

+                              int *mj,

+                              int *ui,

+                              int *uj,

+                              int *wm) {

+  int i, j;

+  unsigned char ym[256];

+  unsigned char ym2[256];

+  unsigned char uvm[64];

+  unsigned char dym2[256];

+  unsigned char dym[256];

+  unsigned char duvm[64];

+  unsigned int e = 0;

+  int beste = 256;

+  int bmi = -32, bmj = -32;

+  int bui = -32, buj = -32;

+  int beste1 = 256;

+  int bmi1 = -32, bmj1 = -32;

+  int bui1 = -32, buj1 = -32;

+  int obeste;

+  // first try finding best mask and then unmasked

+  beste = 0xffffffff;

+#if 0

+  for (i = 0; i < 16; i++) {

+    unsigned char *dy = i * yp + y;

+    for (j = 0; j < 16; j++)

+      printf("%2x", dy[j]);

+    printf("\n");

+  }

+  printf("\n");

+  for (i = -32; i < 48; i++) {

+    unsigned char *dyz = i * dyp + dy;

+    for (j = -32; j < 48; j++)

+      printf("%2x", dyz[j]);

+    printf("\n");

+  }

+#endif

+  // find best unmasked mv

+  for (i = -32; i < 32; i++) {

+    unsigned char *dyz = i * dyp + dy;

+    unsigned char *duz = i / 2 * duvp + du;

+    unsigned char *dvz = i / 2 * duvp + dv;

+    for (j = -32; j < 32; j++) {

+      // 0,0  masked destination

+      vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp,

+                        sgm[0].y, sgm[0].u, sgm[0].v,

+                        sgm[0].yt, sgm[0].ut, sgm[0].vt);

+      vp8_growmaskmb_sse3(dym, dym2);

+      e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2);

+      if (e < beste) {

+        bui = i;

+        buj = j;

+        beste = e;

+      }

+    }

+  }

+  // bui=0;buj=0;

+  // best mv masked destination

+  vp8_makemask_sse3(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2,

+                    dym, dyp, duvp,

+                    sgm[0].y, sgm[0].u, sgm[0].v,

+                    sgm[0].yt, sgm[0].ut, sgm[0].vt);

+  vp8_growmaskmb_sse3(dym, dym2);

+  obeste = beste;

+  beste = 0xffffffff;

+  // find best masked

+  for (i = -32; i < 32; i++) {

+    unsigned char *dyz = i * dyp + dy;

+    for (j = -32; j < 32; j++) {

+      e = vp9_sad16x16_masked_wmt(y, yp, dyz + j, dyp, dym2);

+      if (e < beste) {

+        bmi = i;

+        bmj = j;

+        beste = e;

+      }

+    }

+  }

+  beste1 = beste + obeste;

+  bmi1 = bmi;

+  bmj1 = bmj;

+  bui1 = bui;

+  buj1 = buj;

+  // source mask

+  vp8_makemask_sse3(y, u, v,

+                    ym, yp, uvp,

+                    sgm[0].y, sgm[0].u, sgm[0].v,

+                    sgm[0].yt, sgm[0].ut, sgm[0].vt);

+  vp8_growmaskmb_sse3(ym, ym2);

+  // find best mask

+  for (i = -32; i < 32; i++) {

+    unsigned char *dyz = i * dyp + dy;

+    unsigned char *duz = i / 2 * duvp + du;

+    unsigned char *dvz = i / 2 * duvp + dv;

+    for (j = -32; j < 32; j++) {

+      // 0,0  masked destination

+      vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp,

+                        sgm[0].y, sgm[0].u, sgm[0].v,

+                        sgm[0].yt, sgm[0].ut, sgm[0].vt);

+      vp8_growmaskmb_sse3(dym, dym2);

+      e = compare_masks(ym2, dym2);

+      if (e < beste) {

+        bmi = i;

+        bmj = j;

+        beste = e;

+      }

+    }

+  }

+  vp8_makemask_sse3(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2,

+                    dym, dyp, duvp,

+                    sgm[0].y, sgm[0].u, sgm[0].v,

+                    sgm[0].yt, sgm[0].ut, sgm[0].vt);

+  vp8_growmaskmb_sse3(dym, dym2);

+  obeste = vp9_sad16x16_masked_wmt(y, yp, dy + bmi * dyp + bmj, dyp, dym2);

+  beste = 0xffffffff;

+  // find best unmasked mv

+  for (i = -32; i < 32; i++) {

+    unsigned char *dyz = i * dyp + dy;

+    for (j = -32; j < 32; j++) {

+      e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2);

+      if (e < beste) {

+        bui = i;

+        buj = j;

+        beste = e;

+      }

+    }

+  }

+  beste += obeste;

+  if (beste < beste1) {

+    *mi = bmi;

+    *mj = bmj;

+    *ui = bui;

+    *uj = buj;

+    *wm = 1;

+  } else {

+    *mi = bmi1;

+    *mj = bmj1;

+    *ui = bui1;

+    *uj = buj1;

+    *wm = 0;

+    beste = beste1;

+  }

+  return beste;

+}

+int predict_all(unsigned char *ym, unsigned char *um, unsigned char *vm,

+                int ymp, int uvmp,

+                unsigned char *yp, unsigned char *up, unsigned char *vp,

+                int ypp, int uvpp,

+                COLOR_SEG_ELEMENT sgm[],

+                int count,

+                int mi,

+                int mj,

+                int ui,

+                int uj,

+                int wm) {

+  int i, j;

+  unsigned char dym[256];

+  unsigned char dym2[256];

+  unsigned char duvm[64];

+  unsigned char *yu = ym, *uu = um, *vu = vm;

+  unsigned char *dym3 = dym2;

+  ym += mi * ymp + mj;

+  um += mi / 2 * uvmp + mj / 2;

+  vm += mi / 2 * uvmp + mj / 2;

+  yu += ui * ymp + uj;

+  uu += ui / 2 * uvmp + uj / 2;

+  vu += ui / 2 * uvmp + uj / 2;

+  // best mv masked destination

+  if (wm)

+    vp8_makemask_sse3(ym, um, vm, dym, ymp, uvmp,

+                      sgm[0].y, sgm[0].u, sgm[0].v,

+                      sgm[0].yt, sgm[0].ut, sgm[0].vt);

+  else

+    vp8_makemask_sse3(yu, uu, vu, dym, ymp, uvmp,

+                      sgm[0].y, sgm[0].u, sgm[0].v,

+                      sgm[0].yt, sgm[0].ut, sgm[0].vt);

+  vp8_growmaskmb_sse3(dym, dym2);

+  vp8_masked_predictor_wmt(ym, yu, ymp, yp, ypp, dym3);

+  vp8_uv_from_y_mask(dym3, duvm);

+  vp8_masked_predictor_uv_wmt(um, uu, uvmp, up, uvpp, duvm);

+  vp8_masked_predictor_uv_wmt(vm, vu, uvmp, vp, uvpp, duvm);

+  return 0;

+}

+unsigned char f0p[1280 * 720 * 3 / 2];

+unsigned char f1p[1280 * 720 * 3 / 2];

+unsigned char prd[1280 * 720 * 3 / 2];

+unsigned char msk[1280 * 720 * 3 / 2];

+int mainz(int argc, char *argv[]) {

+  FILE *f = fopen(argv[1], "rb");

+  FILE *g = fopen(argv[2], "wb");

+  int w = atoi(argv[3]), h = atoi(argv[4]);

+  int y_stride = w, uv_stride = w / 2;

+  int r, c;

+  unsigned char *f0 = f0p, *f1 = f1p, *t;

+  unsigned char ym[256], uvm[64];

+  unsigned char ym2[256], uvm2[64];

+  unsigned char ym3[256], uvm3[64];

+  int a, b;

+  COLOR_SEG_ELEMENT last = { 20, 20, 20, 20, 230, 20, 1}, best;

+#if 0

+  makeneighbors();

+  COLOR_SEG_ELEMENT segmentation[] = {

+    { 60, 4, 80, 17, 80, 10, 1},

+    { 40, 4, 15, 10, 80, 10, 1},

+  };

+  make_mb_mask(y, u, v, ym2, uvm2, 16, 8, segmentation, 1);

+  vp8_makemask_sse3(y, u, v, ym, (int) 16, (int) 8,

+                    (int) segmentation[0].y, (int) segmentation[0].u, (int) segmentation[0].v,

+                    segmentation[0].yt, segmentation[0].ut, segmentation[0].vt);

+  vp8_growmaskmb_sse3(ym, ym3);

+  a = vp9_sad16x16_masked_wmt(str, 16, sts, 16, ym3);

+  b = vp9_sad16x16_unmasked_wmt(str, 16, sts, 16, ym3);

+  vp8_masked_predictor_wmt(str, sts, 16, ym, 16, ym3);

+  vp8_uv_from_y_mask(ym3, uvm3);

+  return 4;

+#endif

+  makeneighbors();

+  memset(prd, 128, w * h * 3 / 2);

+  fread(f0, w * h * 3 / 2, 1, f);

+  while (!feof(f)) {

+    unsigned char *ys = f1, *yd = f0, *yp = prd;

+    unsigned char *us = f1 + w * h, *ud = f0 + w * h, *up = prd + w * h;

+    unsigned char *vs = f1 + w * h * 5 / 4, *vd = f0 + w * h * 5 / 4, *vp = prd + w * h * 5 / 4;

+    fread(f1, w * h * 3 / 2, 1, f);

+    ys += 32 * y_stride;

+    yd += 32 * y_stride;

+    yp += 32 * y_stride;

+    us += 16 * uv_stride;

+    ud += 16 * uv_stride;

+    up += 16 * uv_stride;

+    vs += 16 * uv_stride;

+    vd += 16 * uv_stride;

+    vp += 16 * uv_stride;

+    for (r = 32; r < h - 32; r += 16,

+         ys += 16 * w, yd += 16 * w, yp += 16 * w,

+         us += 8 * uv_stride, ud += 8 * uv_stride, up += 8 * uv_stride,

+         vs += 8 * uv_stride, vd += 8 * uv_stride, vp += 8 * uv_stride) {

+      for (c = 32; c < w - 32; c += 16) {

+        int mi, mj, ui, uj, wm;

+        int bmi, bmj, bui, buj, bwm;

+        unsigned char ym[256];

+        if (vp9_sad16x16_sse3(ys + c, y_stride, yd + c, y_stride, 0xffff) == 0)

+          bmi = bmj = bui = buj = bwm = 0;

+        else {

+          COLOR_SEG_ELEMENT cs[5];

+          int j;

+          unsigned int beste = 0xfffffff;

+          unsigned int bestj = 0;

+          // try color from last mb segmentation

+          cs[0] = last;

+          // try color segs from 4 pixels in mb recon as segmentation

+          cs[1].y = yd[c + y_stride + 1];

+          cs[1].u = ud[c / 2 + uv_stride];

+          cs[1].v = vd[c / 2 + uv_stride];

+          cs[1].yt = cs[1].ut = cs[1].vt = 20;

+          cs[2].y = yd[c + w + 14];

+          cs[2].u = ud[c / 2 + uv_stride + 7];

+          cs[2].v = vd[c / 2 + uv_stride + 7];

+          cs[2].yt = cs[2].ut = cs[2].vt = 20;

+          cs[3].y = yd[c + w * 14 + 1];

+          cs[3].u = ud[c / 2 + uv_stride * 7];

+          cs[3].v = vd[c / 2 + uv_stride * 7];

+          cs[3].yt = cs[3].ut = cs[3].vt = 20;

+          cs[4].y = yd[c + w * 14 + 14];

+          cs[4].u = ud[c / 2 + uv_stride * 7 + 7];

+          cs[4].v = vd[c / 2 + uv_stride * 7 + 7];

+          cs[4].yt = cs[4].ut = cs[4].vt = 20;

+          for (j = 0; j < 5; j++) {

+            int e;

+            e = fast_masked_motion_search(

+                  ys + c, us + c / 2, vs + c / 2, y_stride, uv_stride,

+                  yd + c, ud + c / 2, vd + c / 2, y_stride, uv_stride,

+                  &cs[j], 1, &mi, &mj, &ui, &uj, &wm);

+            if (e < beste) {

+              bmi = mi;

+              bmj = mj;

+              bui = ui;

+              buj = uj, bwm = wm;

+              bestj = j;

+              beste = e;

+            }

+          }

+          best = cs[bestj];

+          // best = segmentation[0];

+          last = best;

+        }

+        predict_all(yd + c, ud + c / 2, vd + c / 2, w, uv_stride,

+                    yp + c, up + c / 2, vp + c / 2, w, uv_stride,

+                    &best, 1, bmi, bmj, bui, buj, bwm);

+      }

+    }

+    fwrite(prd, w * h * 3 / 2, 1, g);

+    t = f0;

+    f0 = f1;

+    f1 = t;

+  }

+  fclose(f);

+  fclose(g);

+  return;

+}

--- /dev/null

+++ b/vp9/common/mbpitch.c

@@ -1,0 +1,124 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "blockd.h"

+typedef enum {

+  PRED = 0,

+  DEST = 1

+} BLOCKSET;

+static void setup_block

+(

+  BLOCKD *b,

+  int mv_stride,

+  unsigned char **base,

+  unsigned char **base2,

+  int Stride,

+  int offset,

+  BLOCKSET bs

+) {

+  if (bs == DEST) {

+    b->dst_stride = Stride;

+    b->dst = offset;

+    b->base_dst = base;

+  } else {

+    b->pre_stride = Stride;

+    b->pre = offset;

+    b->base_pre = base;

+    b->base_second_pre = base2;

+  }

+}

+static void setup_macroblock(MACROBLOCKD *xd, BLOCKSET bs) {

+  int block;

+  unsigned char **y, **u, **v;

+  unsigned char **y2, **u2, **v2;

+  BLOCKD *blockd = xd->block;

+  int stride;

+  if (bs == DEST) {

+    y = &xd->dst.y_buffer;

+    u = &xd->dst.u_buffer;

+    v = &xd->dst.v_buffer;

+  } else {

+    y = &xd->pre.y_buffer;

+    u = &xd->pre.u_buffer;

+    v = &xd->pre.v_buffer;

+    y2 = &xd->second_pre.y_buffer;

+    u2 = &xd->second_pre.u_buffer;

+    v2 = &xd->second_pre.v_buffer;

+  }

+  stride = xd->dst.y_stride;

+  for (block = 0; block < 16; block++) { /* y blocks */

+    setup_block(&blockd[block], stride, y, y2, stride,

+                (block >> 2) * 4 * stride + (block & 3) * 4, bs);

+  }

+  stride = xd->dst.uv_stride;

+  for (block = 16; block < 20; block++) { /* U and V blocks */

+    setup_block(&blockd[block], stride, u, u2, stride,

+      ((block - 16) >> 1) * 4 * stride + (block & 1) * 4, bs);

+    setup_block(&blockd[block + 4], stride, v, v2, stride,

+      ((block - 16) >> 1) * 4 * stride + (block & 1) * 4, bs);

+  }

+}

+void vp9_setup_block_dptrs(MACROBLOCKD *xd) {

+  int r, c;

+  BLOCKD *blockd = xd->block;

+  for (r = 0; r < 4; r++) {

+    for (c = 0; c < 4; c++) {

+      blockd[r * 4 + c].diff = &xd->diff[r * 4 * 16 + c * 4];

+      blockd[r * 4 + c].predictor = xd->predictor + r * 4 * 16 + c * 4;

+    }

+  }

+  for (r = 0; r < 2; r++) {

+    for (c = 0; c < 2; c++) {

+      blockd[16 + r * 2 + c].diff = &xd->diff[256 + r * 4 * 8 + c * 4];

+      blockd[16 + r * 2 + c].predictor =

+        xd->predictor + 256 + r * 4 * 8 + c * 4;

+    }

+  }

+  for (r = 0; r < 2; r++) {

+    for (c = 0; c < 2; c++) {

+      blockd[20 + r * 2 + c].diff = &xd->diff[320 + r * 4 * 8 + c * 4];

+      blockd[20 + r * 2 + c].predictor =

+        xd->predictor + 320 + r * 4 * 8 + c * 4;

+    }

+  }

+  blockd[24].diff = &xd->diff[384];

+  for (r = 0; r < 25; r++) {

+    blockd[r].qcoeff  = xd->qcoeff  + r * 16;

+    blockd[r].dqcoeff = xd->dqcoeff + r * 16;

+  }

+}

+void vp9_build_block_doffsets(MACROBLOCKD *xd) {

+  /* handle the destination pitch features */

+  setup_macroblock(xd, DEST);

+  setup_macroblock(xd, PRED);

+}

--- /dev/null

+++ b/vp9/common/modecont.c

@@ -1,0 +1,64 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "entropy.h"

+const int vp9_default_mode_contexts[6][4] = {

+  {

+    /* 0 */

+    7,     1,     1,   183

+  },

+  {

+    /* 1 */

+    14,    18,    14,   147

+  },

+  {

+    /* 2 */

+    135,    64,    57,    68

+  },

+  {

+    /* 3 */

+    60,    56,   128,   65

+  },

+  {

+    /* 4 */

+    159,   134,   128,   34

+  },

+  {

+    /* 5 */

+    234,   188,   128,   28

+  },

+};

+const int vp9_default_mode_contexts_a[6][4] = {

+  {

+    /* 0 */

+    4,     1,    1,   143

+  },

+  {

+    /* 1 */

+    7,     9,    7,   107

+  },

+  {

+    /* 2 */

+    95,    34,   57,    68

+  },

+  {

+    /* 3 */

+    95,    56,   128,   65

+  },

+  {

+    /* 4 */

+    159,   67,   128,   34

+  },

+  {

+    /* 5 */

+    234,   94,   128,   28

+  },

+};

--- /dev/null

+++ b/vp9/common/modecont.h

@@ -1,0 +1,17 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_MODECONT_H

+#define __INC_MODECONT_H

+extern const int vp9_default_mode_contexts[6][4];

+extern const int vp9_default_mode_contexts_a[6][4];

+#endif

--- /dev/null

+++ b/vp9/common/modecontext.c

@@ -1,0 +1,145 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "entropymode.h"

+const unsigned int vp9_kf_default_bmode_counts[VP9_BINTRAMODES][VP9_BINTRAMODES][VP9_BINTRAMODES] = {

+  {

+    /*Above Mode :  0*/

+    { 43438,   2195,    470,    316,    615,    171,    217,    412,    124,    160, }, /* left_mode 0 */

+    {  5722,   2751,    296,    291,     81,     68,     80,    101,    100,    170, }, /* left_mode 1 */

+    {  1629,    201,    307,     25,     47,     16,     34,     72,     19,     28, }, /* left_mode 2 */

+    {   332,    266,     36,    500,     20,     65,     23,     14,    154,    106, }, /* left_mode 3 */

+    {   450,     97,     10,     24,    117,     10,      2,     12,      8,     71, }, /* left_mode 4 */

+    {   384,     49,     29,     44,     12,    162,     51,      5,     87,     42, }, /* left_mode 5 */

+    {   495,     53,    157,     27,     14,     57,    180,     17,     17,     34, }, /* left_mode 6 */

+    {   695,     64,     62,      9,     27,      5,      3,    147,     10,     26, }, /* left_mode 7 */

+    {   230,     54,     20,    124,     16,    125,     29,     12,    283,     37, }, /* left_mode 8 */

+    {   260,     87,     21,    120,     32,     16,     33,     16,     33,    203, }, /* left_mode 9 */

+  },

+  {

+    /*Above Mode :  1*/

+    {  3934,   2573,    355,    137,    128,     87,    133,    117,     37,     27, }, /* left_mode 0 */

+    {  1036,   1929,    278,    135,     27,     37,     48,     55,     41,     91, }, /* left_mode 1 */

+    {   223,    256,    253,     15,     13,      9,     28,     64,      3,      3, }, /* left_mode 2 */

+    {   120,    129,     17,    316,     15,     11,      9,      4,     53,     74, }, /* left_mode 3 */

+    {   129,     58,      6,     11,     38,      2,      0,      5,      2,     67, }, /* left_mode 4 */

+    {    53,     22,     11,     16,      8,     26,     14,      3,     19,     12, }, /* left_mode 5 */

+    {    59,     26,     61,     11,      4,      9,     35,     13,      8,      8, }, /* left_mode 6 */

+    {   101,     52,     40,      8,      5,      2,      8,     59,      2,     20, }, /* left_mode 7 */

+    {    48,     34,     10,     52,      8,     15,      6,      6,     63,     20, }, /* left_mode 8 */

+    {    96,     48,     22,     63,     11,     14,      5,      8,      9,     96, }, /* left_mode 9 */

+  },

+  {

+    /*Above Mode :  2*/

+    {   709,    461,    506,     36,     27,     33,    151,     98,     24,      6, }, /* left_mode 0 */

+    {   201,    375,    442,     27,     13,      8,     46,     58,      6,     19, }, /* left_mode 1 */

+    {   122,    140,    417,      4,     13,      3,     33,     59,      4,      2, }, /* left_mode 2 */

+    {    36,     17,     22,     16,      6,      8,     12,     17,      9,     21, }, /* left_mode 3 */

+    {    51,     15,      7,      1,     14,      0,      4,      5,      3,     22, }, /* left_mode 4 */

+    {    18,     11,     30,      9,      7,     20,     11,      5,      2,      6, }, /* left_mode 5 */

+    {    38,     21,    103,      9,      4,     12,     79,     13,      2,      5, }, /* left_mode 6 */

+    {    64,     17,     66,      2,     12,      4,      2,     65,      4,      5, }, /* left_mode 7 */

+    {    14,      7,      7,     16,      3,     11,      4,     13,     15,     16, }, /* left_mode 8 */

+    {    36,      8,     32,      9,      9,      4,     14,      7,      6,     24, }, /* left_mode 9 */

+  },

+  {

+    /*Above Mode :  3*/

+    {  1340,    173,     36,    119,     30,     10,     13,     10,     20,     26, }, /* left_mode 0 */

+    {   156,    293,     26,    108,      5,     16,      2,      4,     23,     30, }, /* left_mode 1 */

+    {    60,     34,     13,      7,      3,      3,      0,      8,      4,      5, }, /* left_mode 2 */

+    {    72,     64,      1,    235,      3,      9,      2,      7,     28,     38, }, /* left_mode 3 */

+    {    29,     14,      1,      3,      5,      0,      2,      2,      5,     13, }, /* left_mode 4 */

+    {    22,      7,      4,     11,      2,      5,      1,      2,      6,      4, }, /* left_mode 5 */

+    {    18,     14,      5,      6,      4,      3,     14,      0,      9,      2, }, /* left_mode 6 */

+    {    41,     10,      7,      1,      2,      0,      0,     10,      2,      1, }, /* left_mode 7 */

+    {    23,     19,      2,     33,      1,      5,      2,      0,     51,      8, }, /* left_mode 8 */

+    {    33,     26,      7,     53,      3,      9,      3,      3,      9,     19, }, /* left_mode 9 */

+  },

+  {

+    /*Above Mode :  4*/

+    {   410,    165,     43,     31,     66,     15,     30,     54,      8,     17, }, /* left_mode 0 */

+    {   115,     64,     27,     18,     30,      7,     11,     15,      4,     19, }, /* left_mode 1 */

+    {    31,     23,     25,      1,      7,      2,      2,     10,      0,      5, }, /* left_mode 2 */

+    {    17,      4,      1,      6,      8,      2,      7,      5,      5,     21, }, /* left_mode 3 */

+    {   120,     12,      1,      2,     83,      3,      0,      4,      1,     40, }, /* left_mode 4 */

+    {     4,      3,      1,      2,      1,      2,      5,      0,      3,      6, }, /* left_mode 5 */

+    {    10,      2,     13,      6,      6,      6,      8,      2,      4,      5, }, /* left_mode 6 */

+    {    58,     10,      5,      1,     28,      1,      1,     33,      1,      9, }, /* left_mode 7 */

+    {     8,      2,      1,      4,      2,      5,      1,      1,      2,     10, }, /* left_mode 8 */

+    {    76,      7,      5,      7,     18,      2,      2,      0,      5,     45, }, /* left_mode 9 */

+  },

+  {

+    /*Above Mode :  5*/

+    {   444,     46,     47,     20,     14,    110,     60,     14,     60,      7, }, /* left_mode 0 */

+    {    59,     57,     25,     18,      3,     17,     21,      6,     14,      6, }, /* left_mode 1 */

+    {    24,     17,     20,      6,      4,     13,      7,      2,      3,      2, }, /* left_mode 2 */

+    {    13,     11,      5,     14,      4,      9,      2,      4,     15,      7, }, /* left_mode 3 */

+    {     8,      5,      2,      1,      4,      0,      1,      1,      2,     12, }, /* left_mode 4 */

+    {    19,      5,      5,      7,      4,     40,      6,      3,     10,      4, }, /* left_mode 5 */

+    {    16,      5,      9,      1,      1,     16,     26,      2,     10,      4, }, /* left_mode 6 */

+    {    11,      4,      8,      1,      1,      4,      4,      5,      4,      1, }, /* left_mode 7 */

+    {    15,      1,      3,      7,      3,     21,      7,      1,     34,      5, }, /* left_mode 8 */

+    {    18,      5,      1,      3,      4,      3,      7,      1,      2,      9, }, /* left_mode 9 */

+  },

+  {

+    /*Above Mode :  6*/

+    {   476,    149,     94,     13,     14,     77,    291,     27,     23,      3, }, /* left_mode 0 */

+    {    79,     83,     42,     14,      2,     12,     63,      2,      4,     14, }, /* left_mode 1 */

+    {    43,     36,     55,      1,      3,      8,     42,     11,      5,      1, }, /* left_mode 2 */

+    {     9,      9,      6,     16,      1,      5,      6,      3,     11,     10, }, /* left_mode 3 */

+    {    10,      3,      1,      3,     10,      1,      0,      1,      1,      4, }, /* left_mode 4 */

+    {    14,      6,     15,      5,      1,     20,     25,      2,      5,      0, }, /* left_mode 5 */

+    {    28,      7,     51,      1,      0,      8,    127,      6,      2,      5, }, /* left_mode 6 */

+    {    13,      3,      3,      2,      3,      1,      2,      8,      1,      2, }, /* left_mode 7 */

+    {    10,      3,      3,      3,      3,      8,      2,      2,      9,      3, }, /* left_mode 8 */

+    {    13,      7,     11,      4,      0,      4,      6,      2,      5,      8, }, /* left_mode 9 */

+  },

+  {

+    /*Above Mode :  7*/

+    {   376,    135,    119,      6,     32,      8,     31,    224,      9,      3, }, /* left_mode 0 */

+    {    93,     60,     54,      6,     13,      7,      8,     92,      2,     12, }, /* left_mode 1 */

+    {    74,     36,     84,      0,      3,      2,      9,     67,      2,      1, }, /* left_mode 2 */

+    {    19,      4,      4,      8,      8,      2,      4,      7,      6,     16, }, /* left_mode 3 */

+    {    51,      7,      4,      1,     77,      3,      0,     14,      1,     15, }, /* left_mode 4 */

+    {     7,      7,      5,      7,      4,      7,      4,      5,      0,      3, }, /* left_mode 5 */

+    {    18,      2,     19,      2,      2,      4,     12,     11,      1,      2, }, /* left_mode 6 */

+    {   129,      6,     27,      1,     21,      3,      0,    189,      0,      6, }, /* left_mode 7 */

+    {     9,      1,      2,      8,      3,      7,      0,      5,      3,      3, }, /* left_mode 8 */

+    {    20,      4,      5,     10,      4,      2,      7,     17,      3,     16, }, /* left_mode 9 */

+  },

+  {

+    /*Above Mode :  8*/

+    {   617,     68,     34,     79,     11,     27,     25,     14,     75,     13, }, /* left_mode 0 */

+    {    51,     82,     21,     26,      6,     12,     13,      1,     26,     16, }, /* left_mode 1 */

+    {    29,      9,     12,     11,      3,      7,      1,     10,      2,      2, }, /* left_mode 2 */

+    {    17,     19,     11,     74,      4,      3,      2,      0,     58,     13, }, /* left_mode 3 */

+    {    10,      1,      1,      3,      4,      1,      0,      2,      1,      8, }, /* left_mode 4 */

+    {    14,      4,      5,      5,      1,     13,      2,      0,     27,      8, }, /* left_mode 5 */

+    {    10,      3,      5,      4,      1,      7,      6,      4,      5,      1, }, /* left_mode 6 */

+    {    10,      2,      6,      2,      1,      1,      1,      4,      2,      1, }, /* left_mode 7 */

+    {    14,      8,      5,     23,      2,     12,      6,      2,    117,      5, }, /* left_mode 8 */

+    {     9,      6,      2,     19,      1,      6,      3,      2,      9,      9, }, /* left_mode 9 */

+  },

+  {

+    /*Above Mode :  9*/

+    {   680,     73,     22,     38,     42,      5,     11,      9,      6,     28, }, /* left_mode 0 */

+    {   113,    112,     21,     22,     10,      2,      8,      4,      6,     42, }, /* left_mode 1 */

+    {    44,     20,     24,      6,      5,      4,      3,      3,      1,      2, }, /* left_mode 2 */

+    {    40,     23,      7,     71,      5,      2,      4,      1,      7,     22, }, /* left_mode 3 */

+    {    85,      9,      4,      4,     17,      2,      0,      3,      2,     23, }, /* left_mode 4 */

+    {    13,      4,      2,      6,      1,      7,      0,      1,      7,      6, }, /* left_mode 5 */

+    {    26,      6,      8,      3,      2,      3,      8,      1,      5,      4, }, /* left_mode 6 */

+    {    54,      8,      9,      6,      7,      0,      1,     11,      1,      3, }, /* left_mode 7 */

+    {     9,     10,      4,     13,      2,      5,      4,      2,     14,      8, }, /* left_mode 8 */

+    {    92,      9,      5,     19,     15,      3,      3,      1,      6,     58, }, /* left_mode 9 */

+  },

+};

--- /dev/null

+++ b/vp9/common/mv.h

@@ -1,0 +1,26 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_MV_H

+#define __INC_MV_H

+#include "vpx/vpx_integer.h"

+typedef struct {

+  short row;

+  short col;

+} MV;

+typedef union {

+  uint32_t  as_int;

+  MV        as_mv;

+} int_mv;        /* facilitates faster equality tests and copies */

+#endif

--- /dev/null

+++ b/vp9/common/mvref_common.c

@@ -1,0 +1,342 @@

+/*

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "mvref_common.h"

+#if CONFIG_NEWBESTREFMV

+#define MVREF_NEIGHBOURS 8

+static int mv_ref_search[MVREF_NEIGHBOURS][2] =

+  { {0,-1},{-1,0},{-1,-1},{0,-2},{-2,0},{-1,-2},{-2,-1},{-2,-2} };

+static int ref_distance_weight[MVREF_NEIGHBOURS] =

+  { 3,3,2,1,1,1,1,1 };

+// clamp_mv

+#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units

+static void clamp_mv(const MACROBLOCKD *xd, int_mv *mv) {

+  if (mv->as_mv.col < (xd->mb_to_left_edge - MV_BORDER))

+    mv->as_mv.col = xd->mb_to_left_edge - MV_BORDER;

+  else if (mv->as_mv.col > xd->mb_to_right_edge + MV_BORDER)

+    mv->as_mv.col = xd->mb_to_right_edge + MV_BORDER;

+  if (mv->as_mv.row < (xd->mb_to_top_edge - MV_BORDER))

+    mv->as_mv.row = xd->mb_to_top_edge - MV_BORDER;

+  else if (mv->as_mv.row > xd->mb_to_bottom_edge + MV_BORDER)

+    mv->as_mv.row = xd->mb_to_bottom_edge + MV_BORDER;

+}

+// Gets a best matching candidate refenence motion vector

+// from the given mode info structure (if available)

+static int get_candidate_mvref(

+  const MODE_INFO *candidate_mi,

+  MV_REFERENCE_FRAME ref_frame,

+  MV_REFERENCE_FRAME *c_ref_frame,

+  int_mv *c_mv,

+  MV_REFERENCE_FRAME *c2_ref_frame,

+  int_mv *c2_mv

+) {

+  int ret_val = FALSE;

+  c2_mv->as_int = 0;

+  *c2_ref_frame = INTRA_FRAME;

+  // Target ref frame matches candidate first ref frame

+  if (ref_frame == candidate_mi->mbmi.ref_frame) {

+    c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;

+    *c_ref_frame = ref_frame;

+    ret_val = TRUE;

+    // Is there a second non zero vector we can use.

+    if ((candidate_mi->mbmi.second_ref_frame != INTRA_FRAME) &&

+        (candidate_mi->mbmi.mv[1].as_int != 0) &&

+        (candidate_mi->mbmi.mv[1].as_int != c_mv->as_int)) {

+      c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;

+      *c2_ref_frame = candidate_mi->mbmi.second_ref_frame;

+    }

+  // Target ref frame matches candidate second ref frame

+  } else if (ref_frame == candidate_mi->mbmi.second_ref_frame) {

+    c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;

+    *c_ref_frame = ref_frame;

+    ret_val = TRUE;

+    // Is there a second non zero vector we can use.

+    if ((candidate_mi->mbmi.ref_frame != INTRA_FRAME) &&

+        (candidate_mi->mbmi.mv[0].as_int != 0) &&

+        (candidate_mi->mbmi.mv[0].as_int != c_mv->as_int)) {

+      c2_mv->as_int = candidate_mi->mbmi.mv[0].as_int;

+      *c2_ref_frame = candidate_mi->mbmi.ref_frame;

+    }

+  // No ref frame matches so use first ref mv as first choice

+  } else if (candidate_mi->mbmi.ref_frame != INTRA_FRAME) {

+    c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;

+    *c_ref_frame = candidate_mi->mbmi.ref_frame;

+    ret_val = TRUE;

+    // Is there a second non zero vector we can use.

+    if ((candidate_mi->mbmi.second_ref_frame != INTRA_FRAME) &&

+        (candidate_mi->mbmi.mv[1].as_int != 0) &&

+        (candidate_mi->mbmi.mv[1].as_int != c_mv->as_int)) {

+      c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;

+      *c2_ref_frame = candidate_mi->mbmi.second_ref_frame;

+    }

+  // If only the second ref mv is valid:- (Should not trigger in current code

+  // base given current possible compound prediction options).

+  } else if (candidate_mi->mbmi.second_ref_frame != INTRA_FRAME) {

+    c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;

+    *c_ref_frame = candidate_mi->mbmi.second_ref_frame;

+    ret_val = TRUE;

+  }

+  return ret_val;

+}

+// Performs mv adjustment based on reference frame and clamps the MV

+// if it goes off the edge of the buffer.

+static void scale_mv(

+  MACROBLOCKD *xd,

+  MV_REFERENCE_FRAME this_ref_frame,

+  MV_REFERENCE_FRAME candidate_ref_frame,

+  int_mv *candidate_mv,

+  int *ref_sign_bias

+) {

+  if (candidate_ref_frame != this_ref_frame) {

+    //int frame_distances[MAX_REF_FRAMES];

+    //int last_distance = 1;

+    //int gf_distance = xd->frames_since_golden;

+    //int arf_distance = xd->frames_till_alt_ref_frame;

+    // Sign inversion where appropriate.

+    if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) {

+      candidate_mv->as_mv.row = -candidate_mv->as_mv.row;

+      candidate_mv->as_mv.col = -candidate_mv->as_mv.col;

+    }

+    // Scale based on frame distance if the reference frames not the same.

+    /*frame_distances[INTRA_FRAME] = 1;   // should never be used

+    frame_distances[LAST_FRAME] = 1;

+    frame_distances[GOLDEN_FRAME] =

+      (xd->frames_since_golden) ? xd->frames_since_golden : 1;

+    frame_distances[ALTREF_FRAME] =

+      (xd->frames_till_alt_ref_frame) ? xd->frames_till_alt_ref_frame : 1;

+    if (frame_distances[this_ref_frame] &&

+        frame_distances[candidate_ref_frame]) {

+      candidate_mv->as_mv.row =

+        (short)(((int)(candidate_mv->as_mv.row) *

+                 frame_distances[this_ref_frame]) /

+                frame_distances[candidate_ref_frame]);

+      candidate_mv->as_mv.col =

+        (short)(((int)(candidate_mv->as_mv.col) *

+                 frame_distances[this_ref_frame]) /

+                frame_distances[candidate_ref_frame]);

+    }

+    */

+  }

+  // Clamp the MV so it does not point out of the frame buffer

+  clamp_mv(xd, candidate_mv);

+}

+// Adds a new candidate reference vector to the list if indeed it is new.

+// If it is not new then the score of the existing candidate that it matches

+// is increased and the list is resorted.

+static void addmv_and_shuffle(

+  int_mv *mv_list,

+  int *mv_scores,

+  int *index,

+  int_mv candidate_mv,

+  int weight

+) {

+  int i = *index;

+  int duplicate_found = FALSE;

+  // Check for duplicates. If there is one increment its score.

+  // Duplicate defined as being the same full pel vector with rounding.

+  while (i > 0) {

+    i--;

+    if (candidate_mv.as_int == mv_list[i].as_int) {

+      duplicate_found = TRUE;

+      mv_scores[i] += weight;

+      break;

+    }

+  }

+  // If no duplicate was found add the new vector and give it a weight

+  if (!duplicate_found) {

+    mv_list[*index].as_int = candidate_mv.as_int;

+    mv_scores[*index] = weight;

+    i = *index;

+    (*index)++;

+  }

+  // Reshuffle the list so that highest scoring mvs at the top.

+  while (i > 0) {

+    if (mv_scores[i] > mv_scores[i-1]) {

+      int tmp_score = mv_scores[i-1];

+      int_mv tmp_mv = mv_list[i-1];

+      mv_scores[i-1] = mv_scores[i];

+      mv_list[i-1] = mv_list[i];

+      mv_scores[i] = tmp_score;

+      mv_list[i] = tmp_mv;

+      i--;

+    } else

+      break;

+  }

+}

+// This function searches the neighbourhood of a given MB/SB and populates a

+// list of candidate reference vectors.

+//

+void vp9_find_mv_refs(

+  MACROBLOCKD *xd,

+  MODE_INFO *here,

+  MODE_INFO *lf_here,

+  MV_REFERENCE_FRAME ref_frame,

+  int_mv *mv_ref_list,

+  int *ref_sign_bias

+) {

+  int i;

+  MODE_INFO *candidate_mi;

+  int_mv candidate_mvs[MAX_MV_REFS];

+  int_mv c_refmv;

+  MV_REFERENCE_FRAME c_ref_frame;

+  int_mv c2_refmv;

+  MV_REFERENCE_FRAME c2_ref_frame;

+  int candidate_scores[MAX_MV_REFS];

+  int index = 0;

+  int ref_weight = 0;

+  int valid_mv_ref;

+  // Blank the reference vector lists and other local structures.

+  vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REFS);

+  vpx_memset(candidate_mvs, 0, sizeof(int_mv) * MAX_MV_REFS);

+  vpx_memset(candidate_scores, 0, sizeof(candidate_scores));

+  // Populate a list with candidate reference vectors from the

+  // spatial neighbours.

+  for (i = 0; i < 2; ++i) {

+    if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&

+        ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {

+      candidate_mi = here + mv_ref_search[i][0] +

+                     (mv_ref_search[i][1] * xd->mode_info_stride);

+      valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,

+                                         &c_ref_frame, &c_refmv,

+                                         &c2_ref_frame, &c2_refmv);

+      // If there is a valid MV candidate then add it to the list

+      if (valid_mv_ref) {

+        scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );

+        ref_weight = ref_distance_weight[i] +

+                     ((c_ref_frame == ref_frame) << 4);

+        addmv_and_shuffle(candidate_mvs, candidate_scores,

+                          &index, c_refmv, ref_weight);

+        // If there is a second valid mv then add it as well.

+        if (c2_ref_frame != INTRA_FRAME) {

+          scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );

+          ref_weight = ref_distance_weight[i] +

+                       ((c2_ref_frame == ref_frame) << 4);

+          addmv_and_shuffle(candidate_mvs, candidate_scores,

+                            &index, c2_refmv, ref_weight);

+        }

+      }

+    }

+  }

+  // Look at the corresponding vector in the last frame

+  candidate_mi = lf_here;

+  valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,

+                                     &c_ref_frame, &c_refmv,

+                                     &c2_ref_frame, &c2_refmv);

+  // If there is a valid MV candidate then add it to the list

+  if (valid_mv_ref) {

+    scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );

+    ref_weight = 2 + ((c_ref_frame == ref_frame) << 4);

+    addmv_and_shuffle(candidate_mvs, candidate_scores,

+                      &index, c_refmv, ref_weight);

+    // If there is a second valid mv then add it as well.

+    if (c2_ref_frame != INTRA_FRAME) {

+      scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );

+      ref_weight = ref_distance_weight[i] +

+                   ((c2_ref_frame == ref_frame) << 4);

+      addmv_and_shuffle(candidate_mvs, candidate_scores,

+                        &index, c2_refmv, ref_weight);

+    }

+  }

+  // Populate a list with candidate reference vectors from the

+  // spatial neighbours.

+  for (i = 2; i < MVREF_NEIGHBOURS; ++i) {

+    if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&

+        ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {

+      candidate_mi = here + mv_ref_search[i][0] +

+                     (mv_ref_search[i][1] * xd->mode_info_stride);

+      valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,

+                                         &c_ref_frame, &c_refmv,

+                                         &c2_ref_frame, &c2_refmv);

+      // If there is a valid MV candidate then add it to the list

+      if (valid_mv_ref) {

+        scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );

+        ref_weight = ref_distance_weight[i] +

+                     ((c_ref_frame == ref_frame) << 4);

+        addmv_and_shuffle(candidate_mvs, candidate_scores,

+                          &index, c_refmv, ref_weight);

+        // If there is a second valid mv then add it as well.

+        if (c2_ref_frame != INTRA_FRAME) {

+          scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );

+          ref_weight = ref_distance_weight[i] +

+                       ((c2_ref_frame == ref_frame) << 4);

+          addmv_and_shuffle(candidate_mvs, candidate_scores,

+                            &index, c2_refmv, ref_weight);

+        }

+      }

+    }

+  }

+  // 0,0 is always a valid reference.

+  for (i = 0; i < index; ++i)

+    if (candidate_mvs[i].as_int == 0)

+      break;

+  if (i == index) {

+    c_refmv.as_int = 0;

+    addmv_and_shuffle(candidate_mvs, candidate_scores,

+                      &index, c_refmv, candidate_scores[3]+1 );

+  }

+  // Copy over the candidate list.

+  vpx_memcpy(mv_ref_list, candidate_mvs, sizeof(candidate_mvs));

+}

+#endif

--- /dev/null

+++ b/vp9/common/mvref_common.h

@@ -1,0 +1,31 @@

+/*

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "onyxc_int.h"

+#include "blockd.h"

+// MR reference entropy header file.

+#if CONFIG_NEWBESTREFMV

+#ifndef __INC_MVREF_COMMON_H

+#define __INC_MVREF_COMMON_H

+void vp9_find_mv_refs(

+  MACROBLOCKD *xd,

+  MODE_INFO *here,

+  MODE_INFO *lf_here,

+  MV_REFERENCE_FRAME ref_frame,

+  int_mv * mv_ref_list,

+  int *ref_sign_bias

+);

+#endif

+#endif

--- /dev/null

+++ b/vp9/common/onyx.h

@@ -1,0 +1,225 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_ONYX_H

+#define __INC_ONYX_H

+#ifdef __cplusplus

+extern "C"

+{

+#endif

+#include "vpx/internal/vpx_codec_internal.h"

+#include "vpx/vp8cx.h"

+#include "vpx_scale/yv12config.h"

+#include "type_aliases.h"

+#include "ppflags.h"

+  typedef int *VP9_PTR;

+  /* Create/destroy static data structures. */

+  typedef enum {

+    NORMAL      = 0,

+    FOURFIVE    = 1,

+    THREEFIVE   = 2,

+    ONETWO      = 3

+  } VPX_SCALING;

+  typedef enum {

+    VP9_LAST_FLAG = 1,

+    VP9_GOLD_FLAG = 2,

+    VP9_ALT_FLAG = 4

+  } VP9_REFFRAME;

+  typedef enum {

+    USAGE_STREAM_FROM_SERVER    = 0x0,

+    USAGE_LOCAL_FILE_PLAYBACK   = 0x1,

+    USAGE_CONSTRAINED_QUALITY   = 0x2

+  } END_USAGE;

+  typedef enum {

+    MODE_GOODQUALITY    = 0x1,

+    MODE_BESTQUALITY    = 0x2,

+    MODE_FIRSTPASS      = 0x3,

+    MODE_SECONDPASS     = 0x4,

+    MODE_SECONDPASS_BEST = 0x5,

+  } MODE;

+  typedef enum {

+    FRAMEFLAGS_KEY    = 1,

+    FRAMEFLAGS_GOLDEN = 2,

+    FRAMEFLAGS_ALTREF = 4,

+  } FRAMETYPE_FLAGS;

+#include <assert.h>

+  static __inline void Scale2Ratio(int mode, int *hr, int *hs) {

+    switch (mode) {

+      case    NORMAL:

+        *hr = 1;

+        *hs = 1;

+        break;

+      case    FOURFIVE:

+        *hr = 4;

+        *hs = 5;

+        break;

+      case    THREEFIVE:

+        *hr = 3;

+        *hs = 5;

+        break;

+      case    ONETWO:

+        *hr = 1;

+        *hs = 2;

+        break;

+      default:

+        *hr = 1;

+        *hs = 1;

+        assert(0);

+        break;

+    }

+  }

+  typedef struct {

+    int Version;            // 4 versions of bitstream defined 0 best quality/slowest decode, 3 lowest quality/fastest decode

+    int Width;              // width of data passed to the compressor

+    int Height;             // height of data passed to the compressor

+    double frame_rate;       // set to passed in framerate

+    int target_bandwidth;    // bandwidth to be used in kilobits per second

+    int noise_sensitivity;   // parameter used for applying pre processing blur: recommendation 0

+    int Sharpness;          // parameter used for sharpening output: recommendation 0:

+    int cpu_used;

+    unsigned int rc_max_intra_bitrate_pct;

+    // mode ->

+    // (0)=Realtime/Live Encoding. This mode is optimized for realtim encoding (for example, capturing

+    //    a television signal or feed from a live camera). ( speed setting controls how fast )

+    // (1)=Good Quality Fast Encoding. The encoder balances quality with the amount of time it takes to

+    //    encode the output. ( speed setting controls how fast )

+    // (2)=One Pass - Best Quality. The encoder places priority on the quality of the output over encoding

+    //    speed. The output is compressed at the highest possible quality. This option takes the longest

+    //    amount of time to encode. ( speed setting ignored )

+    // (3)=Two Pass - First Pass. The encoder generates a file of statistics for use in the second encoding

+    //    pass. ( speed setting controls how fast )

+    // (4)=Two Pass - Second Pass. The encoder uses the statistics that were generated in the first encoding

+    //    pass to create the compressed output. ( speed setting controls how fast )

+    // (5)=Two Pass - Second Pass Best.  The encoder uses the statistics that were generated in the first

+    //    encoding pass to create the compressed output using the highest possible quality, and taking a

+    //    longer amount of time to encode.. ( speed setting ignored )

+    int Mode;               //

+    // Key Framing Operations

+    int auto_key;            // automatically detect cut scenes and set the keyframes

+    int key_freq;            // maximum distance to key frame.

+    int allow_lag;           // allow lagged compression (if 0 lagin frames is ignored)

+    int lag_in_frames;        // how many frames lag before we start encoding

+    // ----------------------------------------------------------------

+    // DATARATE CONTROL OPTIONS

+    int end_usage; // vbr or cbr

+    // buffer targeting aggressiveness

+    int under_shoot_pct;

+    int over_shoot_pct;

+    // buffering parameters

+    int starting_buffer_level;  // in seconds

+    int optimal_buffer_level;

+    int maximum_buffer_size;

+    // controlling quality

+    int fixed_q;

+    int worst_allowed_q;

+    int best_allowed_q;

+    int cq_level;

+    int lossless;

+    // two pass datarate control

+    int two_pass_vbrbias;        // two pass datarate control tweaks

+    int two_pass_vbrmin_section;

+    int two_pass_vbrmax_section;

+    // END DATARATE CONTROL OPTIONS

+    // ----------------------------------------------------------------

+    // these parameters aren't to be used in final build don't use!!!

+    int play_alternate;

+    int alt_freq;

+    int encode_breakout;  // early breakout encode threshold : for video conf recommend 800

+    int arnr_max_frames;

+    int arnr_strength;

+    int arnr_type;

+    struct vpx_fixed_buf         two_pass_stats_in;

+    struct vpx_codec_pkt_list  *output_pkt_list;

+    vp8e_tuning tuning;

+  } VP9_CONFIG;

+  void vp9_initialize_enc();

+  VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf);

+  void vp9_remove_compressor(VP9_PTR *comp);

+  void vp9_change_config(VP9_PTR onyx, VP9_CONFIG *oxcf);

+// receive a frames worth of data caller can assume that a copy of this frame is made

+// and not just a copy of the pointer..

+  int vp9_receive_raw_frame(VP9_PTR comp, unsigned int frame_flags,

+                            YV12_BUFFER_CONFIG *sd, int64_t time_stamp,

+                            int64_t end_time_stamp);

+  int vp9_get_compressed_data(VP9_PTR comp, unsigned int *frame_flags,

+                              unsigned long *size, unsigned char *dest,

+                              int64_t *time_stamp, int64_t *time_end,

+                              int flush);

+  int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,

+                                vp9_ppflags_t *flags);

+  int vp9_use_as_reference(VP9_PTR comp, int ref_frame_flags);

+  int vp9_update_reference(VP9_PTR comp, int ref_frame_flags);

+  int vp9_get_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,

+                            YV12_BUFFER_CONFIG *sd);

+  int vp9_set_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,

+                            YV12_BUFFER_CONFIG *sd);

+  int vp9_update_entropy(VP9_PTR comp, int update);

+  int vp9_set_roimap(VP9_PTR comp, unsigned char *map,

+                     unsigned int rows, unsigned int cols,

+                     int delta_q[4], int delta_lf[4],

+                     unsigned int threshold[4]);

+  int vp9_set_active_map(VP9_PTR comp, unsigned char *map,

+                         unsigned int rows, unsigned int cols);

+  int vp9_set_internal_size(VP9_PTR comp,

+                            VPX_SCALING horiz_mode, VPX_SCALING vert_mode);

+  int vp9_get_quantizer(VP9_PTR c);

+#ifdef __cplusplus

+}

+#endif

+#endif  // __INC_ONYX_H

--- /dev/null

+++ b/vp9/common/onyxc_int.h

@@ -1,0 +1,314 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_ONYXC_INT_H

+#define __INC_ONYXC_INT_H

+#include "vpx_config.h"

+#include "vpx/internal/vpx_codec_internal.h"

+#include "vpx_rtcd.h"

+#include "loopfilter.h"

+#include "entropymv.h"

+#include "entropy.h"

+#include "entropymode.h"

+#include "idct.h"

+#if CONFIG_POSTPROC

+#include "postproc.h"

+#endif

+/*#ifdef PACKET_TESTING*/

+#include "header.h"

+/*#endif*/

+/* Create/destroy static data structures. */

+void vp9_initialize_common(void);

+#define MINQ 0

+#define MAXQ 255

+#define QINDEX_BITS 8

+#define QINDEX_RANGE (MAXQ + 1)

+#define NUM_YV12_BUFFERS 4

+#define COMP_PRED_CONTEXTS   2

+typedef struct frame_contexts {

+  vp9_prob bmode_prob [VP9_BINTRAMODES - 1];

+  vp9_prob ymode_prob [VP9_YMODES - 1]; /* interframe intra mode probs */

+  vp9_prob uv_mode_prob [VP9_YMODES][VP9_UV_MODES - 1];

+  vp9_prob i8x8_mode_prob [VP9_I8X8_MODES - 1];

+  vp9_prob sub_mv_ref_prob [SUBMVREF_COUNT][VP9_SUBMVREFS - 1];

+  vp9_prob mbsplit_prob [VP9_NUMMBSPLITS - 1];

+  vp9_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+  vp9_prob hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+  vp9_prob coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+  vp9_prob hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+  vp9_prob coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+  vp9_prob hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+  nmv_context nmvc;

+  nmv_context pre_nmvc;

+  vp9_prob pre_bmode_prob [VP9_BINTRAMODES - 1];

+  vp9_prob pre_ymode_prob [VP9_YMODES - 1]; /* interframe intra mode probs */

+  vp9_prob pre_uv_mode_prob [VP9_YMODES][VP9_UV_MODES - 1];

+  vp9_prob pre_i8x8_mode_prob [VP9_I8X8_MODES - 1];

+  vp9_prob pre_sub_mv_ref_prob [SUBMVREF_COUNT][VP9_SUBMVREFS - 1];

+  vp9_prob pre_mbsplit_prob [VP9_NUMMBSPLITS - 1];

+  unsigned int bmode_counts [VP9_BINTRAMODES];

+  unsigned int ymode_counts [VP9_YMODES];   /* interframe intra mode probs */

+  unsigned int uv_mode_counts [VP9_YMODES][VP9_UV_MODES];

+  unsigned int i8x8_mode_counts [VP9_I8X8_MODES];   /* interframe intra mode probs */

+  unsigned int sub_mv_ref_counts [SUBMVREF_COUNT][VP9_SUBMVREFS];

+  unsigned int mbsplit_counts [VP9_NUMMBSPLITS];

+  vp9_prob pre_coef_probs [BLOCK_TYPES] [COEF_BANDS]

+      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+  vp9_prob pre_hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS]

+      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+  vp9_prob pre_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]

+      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+  vp9_prob pre_hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]

+      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+  vp9_prob pre_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]

+      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+  vp9_prob pre_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]

+      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+  unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS]

+      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

+  unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS]

+      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

+  unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]

+      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

+  unsigned int hybrid_coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]

+      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

+  unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]

+      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

+  unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]

+      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

+  nmv_context_counts NMVcount;

+  vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]

+                                 [VP9_SWITCHABLE_FILTERS - 1];

+  int mode_context[6][4];

+  int mode_context_a[6][4];

+  int vp8_mode_contexts[6][4];

+  int mv_ref_ct[6][4][2];

+  int mv_ref_ct_a[6][4][2];

+} FRAME_CONTEXT;

+typedef enum {

+  RECON_CLAMP_REQUIRED        = 0,

+  RECON_CLAMP_NOTREQUIRED     = 1

+} CLAMP_TYPE;

+typedef enum {

+  SINGLE_PREDICTION_ONLY = 0,

+  COMP_PREDICTION_ONLY   = 1,

+  HYBRID_PREDICTION      = 2,

+  NB_PREDICTION_TYPES    = 3,

+} COMPPREDMODE_TYPE;

+typedef enum {

+  ONLY_4X4            = 0,

+  ALLOW_8X8           = 1,

+  ALLOW_16X16         = 2,

+  TX_MODE_SELECT      = 3,

+  NB_TXFM_MODES       = 4,

+} TXFM_MODE;

+typedef struct VP9_COMMON_RTCD {

+#if CONFIG_RUNTIME_CPU_DETECT

+  vp9_idct_rtcd_vtable_t        idct;

+  vp9_subpix_rtcd_vtable_t      subpix;

+#if CONFIG_POSTPROC

+  vp9_postproc_rtcd_vtable_t    postproc;

+#endif

+  int                           flags;

+#else

+  int unused;

+#endif

+} VP9_COMMON_RTCD;

+typedef struct VP9Common {

+  struct vpx_internal_error_info  error;

+  DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][16]);

+  int Width;

+  int Height;

+  int horiz_scale;

+  int vert_scale;

+  YUV_TYPE clr_type;

+  CLAMP_TYPE  clamp_type;

+  YV12_BUFFER_CONFIG *frame_to_show;

+  YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS];

+  int fb_idx_ref_cnt[NUM_YV12_BUFFERS];

+  int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx;

+  YV12_BUFFER_CONFIG post_proc_buffer;

+  YV12_BUFFER_CONFIG temp_scale_frame;

+  FRAME_TYPE last_frame_type;  /* Save last frame's frame type for motion search. */

+  FRAME_TYPE frame_type;

+  int show_frame;

+  int frame_flags;

+  int MBs;

+  int mb_rows;

+  int mb_cols;

+  int mode_info_stride;

+  /* profile settings */

+  int experimental;

+  int mb_no_coeff_skip;

+  TXFM_MODE txfm_mode;

+  COMPPREDMODE_TYPE comp_pred_mode;

+  int no_lpf;

+  int use_bilinear_mc_filter;

+  int full_pixel;

+  int base_qindex;

+  int last_kf_gf_q;  /* Q used on the last GF or KF */

+  int y1dc_delta_q;

+  int y2dc_delta_q;

+  int y2ac_delta_q;

+  int uvdc_delta_q;

+  int uvac_delta_q;

+  unsigned int frames_since_golden;

+  unsigned int frames_till_alt_ref_frame;

+  /* We allocate a MODE_INFO struct for each macroblock, together with

+     an extra row on top and column on the left to simplify prediction. */

+  MODE_INFO *mip; /* Base of allocated array */

+  MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */

+  MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */

+  MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */

+  // Persistent mb segment id map used in prediction.

+  unsigned char *last_frame_seg_map;

+  INTERPOLATIONFILTERTYPE mcomp_filter_type;

+  LOOPFILTERTYPE filter_type;

+  loop_filter_info_n lf_info;

+  int filter_level;

+  int last_sharpness_level;

+  int sharpness_level;

+  int refresh_last_frame;       /* Two state 0 = NO, 1 = YES */

+  int refresh_golden_frame;     /* Two state 0 = NO, 1 = YES */

+  int refresh_alt_ref_frame;     /* Two state 0 = NO, 1 = YES */

+  int copy_buffer_to_gf;         /* 0 none, 1 Last to GF, 2 ARF to GF */

+  int copy_buffer_to_arf;        /* 0 none, 1 Last to ARF, 2 GF to ARF */

+  int refresh_entropy_probs;    /* Two state 0 = NO, 1 = YES */

+  int ref_frame_sign_bias[MAX_REF_FRAMES];    /* Two state 0, 1 */

+  /* Y,U,V,Y2 */

+  ENTROPY_CONTEXT_PLANES *above_context;   /* row of context for each plane */

+  ENTROPY_CONTEXT_PLANES left_context[2];  /* (up to) 4 contexts "" */

+  /* keyframe block modes are predicted by their above, left neighbors */

+  vp9_prob kf_bmode_prob [VP9_BINTRAMODES] [VP9_BINTRAMODES] [VP9_BINTRAMODES - 1];

+  vp9_prob kf_ymode_prob[8][VP9_YMODES - 1]; /* keyframe "" */

+#if CONFIG_SUPERBLOCKS

+  vp9_prob sb_kf_ymode_prob[8][VP9_I32X32_MODES - 1];

+#endif

+  int kf_ymode_probs_index;

+  int kf_ymode_probs_update;

+  vp9_prob kf_uv_mode_prob[VP9_YMODES] [VP9_UV_MODES - 1];

+  vp9_prob prob_intra_coded;

+  vp9_prob prob_last_coded;

+  vp9_prob prob_gf_coded;

+#if CONFIG_SUPERBLOCKS

+  vp9_prob sb_coded;

+#endif

+  // Context probabilities when using predictive coding of segment id

+  vp9_prob segment_pred_probs[PREDICTION_PROBS];

+  unsigned char temporal_update;

+  // Context probabilities for reference frame prediction

+  unsigned char ref_scores[MAX_REF_FRAMES];

+  vp9_prob ref_pred_probs[PREDICTION_PROBS];

+  vp9_prob mod_refprobs[MAX_REF_FRAMES][PREDICTION_PROBS];

+  vp9_prob prob_comppred[COMP_PRED_CONTEXTS];

+  // FIXME contextualize

+  vp9_prob prob_tx[TX_SIZE_MAX - 1];

+  vp9_prob mbskip_pred_probs[MBSKIP_CONTEXTS];

+  FRAME_CONTEXT lfc_a; /* last alt ref entropy */

+  FRAME_CONTEXT lfc; /* last frame entropy */

+  FRAME_CONTEXT fc;  /* this frame entropy */

+  // int mv_ref_ct[6][4][2];

+  // int mv_ref_ct_a[6][4][2];

+  // int mode_context[6][4];

+  // int mode_context_a[6][4];

+  // int vp8_mode_contexts[6][4];

+  unsigned int current_video_frame;

+  int near_boffset[3];

+  int version;

+#ifdef PACKET_TESTING

+  VP9_HEADER oh;

+#endif

+  double bitrate;

+  double framerate;

+#if CONFIG_RUNTIME_CPU_DETECT

+  VP9_COMMON_RTCD rtcd;

+#endif

+#if CONFIG_POSTPROC

+  struct postproc_state  postproc_state;

+#endif

+#if CONFIG_PRED_FILTER

+  /* Prediction filter variables */

+  int pred_filter_mode;   // 0=disabled at the frame level (no MB filtered)

+  // 1=enabled at the frame level (all MB filtered)

+  // 2=specified per MB (1=filtered, 0=non-filtered)

+  vp9_prob prob_pred_filter_off;

+#endif

+} VP9_COMMON;

+#endif  // __INC_ONYX_INT_H

--- /dev/null

+++ b/vp9/common/onyxd.h

@@ -1,0 +1,68 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_ONYXD_H

+#define __INC_ONYXD_H

+/* Create/destroy static data structures. */

+#ifdef __cplusplus

+extern "C"

+{

+#endif

+#include "type_aliases.h"

+#include "vpx_scale/yv12config.h"

+#include "ppflags.h"

+#include "vpx_ports/mem.h"

+#include "vpx/vpx_codec.h"

+  typedef void   *VP9D_PTR;

+  typedef struct {

+    int     Width;

+    int     Height;

+    int     Version;

+    int     postprocess;

+    int     max_threads;

+    int     input_partition;

+  } VP9D_CONFIG;

+  typedef enum {

+    VP9_LAST_FLAG = 1,

+    VP9_GOLD_FLAG = 2,

+    VP9_ALT_FLAG = 4

+  } VP9_REFFRAME;

+  void vp9_initialize_dec(void);

+  int vp9_receive_compressed_data(VP9D_PTR comp, unsigned long size,

+                                  const unsigned char *dest,

+                                  int64_t time_stamp);

+  int vp9_get_raw_frame(VP9D_PTR comp, YV12_BUFFER_CONFIG *sd,

+                        int64_t *time_stamp, int64_t *time_end_stamp,

+                        vp9_ppflags_t *flags);

+  vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR comp,

+                                        VP9_REFFRAME ref_frame_flag,

+                                        YV12_BUFFER_CONFIG *sd);

+  vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR comp,

+                                        VP9_REFFRAME ref_frame_flag,

+                                        YV12_BUFFER_CONFIG *sd);

+  VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf);

+  void vp9_remove_decompressor(VP9D_PTR comp);

+#ifdef __cplusplus

+}

+#endif

+#endif  // __INC_ONYXD_H

--- /dev/null

+++ b/vp9/common/postproc.c

@@ -1,0 +1,1035 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vpx_scale/yv12config.h"

+#include "postproc.h"

+#include "vpx_scale/yv12extend.h"

+#include "vpx_scale/vpxscale.h"

+#include "systemdependent.h"

+#include <math.h>

+#include <stdlib.h>

+#include <stdio.h>

+#define RGB_TO_YUV(t)                                            \

+  ( (0.257*(float)(t >> 16))  + (0.504*(float)(t >> 8 & 0xff)) + \

+    (0.098*(float)(t & 0xff)) + 16),                             \

+  (-(0.148*(float)(t >> 16))  - (0.291*(float)(t >> 8 & 0xff)) + \

+    (0.439*(float)(t & 0xff)) + 128),                            \

+  ( (0.439*(float)(t >> 16))  - (0.368*(float)(t >> 8 & 0xff)) - \

+    (0.071*(float)(t & 0xff)) + 128)

+/* global constants */

+#if CONFIG_POSTPROC_VISUALIZER

+static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = {

+  { RGB_TO_YUV(0x98FB98) },   /* PaleGreen */

+  { RGB_TO_YUV(0x00FF00) },   /* Green */

+  { RGB_TO_YUV(0xADFF2F) },   /* GreenYellow */

+  { RGB_TO_YUV(0x8F0000) },   /* Dark Red */

+  { RGB_TO_YUV(0x008F8F) },   /* Dark Cyan */

+  { RGB_TO_YUV(0x008F8F) },   /* Dark Cyan */

+  { RGB_TO_YUV(0x008F8F) },   /* Dark Cyan */

+  { RGB_TO_YUV(0x8F0000) },   /* Dark Red */

+  { RGB_TO_YUV(0x8F0000) },   /* Dark Red */

+  { RGB_TO_YUV(0x228B22) },   /* ForestGreen */

+  { RGB_TO_YUV(0x006400) },   /* DarkGreen */

+  { RGB_TO_YUV(0x98F5FF) },   /* Cadet Blue */

+  { RGB_TO_YUV(0x6CA6CD) },   /* Sky Blue */

+  { RGB_TO_YUV(0x00008B) },   /* Dark blue */

+  { RGB_TO_YUV(0x551A8B) },   /* Purple */

+  { RGB_TO_YUV(0xFF0000) }    /* Red */

+  { RGB_TO_YUV(0xCC33FF) },   /* Magenta */

+};

+static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] = {

+  { RGB_TO_YUV(0x6633ff) },   /* Purple */

+  { RGB_TO_YUV(0xcc33ff) },   /* Magenta */

+  { RGB_TO_YUV(0xff33cc) },   /* Pink */

+  { RGB_TO_YUV(0xff3366) },   /* Coral */

+  { RGB_TO_YUV(0x3366ff) },   /* Blue */

+  { RGB_TO_YUV(0xed00f5) },   /* Dark Blue */

+  { RGB_TO_YUV(0x2e00b8) },   /* Dark Purple */

+  { RGB_TO_YUV(0xff6633) },   /* Orange */

+  { RGB_TO_YUV(0x33ccff) },   /* Light Blue */

+  { RGB_TO_YUV(0x8ab800) },   /* Green */

+  { RGB_TO_YUV(0xffcc33) },   /* Light Orange */

+  { RGB_TO_YUV(0x33ffcc) },   /* Aqua */

+  { RGB_TO_YUV(0x66ff33) },   /* Light Green */

+  { RGB_TO_YUV(0xccff33) },   /* Yellow */

+};

+static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] = {

+  { RGB_TO_YUV(0x00ff00) },   /* Blue */

+  { RGB_TO_YUV(0x0000ff) },   /* Green */

+  { RGB_TO_YUV(0xffff00) },   /* Yellow */

+  { RGB_TO_YUV(0xff0000) },   /* Red */

+};

+#endif

+static const short kernel5[] = {

+  1, 1, 4, 1, 1

+};

+const short vp9_rv[] = {

+  8, 5, 2, 2, 8, 12, 4, 9, 8, 3,

+  0, 3, 9, 0, 0, 0, 8, 3, 14, 4,

+  10, 1, 11, 14, 1, 14, 9, 6, 12, 11,

+  8, 6, 10, 0, 0, 8, 9, 0, 3, 14,

+  8, 11, 13, 4, 2, 9, 0, 3, 9, 6,

+  1, 2, 3, 14, 13, 1, 8, 2, 9, 7,

+  3, 3, 1, 13, 13, 6, 6, 5, 2, 7,

+  11, 9, 11, 8, 7, 3, 2, 0, 13, 13,

+  14, 4, 12, 5, 12, 10, 8, 10, 13, 10,

+  4, 14, 4, 10, 0, 8, 11, 1, 13, 7,

+  7, 14, 6, 14, 13, 2, 13, 5, 4, 4,

+  0, 10, 0, 5, 13, 2, 12, 7, 11, 13,

+  8, 0, 4, 10, 7, 2, 7, 2, 2, 5,

+  3, 4, 7, 3, 3, 14, 14, 5, 9, 13,

+  3, 14, 3, 6, 3, 0, 11, 8, 13, 1,

+  13, 1, 12, 0, 10, 9, 7, 6, 2, 8,

+  5, 2, 13, 7, 1, 13, 14, 7, 6, 7,

+  9, 6, 10, 11, 7, 8, 7, 5, 14, 8,

+  4, 4, 0, 8, 7, 10, 0, 8, 14, 11,

+  3, 12, 5, 7, 14, 3, 14, 5, 2, 6,

+  11, 12, 12, 8, 0, 11, 13, 1, 2, 0,

+  5, 10, 14, 7, 8, 0, 4, 11, 0, 8,

+  0, 3, 10, 5, 8, 0, 11, 6, 7, 8,

+  10, 7, 13, 9, 2, 5, 1, 5, 10, 2,

+  4, 3, 5, 6, 10, 8, 9, 4, 11, 14,

+  0, 10, 0, 5, 13, 2, 12, 7, 11, 13,

+  8, 0, 4, 10, 7, 2, 7, 2, 2, 5,

+  3, 4, 7, 3, 3, 14, 14, 5, 9, 13,

+  3, 14, 3, 6, 3, 0, 11, 8, 13, 1,

+  13, 1, 12, 0, 10, 9, 7, 6, 2, 8,

+  5, 2, 13, 7, 1, 13, 14, 7, 6, 7,

+  9, 6, 10, 11, 7, 8, 7, 5, 14, 8,

+  4, 4, 0, 8, 7, 10, 0, 8, 14, 11,

+  3, 12, 5, 7, 14, 3, 14, 5, 2, 6,

+  11, 12, 12, 8, 0, 11, 13, 1, 2, 0,

+  5, 10, 14, 7, 8, 0, 4, 11, 0, 8,

+  0, 3, 10, 5, 8, 0, 11, 6, 7, 8,

+  10, 7, 13, 9, 2, 5, 1, 5, 10, 2,

+  4, 3, 5, 6, 10, 8, 9, 4, 11, 14,

+  3, 8, 3, 7, 8, 5, 11, 4, 12, 3,

+  11, 9, 14, 8, 14, 13, 4, 3, 1, 2,

+  14, 6, 5, 4, 4, 11, 4, 6, 2, 1,

+  5, 8, 8, 12, 13, 5, 14, 10, 12, 13,

+  0, 9, 5, 5, 11, 10, 13, 9, 10, 13,

+};

+extern void vp9_blit_text(const char *msg, unsigned char *address,

+                          const int pitch);

+extern void vp9_blit_line(int x0, int x1, int y0, int y1,

+                          unsigned char *image, const int pitch);

+/****************************************************************************

+ */

+void vp9_post_proc_down_and_across_c(unsigned char *src_ptr,

+                                     unsigned char *dst_ptr,

+                                     int src_pixels_per_line,

+                                     int dst_pixels_per_line,

+                                     int rows,

+                                     int cols,

+                                     int flimit) {

+  unsigned char *p_src, *p_dst;

+  int row;

+  int col;

+  int i;

+  int v;

+  int pitch = src_pixels_per_line;

+  unsigned char d[8];

+  (void)dst_pixels_per_line;

+  for (row = 0; row < rows; row++) {

+    /* post_proc_down for one row */

+    p_src = src_ptr;

+    p_dst = dst_ptr;

+    for (col = 0; col < cols; col++) {

+      int kernel = 4;

+      int v = p_src[col];

+      for (i = -2; i <= 2; i++) {

+        if (abs(v - p_src[col + i * pitch]) > flimit)

+          goto down_skip_convolve;

+        kernel += kernel5[2 + i] * p_src[col + i * pitch];

+      }

+      v = (kernel >> 3);

+    down_skip_convolve:

+      p_dst[col] = v;

+    }

+    /* now post_proc_across */

+    p_src = dst_ptr;

+    p_dst = dst_ptr;

+    for (i = 0; i < 8; i++)

+      d[i] = p_src[i];

+    for (col = 0; col < cols; col++) {

+      int kernel = 4;

+      v = p_src[col];

+      d[col & 7] = v;

+      for (i = -2; i <= 2; i++) {

+        if (abs(v - p_src[col + i]) > flimit)

+          goto across_skip_convolve;

+        kernel += kernel5[2 + i] * p_src[col + i];

+      }

+      d[col & 7] = (kernel >> 3);

+    across_skip_convolve:

+      if (col >= 2)

+        p_dst[col - 2] = d[(col - 2) & 7];

+    }

+    /* handle the last two pixels */

+    p_dst[col - 2] = d[(col - 2) & 7];

+    p_dst[col - 1] = d[(col - 1) & 7];

+    /* next row */

+    src_ptr += pitch;

+    dst_ptr += pitch;

+  }

+}

+static int q2mbl(int x) {

+  if (x < 20) x = 20;

+  x = 50 + (x - 50) * 10 / 8;

+  return x * x / 3;

+}

+void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch,

+                                 int rows, int cols, int flimit) {

+  int r, c, i;

+  unsigned char *s = src;

+  unsigned char d[16];

+  for (r = 0; r < rows; r++) {

+    int sumsq = 0;

+    int sum   = 0;

+    for (i = -8; i <= 6; i++) {

+      sumsq += s[i] * s[i];

+      sum   += s[i];

+      d[i + 8] = 0;

+    }

+    for (c = 0; c < cols + 8; c++) {

+      int x = s[c + 7] - s[c - 8];

+      int y = s[c + 7] + s[c - 8];

+      sum  += x;

+      sumsq += x * y;

+      d[c & 15] = s[c];

+      if (sumsq * 15 - sum * sum < flimit) {

+        d[c & 15] = (8 + sum + s[c]) >> 4;

+      }

+      s[c - 8] = d[(c - 8) & 15];

+    }

+    s += pitch;

+  }

+}

+void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch,

+                            int rows, int cols, int flimit) {

+  int r, c, i;

+  const short *rv3 = &vp9_rv[63 & rand()];

+  for (c = 0; c < cols; c++) {

+    unsigned char *s = &dst[c];

+    int sumsq = 0;

+    int sum   = 0;

+    unsigned char d[16];

+    const short *rv2 = rv3 + ((c * 17) & 127);

+    for (i = -8; i <= 6; i++) {

+      sumsq += s[i * pitch] * s[i * pitch];

+      sum   += s[i * pitch];

+    }

+    for (r = 0; r < rows + 8; r++) {

+      sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch];

+      sum  += s[7 * pitch] - s[-8 * pitch];

+      d[r & 15] = s[0];

+      if (sumsq * 15 - sum * sum < flimit) {

+        d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;

+      }

+      s[-8 * pitch] = d[(r - 8) & 15];

+      s += pitch;

+    }

+  }

+}

+static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG   *source,

+                                       YV12_BUFFER_CONFIG   *post,

+                                       int                   q,

+                                       int                   low_var_thresh,

+                                       int                   flag,

+                                       vp9_postproc_rtcd_vtable_t *rtcd) {

+  double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;

+  int ppl = (int)(level + .5);

+  (void) low_var_thresh;

+  (void) flag;

+  POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer,

+                                    source->y_stride,  post->y_stride,

+                                    source->y_height, source->y_width,  ppl);

+  POSTPROC_INVOKE(rtcd, across)(post->y_buffer, post->y_stride,

+                                post->y_height, post->y_width, q2mbl(q));

+  POSTPROC_INVOKE(rtcd, down)(post->y_buffer, post->y_stride,

+                              post->y_height, post->y_width, q2mbl(q));

+  POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer,

+                                    source->uv_stride, post->uv_stride,

+                                    source->uv_height, source->uv_width, ppl);

+  POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer,

+                                    source->uv_stride, post->uv_stride,

+                                    source->uv_height, source->uv_width, ppl);

+}

+void vp9_deblock(YV12_BUFFER_CONFIG         *source,

+                 YV12_BUFFER_CONFIG         *post,

+                 int                         q,

+                 int                         low_var_thresh,

+                 int                         flag,

+                 vp9_postproc_rtcd_vtable_t *rtcd) {

+  double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;

+  int ppl = (int)(level + .5);

+  (void) low_var_thresh;

+  (void) flag;

+  POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer,

+                                    source->y_stride,  post->y_stride,

+                                    source->y_height, source->y_width,   ppl);

+  POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer,

+                                    source->uv_stride, post->uv_stride,

+                                    source->uv_height, source->uv_width, ppl);

+  POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer,

+                                    source->uv_stride, post->uv_stride,

+                                    source->uv_height, source->uv_width, ppl);

+}

+void vp9_de_noise(YV12_BUFFER_CONFIG         *src,

+                  YV12_BUFFER_CONFIG         *post,

+                  int                         q,

+                  int                         low_var_thresh,

+                  int                         flag,

+                  vp9_postproc_rtcd_vtable_t *rtcd) {

+  double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;

+  int ppl = (int)(level + .5);

+  (void) post;

+  (void) low_var_thresh;

+  (void) flag;

+  POSTPROC_INVOKE(rtcd, downacross)(src->y_buffer + 2 * src->y_stride + 2,

+                                    src->y_buffer + 2 * src->y_stride + 2,

+                                    src->y_stride,

+                                    src->y_stride,

+                                    src->y_height - 4,

+                                    src->y_width - 4,

+                                    ppl);

+  POSTPROC_INVOKE(rtcd, downacross)(src->u_buffer + 2 * src->uv_stride + 2,

+                                    src->u_buffer + 2 * src->uv_stride + 2,

+                                    src->uv_stride,

+                                    src->uv_stride,

+                                    src->uv_height - 4,

+                                    src->uv_width - 4, ppl);

+  POSTPROC_INVOKE(rtcd, downacross)(src->v_buffer + 2 * src->uv_stride + 2,

+                                    src->v_buffer + 2 * src->uv_stride + 2,

+                                    src->uv_stride,

+                                    src->uv_stride,

+                                    src->uv_height - 4,

+                                    src->uv_width - 4, ppl);

+}

+double vp9_gaussian(double sigma, double mu, double x) {

+  return 1 / (sigma * sqrt(2.0 * 3.14159265)) *

+         (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));

+}

+static void fillrd(struct postproc_state *state, int q, int a) {

+  char char_dist[300];

+  double sigma;

+  int ai = a, qi = q, i;

+  vp9_clear_system_state();

+  sigma = ai + .5 + .6 * (63 - qi) / 63.0;

+  /* set up a lookup table of 256 entries that matches

+   * a gaussian distribution with sigma determined by q.

+   */

+  {

+    double i;

+    int next, j;

+    next = 0;

+    for (i = -32; i < 32; i++) {

+      int a = (int)(.5 + 256 * vp9_gaussian(sigma, 0, i));

+      if (a) {

+        for (j = 0; j < a; j++) {

+          char_dist[next + j] = (char) i;

+        }

+        next = next + j;

+      }

+    }

+    for (next = next; next < 256; next++)

+      char_dist[next] = 0;

+  }

+  for (i = 0; i < 3072; i++) {

+    state->noise[i] = char_dist[rand() & 0xff];

+  }

+  for (i = 0; i < 16; i++) {

+    state->blackclamp[i] = -char_dist[0];

+    state->whiteclamp[i] = -char_dist[0];

+    state->bothclamp[i] = -2 * char_dist[0];

+  }

+  state->last_q = q;

+  state->last_noise = a;

+}

+/****************************************************************************

+ *

+ *  ROUTINE       : plane_add_noise_c

+ *

+ *  INPUTS        : unsigned char *Start  starting address of buffer to

+ *                                        add gaussian noise to

+ *                  unsigned int Width    width of plane

+ *                  unsigned int Height   height of plane

+ *                  int  Pitch    distance between subsequent lines of frame

+ *                  int  q        quantizer used to determine amount of noise

+ *                                  to add

+ *

+ *  OUTPUTS       : None.

+ *

+ *  RETURNS       : void.

+ *

+ *  FUNCTION      : adds gaussian noise to a plane of pixels

+ *

+ *  SPECIAL NOTES : None.

+ *

+ ****************************************************************************/

+void vp9_plane_add_noise_c(unsigned char *Start, char *noise,

+                           char blackclamp[16],

+                           char whiteclamp[16],

+                           char bothclamp[16],

+                           unsigned int Width, unsigned int Height, int Pitch) {

+  unsigned int i, j;

+  for (i = 0; i < Height; i++) {

+    unsigned char *Pos = Start + i * Pitch;

+    char  *Ref = (char *)(noise + (rand() & 0xff));

+    for (j = 0; j < Width; j++) {

+      if (Pos[j] < blackclamp[0])

+        Pos[j] = blackclamp[0];

+      if (Pos[j] > 255 + whiteclamp[0])

+        Pos[j] = 255 + whiteclamp[0];

+      Pos[j] += Ref[j];

+    }

+  }

+}

+/* Blend the macro block with a solid colored square.  Leave the

+ * edges unblended to give distinction to macro blocks in areas

+ * filled with the same color block.

+ */

+void vp9_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v,

+                          int y1, int u1, int v1, int alpha, int stride) {

+  int i, j;

+  int y1_const = y1 * ((1 << 16) - alpha);

+  int u1_const = u1 * ((1 << 16) - alpha);

+  int v1_const = v1 * ((1 << 16) - alpha);

+  y += 2 * stride + 2;

+  for (i = 0; i < 12; i++) {

+    for (j = 0; j < 12; j++) {

+      y[j] = (y[j] * alpha + y1_const) >> 16;

+    }

+    y += stride;

+  }

+  stride >>= 1;

+  u += stride + 1;

+  v += stride + 1;

+  for (i = 0; i < 6; i++) {

+    for (j = 0; j < 6; j++) {

+      u[j] = (u[j] * alpha + u1_const) >> 16;

+      v[j] = (v[j] * alpha + v1_const) >> 16;

+    }

+    u += stride;

+    v += stride;

+  }

+}

+/* Blend only the edge of the macro block.  Leave center

+ * unblended to allow for other visualizations to be layered.

+ */

+void vp9_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v,

+                          int y1, int u1, int v1, int alpha, int stride) {

+  int i, j;

+  int y1_const = y1 * ((1 << 16) - alpha);

+  int u1_const = u1 * ((1 << 16) - alpha);

+  int v1_const = v1 * ((1 << 16) - alpha);

+  for (i = 0; i < 2; i++) {

+    for (j = 0; j < 16; j++) {

+      y[j] = (y[j] * alpha + y1_const) >> 16;

+    }

+    y += stride;

+  }

+  for (i = 0; i < 12; i++) {

+    y[0]  = (y[0] * alpha  + y1_const) >> 16;

+    y[1]  = (y[1] * alpha  + y1_const) >> 16;

+    y[14] = (y[14] * alpha + y1_const) >> 16;

+    y[15] = (y[15] * alpha + y1_const) >> 16;

+    y += stride;

+  }

+  for (i = 0; i < 2; i++) {

+    for (j = 0; j < 16; j++) {

+      y[j] = (y[j] * alpha + y1_const) >> 16;

+    }

+    y += stride;

+  }

+  stride >>= 1;

+  for (j = 0; j < 8; j++) {

+    u[j] = (u[j] * alpha + u1_const) >> 16;

+    v[j] = (v[j] * alpha + v1_const) >> 16;

+  }

+  u += stride;

+  v += stride;

+  for (i = 0; i < 6; i++) {

+    u[0] = (u[0] * alpha + u1_const) >> 16;

+    v[0] = (v[0] * alpha + v1_const) >> 16;

+    u[7] = (u[7] * alpha + u1_const) >> 16;

+    v[7] = (v[7] * alpha + v1_const) >> 16;

+    u += stride;

+    v += stride;

+  }

+  for (j = 0; j < 8; j++) {

+    u[j] = (u[j] * alpha + u1_const) >> 16;

+    v[j] = (v[j] * alpha + v1_const) >> 16;

+  }

+}

+void vp9_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v,

+                   int y1, int u1, int v1, int alpha, int stride) {

+  int i, j;

+  int y1_const = y1 * ((1 << 16) - alpha);

+  int u1_const = u1 * ((1 << 16) - alpha);

+  int v1_const = v1 * ((1 << 16) - alpha);

+  for (i = 0; i < 4; i++) {

+    for (j = 0; j < 4; j++) {

+      y[j] = (y[j] * alpha + y1_const) >> 16;

+    }

+    y += stride;

+  }

+  stride >>= 1;

+  for (i = 0; i < 2; i++) {

+    for (j = 0; j < 2; j++) {

+      u[j] = (u[j] * alpha + u1_const) >> 16;

+      v[j] = (v[j] * alpha + v1_const) >> 16;

+    }

+    u += stride;

+    v += stride;

+  }

+}

+static void constrain_line(int x0, int *x1, int y0, int *y1,

+                           int width, int height) {

+  int dx;

+  int dy;

+  if (*x1 > width) {

+    dx = *x1 - x0;

+    dy = *y1 - y0;

+    *x1 = width;

+    if (dx)

+      *y1 = ((width - x0) * dy) / dx + y0;

+  }

+  if (*x1 < 0) {

+    dx = *x1 - x0;

+    dy = *y1 - y0;

+    *x1 = 0;

+    if (dx)

+      *y1 = ((0 - x0) * dy) / dx + y0;

+  }

+  if (*y1 > height) {

+    dx = *x1 - x0;

+    dy = *y1 - y0;

+    *y1 = height;

+    if (dy)

+      *x1 = ((height - y0) * dx) / dy + x0;

+  }

+  if (*y1 < 0) {

+    dx = *x1 - x0;

+    dy = *y1 - y0;

+    *y1 = 0;

+    if (dy)

+      *x1 = ((0 - y0) * dx) / dy + x0;

+  }

+}

+#if CONFIG_RUNTIME_CPU_DETECT

+#define RTCD_VTABLE(oci) (&(oci)->rtcd.postproc)

+#else

+#define RTCD_VTABLE(oci) NULL

+#endif

+int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest,

+                        vp9_ppflags_t *ppflags) {

+  int q = oci->filter_level * 10 / 6;

+  int flags = ppflags->post_proc_flag;

+  int deblock_level = ppflags->deblocking_level;

+  int noise_level = ppflags->noise_level;

+  if (!oci->frame_to_show)

+    return -1;

+  if (q > 63)

+    q = 63;

+  if (!flags) {

+    *dest = *oci->frame_to_show;

+    /* handle problem with extending borders */

+    dest->y_width = oci->Width;

+    dest->y_height = oci->Height;

+    dest->uv_height = dest->y_height / 2;

+    return 0;

+  }

+#if ARCH_X86||ARCH_X86_64

+  vpx_reset_mmx_state();

+#endif

+  if (flags & VP9D_DEMACROBLOCK) {

+    deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,

+                               q + (deblock_level - 5) * 10, 1, 0,

+                               RTCD_VTABLE(oci));

+  } else if (flags & VP9D_DEBLOCK) {

+    vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer,

+                q, 1, 0, RTCD_VTABLE(oci));

+  } else {

+    vp8_yv12_copy_frame_ptr(oci->frame_to_show, &oci->post_proc_buffer);

+  }

+  if (flags & VP9D_ADDNOISE) {

+    if (oci->postproc_state.last_q != q

+        || oci->postproc_state.last_noise != noise_level) {

+      fillrd(&oci->postproc_state, 63 - q, noise_level);

+    }

+    POSTPROC_INVOKE(RTCD_VTABLE(oci), addnoise)(oci->post_proc_buffer.y_buffer,

+                                                oci->postproc_state.noise,

+                                                oci->postproc_state.blackclamp,

+                                                oci->postproc_state.whiteclamp,

+                                                oci->postproc_state.bothclamp,

+                                                oci->post_proc_buffer.y_width,

+                                                oci->post_proc_buffer.y_height,

+                                                oci->post_proc_buffer.y_stride);

+  }

+#if CONFIG_POSTPROC_VISUALIZER

+  if (flags & VP9D_DEBUG_TXT_FRAME_INFO) {

+    char message[512];

+    sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",

+            (oci->frame_type == KEY_FRAME),

+            oci->refresh_golden_frame,

+            oci->base_qindex,

+            oci->filter_level,

+            flags,

+            oci->mb_cols, oci->mb_rows);

+    vp9_blit_text(message, oci->post_proc_buffer.y_buffer,

+                  oci->post_proc_buffer.y_stride);

+  }

+  if (flags & VP9D_DEBUG_TXT_MBLK_MODES) {

+    int i, j;

+    unsigned char *y_ptr;

+    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;

+    int mb_rows = post->y_height >> 4;

+    int mb_cols = post->y_width  >> 4;

+    int mb_index = 0;

+    MODE_INFO *mi = oci->mi;

+    y_ptr = post->y_buffer + 4 * post->y_stride + 4;

+    /* vp9_filter each macro block */

+    for (i = 0; i < mb_rows; i++) {

+      for (j = 0; j < mb_cols; j++) {

+        char zz[4];

+        sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a');

+        vp9_blit_text(zz, y_ptr, post->y_stride);

+        mb_index++;

+        y_ptr += 16;

+      }

+      mb_index++; /* border */

+      y_ptr += post->y_stride  * 16 - post->y_width;

+    }

+  }

+  if (flags & VP9D_DEBUG_TXT_DC_DIFF) {

+    int i, j;

+    unsigned char *y_ptr;

+    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;

+    int mb_rows = post->y_height >> 4;

+    int mb_cols = post->y_width  >> 4;

+    int mb_index = 0;

+    MODE_INFO *mi = oci->mi;

+    y_ptr = post->y_buffer + 4 * post->y_stride + 4;

+    /* vp9_filter each macro block */

+    for (i = 0; i < mb_rows; i++) {

+      for (j = 0; j < mb_cols; j++) {

+        char zz[4];

+        int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED &&

+                        mi[mb_index].mbmi.mode != SPLITMV &&

+                        mi[mb_index].mbmi.mb_skip_coeff);

+        if (oci->frame_type == KEY_FRAME)

+          sprintf(zz, "a");

+        else

+          sprintf(zz, "%c", dc_diff + '0');

+        vp9_blit_text(zz, y_ptr, post->y_stride);

+        mb_index++;

+        y_ptr += 16;

+      }

+      mb_index++; /* border */

+      y_ptr += post->y_stride  * 16 - post->y_width;

+    }

+  }

+  if (flags & VP9D_DEBUG_TXT_RATE_INFO) {

+    char message[512];

+    snprintf(message, sizeof(message),

+             "Bitrate: %10.2f frame_rate: %10.2f ",

+             oci->bitrate, oci->framerate);

+    vp9_blit_text(message, oci->post_proc_buffer.y_buffer,

+                  oci->post_proc_buffer.y_stride);

+  }

+  /* Draw motion vectors */

+  if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) {

+    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;

+    int width  = post->y_width;

+    int height = post->y_height;

+    unsigned char *y_buffer = oci->post_proc_buffer.y_buffer;

+    int y_stride = oci->post_proc_buffer.y_stride;

+    MODE_INFO *mi = oci->mi;

+    int x0, y0;

+    for (y0 = 0; y0 < height; y0 += 16) {

+      for (x0 = 0; x0 < width; x0 += 16) {

+        int x1, y1;

+        if (!(ppflags->display_mv_flag & (1 << mi->mbmi.mode))) {

+          mi++;

+          continue;

+        }

+        if (mi->mbmi.mode == SPLITMV) {

+          switch (mi->mbmi.partitioning) {

+            case PARTITIONING_16X8 : {  /* mv_top_bottom */

+              union b_mode_info *bmi = &mi->bmi[0];

+              MV *mv = &bmi->mv.as_mv;

+              x1 = x0 + 8 + (mv->col >> 3);

+              y1 = y0 + 4 + (mv->row >> 3);

+              constrain_line(x0 + 8, &x1, y0 + 4, &y1, width, height);

+              vp9_blit_line(x0 + 8,  x1, y0 + 4,  y1, y_buffer, y_stride);

+              bmi = &mi->bmi[8];

+              x1 = x0 + 8 + (mv->col >> 3);

+              y1 = y0 + 12 + (mv->row >> 3);

+              constrain_line(x0 + 8, &x1, y0 + 12, &y1, width, height);

+              vp9_blit_line(x0 + 8,  x1, y0 + 12,  y1, y_buffer, y_stride);

+              break;

+            }

+            case PARTITIONING_8X16 : {  /* mv_left_right */

+              union b_mode_info *bmi = &mi->bmi[0];

+              MV *mv = &bmi->mv.as_mv;

+              x1 = x0 + 4 + (mv->col >> 3);

+              y1 = y0 + 8 + (mv->row >> 3);

+              constrain_line(x0 + 4, &x1, y0 + 8, &y1, width, height);

+              vp9_blit_line(x0 + 4,  x1, y0 + 8,  y1, y_buffer, y_stride);

+              bmi = &mi->bmi[2];

+              x1 = x0 + 12 + (mv->col >> 3);

+              y1 = y0 + 8 + (mv->row >> 3);

+              constrain_line(x0 + 12, &x1, y0 + 8, &y1, width, height);

+              vp9_blit_line(x0 + 12,  x1, y0 + 8,  y1, y_buffer, y_stride);

+              break;

+            }

+            case PARTITIONING_8X8 : {  /* mv_quarters   */

+              union b_mode_info *bmi = &mi->bmi[0];

+              MV *mv = &bmi->mv.as_mv;

+              x1 = x0 + 4 + (mv->col >> 3);

+              y1 = y0 + 4 + (mv->row >> 3);

+              constrain_line(x0 + 4, &x1, y0 + 4, &y1, width, height);

+              vp9_blit_line(x0 + 4,  x1, y0 + 4,  y1, y_buffer, y_stride);

+              bmi = &mi->bmi[2];

+              x1 = x0 + 12 + (mv->col >> 3);

+              y1 = y0 + 4 + (mv->row >> 3);

+              constrain_line(x0 + 12, &x1, y0 + 4, &y1, width, height);

+              vp9_blit_line(x0 + 12,  x1, y0 + 4,  y1, y_buffer, y_stride);

+              bmi = &mi->bmi[8];

+              x1 = x0 + 4 + (mv->col >> 3);

+              y1 = y0 + 12 + (mv->row >> 3);

+              constrain_line(x0 + 4, &x1, y0 + 12, &y1, width, height);

+              vp9_blit_line(x0 + 4,  x1, y0 + 12,  y1, y_buffer, y_stride);

+              bmi = &mi->bmi[10];

+              x1 = x0 + 12 + (mv->col >> 3);

+              y1 = y0 + 12 + (mv->row >> 3);

+              constrain_line(x0 + 12, &x1, y0 + 12, &y1, width, height);

+              vp9_blit_line(x0 + 12,  x1, y0 + 12,  y1, y_buffer, y_stride);

+              break;

+            }

+            case PARTITIONING_4X4:

+            default : {

+              union b_mode_info *bmi = mi->bmi;

+              int bx0, by0;

+              for (by0 = y0; by0 < (y0 + 16); by0 += 4) {

+                for (bx0 = x0; bx0 < (x0 + 16); bx0 += 4) {

+                  MV *mv = &bmi->mv.as_mv;

+                  x1 = bx0 + 2 + (mv->col >> 3);

+                  y1 = by0 + 2 + (mv->row >> 3);

+                  constrain_line(bx0 + 2, &x1, by0 + 2, &y1, width, height);

+                  vp9_blit_line(bx0 + 2,  x1, by0 + 2,  y1, y_buffer, y_stride);

+                  bmi++;

+                }

+              }

+            }

+          }

+        } else if (mi->mbmi.mode >= NEARESTMV) {

+          MV *mv = &mi->mbmi.mv.as_mv;

+          const int lx0 = x0 + 8;

+          const int ly0 = y0 + 8;

+          x1 = lx0 + (mv->col >> 3);

+          y1 = ly0 + (mv->row >> 3);

+          if (x1 != lx0 && y1 != ly0) {

+            constrain_line(lx0, &x1, ly0 - 1, &y1, width, height);

+            vp9_blit_line(lx0,  x1, ly0 - 1,  y1, y_buffer, y_stride);

+            constrain_line(lx0, &x1, ly0 + 1, &y1, width, height);

+            vp9_blit_line(lx0,  x1, ly0 + 1,  y1, y_buffer, y_stride);

+          } else

+            vp9_blit_line(lx0,  x1, ly0,  y1, y_buffer, y_stride);

+        }

+        mi++;

+      }

+      mi++;

+    }

+  }

+  /* Color in block modes */

+  if ((flags & VP9D_DEBUG_CLR_BLK_MODES)

+      && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) {

+    int y, x;

+    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;

+    int width  = post->y_width;

+    int height = post->y_height;

+    unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;

+    unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;

+    unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;

+    int y_stride = oci->post_proc_buffer.y_stride;

+    MODE_INFO *mi = oci->mi;

+    for (y = 0; y < height; y += 16) {

+      for (x = 0; x < width; x += 16) {

+        int Y = 0, U = 0, V = 0;

+        if (mi->mbmi.mode == B_PRED &&

+            ((ppflags->display_mb_modes_flag & B_PRED) ||

+             ppflags->display_b_modes_flag)) {

+          int by, bx;

+          unsigned char *yl, *ul, *vl;

+          union b_mode_info *bmi = mi->bmi;

+          yl = y_ptr + x;

+          ul = u_ptr + (x >> 1);

+          vl = v_ptr + (x >> 1);

+          for (by = 0; by < 16; by += 4) {

+            for (bx = 0; bx < 16; bx += 4) {

+              if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode))

+                  || (ppflags->display_mb_modes_flag & B_PRED)) {

+                Y = B_PREDICTION_MODE_colors[bmi->as_mode.first][0];

+                U = B_PREDICTION_MODE_colors[bmi->as_mode.first][1];

+                V = B_PREDICTION_MODE_colors[bmi->as_mode.first][2];

+                POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_b)(yl + bx,

+                                                           ul + (bx >> 1),

+                                                           vl + (bx >> 1),

+                                                           Y, U, V,

+                                                           0xc000, y_stride);

+              }

+              bmi++;

+            }

+            yl += y_stride * 4;

+            ul += y_stride * 1;

+            vl += y_stride * 1;

+          }

+        } else if (ppflags->display_mb_modes_flag & (1 << mi->mbmi.mode)) {

+          Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];

+          U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];

+          V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];

+          POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_inner)(y_ptr + x,

+                                                            u_ptr + (x >> 1),

+                                                            v_ptr + (x >> 1),

+                                                            Y, U, V,

+                                                            0xc000, y_stride);

+        }

+        mi++;

+      }

+      y_ptr += y_stride * 16;

+      u_ptr += y_stride * 4;

+      v_ptr += y_stride * 4;

+      mi++;

+    }

+  }

+  /* Color in frame reference blocks */

+  if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) &&

+      ppflags->display_ref_frame_flag) {

+    int y, x;

+    YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;

+    int width  = post->y_width;

+    int height = post->y_height;

+    unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;

+    unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;

+    unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;

+    int y_stride = oci->post_proc_buffer.y_stride;

+    MODE_INFO *mi = oci->mi;

+    for (y = 0; y < height; y += 16) {

+      for (x = 0; x < width; x += 16) {

+        int Y = 0, U = 0, V = 0;

+        if (ppflags->display_ref_frame_flag & (1 << mi->mbmi.ref_frame)) {

+          Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];

+          U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];

+          V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];

+          POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_outer)(y_ptr + x,

+                                                            u_ptr + (x >> 1),

+                                                            v_ptr + (x >> 1),

+                                                            Y, U, V,

+                                                            0xc000, y_stride);

+        }

+        mi++;

+      }

+      y_ptr += y_stride * 16;

+      u_ptr += y_stride * 4;

+      v_ptr += y_stride * 4;

+      mi++;

+    }

+  }

+#endif

+  *dest = oci->post_proc_buffer;

+  /* handle problem with extending borders */

+  dest->y_width = oci->Width;

+  dest->y_height = oci->Height;

+  dest->uv_height = dest->y_height / 2;

+  return 0;

+}

--- /dev/null

+++ b/vp9/common/postproc.h

@@ -1,0 +1,128 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef POSTPROC_H

+#define POSTPROC_H

+#define prototype_postproc_inplace(sym)\

+  void sym(unsigned char *dst, int pitch, int rows, int cols, int flimit)

+#define prototype_postproc(sym)\

+  void sym(unsigned char *src, unsigned char *dst, int src_pitch, \

+           int dst_pitch, int rows, int cols, int flimit)

+#define prototype_postproc_addnoise(sym) \

+  void sym(unsigned char *s, char *noise, char blackclamp[16], \

+           char whiteclamp[16], char bothclamp[16], \

+           unsigned int w, unsigned int h, int pitch)

+#define prototype_postproc_blend_mb_inner(sym)\

+  void sym(unsigned char *y, unsigned char *u, unsigned char *v, \

+           int y1, int u1, int v1, int alpha, int stride)

+#define prototype_postproc_blend_mb_outer(sym)\

+  void sym(unsigned char *y, unsigned char *u, unsigned char *v, \

+           int y1, int u1, int v1, int alpha, int stride)

+#define prototype_postproc_blend_b(sym)\

+  void sym(unsigned char *y, unsigned char *u, unsigned char *v, \

+           int y1, int u1, int v1, int alpha, int stride)

+#if ARCH_X86 || ARCH_X86_64

+#include "x86/postproc_x86.h"

+#endif

+#ifndef vp9_postproc_down

+#define vp9_postproc_down vp9_mbpost_proc_down_c

+#endif

+extern prototype_postproc_inplace(vp9_postproc_down);

+#ifndef vp9_postproc_across

+#define vp9_postproc_across vp9_mbpost_proc_across_ip_c

+#endif

+extern prototype_postproc_inplace(vp9_postproc_across);

+#ifndef vp9_postproc_downacross

+#define vp9_postproc_downacross vp9_post_proc_down_and_across_c

+#endif

+extern prototype_postproc(vp9_postproc_downacross);

+#ifndef vp9_postproc_addnoise

+#define vp9_postproc_addnoise vp9_plane_add_noise_c

+#endif

+extern prototype_postproc_addnoise(vp9_postproc_addnoise);

+#ifndef vp9_postproc_blend_mb_inner

+#define vp9_postproc_blend_mb_inner vp9_blend_mb_inner_c

+#endif

+extern prototype_postproc_blend_mb_inner(vp9_postproc_blend_mb_inner);

+#ifndef vp9_postproc_blend_mb_outer

+#define vp9_postproc_blend_mb_outer vp9_blend_mb_outer_c

+#endif

+extern prototype_postproc_blend_mb_outer(vp9_postproc_blend_mb_outer);

+#ifndef vp9_postproc_blend_b

+#define vp9_postproc_blend_b vp9_blend_b_c

+#endif

+extern prototype_postproc_blend_b(vp9_postproc_blend_b);

+typedef prototype_postproc((*vp9_postproc_fn_t));

+typedef prototype_postproc_inplace((*vp9_postproc_inplace_fn_t));

+typedef prototype_postproc_addnoise((*vp9_postproc_addnoise_fn_t));

+typedef prototype_postproc_blend_mb_inner((*vp9_postproc_blend_mb_inner_fn_t));

+typedef prototype_postproc_blend_mb_outer((*vp9_postproc_blend_mb_outer_fn_t));

+typedef prototype_postproc_blend_b((*vp9_postproc_blend_b_fn_t));

+typedef struct {

+  vp9_postproc_inplace_fn_t           down;

+  vp9_postproc_inplace_fn_t           across;

+  vp9_postproc_fn_t                   downacross;

+  vp9_postproc_addnoise_fn_t          addnoise;

+  vp9_postproc_blend_mb_inner_fn_t    blend_mb_inner;

+  vp9_postproc_blend_mb_outer_fn_t    blend_mb_outer;

+  vp9_postproc_blend_b_fn_t           blend_b;

+} vp9_postproc_rtcd_vtable_t;

+#if CONFIG_RUNTIME_CPU_DETECT

+#define POSTPROC_INVOKE(ctx,fn) (ctx)->fn

+#else

+#define POSTPROC_INVOKE(ctx,fn) vp9_postproc_##fn

+#endif

+#include "vpx_ports/mem.h"

+struct postproc_state {

+  int           last_q;

+  int           last_noise;

+  char          noise[3072];

+  DECLARE_ALIGNED(16, char, blackclamp[16]);

+  DECLARE_ALIGNED(16, char, whiteclamp[16]);

+  DECLARE_ALIGNED(16, char, bothclamp[16]);

+};

+#include "onyxc_int.h"

+#include "ppflags.h"

+int vp9_post_proc_frame(struct VP9Common *oci, YV12_BUFFER_CONFIG *dest,

+                        vp9_ppflags_t *flags);

+void vp9_de_noise(YV12_BUFFER_CONFIG         *source,

+                  YV12_BUFFER_CONFIG         *post,

+                  int                         q,

+                  int                         low_var_thresh,

+                  int                         flag,

+                  vp9_postproc_rtcd_vtable_t *rtcd);

+void vp9_deblock(YV12_BUFFER_CONFIG         *source,

+                 YV12_BUFFER_CONFIG         *post,

+                 int                         q,

+                 int                         low_var_thresh,

+                 int                         flag,

+                 vp9_postproc_rtcd_vtable_t *rtcd);

+#endif

--- /dev/null

+++ b/vp9/common/ppc/copy_altivec.asm

@@ -1,0 +1,47 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    .globl copy_mem16x16_ppc

+;# r3 unsigned char *src

+;# r4 int src_stride

+;# r5 unsigned char *dst

+;# r6 int dst_stride

+;# Make the assumption that input will not be aligned,

+;#  but the output will be.  So two reads and a perm

+;#  for the input, but only one store for the output.

+copy_mem16x16_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xe000

+    mtspr   256, r12            ;# set VRSAVE

+    li      r10, 16

+    mtctr   r10

+cp_16x16_loop:

+    lvsl    v0,  0, r3          ;# permutate value for alignment

+    lvx     v1,   0, r3

+    lvx     v2, r10, r3

+    vperm   v1, v1, v2, v0

+    stvx    v1,  0, r5

+    add     r3, r3, r4          ;# increment source pointer

+    add     r5, r5, r6          ;# increment destination pointer

+    bdnz    cp_16x16_loop

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

--- /dev/null

+++ b/vp9/common/ppc/filter_altivec.asm

@@ -1,0 +1,1013 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    .globl sixtap_predict_ppc

+    .globl sixtap_predict8x4_ppc

+    .globl sixtap_predict8x8_ppc

+    .globl sixtap_predict16x16_ppc

+.macro load_c V, LABEL, OFF, R0, R1

+    lis     \R0, \LABEL@ha

+    la      \R1, \LABEL@l(\R0)

+    lvx     \V, \OFF, \R1

+.endm

+.macro load_hfilter V0, V1

+    load_c \V0, HFilter, r5, r9, r10

+    addi    r5,  r5, 16

+    lvx     \V1, r5, r10

+.endm

+;# Vertical filtering

+.macro Vprolog

+    load_c v0, VFilter, r6, r3, r10

+    vspltish v5, 8

+    vspltish v6, 3

+    vslh    v6, v5, v6      ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

+    vspltb  v1, v0, 1

+    vspltb  v2, v0, 2

+    vspltb  v3, v0, 3

+    vspltb  v4, v0, 4

+    vspltb  v5, v0, 5

+    vspltb  v0, v0, 0

+.endm

+.macro vpre_load

+    Vprolog

+    li      r10,  16

+    lvx     v10,   0, r9    ;# v10..v14 = first 5 rows

+    lvx     v11, r10, r9

+    addi    r9,   r9, 32

+    lvx     v12,   0, r9

+    lvx     v13, r10, r9

+    addi    r9,   r9, 32

+    lvx     v14,   0, r9

+.endm

+.macro Msum Re, Ro, V, T, TMP

+                                ;# (Re,Ro) += (V*T)

+    vmuleub \TMP, \V, \T        ;# trashes v8

+    vadduhm \Re, \Re, \TMP      ;# Re = evens, saturation unnecessary

+    vmuloub \TMP, \V, \T

+    vadduhm \Ro, \Ro, \TMP      ;# Ro = odds

+.endm

+.macro vinterp_no_store P0 P1 P2 P3 P4 P5

+    vmuleub  v8, \P0, v0        ;# 64 + 4 positive taps

+    vadduhm v16, v6, v8

+    vmuloub  v8, \P0, v0

+    vadduhm v17, v6, v8

+    Msum v16, v17, \P2, v2, v8

+    Msum v16, v17, \P3, v3, v8

+    Msum v16, v17, \P5, v5, v8

+    vmuleub v18, \P1, v1        ;# 2 negative taps

+    vmuloub v19, \P1, v1

+    Msum v18, v19, \P4, v4, v8

+    vsubuhs v16, v16, v18       ;# subtract neg from pos

+    vsubuhs v17, v17, v19

+    vsrh    v16, v16, v7        ;# divide by 128

+    vsrh    v17, v17, v7        ;# v16 v17 = evens, odds

+    vmrghh  v18, v16, v17       ;# v18 v19 = 16-bit result in order

+    vmrglh  v19, v16, v17

+    vpkuhus  \P0, v18, v19      ;# P0 = 8-bit result

+.endm

+.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5

+    vmuleub v24, \P0, v13       ;# 64 + 4 positive taps

+    vadduhm v21, v20, v24

+    vmuloub v24, \P0, v13

+    vadduhm v22, v20, v24

+    Msum v21, v22, \P2, v15, v25

+    Msum v21, v22, \P3, v16, v25

+    Msum v21, v22, \P5, v18, v25

+    vmuleub v23, \P1, v14       ;# 2 negative taps

+    vmuloub v24, \P1, v14

+    Msum v23, v24, \P4, v17, v25

+    vsubuhs v21, v21, v23       ;# subtract neg from pos

+    vsubuhs v22, v22, v24

+    vsrh    v21, v21, v19       ;# divide by 128

+    vsrh    v22, v22, v19       ;# v16 v17 = evens, odds

+    vmrghh  v23, v21, v22       ;# v18 v19 = 16-bit result in order

+    vmrglh  v24, v21, v22

+    vpkuhus \P0, v23, v24       ;# P0 = 8-bit result

+.endm

+.macro Vinterp P0 P1 P2 P3 P4 P5

+    vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5

+    stvx    \P0, 0, r7

+    add     r7, r7, r8      ;# 33 ops per 16 pels

+.endm

+.macro luma_v P0, P1, P2, P3, P4, P5

+    addi    r9,   r9, 16        ;# P5 = newest input row

+    lvx     \P5,   0, r9

+    Vinterp \P0, \P1, \P2, \P3, \P4, \P5

+.endm

+.macro luma_vtwo

+    luma_v v10, v11, v12, v13, v14, v15

+    luma_v v11, v12, v13, v14, v15, v10

+.endm

+.macro luma_vfour

+    luma_vtwo

+    luma_v v12, v13, v14, v15, v10, v11

+    luma_v v13, v14, v15, v10, v11, v12

+.endm

+.macro luma_vsix

+    luma_vfour

+    luma_v v14, v15, v10, v11, v12, v13

+    luma_v v15, v10, v11, v12, v13, v14

+.endm

+.macro Interp4 R I I4

+    vmsummbm \R, v13, \I, v15

+    vmsummbm \R, v14, \I4, \R

+.endm

+.macro Read8x8 VD, RS, RP, increment_counter

+    lvsl    v21,  0, \RS        ;# permutate value for alignment

+    ;# input to filter is 21 bytes wide, output is 16 bytes.

+    ;#  input will can span three vectors if not aligned correctly.

+    lvx     \VD,   0, \RS

+    lvx     v20, r10, \RS

+.if \increment_counter

+    add     \RS, \RS, \RP

+.endif

+    vperm   \VD, \VD, v20, v21

+.endm

+.macro interp_8x8 R

+    vperm   v20, \R, \R, v16    ;# v20 = 0123 1234 2345 3456

+    vperm   v21, \R, \R, v17    ;# v21 = 4567 5678 6789 789A

+    Interp4 v20, v20,  v21      ;# v20 = result 0 1 2 3

+    vperm   \R, \R, \R, v18     ;# R   = 89AB 9ABC ABCx BCxx

+    Interp4 v21, v21, \R        ;# v21 = result 4 5 6 7

+    vpkswus \R, v20, v21        ;#  R = 0 1 2 3 4 5 6 7

+    vsrh    \R, \R, v19

+    vpkuhus \R, \R, \R          ;# saturate and pack

+.endm

+.macro Read4x4 VD, RS, RP, increment_counter

+    lvsl    v21,  0, \RS        ;# permutate value for alignment

+    ;# input to filter is 21 bytes wide, output is 16 bytes.

+    ;#  input will can span three vectors if not aligned correctly.

+    lvx     v20,   0, \RS

+.if \increment_counter

+    add     \RS, \RS, \RP

+.endif

+    vperm   \VD, v20, v20, v21

+.endm

+    .text

+    .align 2

+;# r3 unsigned char * src

+;# r4 int src_pitch

+;# r5 int x_offset

+;# r6 int y_offset

+;# r7 unsigned char * dst

+;# r8 int dst_pitch

+sixtap_predict_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xff87

+    ori     r12, r12, 0xffc0

+    mtspr   256, r12            ;# set VRSAVE

+    stwu    r1,-32(r1)          ;# create space on the stack

+    slwi.   r5, r5, 5           ;# index into horizontal filter array

+    vspltish v19, 7

+    ;# If there isn't any filtering to be done for the horizontal, then

+    ;#  just skip to the second pass.

+    beq-    vertical_only_4x4

+    ;# load up horizontal filter

+    load_hfilter v13, v14

+    ;# rounding added in on the multiply

+    vspltisw v16, 8

+    vspltisw v15, 3

+    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040

+    ;# Load up permutation constants

+    load_c v16, B_0123, 0, r9, r10

+    load_c v17, B_4567, 0, r9, r10

+    load_c v18, B_89AB, 0, r9, r10

+    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after

+    addi    r3, r3, -2

+    addi    r9, r3, 0

+    li      r10, 16

+    Read8x8 v2, r3, r4, 1

+    Read8x8 v3, r3, r4, 1

+    Read8x8 v4, r3, r4, 1

+    Read8x8 v5, r3, r4, 1

+    slwi.   r6, r6, 4           ;# index into vertical filter array

+    ;# filter a line

+    interp_8x8 v2

+    interp_8x8 v3

+    interp_8x8 v4

+    interp_8x8 v5

+    ;# Finished filtering main horizontal block.  If there is no

+    ;#  vertical filtering, jump to storing the data.  Otherwise

+    ;#  load up and filter the additional 5 lines that are needed

+    ;#  for the vertical filter.

+    beq-    store_4x4

+    ;# only needed if there is a vertical filter present

+    ;# if the second filter is not null then need to back off by 2*pitch

+    sub     r9, r9, r4

+    sub     r9, r9, r4

+    Read8x8 v0, r9, r4, 1

+    Read8x8 v1, r9, r4, 0

+    Read8x8 v6, r3, r4, 1

+    Read8x8 v7, r3, r4, 1

+    Read8x8 v8, r3, r4, 0

+    interp_8x8 v0

+    interp_8x8 v1

+    interp_8x8 v6

+    interp_8x8 v7

+    interp_8x8 v8

+    b       second_pass_4x4

+vertical_only_4x4:

+    ;# only needed if there is a vertical filter present

+    ;# if the second filter is not null then need to back off by 2*pitch

+    sub     r3, r3, r4

+    sub     r3, r3, r4

+    li      r10, 16

+    Read8x8 v0, r3, r4, 1

+    Read8x8 v1, r3, r4, 1

+    Read8x8 v2, r3, r4, 1

+    Read8x8 v3, r3, r4, 1

+    Read8x8 v4, r3, r4, 1

+    Read8x8 v5, r3, r4, 1

+    Read8x8 v6, r3, r4, 1

+    Read8x8 v7, r3, r4, 1

+    Read8x8 v8, r3, r4, 0

+    slwi    r6, r6, 4           ;# index into vertical filter array

+second_pass_4x4:

+    load_c   v20, b_hilo_4x4, 0, r9, r10

+    load_c   v21, b_hilo, 0, r9, r10

+    ;# reposition input so that it can go through the

+    ;# filtering phase with one pass.

+    vperm   v0, v0, v1, v20     ;# 0 1 x x

+    vperm   v2, v2, v3, v20     ;# 2 3 x x

+    vperm   v4, v4, v5, v20     ;# 4 5 x x

+    vperm   v6, v6, v7, v20     ;# 6 7 x x

+    vperm   v0, v0, v2, v21     ;# 0 1 2 3

+    vperm   v4, v4, v6, v21     ;# 4 5 6 7

+    vsldoi  v1, v0, v4, 4

+    vsldoi  v2, v0, v4, 8

+    vsldoi  v3, v0, v4, 12

+    vsldoi  v5, v4, v8, 4

+    load_c   v13, VFilter, r6, r9, r10

+    vspltish v15, 8

+    vspltish v20, 3

+    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

+    vspltb  v14, v13, 1

+    vspltb  v15, v13, 2

+    vspltb  v16, v13, 3

+    vspltb  v17, v13, 4

+    vspltb  v18, v13, 5

+    vspltb  v13, v13, 0

+    vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5

+    stvx    v0, 0, r1

+    lwz     r0, 0(r1)

+    stw     r0, 0(r7)

+    add     r7, r7, r8

+    lwz     r0, 4(r1)

+    stw     r0, 0(r7)

+    add     r7, r7, r8

+    lwz     r0, 8(r1)

+    stw     r0, 0(r7)

+    add     r7, r7, r8

+    lwz     r0, 12(r1)

+    stw     r0, 0(r7)

+    b       exit_4x4

+store_4x4:

+    stvx    v2, 0, r1

+    lwz     r0, 0(r1)

+    stw     r0, 0(r7)

+    add     r7, r7, r8

+    stvx    v3, 0, r1

+    lwz     r0, 0(r1)

+    stw     r0, 0(r7)

+    add     r7, r7, r8

+    stvx    v4, 0, r1

+    lwz     r0, 0(r1)

+    stw     r0, 0(r7)

+    add     r7, r7, r8

+    stvx    v5, 0, r1

+    lwz     r0, 0(r1)

+    stw     r0, 0(r7)

+exit_4x4:

+    addi    r1, r1, 32          ;# recover stack

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+.macro w_8x8 V, D, R, P

+    stvx    \V, 0, r1

+    lwz     \R, 0(r1)

+    stw     \R, 0(r7)

+    lwz     \R, 4(r1)

+    stw     \R, 4(r7)

+    add     \D, \D, \P

+.endm

+    .align 2

+;# r3 unsigned char * src

+;# r4 int src_pitch

+;# r5 int x_offset

+;# r6 int y_offset

+;# r7 unsigned char * dst

+;# r8 int dst_pitch

+sixtap_predict8x4_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xffff

+    ori     r12, r12, 0xffc0

+    mtspr   256, r12            ;# set VRSAVE

+    stwu    r1,-32(r1)          ;# create space on the stack

+    slwi.   r5, r5, 5           ;# index into horizontal filter array

+    vspltish v19, 7

+    ;# If there isn't any filtering to be done for the horizontal, then

+    ;#  just skip to the second pass.

+    beq-    second_pass_pre_copy_8x4

+    load_hfilter v13, v14

+    ;# rounding added in on the multiply

+    vspltisw v16, 8

+    vspltisw v15, 3

+    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040

+    ;# Load up permutation constants

+    load_c v16, B_0123, 0, r9, r10

+    load_c v17, B_4567, 0, r9, r10

+    load_c v18, B_89AB, 0, r9, r10

+    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after

+    addi    r3, r3, -2

+    addi    r9, r3, 0

+    li      r10, 16

+    Read8x8 v2, r3, r4, 1

+    Read8x8 v3, r3, r4, 1

+    Read8x8 v4, r3, r4, 1

+    Read8x8 v5, r3, r4, 1

+    slwi.   r6, r6, 4           ;# index into vertical filter array

+    ;# filter a line

+    interp_8x8 v2

+    interp_8x8 v3

+    interp_8x8 v4

+    interp_8x8 v5

+    ;# Finished filtering main horizontal block.  If there is no

+    ;#  vertical filtering, jump to storing the data.  Otherwise

+    ;#  load up and filter the additional 5 lines that are needed

+    ;#  for the vertical filter.

+    beq-    store_8x4

+    ;# only needed if there is a vertical filter present

+    ;# if the second filter is not null then need to back off by 2*pitch

+    sub     r9, r9, r4

+    sub     r9, r9, r4

+    Read8x8 v0, r9, r4, 1

+    Read8x8 v1, r9, r4, 0

+    Read8x8 v6, r3, r4, 1

+    Read8x8 v7, r3, r4, 1

+    Read8x8 v8, r3, r4, 0

+    interp_8x8 v0

+    interp_8x8 v1

+    interp_8x8 v6

+    interp_8x8 v7

+    interp_8x8 v8

+    b       second_pass_8x4

+second_pass_pre_copy_8x4:

+    ;# only needed if there is a vertical filter present

+    ;# if the second filter is not null then need to back off by 2*pitch

+    sub     r3, r3, r4

+    sub     r3, r3, r4

+    li      r10, 16

+    Read8x8 v0,  r3, r4, 1

+    Read8x8 v1,  r3, r4, 1

+    Read8x8 v2,  r3, r4, 1

+    Read8x8 v3,  r3, r4, 1

+    Read8x8 v4,  r3, r4, 1

+    Read8x8 v5,  r3, r4, 1

+    Read8x8 v6,  r3, r4, 1

+    Read8x8 v7,  r3, r4, 1

+    Read8x8 v8,  r3, r4, 1

+    slwi    r6, r6, 4           ;# index into vertical filter array

+second_pass_8x4:

+    load_c v13, VFilter, r6, r9, r10

+    vspltish v15, 8

+    vspltish v20, 3

+    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

+    vspltb  v14, v13, 1

+    vspltb  v15, v13, 2

+    vspltb  v16, v13, 3

+    vspltb  v17, v13, 4

+    vspltb  v18, v13, 5

+    vspltb  v13, v13, 0

+    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5

+    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6

+    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7

+    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8

+    cmpi    cr0, r8, 8

+    beq     cr0, store_aligned_8x4

+    w_8x8   v0, r7, r0, r8

+    w_8x8   v1, r7, r0, r8

+    w_8x8   v2, r7, r0, r8

+    w_8x8   v3, r7, r0, r8

+    b       exit_8x4

+store_aligned_8x4:

+    load_c v10, b_hilo, 0, r9, r10

+    vperm   v0, v0, v1, v10

+    vperm   v2, v2, v3, v10

+    stvx    v0, 0, r7

+    addi    r7, r7, 16

+    stvx    v2, 0, r7

+    b       exit_8x4

+store_8x4:

+    cmpi    cr0, r8, 8

+    beq     cr0, store_aligned2_8x4

+    w_8x8   v2, r7, r0, r8

+    w_8x8   v3, r7, r0, r8

+    w_8x8   v4, r7, r0, r8

+    w_8x8   v5, r7, r0, r8

+    b       exit_8x4

+store_aligned2_8x4:

+    load_c v10, b_hilo, 0, r9, r10

+    vperm   v2, v2, v3, v10

+    vperm   v4, v4, v5, v10

+    stvx    v2, 0, r7

+    addi    r7, r7, 16

+    stvx    v4, 0, r7

+exit_8x4:

+    addi    r1, r1, 32          ;# recover stack

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+    .align 2

+;# r3 unsigned char * src

+;# r4 int src_pitch

+;# r5 int x_offset

+;# r6 int y_offset

+;# r7 unsigned char * dst

+;# r8 int dst_pitch

+;# Because the width that needs to be filtered will fit in a single altivec

+;#  register there is no need to loop.  Everything can stay in registers.

+sixtap_predict8x8_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xffff

+    ori     r12, r12, 0xffc0

+    mtspr   256, r12            ;# set VRSAVE

+    stwu    r1,-32(r1)          ;# create space on the stack

+    slwi.   r5, r5, 5           ;# index into horizontal filter array

+    vspltish v19, 7

+    ;# If there isn't any filtering to be done for the horizontal, then

+    ;#  just skip to the second pass.

+    beq-    second_pass_pre_copy_8x8

+    load_hfilter v13, v14

+    ;# rounding added in on the multiply

+    vspltisw v16, 8

+    vspltisw v15, 3

+    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040

+    ;# Load up permutation constants

+    load_c v16, B_0123, 0, r9, r10

+    load_c v17, B_4567, 0, r9, r10

+    load_c v18, B_89AB, 0, r9, r10

+    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after

+    addi    r3, r3, -2

+    addi    r9, r3, 0

+    li      r10, 16

+    Read8x8 v2, r3, r4, 1

+    Read8x8 v3, r3, r4, 1

+    Read8x8 v4, r3, r4, 1

+    Read8x8 v5, r3, r4, 1

+    Read8x8 v6, r3, r4, 1

+    Read8x8 v7, r3, r4, 1

+    Read8x8 v8, r3, r4, 1

+    Read8x8 v9, r3, r4, 1

+    slwi.   r6, r6, 4           ;# index into vertical filter array

+    ;# filter a line

+    interp_8x8 v2

+    interp_8x8 v3

+    interp_8x8 v4

+    interp_8x8 v5

+    interp_8x8 v6

+    interp_8x8 v7

+    interp_8x8 v8

+    interp_8x8 v9

+    ;# Finished filtering main horizontal block.  If there is no

+    ;#  vertical filtering, jump to storing the data.  Otherwise

+    ;#  load up and filter the additional 5 lines that are needed

+    ;#  for the vertical filter.

+    beq-    store_8x8

+    ;# only needed if there is a vertical filter present

+    ;# if the second filter is not null then need to back off by 2*pitch

+    sub     r9, r9, r4

+    sub     r9, r9, r4

+    Read8x8 v0,  r9, r4, 1

+    Read8x8 v1,  r9, r4, 0

+    Read8x8 v10, r3, r4, 1

+    Read8x8 v11, r3, r4, 1

+    Read8x8 v12, r3, r4, 0

+    interp_8x8 v0

+    interp_8x8 v1

+    interp_8x8 v10

+    interp_8x8 v11

+    interp_8x8 v12

+    b       second_pass_8x8

+second_pass_pre_copy_8x8:

+    ;# only needed if there is a vertical filter present

+    ;# if the second filter is not null then need to back off by 2*pitch

+    sub     r3, r3, r4

+    sub     r3, r3, r4

+    li      r10, 16

+    Read8x8 v0,  r3, r4, 1

+    Read8x8 v1,  r3, r4, 1

+    Read8x8 v2,  r3, r4, 1

+    Read8x8 v3,  r3, r4, 1

+    Read8x8 v4,  r3, r4, 1

+    Read8x8 v5,  r3, r4, 1

+    Read8x8 v6,  r3, r4, 1

+    Read8x8 v7,  r3, r4, 1

+    Read8x8 v8,  r3, r4, 1

+    Read8x8 v9,  r3, r4, 1

+    Read8x8 v10, r3, r4, 1

+    Read8x8 v11, r3, r4, 1

+    Read8x8 v12, r3, r4, 0

+    slwi    r6, r6, 4           ;# index into vertical filter array

+second_pass_8x8:

+    load_c v13, VFilter, r6, r9, r10

+    vspltish v15, 8

+    vspltish v20, 3

+    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

+    vspltb  v14, v13, 1

+    vspltb  v15, v13, 2

+    vspltb  v16, v13, 3

+    vspltb  v17, v13, 4

+    vspltb  v18, v13, 5

+    vspltb  v13, v13, 0

+    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5

+    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6

+    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7

+    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8

+    vinterp_no_store_8x8 v4, v5, v6, v7,  v8,  v9

+    vinterp_no_store_8x8 v5, v6, v7, v8,  v9,  v10

+    vinterp_no_store_8x8 v6, v7, v8, v9,  v10, v11

+    vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12

+    cmpi    cr0, r8, 8

+    beq     cr0, store_aligned_8x8

+    w_8x8   v0, r7, r0, r8

+    w_8x8   v1, r7, r0, r8

+    w_8x8   v2, r7, r0, r8

+    w_8x8   v3, r7, r0, r8

+    w_8x8   v4, r7, r0, r8

+    w_8x8   v5, r7, r0, r8

+    w_8x8   v6, r7, r0, r8

+    w_8x8   v7, r7, r0, r8

+    b       exit_8x8

+store_aligned_8x8:

+    load_c v10, b_hilo, 0, r9, r10

+    vperm   v0, v0, v1, v10

+    vperm   v2, v2, v3, v10

+    vperm   v4, v4, v5, v10

+    vperm   v6, v6, v7, v10

+    stvx    v0, 0, r7

+    addi    r7, r7, 16

+    stvx    v2, 0, r7

+    addi    r7, r7, 16

+    stvx    v4, 0, r7

+    addi    r7, r7, 16

+    stvx    v6, 0, r7

+    b       exit_8x8

+store_8x8:

+    cmpi    cr0, r8, 8

+    beq     cr0, store_aligned2_8x8

+    w_8x8   v2, r7, r0, r8

+    w_8x8   v3, r7, r0, r8

+    w_8x8   v4, r7, r0, r8

+    w_8x8   v5, r7, r0, r8

+    w_8x8   v6, r7, r0, r8

+    w_8x8   v7, r7, r0, r8

+    w_8x8   v8, r7, r0, r8

+    w_8x8   v9, r7, r0, r8

+    b       exit_8x8

+store_aligned2_8x8:

+    load_c v10, b_hilo, 0, r9, r10

+    vperm   v2, v2, v3, v10

+    vperm   v4, v4, v5, v10

+    vperm   v6, v6, v7, v10

+    vperm   v8, v8, v9, v10

+    stvx    v2, 0, r7

+    addi    r7, r7, 16

+    stvx    v4, 0, r7

+    addi    r7, r7, 16

+    stvx    v6, 0, r7

+    addi    r7, r7, 16

+    stvx    v8, 0, r7

+exit_8x8:

+    addi    r1, r1, 32          ;# recover stack

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+    .align 2

+;# r3 unsigned char * src

+;# r4 int src_pitch

+;# r5 int x_offset

+;# r6 int y_offset

+;# r7 unsigned char * dst

+;# r8 int dst_pitch

+;# Two pass filtering.  First pass is Horizontal edges, second pass is vertical

+;#  edges.  One of the filters can be null, but both won't be.  Needs to use a

+;#  temporary buffer because the source buffer can't be modified and the buffer

+;#  for the destination is not large enough to hold the temporary data.

+sixtap_predict16x16_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xffff

+    ori     r12, r12, 0xf000

+    mtspr   256, r12            ;# set VRSAVE

+    stwu    r1,-416(r1)         ;# create space on the stack

+    ;# Three possiblities

+    ;#  1. First filter is null.  Don't use a temp buffer.

+    ;#  2. Second filter is null.  Don't use a temp buffer.

+    ;#  3. Neither are null, use temp buffer.

+    ;# First Pass (horizontal edge)

+    ;#  setup pointers for src

+    ;#  if possiblity (1) then setup the src pointer to be the orginal and jump

+    ;#  to second pass.  this is based on if x_offset is 0.

+    ;# load up horizontal filter

+    slwi.   r5, r5, 5           ;# index into horizontal filter array

+    load_hfilter v4, v5

+    beq-    copy_horizontal_16x21

+    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after

+    addi    r3, r3, -2

+    slwi.   r6, r6, 4           ;# index into vertical filter array

+    ;# setup constants

+    ;# v14 permutation value for alignment

+    load_c v14, b_hperm, 0, r9, r10

+    ;# These statements are guessing that there won't be a second pass,

+    ;#  but if there is then inside the bypass they need to be set

+    li      r0, 16              ;# prepare for no vertical filter

+    ;# Change the output pointer and pitch to be the actual

+    ;#  desination instead of a temporary buffer.

+    addi    r9, r7, 0

+    addi    r5, r8, 0

+    ;# no vertical filter, so write the output from the first pass

+    ;#  directly into the output buffer.

+    beq-    no_vertical_filter_bypass

+    ;# if the second filter is not null then need to back off by 2*pitch

+    sub     r3, r3, r4

+    sub     r3, r3, r4

+    ;# setup counter for the number of lines that are going to be filtered

+    li      r0, 21

+    ;# use the stack as temporary storage

+    la      r9, 48(r1)

+    li      r5, 16

+no_vertical_filter_bypass:

+    mtctr   r0

+    ;# rounding added in on the multiply

+    vspltisw v10, 8

+    vspltisw v12, 3

+    vslw    v12, v10, v12       ;# 0x00000040000000400000004000000040

+    ;# downshift by 7 ( divide by 128 ) at the end

+    vspltish v13, 7

+    ;# index to the next set of vectors in the row.

+    li      r10, 16

+    li      r12, 32

+horizontal_loop_16x16:

+    lvsl    v15,  0, r3         ;# permutate value for alignment

+    ;# input to filter is 21 bytes wide, output is 16 bytes.

+    ;#  input will can span three vectors if not aligned correctly.

+    lvx     v1,   0, r3

+    lvx     v2, r10, r3

+    lvx     v3, r12, r3

+    vperm   v8, v1, v2, v15

+    vperm   v9, v2, v3, v15     ;# v8 v9 = 21 input pixels left-justified

+    vsldoi  v11, v8, v9, 4

+    ;# set 0

+    vmsummbm v6, v4, v8, v12    ;# taps times elements

+    vmsummbm v0, v5, v11, v6

+    ;# set 1

+    vsldoi  v10, v8, v9, 1

+    vsldoi  v11, v8, v9, 5

+    vmsummbm v6, v4, v10, v12

+    vmsummbm v1, v5, v11, v6

+    ;# set 2

+    vsldoi  v10, v8, v9, 2

+    vsldoi  v11, v8, v9, 6

+    vmsummbm v6, v4, v10, v12

+    vmsummbm v2, v5, v11, v6

+    ;# set 3

+    vsldoi  v10, v8, v9, 3

+    vsldoi  v11, v8, v9, 7

+    vmsummbm v6, v4, v10, v12

+    vmsummbm v3, v5, v11, v6

+    vpkswus v0, v0, v1          ;# v0 = 0 4 8 C 1 5 9 D (16-bit)

+    vpkswus v1, v2, v3          ;# v1 = 2 6 A E 3 7 B F

+    vsrh    v0, v0, v13         ;# divide v0, v1 by 128

+    vsrh    v1, v1, v13

+    vpkuhus v0, v0, v1          ;# v0 = scrambled 8-bit result

+    vperm   v0, v0, v0, v14     ;# v0 = correctly-ordered result

+    stvx    v0,  0, r9

+    add     r9, r9, r5

+    add     r3, r3, r4

+    bdnz    horizontal_loop_16x16

+    ;# check again to see if vertical filter needs to be done.

+    cmpi    cr0, r6, 0

+    beq     cr0, end_16x16

+    ;# yes there is, so go to the second pass

+    b       second_pass_16x16

+copy_horizontal_16x21:

+    li      r10, 21

+    mtctr   r10

+    li      r10, 16

+    sub     r3, r3, r4

+    sub     r3, r3, r4

+    ;# this is done above if there is a horizontal filter,

+    ;#  if not it needs to be done down here.

+    slwi    r6, r6, 4           ;# index into vertical filter array

+    ;# always write to the stack when doing a horizontal copy

+    la      r9, 48(r1)

+copy_horizontal_loop_16x21:

+    lvsl    v15,  0, r3         ;# permutate value for alignment

+    lvx     v1,   0, r3

+    lvx     v2, r10, r3

+    vperm   v8, v1, v2, v15

+    stvx    v8,  0, r9

+    addi    r9, r9, 16

+    add     r3, r3, r4

+    bdnz    copy_horizontal_loop_16x21

+second_pass_16x16:

+    ;# always read from the stack when doing a vertical filter

+    la      r9, 48(r1)

+    ;# downshift by 7 ( divide by 128 ) at the end

+    vspltish v7, 7

+    vpre_load

+    luma_vsix

+    luma_vsix

+    luma_vfour

+end_16x16:

+    addi    r1, r1, 416         ;# recover stack

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+    .data

+    .align 4

+HFilter:

+    .byte     0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0

+    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

+    .byte     0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12

+    .byte    -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0

+    .byte     2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36

+    .byte    -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0

+    .byte     0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50

+    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0

+    .byte     3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77

+    .byte   -16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0

+    .byte     0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93

+    .byte    -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0

+    .byte     1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108

+    .byte   -11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0

+    .byte     0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123

+    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0

+    .align 4

+VFilter:

+    .byte     0,  0,128,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

+    .byte     0,  6,123, 12,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

+    .byte     2, 11,108, 36,  8,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

+    .byte     0,  9, 93, 50,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

+    .byte     3, 16, 77, 77, 16,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

+    .byte     0,  6, 50, 93,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

+    .byte     1,  8, 36,108, 11,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

+    .byte     0,  1, 12,123,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

+    .align 4

+b_hperm:

+    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15

+    .align 4

+B_0123:

+    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6

+    .align 4

+B_4567:

+    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10

+    .align 4

+B_89AB:

+    .byte     8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14

+    .align 4

+b_hilo:

+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

+    .align 4

+b_hilo_4x4:

+    .byte     0,  1,  2,  3, 16, 17, 18, 19,  0,  0,  0,  0,  0,  0,  0,  0

--- /dev/null

+++ b/vp9/common/ppc/filter_bilinear_altivec.asm

@@ -1,0 +1,677 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    .globl bilinear_predict4x4_ppc

+    .globl bilinear_predict8x4_ppc

+    .globl bilinear_predict8x8_ppc

+    .globl bilinear_predict16x16_ppc

+.macro load_c V, LABEL, OFF, R0, R1

+    lis     \R0, \LABEL@ha

+    la      \R1, \LABEL@l(\R0)

+    lvx     \V, \OFF, \R1

+.endm

+.macro load_vfilter V0, V1

+    load_c \V0, vfilter_b, r6, r9, r10

+    addi    r6,  r6, 16

+    lvx     \V1, r6, r10

+.endm

+.macro HProlog jump_label

+    ;# load up horizontal filter

+    slwi.   r5, r5, 4           ;# index into horizontal filter array

+    ;# index to the next set of vectors in the row.

+    li      r10, 16

+    li      r12, 32

+    ;# downshift by 7 ( divide by 128 ) at the end

+    vspltish v19, 7

+    ;# If there isn't any filtering to be done for the horizontal, then

+    ;#  just skip to the second pass.

+    beq     \jump_label

+    load_c v20, hfilter_b, r5, r9, r0

+    ;# setup constants

+    ;# v14 permutation value for alignment

+    load_c v28, b_hperm_b, 0, r9, r0

+    ;# rounding added in on the multiply

+    vspltisw v21, 8

+    vspltisw v18, 3

+    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040

+    slwi.   r6, r6, 5           ;# index into vertical filter array

+.endm

+;# Filters a horizontal line

+;# expects:

+;#  r3  src_ptr

+;#  r4  pitch

+;#  r10 16

+;#  r12 32

+;#  v17 perm intput

+;#  v18 rounding

+;#  v19 shift

+;#  v20 filter taps

+;#  v21 tmp

+;#  v22 tmp

+;#  v23 tmp

+;#  v24 tmp

+;#  v25 tmp

+;#  v26 tmp

+;#  v27 tmp

+;#  v28 perm output

+;#

+.macro HFilter V

+    vperm   v24, v21, v21, v10  ;# v20 = 0123 1234 2345 3456

+    vperm   v25, v21, v21, v11  ;# v21 = 4567 5678 6789 789A

+    vmsummbm v24, v20, v24, v18

+    vmsummbm v25, v20, v25, v18

+    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)

+    vsrh    v24, v24, v19       ;# divide v0, v1 by 128

+    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result

+.endm

+.macro hfilter_8 V, increment_counter

+    lvsl    v17,  0, r3         ;# permutate value for alignment

+    ;# input to filter is 9 bytes wide, output is 8 bytes.

+    lvx     v21,   0, r3

+    lvx     v22, r10, r3

+.if \increment_counter

+    add     r3, r3, r4

+.endif

+    vperm   v21, v21, v22, v17

+    HFilter \V

+.endm

+.macro load_and_align_8 V, increment_counter

+    lvsl    v17,  0, r3         ;# permutate value for alignment

+    ;# input to filter is 21 bytes wide, output is 16 bytes.

+    ;#  input will can span three vectors if not aligned correctly.

+    lvx     v21,   0, r3

+    lvx     v22, r10, r3

+.if \increment_counter

+    add     r3, r3, r4

+.endif

+    vperm   \V, v21, v22, v17

+.endm

+.macro write_aligned_8 V, increment_counter

+    stvx    \V,  0, r7

+.if \increment_counter

+    add     r7, r7, r8

+.endif

+.endm

+.macro vfilter_16 P0 P1

+    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps

+    vadduhm v22, v18, v22

+    vmuloub v23, \P0, v20

+    vadduhm v23, v18, v23

+    vmuleub v24, \P1, v21

+    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary

+    vmuloub v25, \P1, v21

+    vadduhm v23, v23, v25       ;# Ro = odds

+    vsrh    v22, v22, v19       ;# divide by 128

+    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds

+    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order

+    vmrglh  v23, v22, v23

+    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result

+.endm

+.macro w_8x8 V, D, R, P

+    stvx    \V, 0, r1

+    lwz     \R, 0(r1)

+    stw     \R, 0(r7)

+    lwz     \R, 4(r1)

+    stw     \R, 4(r7)

+    add     \D, \D, \P

+.endm

+    .align 2

+;# r3 unsigned char * src

+;# r4 int src_pitch

+;# r5 int x_offset

+;# r6 int y_offset

+;# r7 unsigned char * dst

+;# r8 int dst_pitch

+bilinear_predict4x4_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xf830

+    ori     r12, r12, 0xfff8

+    mtspr   256, r12            ;# set VRSAVE

+    stwu    r1,-32(r1)          ;# create space on the stack

+    HProlog second_pass_4x4_pre_copy_b

+    ;# Load up permutation constants

+    load_c v10, b_0123_b, 0, r9, r12

+    load_c v11, b_4567_b, 0, r9, r12

+    hfilter_8 v0, 1

+    hfilter_8 v1, 1

+    hfilter_8 v2, 1

+    hfilter_8 v3, 1

+    ;# Finished filtering main horizontal block.  If there is no

+    ;#  vertical filtering, jump to storing the data.  Otherwise

+    ;#  load up and filter the additional line that is needed

+    ;#  for the vertical filter.

+    beq     store_out_4x4_b

+    hfilter_8 v4, 0

+    b   second_pass_4x4_b

+second_pass_4x4_pre_copy_b:

+    slwi    r6, r6, 5           ;# index into vertical filter array

+    load_and_align_8  v0, 1

+    load_and_align_8  v1, 1

+    load_and_align_8  v2, 1

+    load_and_align_8  v3, 1

+    load_and_align_8  v4, 1

+second_pass_4x4_b:

+    vspltish v20, 8

+    vspltish v18, 3

+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

+    load_vfilter v20, v21

+    vfilter_16 v0,  v1

+    vfilter_16 v1,  v2

+    vfilter_16 v2,  v3

+    vfilter_16 v3,  v4

+store_out_4x4_b:

+    stvx    v0, 0, r1

+    lwz     r0, 0(r1)

+    stw     r0, 0(r7)

+    add     r7, r7, r8

+    stvx    v1, 0, r1

+    lwz     r0, 0(r1)

+    stw     r0, 0(r7)

+    add     r7, r7, r8

+    stvx    v2, 0, r1

+    lwz     r0, 0(r1)

+    stw     r0, 0(r7)

+    add     r7, r7, r8

+    stvx    v3, 0, r1

+    lwz     r0, 0(r1)

+    stw     r0, 0(r7)

+exit_4x4:

+    addi    r1, r1, 32          ;# recover stack

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+    .align 2

+;# r3 unsigned char * src

+;# r4 int src_pitch

+;# r5 int x_offset

+;# r6 int y_offset

+;# r7 unsigned char * dst

+;# r8 int dst_pitch

+bilinear_predict8x4_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xf830

+    ori     r12, r12, 0xfff8

+    mtspr   256, r12            ;# set VRSAVE

+    stwu    r1,-32(r1)          ;# create space on the stack

+    HProlog second_pass_8x4_pre_copy_b

+    ;# Load up permutation constants

+    load_c v10, b_0123_b, 0, r9, r12

+    load_c v11, b_4567_b, 0, r9, r12

+    hfilter_8 v0, 1

+    hfilter_8 v1, 1

+    hfilter_8 v2, 1

+    hfilter_8 v3, 1

+    ;# Finished filtering main horizontal block.  If there is no

+    ;#  vertical filtering, jump to storing the data.  Otherwise

+    ;#  load up and filter the additional line that is needed

+    ;#  for the vertical filter.

+    beq     store_out_8x4_b

+    hfilter_8 v4, 0

+    b   second_pass_8x4_b

+second_pass_8x4_pre_copy_b:

+    slwi    r6, r6, 5           ;# index into vertical filter array

+    load_and_align_8  v0, 1

+    load_and_align_8  v1, 1

+    load_and_align_8  v2, 1

+    load_and_align_8  v3, 1

+    load_and_align_8  v4, 1

+second_pass_8x4_b:

+    vspltish v20, 8

+    vspltish v18, 3

+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

+    load_vfilter v20, v21

+    vfilter_16 v0,  v1

+    vfilter_16 v1,  v2

+    vfilter_16 v2,  v3

+    vfilter_16 v3,  v4

+store_out_8x4_b:

+    cmpi    cr0, r8, 8

+    beq     cr0, store_aligned_8x4_b

+    w_8x8   v0, r7, r0, r8

+    w_8x8   v1, r7, r0, r8

+    w_8x8   v2, r7, r0, r8

+    w_8x8   v3, r7, r0, r8

+    b       exit_8x4

+store_aligned_8x4_b:

+    load_c v10, b_hilo_b, 0, r9, r10

+    vperm   v0, v0, v1, v10

+    vperm   v2, v2, v3, v10

+    stvx    v0, 0, r7

+    addi    r7, r7, 16

+    stvx    v2, 0, r7

+exit_8x4:

+    addi    r1, r1, 32          ;# recover stack

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+    .align 2

+;# r3 unsigned char * src

+;# r4 int src_pitch

+;# r5 int x_offset

+;# r6 int y_offset

+;# r7 unsigned char * dst

+;# r8 int dst_pitch

+bilinear_predict8x8_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xfff0

+    ori     r12, r12, 0xffff

+    mtspr   256, r12            ;# set VRSAVE

+    stwu    r1,-32(r1)          ;# create space on the stack

+    HProlog second_pass_8x8_pre_copy_b

+    ;# Load up permutation constants

+    load_c v10, b_0123_b, 0, r9, r12

+    load_c v11, b_4567_b, 0, r9, r12

+    hfilter_8 v0, 1

+    hfilter_8 v1, 1

+    hfilter_8 v2, 1

+    hfilter_8 v3, 1

+    hfilter_8 v4, 1

+    hfilter_8 v5, 1

+    hfilter_8 v6, 1

+    hfilter_8 v7, 1

+    ;# Finished filtering main horizontal block.  If there is no

+    ;#  vertical filtering, jump to storing the data.  Otherwise

+    ;#  load up and filter the additional line that is needed

+    ;#  for the vertical filter.

+    beq     store_out_8x8_b

+    hfilter_8 v8, 0

+    b   second_pass_8x8_b

+second_pass_8x8_pre_copy_b:

+    slwi    r6, r6, 5           ;# index into vertical filter array

+    load_and_align_8  v0, 1

+    load_and_align_8  v1, 1

+    load_and_align_8  v2, 1

+    load_and_align_8  v3, 1

+    load_and_align_8  v4, 1

+    load_and_align_8  v5, 1

+    load_and_align_8  v6, 1

+    load_and_align_8  v7, 1

+    load_and_align_8  v8, 0

+second_pass_8x8_b:

+    vspltish v20, 8

+    vspltish v18, 3

+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

+    load_vfilter v20, v21

+    vfilter_16 v0,  v1

+    vfilter_16 v1,  v2

+    vfilter_16 v2,  v3

+    vfilter_16 v3,  v4

+    vfilter_16 v4,  v5

+    vfilter_16 v5,  v6

+    vfilter_16 v6,  v7

+    vfilter_16 v7,  v8

+store_out_8x8_b:

+    cmpi    cr0, r8, 8

+    beq     cr0, store_aligned_8x8_b

+    w_8x8   v0, r7, r0, r8

+    w_8x8   v1, r7, r0, r8

+    w_8x8   v2, r7, r0, r8

+    w_8x8   v3, r7, r0, r8

+    w_8x8   v4, r7, r0, r8

+    w_8x8   v5, r7, r0, r8

+    w_8x8   v6, r7, r0, r8

+    w_8x8   v7, r7, r0, r8

+    b       exit_8x8

+store_aligned_8x8_b:

+    load_c v10, b_hilo_b, 0, r9, r10

+    vperm   v0, v0, v1, v10

+    vperm   v2, v2, v3, v10

+    vperm   v4, v4, v5, v10

+    vperm   v6, v6, v7, v10

+    stvx    v0, 0, r7

+    addi    r7, r7, 16

+    stvx    v2, 0, r7

+    addi    r7, r7, 16

+    stvx    v4, 0, r7

+    addi    r7, r7, 16

+    stvx    v6, 0, r7

+exit_8x8:

+    addi    r1, r1, 32          ;# recover stack

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+;# Filters a horizontal line

+;# expects:

+;#  r3  src_ptr

+;#  r4  pitch

+;#  r10 16

+;#  r12 32

+;#  v17 perm intput

+;#  v18 rounding

+;#  v19 shift

+;#  v20 filter taps

+;#  v21 tmp

+;#  v22 tmp

+;#  v23 tmp

+;#  v24 tmp

+;#  v25 tmp

+;#  v26 tmp

+;#  v27 tmp

+;#  v28 perm output

+;#

+.macro hfilter_16 V, increment_counter

+    lvsl    v17,  0, r3         ;# permutate value for alignment

+    ;# input to filter is 21 bytes wide, output is 16 bytes.

+    ;#  input will can span three vectors if not aligned correctly.

+    lvx     v21,   0, r3

+    lvx     v22, r10, r3

+    lvx     v23, r12, r3

+.if \increment_counter

+    add     r3, r3, r4

+.endif

+    vperm   v21, v21, v22, v17

+    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified

+    ;# set 0

+    vmsummbm v24, v20, v21, v18 ;# taps times elements

+    ;# set 1

+    vsldoi  v23, v21, v22, 1

+    vmsummbm v25, v20, v23, v18

+    ;# set 2

+    vsldoi  v23, v21, v22, 2

+    vmsummbm v26, v20, v23, v18

+    ;# set 3

+    vsldoi  v23, v21, v22, 3

+    vmsummbm v27, v20, v23, v18

+    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)

+    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F

+    vsrh    v24, v24, v19       ;# divide v0, v1 by 128

+    vsrh    v25, v25, v19

+    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result

+    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result

+.endm

+.macro load_and_align_16 V, increment_counter

+    lvsl    v17,  0, r3         ;# permutate value for alignment

+    ;# input to filter is 21 bytes wide, output is 16 bytes.

+    ;#  input will can span three vectors if not aligned correctly.

+    lvx     v21,   0, r3

+    lvx     v22, r10, r3

+.if \increment_counter

+    add     r3, r3, r4

+.endif

+    vperm   \V, v21, v22, v17

+.endm

+.macro write_16 V, increment_counter

+    stvx    \V,  0, r7

+.if \increment_counter

+    add     r7, r7, r8

+.endif

+.endm

+    .align 2

+;# r3 unsigned char * src

+;# r4 int src_pitch

+;# r5 int x_offset

+;# r6 int y_offset

+;# r7 unsigned char * dst

+;# r8 int dst_pitch

+bilinear_predict16x16_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xffff

+    ori     r12, r12, 0xfff8

+    mtspr   256, r12            ;# set VRSAVE

+    HProlog second_pass_16x16_pre_copy_b

+    hfilter_16 v0,  1

+    hfilter_16 v1,  1

+    hfilter_16 v2,  1

+    hfilter_16 v3,  1

+    hfilter_16 v4,  1

+    hfilter_16 v5,  1

+    hfilter_16 v6,  1

+    hfilter_16 v7,  1

+    hfilter_16 v8,  1

+    hfilter_16 v9,  1

+    hfilter_16 v10, 1

+    hfilter_16 v11, 1

+    hfilter_16 v12, 1

+    hfilter_16 v13, 1

+    hfilter_16 v14, 1

+    hfilter_16 v15, 1

+    ;# Finished filtering main horizontal block.  If there is no

+    ;#  vertical filtering, jump to storing the data.  Otherwise

+    ;#  load up and filter the additional line that is needed

+    ;#  for the vertical filter.

+    beq     store_out_16x16_b

+    hfilter_16 v16, 0

+    b   second_pass_16x16_b

+second_pass_16x16_pre_copy_b:

+    slwi    r6, r6, 5           ;# index into vertical filter array

+    load_and_align_16  v0,  1

+    load_and_align_16  v1,  1

+    load_and_align_16  v2,  1

+    load_and_align_16  v3,  1

+    load_and_align_16  v4,  1

+    load_and_align_16  v5,  1

+    load_and_align_16  v6,  1

+    load_and_align_16  v7,  1

+    load_and_align_16  v8,  1

+    load_and_align_16  v9,  1

+    load_and_align_16  v10, 1

+    load_and_align_16  v11, 1

+    load_and_align_16  v12, 1

+    load_and_align_16  v13, 1

+    load_and_align_16  v14, 1

+    load_and_align_16  v15, 1

+    load_and_align_16  v16, 0

+second_pass_16x16_b:

+    vspltish v20, 8

+    vspltish v18, 3

+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

+    load_vfilter v20, v21

+    vfilter_16 v0,  v1

+    vfilter_16 v1,  v2

+    vfilter_16 v2,  v3

+    vfilter_16 v3,  v4

+    vfilter_16 v4,  v5

+    vfilter_16 v5,  v6

+    vfilter_16 v6,  v7

+    vfilter_16 v7,  v8

+    vfilter_16 v8,  v9

+    vfilter_16 v9,  v10

+    vfilter_16 v10, v11

+    vfilter_16 v11, v12

+    vfilter_16 v12, v13

+    vfilter_16 v13, v14

+    vfilter_16 v14, v15

+    vfilter_16 v15, v16

+store_out_16x16_b:

+    write_16 v0,  1

+    write_16 v1,  1

+    write_16 v2,  1

+    write_16 v3,  1

+    write_16 v4,  1

+    write_16 v5,  1

+    write_16 v6,  1

+    write_16 v7,  1

+    write_16 v8,  1

+    write_16 v9,  1

+    write_16 v10, 1

+    write_16 v11, 1

+    write_16 v12, 1

+    write_16 v13, 1

+    write_16 v14, 1

+    write_16 v15, 0

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+    .data

+    .align 4

+hfilter_b:

+    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0

+    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0

+    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0

+    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0

+    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0

+    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0

+    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0

+    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0

+    .align 4

+vfilter_b:

+    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128

+    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

+    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112

+    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

+    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96

+    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32

+    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80

+    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48

+    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64

+    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64

+    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48

+    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80

+    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32

+    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96

+    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

+    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112

+    .align 4

+b_hperm_b:

+    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15

+    .align 4

+b_0123_b:

+    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6

+    .align 4

+b_4567_b:

+    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10

+b_hilo_b:

+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

--- /dev/null

+++ b/vp9/common/ppc/idctllm_altivec.asm

@@ -1,0 +1,189 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    .globl short_idct4x4llm_ppc

+.macro load_c V, LABEL, OFF, R0, R1

+    lis     \R0, \LABEL@ha

+    la      \R1, \LABEL@l(\R0)

+    lvx     \V, \OFF, \R1

+.endm

+;# r3 short *input

+;# r4 short *output

+;# r5 int pitch

+    .align 2

+short_idct4x4llm_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xfff8

+    mtspr   256, r12            ;# set VRSAVE

+    load_c v8, sinpi8sqrt2, 0, r9, r10

+    load_c v9, cospi8sqrt2minus1, 0, r9, r10

+    load_c v10, hi_hi, 0, r9, r10

+    load_c v11, lo_lo, 0, r9, r10

+    load_c v12, shift_16, 0, r9, r10

+    li      r10,  16

+    lvx     v0,   0, r3         ;# input ip[0], ip[ 4]

+    lvx     v1, r10, r3         ;# input ip[8], ip[12]

+    ;# first pass

+    vupkhsh v2, v0

+    vupkhsh v3, v1

+    vaddsws v6, v2, v3          ;# a1 = ip[0]+ip[8]

+    vsubsws v7, v2, v3          ;# b1 = ip[0]-ip[8]

+    vupklsh v0, v0

+    vmulosh v4, v0, v8

+    vsraw   v4, v4, v12

+    vaddsws v4, v4, v0          ;# ip[ 4] * sin(pi/8) * sqrt(2)

+    vupklsh v1, v1

+    vmulosh v5, v1, v9

+    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)

+    vaddsws v5, v5, v1

+    vsubsws v4, v4, v5          ;# c1

+    vmulosh v3, v1, v8

+    vsraw   v3, v3, v12

+    vaddsws v3, v3, v1          ;# ip[12] * sin(pi/8) * sqrt(2)

+    vmulosh v5, v0, v9

+    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)

+    vaddsws v5, v5, v0

+    vaddsws v3, v3, v5          ;# d1

+    vaddsws v0, v6, v3          ;# a1 + d1

+    vsubsws v3, v6, v3          ;# a1 - d1

+    vaddsws v1, v7, v4          ;# b1 + c1

+    vsubsws v2, v7, v4          ;# b1 - c1

+    ;# transpose input

+    vmrghw  v4, v0, v1          ;# a0 b0 a1 b1

+    vmrghw  v5, v2, v3          ;# c0 d0 c1 d1

+    vmrglw  v6, v0, v1          ;# a2 b2 a3 b3

+    vmrglw  v7, v2, v3          ;# c2 d2 c3 d3

+    vperm   v0, v4, v5, v10     ;# a0 b0 c0 d0

+    vperm   v1, v4, v5, v11     ;# a1 b1 c1 d1

+    vperm   v2, v6, v7, v10     ;# a2 b2 c2 d2

+    vperm   v3, v6, v7, v11     ;# a3 b3 c3 d3

+    ;# second pass

+    vaddsws v6, v0, v2          ;# a1 = ip[0]+ip[8]

+    vsubsws v7, v0, v2          ;# b1 = ip[0]-ip[8]

+    vmulosh v4, v1, v8

+    vsraw   v4, v4, v12

+    vaddsws v4, v4, v1          ;# ip[ 4] * sin(pi/8) * sqrt(2)

+    vmulosh v5, v3, v9

+    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)

+    vaddsws v5, v5, v3

+    vsubsws v4, v4, v5          ;# c1

+    vmulosh v2, v3, v8

+    vsraw   v2, v2, v12

+    vaddsws v2, v2, v3          ;# ip[12] * sin(pi/8) * sqrt(2)

+    vmulosh v5, v1, v9

+    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)

+    vaddsws v5, v5, v1

+    vaddsws v3, v2, v5          ;# d1

+    vaddsws v0, v6, v3          ;# a1 + d1

+    vsubsws v3, v6, v3          ;# a1 - d1

+    vaddsws v1, v7, v4          ;# b1 + c1

+    vsubsws v2, v7, v4          ;# b1 - c1

+    vspltish v6, 4

+    vspltish v7, 3

+    vpkswss v0, v0, v1

+    vpkswss v1, v2, v3

+    vaddshs v0, v0, v6

+    vaddshs v1, v1, v6

+    vsrah   v0, v0, v7

+    vsrah   v1, v1, v7

+    ;# transpose output

+    vmrghh  v2, v0, v1          ;# a0 c0 a1 c1 a2 c2 a3 c3

+    vmrglh  v3, v0, v1          ;# b0 d0 b1 d1 b2 d2 b3 d3

+    vmrghh  v0, v2, v3          ;# a0 b0 c0 d0 a1 b1 c1 d1

+    vmrglh  v1, v2, v3          ;# a2 b2 c2 d2 a3 b3 c3 d3

+    stwu    r1,-416(r1)         ;# create space on the stack

+    stvx    v0,  0, r1

+    lwz     r6, 0(r1)

+    stw     r6, 0(r4)

+    lwz     r6, 4(r1)

+    stw     r6, 4(r4)

+    add     r4, r4, r5

+    lwz     r6,  8(r1)

+    stw     r6,  0(r4)

+    lwz     r6, 12(r1)

+    stw     r6,  4(r4)

+    add     r4, r4, r5

+    stvx    v1,  0, r1

+    lwz     r6, 0(r1)

+    stw     r6, 0(r4)

+    lwz     r6, 4(r1)

+    stw     r6, 4(r4)

+    add     r4, r4, r5

+    lwz     r6,  8(r1)

+    stw     r6,  0(r4)

+    lwz     r6, 12(r1)

+    stw     r6,  4(r4)

+    addi    r1, r1, 416         ;# recover stack

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+    .align 4

+sinpi8sqrt2:

+    .short  35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468

+    .align 4

+cospi8sqrt2minus1:

+    .short  20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091

+    .align 4

+shift_16:

+    .long      16,    16,    16,    16

+    .align 4

+hi_hi:

+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

+    .align 4

+lo_lo:

+    .byte     8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31

--- /dev/null

+++ b/vp9/common/ppc/loopfilter_altivec.c

@@ -1,0 +1,127 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "loopfilter.h"

+#include "onyxc_int.h"

+typedef void loop_filter_function_y_ppc

+(

+  unsigned char *s,   // source pointer

+  int p,              // pitch

+  const signed char *flimit,

+  const signed char *limit,

+  const signed char *thresh

+);

+typedef void loop_filter_function_uv_ppc

+(

+  unsigned char *u,   // source pointer

+  unsigned char *v,   // source pointer

+  int p,              // pitch

+  const signed char *flimit,

+  const signed char *limit,

+  const signed char *thresh

+);

+typedef void loop_filter_function_s_ppc

+(

+  unsigned char *s,   // source pointer

+  int p,              // pitch

+  const signed char *flimit

+);

+loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc;

+loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc;

+loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc;

+loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc;

+loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc;

+loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc;

+loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc;

+loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc;

+loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc;

+loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;

+// Horizontal MB filtering

+void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

+                         int y_stride, int uv_stride, loop_filter_info *lfi) {

+  mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);

+  if (u_ptr)

+    mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);

+}

+void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

+                          int y_stride, int uv_stride, loop_filter_info *lfi) {

+  (void)u_ptr;

+  (void)v_ptr;

+  (void)uv_stride;

+  loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim);

+}

+// Vertical MB Filtering

+void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

+                         int y_stride, int uv_stride, loop_filter_info *lfi) {

+  mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);

+  if (u_ptr)

+    mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);

+}

+void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

+                          int y_stride, int uv_stride, loop_filter_info *lfi) {

+  (void)u_ptr;

+  (void)v_ptr;

+  (void)uv_stride;

+  loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim);

+}

+// Horizontal B Filtering

+void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

+                        int y_stride, int uv_stride, loop_filter_info *lfi) {

+  // These should all be done at once with one call, instead of 3

+  loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);

+  loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);

+  loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);

+  if (u_ptr)

+    loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);

+}

+void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

+                         int y_stride, int uv_stride, loop_filter_info *lfi) {

+  (void)u_ptr;

+  (void)v_ptr;

+  (void)uv_stride;

+  loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim);

+  loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim);

+  loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim);

+}

+// Vertical B Filtering

+void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

+                        int y_stride, int uv_stride, loop_filter_info *lfi) {

+  loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);

+  if (u_ptr)

+    loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);

+}

+void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,

+                         int y_stride, int uv_stride, loop_filter_info *lfi) {

+  (void)u_ptr;

+  (void)v_ptr;

+  (void)uv_stride;

+  loop_filter_simple_vertical_edge_ppc(y_ptr + 4,  y_stride, lfi->flim);

+  loop_filter_simple_vertical_edge_ppc(y_ptr + 8,  y_stride, lfi->flim);

+  loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim);

+}

--- /dev/null

+++ b/vp9/common/ppc/loopfilter_filters_altivec.asm

@@ -1,0 +1,1253 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    .globl mbloop_filter_horizontal_edge_y_ppc

+    .globl loop_filter_horizontal_edge_y_ppc

+    .globl mbloop_filter_vertical_edge_y_ppc

+    .globl loop_filter_vertical_edge_y_ppc

+    .globl mbloop_filter_horizontal_edge_uv_ppc

+    .globl loop_filter_horizontal_edge_uv_ppc

+    .globl mbloop_filter_vertical_edge_uv_ppc

+    .globl loop_filter_vertical_edge_uv_ppc

+    .globl loop_filter_simple_horizontal_edge_ppc

+    .globl loop_filter_simple_vertical_edge_ppc

+    .text

+;# We often need to perform transposes (and other transpose-like operations)

+;#   on matrices of data.  This is simplified by the fact that we usually

+;#   operate on hunks of data whose dimensions are powers of 2, or at least

+;#   divisible by highish powers of 2.

+;#

+;#   These operations can be very confusing.  They become more straightforward

+;#   when we think of them as permutations of address bits: Concatenate a

+;#   group of vector registers and think of it as occupying a block of

+;#   memory beginning at address zero.  The low four bits 0...3 of the

+;#   address then correspond to position within a register, the higher-order

+;#   address bits select the register.

+;#

+;#   Although register selection, at the code level, is arbitrary, things

+;#   are simpler if we use contiguous ranges of register numbers, simpler

+;#   still if the low-order bits of the register number correspond to

+;#   conceptual address bits.  We do this whenever reasonable.

+;#

+;#   A 16x16 transpose can then be thought of as an operation on

+;#   a 256-element block of memory.  It takes 8 bits 0...7 to address this

+;#   memory and the effect of a transpose is to interchange address bit

+;#   0 with 4, 1 with 5, 2 with 6, and 3 with 7.  Bits 0...3 index the

+;#   column, which is interchanged with the row addressed by bits 4..7.

+;#

+;#   The altivec merge instructions provide a rapid means of effecting

+;#   many of these transforms.  They operate at three widths (8,16,32).

+;#   Writing V(x) for vector register #x, paired merges permute address

+;#   indices as follows.

+;#

+;#   0->1  1->2  2->3  3->(4+d)  (4+s)->0:

+;#

+;#      vmrghb  V( x),          V( y), V( y + (1<<s))

+;#      vmrglb  V( x + (1<<d)), V( y), V( y + (1<<s))

+;#

+;#

+;#   =0=   1->2  2->3  3->(4+d)  (4+s)->1:

+;#

+;#      vmrghh  V( x),          V( y), V( y + (1<<s))

+;#      vmrglh  V( x + (1<<d)), V( y), V( y + (1<<s))

+;#

+;#

+;#   =0=   =1=   2->3  3->(4+d)  (4+s)->2:

+;#

+;#      vmrghw  V( x),          V( y), V( y + (1<<s))

+;#      vmrglw  V( x + (1<<d)), V( y), V( y + (1<<s))

+;#

+;#

+;#   Unfortunately, there is no doubleword merge instruction.

+;#   The following sequence uses "vperm" is a substitute.

+;#   Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c)

+;#   are in registers Vhihi and Vlolo, we can also effect the permutation

+;#

+;#   =0=   =1=   =2=   3->(4+d)  (4+s)->3   by the sequence:

+;#

+;#      vperm   V( x),          V( y), V( y + (1<<s)), Vhihi

+;#      vperm   V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo

+;#

+;#

+;#   Except for bits s and d, the other relationships between register

+;#   number (= high-order part of address) bits are at the disposal of

+;#   the programmer.

+;#

+;# To avoid excess transposes, we filter all 3 vertical luma subblock

+;#   edges together.  This requires a single 16x16 transpose, which, in

+;#   the above language, amounts to the following permutation of address

+;#   indices:  0<->4   1<->5  2<->6  3<->7, which we accomplish by

+;#   4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0.

+;#

+;#   Except for the fact that the destination registers get written

+;#   before we are done referencing the old contents, the cyclic transform

+;#   is effected by

+;#

+;#      x = 0;  do {

+;#          vmrghb V(2x),   V(x), V(x+8);

+;#          vmrghb V(2x+1), V(x), V(x+8);

+;#      } while( ++x < 8);

+;#

+;#   For clarity, and because we can afford it, we do this transpose

+;#   using all 32 registers, alternating the banks 0..15  and  16 .. 31,

+;#   leaving the final result in 16 .. 31, as the lower registers are

+;#   used in the filtering itself.

+;#

+.macro Tpair A, B, X, Y

+    vmrghb  \A, \X, \Y

+    vmrglb  \B, \X, \Y

+.endm

+;# Each step takes 8*2 = 16 instructions

+.macro t16_even

+    Tpair v16,v17,  v0,v8

+    Tpair v18,v19,  v1,v9

+    Tpair v20,v21,  v2,v10

+    Tpair v22,v23,  v3,v11

+    Tpair v24,v25,  v4,v12

+    Tpair v26,v27,  v5,v13

+    Tpair v28,v29,  v6,v14

+    Tpair v30,v31,  v7,v15

+.endm

+.macro t16_odd

+    Tpair v0,v1, v16,v24

+    Tpair v2,v3, v17,v25

+    Tpair v4,v5, v18,v26

+    Tpair v6,v7, v19,v27

+    Tpair v8,v9, v20,v28

+    Tpair v10,v11, v21,v29

+    Tpair v12,v13, v22,v30

+    Tpair v14,v15, v23,v31

+.endm

+;# Whole transpose takes 4*16 = 64 instructions

+.macro t16_full

+    t16_odd

+    t16_even

+    t16_odd

+    t16_even

+.endm

+;# Vertical edge filtering requires transposes.  For the simple filter,

+;#   we need to convert 16 rows of 4 pels each into 4 registers of 16 pels

+;#   each.  Writing 0 ... 63 for the pixel indices, the desired result is:

+;#

+;#  v0 =  0  1 ... 14 15

+;#  v1 = 16 17 ... 30 31

+;#  v2 = 32 33 ... 47 48

+;#  v3 = 49 50 ... 62 63

+;#

+;#  In frame-buffer memory, the layout is:

+;#

+;#     0  16  32  48

+;#     1  17  33  49

+;#     ...

+;#    15  31  47  63.

+;#

+;#  We begin by reading the data 32 bits at a time (using scalar operations)

+;#  into a temporary array, reading the rows of the array into vector registers,

+;#  with the following layout:

+;#

+;#  v0 =  0 16 32 48  4 20 36 52  8 24 40 56  12 28 44 60

+;#  v1 =  1 17 33 49  5 21 ...                      45 61

+;#  v2 =  2 18 ...                                  46 62

+;#  v3 =  3 19 ...                                  47 63

+;#

+;#  From the "address-bit" perspective discussed above, we simply need to

+;#  interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone.

+;#  In other words, we transpose each of the four 4x4 submatrices.

+;#

+;#  This transformation is its own inverse, and we need to perform it

+;#  again before writing the pixels back into the frame buffer.

+;#

+;#  It acts in place on registers v0...v3, uses v4...v7 as temporaries,

+;#  and assumes that v14/v15 contain the b_hihi/b_lolo selectors

+;#  defined above.  We think of both groups of 4 registers as having

+;#  "addresses" {0,1,2,3} * 16.

+;#

+.macro Transpose4times4x4 Vlo, Vhi

+    ;# d=s=0        0->1  1->2  2->3  3->4  4->0  =5=

+    vmrghb  v4, v0, v1

+    vmrglb  v5, v0, v1

+    vmrghb  v6, v2, v3

+    vmrglb  v7, v2, v3

+    ;# d=0 s=1      =0=   1->2  2->3  3->4  4->5  5->1

+    vmrghh  v0, v4, v6

+    vmrglh  v1, v4, v6

+    vmrghh  v2, v5, v7

+    vmrglh  v3, v5, v7

+    ;# d=s=0        =0=   =1=   2->3  3->4  4->2  =5=

+    vmrghw  v4, v0, v1

+    vmrglw  v5, v0, v1

+    vmrghw  v6, v2, v3

+    vmrglw  v7, v2, v3

+    ;# d=0  s=1     =0=   =1=   =2=   3->4  4->5  5->3

+    vperm   v0, v4, v6, \Vlo

+    vperm   v1, v4, v6, \Vhi

+    vperm   v2, v5, v7, \Vlo

+    vperm   v3, v5, v7, \Vhi

+.endm

+;# end Transpose4times4x4

+;# Normal mb vertical edge filter transpose.

+;#

+;#   We read 8 columns of data, initially in the following pattern:

+;#

+;#  (0,0)  (1,0) ... (7,0)  (0,1)  (1,1) ... (7,1)

+;#  (0,2)  (1,2) ... (7,2)  (0,3)  (1,3) ... (7,3)

+;#  ...

+;#  (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15)

+;#

+;#   and wish to convert to:

+;#

+;#  (0,0) ... (0,15)

+;#  (1,0) ... (1,15)

+;#  ...

+;#  (7,0) ... (7,15).

+;#

+;#  In "address bit" language, we wish to map

+;#

+;#  0->4  1->5  2->6  3->0  4->1  5->2  6->3, i.e., I -> (I+4) mod 7.

+;#

+;#  This can be accomplished by 4 iterations of the cyclic transform

+;#

+;#  I -> (I+1) mod 7;

+;#

+;#  each iteration can be realized by (d=0, s=2):

+;#

+;#  x = 0;  do  Tpair( V(2x),V(2x+1),  V(x),V(x+4))  while( ++x < 4);

+;#

+;#  The input/output is in registers v0...v7.  We use v10...v17 as mirrors;

+;#  preserving v8 = sign converter.

+;#

+;#  Inverse transpose is similar, except here I -> (I+3) mod 7 and the

+;#  result lands in the "mirror" registers v10...v17

+;#

+.macro t8x16_odd

+    Tpair v10, v11,  v0, v4

+    Tpair v12, v13,  v1, v5

+    Tpair v14, v15,  v2, v6

+    Tpair v16, v17,  v3, v7

+.endm

+.macro t8x16_even

+    Tpair v0, v1,  v10, v14

+    Tpair v2, v3,  v11, v15

+    Tpair v4, v5,  v12, v16

+    Tpair v6, v7,  v13, v17

+.endm

+.macro transpose8x16_fwd

+    t8x16_odd

+    t8x16_even

+    t8x16_odd

+    t8x16_even

+.endm

+.macro transpose8x16_inv

+    t8x16_odd

+    t8x16_even

+    t8x16_odd

+.endm

+.macro Transpose16x16

+    vmrghb  v0, v16, v24

+    vmrglb  v1, v16, v24

+    vmrghb  v2, v17, v25

+    vmrglb  v3, v17, v25

+    vmrghb  v4, v18, v26

+    vmrglb  v5, v18, v26

+    vmrghb  v6, v19, v27

+    vmrglb  v7, v19, v27

+    vmrghb  v8, v20, v28

+    vmrglb  v9, v20, v28

+    vmrghb  v10, v21, v29

+    vmrglb  v11, v21, v29

+    vmrghb  v12, v22, v30

+    vmrglb  v13, v22, v30

+    vmrghb  v14, v23, v31

+    vmrglb  v15, v23, v31

+    vmrghb  v16, v0, v8

+    vmrglb  v17, v0, v8

+    vmrghb  v18, v1, v9

+    vmrglb  v19, v1, v9

+    vmrghb  v20, v2, v10

+    vmrglb  v21, v2, v10

+    vmrghb  v22, v3, v11

+    vmrglb  v23, v3, v11

+    vmrghb  v24, v4, v12

+    vmrglb  v25, v4, v12

+    vmrghb  v26, v5, v13

+    vmrglb  v27, v5, v13

+    vmrghb  v28, v6, v14

+    vmrglb  v29, v6, v14

+    vmrghb  v30, v7, v15

+    vmrglb  v31, v7, v15

+    vmrghb  v0, v16, v24

+    vmrglb  v1, v16, v24

+    vmrghb  v2, v17, v25

+    vmrglb  v3, v17, v25

+    vmrghb  v4, v18, v26

+    vmrglb  v5, v18, v26

+    vmrghb  v6, v19, v27

+    vmrglb  v7, v19, v27

+    vmrghb  v8, v20, v28

+    vmrglb  v9, v20, v28

+    vmrghb  v10, v21, v29

+    vmrglb  v11, v21, v29

+    vmrghb  v12, v22, v30

+    vmrglb  v13, v22, v30

+    vmrghb  v14, v23, v31

+    vmrglb  v15, v23, v31

+    vmrghb  v16, v0, v8

+    vmrglb  v17, v0, v8

+    vmrghb  v18, v1, v9

+    vmrglb  v19, v1, v9

+    vmrghb  v20, v2, v10

+    vmrglb  v21, v2, v10

+    vmrghb  v22, v3, v11

+    vmrglb  v23, v3, v11

+    vmrghb  v24, v4, v12

+    vmrglb  v25, v4, v12

+    vmrghb  v26, v5, v13

+    vmrglb  v27, v5, v13

+    vmrghb  v28, v6, v14

+    vmrglb  v29, v6, v14

+    vmrghb  v30, v7, v15

+    vmrglb  v31, v7, v15

+.endm

+;# load_g loads a global vector (whose address is in the local variable Gptr)

+;#   into vector register Vreg.  Trashes r0

+.macro load_g Vreg, Gptr

+    lwz     r0, \Gptr

+    lvx     \Vreg, 0, r0

+.endm

+;# exploit the saturation here.  if the answer is negative

+;# it will be clamped to 0.  orring 0 with a positive

+;# number will be the positive number (abs)

+;# RES = abs( A-B), trashes TMP

+.macro Abs RES, TMP, A, B

+    vsububs \RES, \A, \B

+    vsububs \TMP, \B, \A

+    vor     \RES, \RES, \TMP

+.endm

+;# RES = Max( RES, abs( A-B)), trashes TMP

+.macro max_abs RES, TMP, A, B

+    vsububs \TMP, \A, \B

+    vmaxub  \RES, \RES, \TMP

+    vsububs \TMP, \B, \A

+    vmaxub  \RES, \RES, \TMP

+.endm

+.macro Masks

+    ;# build masks

+    ;# input is all 8 bit unsigned (0-255).  need to

+    ;# do abs(vala-valb) > limit.  but no need to compare each

+    ;# value to the limit.  find the max of the absolute differences

+    ;# and compare that to the limit.

+    ;# First hev

+    Abs     v14, v13, v2, v3    ;# |P1 - P0|

+    max_abs  v14, v13, v5, v4    ;# |Q1 - Q0|

+    vcmpgtub v10, v14, v10      ;# HEV = true if thresh exceeded

+    ;# Next limit

+    max_abs  v14, v13, v0, v1    ;# |P3 - P2|

+    max_abs  v14, v13, v1, v2    ;# |P2 - P1|

+    max_abs  v14, v13, v6, v5    ;# |Q2 - Q1|

+    max_abs  v14, v13, v7, v6    ;# |Q3 - Q2|

+    vcmpgtub v9, v14, v9        ;# R = true if limit exceeded

+    ;# flimit

+    Abs     v14, v13, v3, v4    ;# |P0 - Q0|

+    vcmpgtub v8, v14, v8        ;# X = true if flimit exceeded

+    vor     v8, v8, v9          ;# R = true if flimit or limit exceeded

+    ;# done building masks

+.endm

+.macro build_constants RFL, RLI, RTH, FL, LI, TH

+    ;# build constants

+    lvx     \FL, 0, \RFL        ;# flimit

+    lvx     \LI, 0, \RLI        ;# limit

+    lvx     \TH, 0, \RTH        ;# thresh

+    vspltisb v11, 8

+    vspltisb v12, 4

+    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080

+.endm

+.macro load_data_y

+    ;# setup strides/pointers to be able to access

+    ;# all of the data

+    add     r5, r4, r4          ;# r5 = 2 * stride

+    sub     r6, r3, r5          ;# r6 -> 2 rows back

+    neg     r7, r4              ;# r7 = -stride

+    ;# load 16 pixels worth of data to work on

+    sub     r0, r6, r5          ;# r0 -> 4 rows back (temp)

+    lvx     v0,  0, r0          ;# P3  (read only)

+    lvx     v1, r7, r6          ;# P2

+    lvx     v2,  0, r6          ;# P1

+    lvx     v3, r7, r3          ;# P0

+    lvx     v4,  0, r3          ;# Q0

+    lvx     v5, r4, r3          ;# Q1

+    lvx     v6, r5, r3          ;# Q2

+    add     r0, r3, r5          ;# r0 -> 2 rows fwd (temp)

+    lvx     v7, r4, r0          ;# Q3  (read only)

+.endm

+;# Expects

+;#  v10 == HEV

+;#  v13 == tmp

+;#  v14 == tmp

+.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT

+    vxor    \P1, \P1, v11       ;# SP1

+    vxor    \P0, \P0, v11       ;# SP0

+    vxor    \Q0, \Q0, v11       ;# SQ0

+    vxor    \Q1, \Q1, v11       ;# SQ1

+    vsubsbs v13, \P1, \Q1       ;# f  = c (P1 - Q1)

+.if \HEV_PRESENT

+    vand    v13, v13, v10       ;# f &= hev

+.endif

+    vsubsbs v14, \Q0, \P0       ;# -126 <=  X = Q0-P0  <= +126

+    vaddsbs v13, v13, v14

+    vaddsbs v13, v13, v14

+    vaddsbs v13, v13, v14       ;# A = c( c(P1-Q1) + 3*(Q0-P0))

+    vandc   v13, v13, v8        ;# f &= mask

+    vspltisb v8, 3

+    vspltisb v9, 4

+    vaddsbs v14, v13, v9        ;# f1 = c (f+4)

+    vaddsbs v15, v13, v8        ;# f2 = c (f+3)

+    vsrab   v13, v14, v8        ;# f1 >>= 3

+    vsrab   v15, v15, v8        ;# f2 >>= 3

+    vsubsbs \Q0, \Q0, v13       ;# u1 = c (SQ0 - f1)

+    vaddsbs \P0, \P0, v15       ;# u2 = c (SP0 + f2)

+.endm

+.macro vp8_mbfilter

+    Masks

+    ;# start the fitering here

+    vxor    v1, v1, v11         ;# SP2

+    vxor    v2, v2, v11         ;# SP1

+    vxor    v3, v3, v11         ;# SP0

+    vxor    v4, v4, v11         ;# SQ0

+    vxor    v5, v5, v11         ;# SQ1

+    vxor    v6, v6, v11         ;# SQ2

+    ;# add outer taps if we have high edge variance

+    vsubsbs v13, v2, v5         ;# f  = c (SP1-SQ1)

+    vsubsbs v14, v4, v3         ;# SQ0-SP0

+    vaddsbs v13, v13, v14

+    vaddsbs v13, v13, v14

+    vaddsbs v13, v13, v14       ;# f  = c( c(SP1-SQ1) + 3*(SQ0-SP0))

+    vandc   v13, v13, v8        ;# f &= mask

+    vand    v15, v13, v10       ;# f2 = f & hev

+    ;# save bottom 3 bits so that we round one side +4 and the other +3

+    vspltisb v8, 3

+    vspltisb v9, 4

+    vaddsbs v14, v15, v9        ;# f1 = c (f+4)

+    vaddsbs v15, v15, v8        ;# f2 = c (f+3)

+    vsrab   v14, v14, v8        ;# f1 >>= 3

+    vsrab   v15, v15, v8        ;# f2 >>= 3

+    vsubsbs v4, v4, v14         ;# u1 = c (SQ0 - f1)

+    vaddsbs v3, v3, v15         ;# u2 = c (SP0 + f2)

+    ;# only apply wider filter if not high edge variance

+    vandc   v13, v13, v10       ;# f &= ~hev

+    vspltisb v9, 2

+    vnor    v8, v8, v8

+    vsrb    v9, v8, v9          ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f

+    vupkhsb v9, v9              ;# 0x003f003f003f003f003f003f003f003f

+    vspltisb v8, 9

+    ;# roughly 1/7th difference across boundary

+    vspltish v10, 7

+    vmulosb v14, v8, v13        ;# A = c( c(P1-Q1) + 3*(Q0-P0))

+    vmulesb v15, v8, v13

+    vaddshs v14, v14, v9        ;# +=  63

+    vaddshs v15, v15, v9

+    vsrah   v14, v14, v10       ;# >>= 7

+    vsrah   v15, v15, v10

+    vmrglh  v10, v15, v14

+    vmrghh  v15, v15, v14

+    vpkshss v10, v15, v10       ;# X = saturated down to bytes

+    vsubsbs v6, v6, v10         ;# subtract from Q and add to P

+    vaddsbs v1, v1, v10

+    vxor    v6, v6, v11

+    vxor    v1, v1, v11

+    ;# roughly 2/7th difference across boundary

+    vspltish v10, 7

+    vaddubm v12, v8, v8

+    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))

+    vmulesb v15, v12, v13

+    vaddshs v14, v14, v9

+    vaddshs v15, v15, v9

+    vsrah   v14, v14, v10       ;# >>= 7

+    vsrah   v15, v15, v10

+    vmrglh  v10, v15, v14

+    vmrghh  v15, v15, v14

+    vpkshss v10, v15, v10       ;# X = saturated down to bytes

+    vsubsbs v5, v5, v10         ;# subtract from Q and add to P

+    vaddsbs v2, v2, v10

+    vxor    v5, v5, v11

+    vxor    v2, v2, v11

+    ;# roughly 3/7th difference across boundary

+    vspltish v10, 7

+    vaddubm v12, v12, v8

+    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))

+    vmulesb v15, v12, v13

+    vaddshs v14, v14, v9

+    vaddshs v15, v15, v9

+    vsrah   v14, v14, v10       ;# >>= 7

+    vsrah   v15, v15, v10

+    vmrglh  v10, v15, v14

+    vmrghh  v15, v15, v14

+    vpkshss v10, v15, v10       ;# X = saturated down to bytes

+    vsubsbs v4, v4, v10         ;# subtract from Q and add to P

+    vaddsbs v3, v3, v10

+    vxor    v4, v4, v11

+    vxor    v3, v3, v11

+.endm

+.macro SBFilter

+    Masks

+    common_adjust v3, v4, v2, v5, 1

+    ;# outer tap adjustments

+    vspltisb v8, 1

+    vaddubm v13, v13, v8        ;# f  += 1

+    vsrab   v13, v13, v8        ;# f >>= 1

+    vandc   v13, v13, v10       ;# f &= ~hev

+    vsubsbs v5, v5, v13         ;# u1 = c (SQ1 - f)

+    vaddsbs v2, v2, v13         ;# u2 = c (SP1 + f)

+    vxor    v2, v2, v11

+    vxor    v3, v3, v11

+    vxor    v4, v4, v11

+    vxor    v5, v5, v11

+.endm

+    .align 2

+mbloop_filter_horizontal_edge_y_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xffff

+    mtspr   256, r12            ;# set VRSAVE

+    build_constants r5, r6, r7, v8, v9, v10

+    load_data_y

+    vp8_mbfilter

+    stvx     v1, r7, r6         ;# P2

+    stvx     v2,  0, r6         ;# P1

+    stvx     v3, r7, r3         ;# P0

+    stvx     v4,  0, r3         ;# Q0

+    stvx     v5, r4, r3         ;# Q1

+    stvx     v6, r5, r3         ;# Q2

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+    .align 2

+;#  r3 unsigned char *s

+;#  r4 int p

+;#  r5 const signed char *flimit

+;#  r6 const signed char *limit

+;#  r7 const signed char *thresh

+loop_filter_horizontal_edge_y_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xffff

+    mtspr   256, r12            ;# set VRSAVE

+    build_constants r5, r6, r7, v8, v9, v10

+    load_data_y

+    SBFilter

+    stvx     v2,  0, r6         ;# P1

+    stvx     v3, r7, r3         ;# P0

+    stvx     v4,  0, r3         ;# Q0

+    stvx     v5, r4, r3         ;# Q1

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+;# Filtering a vertical mb.  Each mb is aligned on a 16 byte boundary.

+;#  So we can read in an entire mb aligned.  However if we want to filter the mb

+;#  edge we run into problems.  For the loopfilter we require 4 bytes before the mb

+;#  and 4 after for a total of 8 bytes.  Reading 16 bytes inorder to get 4 is a bit

+;#  of a waste.  So this is an even uglier way to get around that.

+;# Using the regular register file words are read in and then saved back out to

+;#  memory to align and order them up.  Then they are read in using the

+;#  vector register file.

+.macro RLVmb V, R

+    lwzux   r0, r3, r4

+    stw     r0, 4(\R)

+    lwz     r0,-4(r3)

+    stw     r0, 0(\R)

+    lwzux   r0, r3, r4

+    stw     r0,12(\R)

+    lwz     r0,-4(r3)

+    stw     r0, 8(\R)

+    lvx     \V, 0, \R

+.endm

+.macro WLVmb V, R

+    stvx    \V, 0, \R

+    lwz     r0,12(\R)

+    stwux   r0, r3, r4

+    lwz     r0, 8(\R)

+    stw     r0,-4(r3)

+    lwz     r0, 4(\R)

+    stwux   r0, r3, r4

+    lwz     r0, 0(\R)

+    stw     r0,-4(r3)

+.endm

+    .align 2

+;#  r3 unsigned char *s

+;#  r4 int p

+;#  r5 const signed char *flimit

+;#  r6 const signed char *limit

+;#  r7 const signed char *thresh

+mbloop_filter_vertical_edge_y_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xffff

+    ori     r12, r12, 0xc000

+    mtspr   256, r12            ;# set VRSAVE

+    la      r9, -48(r1)         ;# temporary space for reading in vectors

+    sub     r3, r3, r4

+    RLVmb v0, r9

+    RLVmb v1, r9

+    RLVmb v2, r9

+    RLVmb v3, r9

+    RLVmb v4, r9

+    RLVmb v5, r9

+    RLVmb v6, r9

+    RLVmb v7, r9

+    transpose8x16_fwd

+    build_constants r5, r6, r7, v8, v9, v10

+    vp8_mbfilter

+    transpose8x16_inv

+    add r3, r3, r4

+    neg r4, r4

+    WLVmb v17, r9

+    WLVmb v16, r9

+    WLVmb v15, r9

+    WLVmb v14, r9

+    WLVmb v13, r9

+    WLVmb v12, r9

+    WLVmb v11, r9

+    WLVmb v10, r9

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+.macro RL V, R, P

+    lvx     \V, 0,  \R

+    add     \R, \R, \P

+.endm

+.macro WL V, R, P

+    stvx    \V, 0,  \R

+    add     \R, \R, \P

+.endm

+.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3

+                                ;# K = |P0-P1| already

+    Abs     v14, v13, \Q0, \Q1  ;# M = |Q0-Q1|

+    vmaxub  v14, v14, v4        ;# M = max( |P0-P1|, |Q0-Q1|)

+    vcmpgtub v10, v14, v0

+    Abs     v4, v5, \Q2, \Q3    ;# K = |Q2-Q3| = next |P0-P1]

+    max_abs  v14, v13, \Q1, \Q2  ;# M = max( M, |Q1-Q2|)

+    max_abs  v14, v13, \P1, \P2  ;# M = max( M, |P1-P2|)

+    max_abs  v14, v13, \P2, \P3  ;# M = max( M, |P2-P3|)

+    vmaxub   v14, v14, v4       ;# M = max interior abs diff

+    vcmpgtub v9, v14, v2        ;# M = true if int_l exceeded

+    Abs     v14, v13, \P0, \Q0  ;# X = Abs( P0-Q0)

+    vcmpgtub v8, v14, v3        ;# X = true if edge_l exceeded

+    vor     v8, v8, v9          ;# M = true if edge_l or int_l exceeded

+    ;# replace P1,Q1 w/signed versions

+    common_adjust \P0, \Q0, \P1, \Q1, 1

+    vaddubm v13, v13, v1        ;# -16 <= M <= 15, saturation irrelevant

+    vsrab   v13, v13, v1

+    vandc   v13, v13, v10       ;# adjust P1,Q1 by (M+1)>>1  if ! hev

+    vsubsbs \Q1, \Q1, v13

+    vaddsbs \P1, \P1, v13

+    vxor    \P1, \P1, v11       ;# P1

+    vxor    \P0, \P0, v11       ;# P0

+    vxor    \Q0, \Q0, v11       ;# Q0

+    vxor    \Q1, \Q1, v11       ;# Q1

+.endm

+    .align 2

+;#  r3 unsigned char *s

+;#  r4 int p

+;#  r5 const signed char *flimit

+;#  r6 const signed char *limit

+;#  r7 const signed char *thresh

+loop_filter_vertical_edge_y_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xffff

+    ori     r12, r12, 0xffff

+    mtspr   256, r12            ;# set VRSAVE

+    addi    r9, r3, 0

+    RL      v16, r9, r4

+    RL      v17, r9, r4

+    RL      v18, r9, r4

+    RL      v19, r9, r4

+    RL      v20, r9, r4

+    RL      v21, r9, r4

+    RL      v22, r9, r4

+    RL      v23, r9, r4

+    RL      v24, r9, r4

+    RL      v25, r9, r4

+    RL      v26, r9, r4

+    RL      v27, r9, r4

+    RL      v28, r9, r4

+    RL      v29, r9, r4

+    RL      v30, r9, r4

+    lvx     v31, 0, r9

+    Transpose16x16

+    vspltisb v1, 1

+    build_constants r5, r6, r7, v3, v2, v0

+    Abs v4, v5, v19, v18                            ;# K(v14) = first |P0-P1|

+    Fil v16, v17, v18, v19,  v20, v21, v22, v23

+    Fil v20, v21, v22, v23,  v24, v25, v26, v27

+    Fil v24, v25, v26, v27,  v28, v29, v30, v31

+    Transpose16x16

+    addi    r9, r3, 0

+    WL      v16, r9, r4

+    WL      v17, r9, r4

+    WL      v18, r9, r4

+    WL      v19, r9, r4

+    WL      v20, r9, r4

+    WL      v21, r9, r4

+    WL      v22, r9, r4

+    WL      v23, r9, r4

+    WL      v24, r9, r4

+    WL      v25, r9, r4

+    WL      v26, r9, r4

+    WL      v27, r9, r4

+    WL      v28, r9, r4

+    WL      v29, r9, r4

+    WL      v30, r9, r4

+    stvx    v31, 0, r9

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

+.macro active_chroma_sel V

+    andi.   r7, r3, 8       ;# row origin modulo 16

+    add     r7, r7, r7      ;# selects selectors

+    lis     r12, _chromaSelectors@ha

+    la      r0,  _chromaSelectors@l(r12)

+    lwzux   r0, r7, r0      ;# leave selector addr in r7

+    lvx     \V, 0, r0       ;# mask to concatenate active U,V pels

+.endm

+.macro hread_uv Dest, U, V, Offs, VMask

+    lvx     \U, \Offs, r3

+    lvx     \V, \Offs, r4

+    vperm   \Dest, \U, \V, \VMask   ;# Dest = active part of U then V

+.endm

+.macro hwrite_uv New, U, V, Offs, Umask, Vmask

+    vperm   \U, \New, \U, \Umask    ;# Combine new pels with siblings

+    vperm   \V, \New, \V, \Vmask

+    stvx    \U, \Offs, r3           ;# Write to frame buffer

+    stvx    \V, \Offs, r4

+.endm

+;# Process U,V in parallel.

+.macro load_chroma_h

+    neg     r9, r5          ;# r9 = -1 * stride

+    add     r8, r9, r9      ;# r8 = -2 * stride

+    add     r10, r5, r5     ;# r10 = 2 * stride

+    active_chroma_sel v12

+    ;# P3, Q3 are read-only; need not save addresses or sibling pels

+    add     r6, r8, r8      ;# r6 = -4 * stride

+    hread_uv v0, v14, v15, r6, v12

+    add     r6, r10, r5     ;# r6 =  3 * stride

+    hread_uv v7, v14, v15, r6, v12

+    ;# Others are read/write; save addresses and sibling pels

+    add     r6, r8, r9      ;# r6 = -3 * stride

+    hread_uv v1, v16, v17, r6,  v12

+    hread_uv v2, v18, v19, r8,  v12

+    hread_uv v3, v20, v21, r9,  v12

+    hread_uv v4, v22, v23, 0,   v12

+    hread_uv v5, v24, v25, r5,  v12

+    hread_uv v6, v26, v27, r10, v12

+.endm

+.macro uresult_sel V

+    load_g   \V, 4(r7)

+.endm

+.macro vresult_sel V

+    load_g   \V, 8(r7)

+.endm

+;# always write P1,P0,Q0,Q1

+.macro store_chroma_h

+    uresult_sel v11

+    vresult_sel v12

+    hwrite_uv v2, v18, v19, r8, v11, v12

+    hwrite_uv v3, v20, v21, r9, v11, v12

+    hwrite_uv v4, v22, v23, 0,  v11, v12

+    hwrite_uv v5, v24, v25, r5, v11, v12

+.endm

+    .align 2

+;#  r3 unsigned char *u

+;#  r4 unsigned char *v

+;#  r5 int p

+;#  r6 const signed char *flimit

+;#  r7 const signed char *limit

+;#  r8 const signed char *thresh

+mbloop_filter_horizontal_edge_uv_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xffff

+    ori     r12, r12, 0xffff

+    mtspr   256, r12            ;# set VRSAVE

+    build_constants r6, r7, r8, v8, v9, v10

+    load_chroma_h

+    vp8_mbfilter

+    store_chroma_h

+    hwrite_uv v1, v16, v17, r6,  v11, v12    ;# v1 == P2

+    hwrite_uv v6, v26, v27, r10, v11, v12    ;# v6 == Q2

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+    .align 2

+;#  r3 unsigned char *u

+;#  r4 unsigned char *v

+;#  r5 int p

+;#  r6 const signed char *flimit

+;#  r7 const signed char *limit

+;#  r8 const signed char *thresh

+loop_filter_horizontal_edge_uv_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xffff

+    ori     r12, r12, 0xffff

+    mtspr   256, r12            ;# set VRSAVE

+    build_constants r6, r7, r8, v8, v9, v10

+    load_chroma_h

+    SBFilter

+    store_chroma_h

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+.macro R V, R

+    lwzux   r0, r3, r5

+    stw     r0, 4(\R)

+    lwz     r0,-4(r3)

+    stw     r0, 0(\R)

+    lwzux   r0, r4, r5

+    stw     r0,12(\R)

+    lwz     r0,-4(r4)

+    stw     r0, 8(\R)

+    lvx     \V, 0, \R

+.endm

+.macro W V, R

+    stvx    \V, 0, \R

+    lwz     r0,12(\R)

+    stwux   r0, r4, r5

+    lwz     r0, 8(\R)

+    stw     r0,-4(r4)

+    lwz     r0, 4(\R)

+    stwux   r0, r3, r5

+    lwz     r0, 0(\R)

+    stw     r0,-4(r3)

+.endm

+.macro chroma_vread R

+    sub r3, r3, r5          ;# back up one line for simplicity

+    sub r4, r4, r5

+    R v0, \R

+    R v1, \R

+    R v2, \R

+    R v3, \R

+    R v4, \R

+    R v5, \R

+    R v6, \R

+    R v7, \R

+    transpose8x16_fwd

+.endm

+.macro chroma_vwrite R

+    transpose8x16_inv

+    add     r3, r3, r5

+    add     r4, r4, r5

+    neg     r5, r5          ;# Write rows back in reverse order

+    W v17, \R

+    W v16, \R

+    W v15, \R

+    W v14, \R

+    W v13, \R

+    W v12, \R

+    W v11, \R

+    W v10, \R

+.endm

+    .align 2

+;#  r3 unsigned char *u

+;#  r4 unsigned char *v

+;#  r5 int p

+;#  r6 const signed char *flimit

+;#  r7 const signed char *limit

+;#  r8 const signed char *thresh

+mbloop_filter_vertical_edge_uv_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xffff

+    ori     r12, r12, 0xc000

+    mtspr   256, r12            ;# set VRSAVE

+    la      r9, -48(r1)         ;# temporary space for reading in vectors

+    chroma_vread r9

+    build_constants r6, r7, r8, v8, v9, v10

+    vp8_mbfilter

+    chroma_vwrite r9

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+    .align 2

+;#  r3 unsigned char *u

+;#  r4 unsigned char *v

+;#  r5 int p

+;#  r6 const signed char *flimit

+;#  r7 const signed char *limit

+;#  r8 const signed char *thresh

+loop_filter_vertical_edge_uv_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xffff

+    ori     r12, r12, 0xc000

+    mtspr   256, r12            ;# set VRSAVE

+    la      r9, -48(r1)         ;# temporary space for reading in vectors

+    chroma_vread r9

+    build_constants r6, r7, r8, v8, v9, v10

+    SBFilter

+    chroma_vwrite r9

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=-

+.macro vp8_simple_filter

+    Abs v14, v13, v1, v2    ;# M = abs( P0 - Q0)

+    vcmpgtub v8, v14, v8    ;# v5 = true if _over_ limit

+    ;# preserve unsigned v0 and v3

+    common_adjust v1, v2, v0, v3, 0

+    vxor v1, v1, v11

+    vxor v2, v2, v11        ;# cvt Q0, P0 back to pels

+.endm

+.macro simple_vertical

+    addi    r8,  0, 16

+    addi    r7, r5, 32

+    lvx     v0,  0, r5

+    lvx     v1, r8, r5

+    lvx     v2,  0, r7

+    lvx     v3, r8, r7

+    lis     r12, _B_hihi@ha

+    la      r0,  _B_hihi@l(r12)

+    lvx     v16, 0, r0

+    lis     r12, _B_lolo@ha

+    la      r0,  _B_lolo@l(r12)

+    lvx     v17, 0, r0

+    Transpose4times4x4 v16, v17

+    vp8_simple_filter

+    vxor v0, v0, v11

+    vxor v3, v3, v11        ;# cvt Q0, P0 back to pels

+    Transpose4times4x4 v16, v17

+    stvx    v0,  0, r5

+    stvx    v1, r8, r5

+    stvx    v2,  0, r7

+    stvx    v3, r8, r7

+.endm

+    .align 2

+;#  r3 unsigned char *s

+;#  r4 int p

+;#  r5 const signed char *flimit

+loop_filter_simple_horizontal_edge_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xffff

+    mtspr   256, r12            ;# set VRSAVE

+    ;# build constants

+    lvx     v8, 0, r5           ;# flimit

+    vspltisb v11, 8

+    vspltisb v12, 4

+    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080

+    neg     r5, r4              ;# r5 = -1 * stride

+    add     r6, r5, r5          ;# r6 = -2 * stride

+    lvx     v0, r6, r3          ;# v0 = P1 = 16 pels two rows above edge

+    lvx     v1, r5, r3          ;# v1 = P0 = 16 pels one row  above edge

+    lvx     v2,  0, r3          ;# v2 = Q0 = 16 pels one row  below edge

+    lvx     v3, r4, r3          ;# v3 = Q1 = 16 pels two rows below edge

+    vp8_simple_filter

+    stvx    v1, r5, r3          ;# store P0

+    stvx    v2,  0, r3          ;# store Q0

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+.macro RLV Offs

+    stw     r0, (\Offs*4)(r5)

+    lwzux   r0, r7, r4

+.endm

+.macro WLV Offs

+    lwz     r0, (\Offs*4)(r5)

+    stwux   r0, r7, r4

+.endm

+    .align 2

+;#  r3 unsigned char *s

+;#  r4 int p

+;#  r5 const signed char *flimit

+loop_filter_simple_vertical_edge_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xffff

+    ori     r12, r12, 0xc000

+    mtspr   256, r12            ;# set VRSAVE

+    ;# build constants

+    lvx     v8, 0, r5           ;# flimit

+    vspltisb v11, 8

+    vspltisb v12, 4

+    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080

+    la r5, -96(r1)              ;# temporary space for reading in vectors

+    ;# Store 4 pels at word "Offs" in temp array, then advance r7

+    ;#   to next row and read another 4 pels from the frame buffer.

+    subi    r7, r3,  2          ;# r7 -> 2 pels before start

+    lwzx    r0,  0, r7          ;# read first 4 pels

+    ;# 16 unaligned word accesses

+    RLV 0

+    RLV 4

+    RLV 8

+    RLV 12

+    RLV 1

+    RLV 5

+    RLV 9

+    RLV 13

+    RLV 2

+    RLV 6

+    RLV 10

+    RLV 14

+    RLV 3

+    RLV 7

+    RLV 11

+    stw     r0, (15*4)(r5)      ;# write last 4 pels

+    simple_vertical

+    ;# Read temp array, write frame buffer.

+    subi    r7, r3,  2          ;# r7 -> 2 pels before start

+    lwzx    r0,  0, r5          ;# read/write first 4 pels

+    stwx    r0,  0, r7

+    WLV 4

+    WLV 8

+    WLV 12

+    WLV 1

+    WLV 5

+    WLV 9

+    WLV 13

+    WLV 2

+    WLV 6

+    WLV 10

+    WLV 14

+    WLV 3

+    WLV 7

+    WLV 11

+    WLV 15

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+    .data

+_chromaSelectors:

+    .long   _B_hihi

+    .long   _B_Ures0

+    .long   _B_Vres0

+    .long   0

+    .long   _B_lolo

+    .long   _B_Ures8

+    .long   _B_Vres8

+    .long   0

+    .align 4

+_B_Vres8:

+    .byte   16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15

+    .align 4

+_B_Ures8:

+    .byte   16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,  5,  6,  7

+    .align 4

+_B_lolo:

+    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31

+    .align 4

+_B_Vres0:

+    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31

+    .align 4

+_B_Ures0:

+    .byte    0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31

+    .align 4

+_B_hihi:

+    .byte    0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

--- /dev/null

+++ b/vp9/common/ppc/platform_altivec.asm

@@ -1,0 +1,59 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    .globl save_platform_context

+    .globl restore_platform_context

+.macro W V P

+    stvx    \V,  0, \P

+    addi    \P, \P, 16

+.endm

+.macro R V P

+    lvx     \V,  0, \P

+    addi    \P, \P, 16

+.endm

+;# r3 context_ptr

+    .align 2

+save_platform_contex:

+    W v20, r3

+    W v21, r3

+    W v22, r3

+    W v23, r3

+    W v24, r3

+    W v25, r3

+    W v26, r3

+    W v27, r3

+    W v28, r3

+    W v29, r3

+    W v30, r3

+    W v31, r3

+    blr

+;# r3 context_ptr

+    .align 2

+restore_platform_context:

+    R v20, r3

+    R v21, r3

+    R v22, r3

+    R v23, r3

+    R v24, r3

+    R v25, r3

+    R v26, r3

+    R v27, r3

+    R v28, r3

+    R v29, r3

+    R v30, r3

+    R v31, r3

+    blr

--- /dev/null

+++ b/vp9/common/ppc/recon_altivec.asm

@@ -1,0 +1,175 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    .globl recon4b_ppc

+    .globl recon2b_ppc

+    .globl recon_b_ppc

+.macro row_of16 Diff Pred Dst Stride

+    lvx     v1,  0, \Pred           ;# v1 = pred = p0..p15

+    addi    \Pred, \Pred, 16        ;# next pred

+    vmrghb  v2, v0, v1              ;# v2 = 16-bit p0..p7

+    lvx     v3,  0, \Diff           ;# v3 = d0..d7

+    vaddshs v2, v2, v3              ;# v2 = r0..r7

+    vmrglb  v1, v0, v1              ;# v1 = 16-bit p8..p15

+    lvx     v3, r8, \Diff           ;# v3 = d8..d15

+    addi    \Diff, \Diff, 32        ;# next diff

+    vaddshs v3, v3, v1              ;# v3 = r8..r15

+    vpkshus v2, v2, v3              ;# v2 = 8-bit r0..r15

+    stvx    v2,  0, \Dst            ;# to dst

+    add     \Dst, \Dst, \Stride     ;# next dst

+.endm

+    .text

+    .align 2

+;#  r3 = short *diff_ptr,

+;#  r4 = unsigned char *pred_ptr,

+;#  r5 = unsigned char *dst_ptr,

+;#  r6 = int stride

+recon4b_ppc:

+    mfspr   r0, 256                     ;# get old VRSAVE

+    stw     r0, -8(r1)                  ;# save old VRSAVE to stack

+    oris    r0, r0, 0xf000

+    mtspr   256,r0                      ;# set VRSAVE

+    vxor    v0, v0, v0

+    li      r8, 16

+    row_of16 r3, r4, r5, r6

+    row_of16 r3, r4, r5, r6

+    row_of16 r3, r4, r5, r6

+    row_of16 r3, r4, r5, r6

+    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack

+    mtspr   256, r12                    ;# reset old VRSAVE

+    blr

+.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels

+    lvx     v1,  0, \Pred       ;# v1 = pred = p0..p15

+    vmrghb  v2, v0, v1          ;# v2 = 16-bit p0..p7

+    lvx     v3,  0, \Diff       ;# v3 = d0..d7

+    vaddshs v2, v2, v3          ;# v2 = r0..r7

+    vmrglb  v1, v0, v1          ;# v1 = 16-bit p8..p15

+    lvx     v3, r8, \Diff       ;# v2 = d8..d15

+    vaddshs v3, v3, v1          ;# v3 = r8..r15

+    vpkshus v2, v2, v3          ;# v3 = 8-bit r0..r15

+    stvx    v2,  0, r10         ;# 2 rows to dst from buf

+    lwz     r0, 0(r10)

+.if \write_first_four_pels

+    stw     r0, 0(\Dst)

+    .else

+    stwux   r0, \Dst, \Stride

+.endif

+    lwz     r0, 4(r10)

+    stw     r0, 4(\Dst)

+    lwz     r0, 8(r10)

+    stwux   r0, \Dst, \Stride       ;# advance dst to next row

+    lwz     r0, 12(r10)

+    stw     r0, 4(\Dst)

+.endm

+    .align 2

+;#  r3 = short *diff_ptr,

+;#  r4 = unsigned char *pred_ptr,

+;#  r5 = unsigned char *dst_ptr,

+;#  r6 = int stride

+recon2b_ppc:

+    mfspr   r0, 256                     ;# get old VRSAVE

+    stw     r0, -8(r1)                  ;# save old VRSAVE to stack

+    oris    r0, r0, 0xf000

+    mtspr   256,r0                      ;# set VRSAVE

+    vxor    v0, v0, v0

+    li      r8, 16

+    la      r10, -48(r1)                ;# buf

+    two_rows_of8 r3, r4, r5, r6, 1

+    addi    r4, r4, 16;                 ;# next pred

+    addi    r3, r3, 32;                 ;# next diff

+    two_rows_of8 r3, r4, r5, r6, 0

+    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack

+    mtspr   256, r12                    ;# reset old VRSAVE

+    blr

+.macro get_two_diff_rows

+    stw     r0, 0(r10)

+    lwz     r0, 4(r3)

+    stw     r0, 4(r10)

+    lwzu    r0, 32(r3)

+    stw     r0, 8(r10)

+    lwz     r0, 4(r3)

+    stw     r0, 12(r10)

+    lvx     v3, 0, r10

+.endm

+    .align 2

+;#  r3 = short *diff_ptr,

+;#  r4 = unsigned char *pred_ptr,

+;#  r5 = unsigned char *dst_ptr,

+;#  r6 = int stride

+recon_b_ppc:

+    mfspr   r0, 256                     ;# get old VRSAVE

+    stw     r0, -8(r1)                  ;# save old VRSAVE to stack

+    oris    r0, r0, 0xf000

+    mtspr   256,r0                      ;# set VRSAVE

+    vxor    v0, v0, v0

+    la      r10, -48(r1)    ;# buf

+    lwz     r0, 0(r4)

+    stw     r0, 0(r10)

+    lwz     r0, 16(r4)

+    stw     r0, 4(r10)

+    lwz     r0, 32(r4)

+    stw     r0, 8(r10)

+    lwz     r0, 48(r4)

+    stw     r0, 12(r10)

+    lvx     v1,  0, r10;    ;# v1 = pred = p0..p15

+    lwz r0, 0(r3)           ;# v3 = d0..d7

+    get_two_diff_rows

+    vmrghb  v2, v0, v1;     ;# v2 = 16-bit p0..p7

+    vaddshs v2, v2, v3;     ;# v2 = r0..r7

+    lwzu r0, 32(r3)         ;# v3 = d8..d15

+    get_two_diff_rows

+    vmrglb  v1, v0, v1;     ;# v1 = 16-bit p8..p15

+    vaddshs v3, v3, v1;     ;# v3 = r8..r15

+    vpkshus v2, v2, v3;     ;# v2 = 8-bit r0..r15

+    stvx    v2,  0, r10;    ;# 16 pels to dst from buf

+    lwz     r0, 0(r10)

+    stw     r0, 0(r5)

+    lwz     r0, 4(r10)

+    stwux   r0, r5, r6

+    lwz     r0, 8(r10)

+    stwux   r0, r5, r6

+    lwz     r0, 12(r10)

+    stwx    r0, r5, r6

+    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack

+    mtspr   256, r12                    ;# reset old VRSAVE

+    blr

--- /dev/null

+++ b/vp9/common/ppc/systemdependent.c

@@ -1,0 +1,167 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "subpixel.h"

+#include "loopfilter.h"

+#include "recon.h"

+#include "idct.h"

+#include "onyxc_int.h"

+void (*vp8_short_idct4x4)(short *input, short *output, int pitch);

+void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);

+void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);

+extern void (*vp9_post_proc_down_and_across)(

+  unsigned char *src_ptr,

+  unsigned char *dst_ptr,

+  int src_pixels_per_line,

+  int dst_pixels_per_line,

+  int rows,

+  int cols,

+  int flimit

+);

+extern void (*vp9_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);

+extern void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit);

+extern void (*vp9_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);

+extern void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);

+extern void vp9_post_proc_down_and_across_c

+(

+  unsigned char *src_ptr,

+  unsigned char *dst_ptr,

+  int src_pixels_per_line,

+  int dst_pixels_per_line,

+  int rows,

+  int cols,

+  int flimit

+);

+void vp9_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);

+extern copy_mem_block_function *vp9_copy_mem16x16;

+extern copy_mem_block_function *vp9_copy_mem8x8;

+extern copy_mem_block_function *vp9_copy_mem8x4;

+// PPC

+extern subpixel_predict_function sixtap_predict_ppc;

+extern subpixel_predict_function sixtap_predict8x4_ppc;

+extern subpixel_predict_function sixtap_predict8x8_ppc;

+extern subpixel_predict_function sixtap_predict16x16_ppc;

+extern subpixel_predict_function bilinear_predict4x4_ppc;

+extern subpixel_predict_function bilinear_predict8x4_ppc;

+extern subpixel_predict_function bilinear_predict8x8_ppc;

+extern subpixel_predict_function bilinear_predict16x16_ppc;

+extern copy_mem_block_function copy_mem16x16_ppc;

+void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

+void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

+void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

+extern void short_idct4x4llm_ppc(short *input, short *output, int pitch);

+// Generic C

+extern subpixel_predict_function vp9_sixtap_predict_c;

+extern subpixel_predict_function vp9_sixtap_predict8x4_c;

+extern subpixel_predict_function vp9_sixtap_predict8x8_c;

+extern subpixel_predict_function vp9_sixtap_predict16x16_c;

+extern subpixel_predict_function vp9_bilinear_predict4x4_c;

+extern subpixel_predict_function vp9_bilinear_predict8x4_c;

+extern subpixel_predict_function vp9_bilinear_predict8x8_c;

+extern subpixel_predict_function vp9_bilinear_predict16x16_c;

+extern copy_mem_block_function vp9_copy_mem16x16_c;

+extern copy_mem_block_function vp9_copy_mem8x8_c;

+extern copy_mem_block_function vp9_copy_mem8x4_c;

+void vp9_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

+void vp9_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

+void vp9_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

+extern void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch);

+extern void vp9_short_idct4x4llm_c(short *input, short *output, int pitch);

+extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);

+// PPC

+extern loop_filter_block_function loop_filter_mbv_ppc;

+extern loop_filter_block_function loop_filter_bv_ppc;

+extern loop_filter_block_function loop_filter_mbh_ppc;

+extern loop_filter_block_function loop_filter_bh_ppc;

+extern loop_filter_block_function loop_filter_mbvs_ppc;

+extern loop_filter_block_function loop_filter_bvs_ppc;

+extern loop_filter_block_function loop_filter_mbhs_ppc;

+extern loop_filter_block_function loop_filter_bhs_ppc;

+// Generic C

+extern loop_filter_block_function vp9_loop_filter_mbv_c;

+extern loop_filter_block_function vp9_loop_filter_bv_c;

+extern loop_filter_block_function vp9_loop_filter_mbh_c;

+extern loop_filter_block_function vp9_loop_filter_bh_c;

+extern loop_filter_block_function vp9_loop_filter_mbvs_c;

+extern loop_filter_block_function vp9_loop_filter_bvs_c;

+extern loop_filter_block_function vp9_loop_filter_mbhs_c;

+extern loop_filter_block_function vp9_loop_filter_bhs_c;

+extern loop_filter_block_function *vp8_lf_mbvfull;

+extern loop_filter_block_function *vp8_lf_mbhfull;

+extern loop_filter_block_function *vp8_lf_bvfull;

+extern loop_filter_block_function *vp8_lf_bhfull;

+extern loop_filter_block_function *vp8_lf_mbvsimple;

+extern loop_filter_block_function *vp8_lf_mbhsimple;

+extern loop_filter_block_function *vp8_lf_bvsimple;

+extern loop_filter_block_function *vp8_lf_bhsimple;

+void vp9_clear_c(void) {

+}

+void vp9_machine_specific_config(void) {

+  // Pure C:

+  vp9_clear_system_state                = vp9_clear_c;

+  vp9_recon_b                          = vp9_recon_b_c;

+  vp9_recon4b                         = vp9_recon4b_c;

+  vp9_recon2b                         = vp9_recon2b_c;

+  vp9_bilinear_predict16x16            = bilinear_predict16x16_ppc;

+  vp9_bilinear_predict8x8              = bilinear_predict8x8_ppc;

+  vp9_bilinear_predict8x4              = bilinear_predict8x4_ppc;

+  vp8_bilinear_predict                 = bilinear_predict4x4_ppc;

+  vp9_sixtap_predict16x16              = sixtap_predict16x16_ppc;

+  vp9_sixtap_predict8x8                = sixtap_predict8x8_ppc;

+  vp9_sixtap_predict8x4                = sixtap_predict8x4_ppc;

+  vp9_sixtap_predict                   = sixtap_predict_ppc;

+  vp8_short_idct4x4_1                  = vp9_short_idct4x4llm_1_c;

+  vp8_short_idct4x4                    = short_idct4x4llm_ppc;

+  vp8_dc_only_idct                      = vp8_dc_only_idct_c;

+  vp8_lf_mbvfull                       = loop_filter_mbv_ppc;

+  vp8_lf_bvfull                        = loop_filter_bv_ppc;

+  vp8_lf_mbhfull                       = loop_filter_mbh_ppc;

+  vp8_lf_bhfull                        = loop_filter_bh_ppc;

+  vp8_lf_mbvsimple                     = loop_filter_mbvs_ppc;

+  vp8_lf_bvsimple                      = loop_filter_bvs_ppc;

+  vp8_lf_mbhsimple                     = loop_filter_mbhs_ppc;

+  vp8_lf_bhsimple                      = loop_filter_bhs_ppc;

+  vp9_post_proc_down_and_across           = vp9_post_proc_down_and_across_c;

+  vp9_mbpost_proc_down                  = vp9_mbpost_proc_down_c;

+  vp9_mbpost_proc_across_ip              = vp9_mbpost_proc_across_ip_c;

+  vp9_plane_add_noise                   = vp9_plane_add_noise_c;

+  vp9_copy_mem16x16                    = copy_mem16x16_ppc;

+  vp9_copy_mem8x8                      = vp9_copy_mem8x8_c;

+  vp9_copy_mem8x4                      = vp9_copy_mem8x4_c;

+}

--- /dev/null

+++ b/vp9/common/ppflags.h

@@ -1,0 +1,38 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_PPFLAGS_H

+#define __INC_PPFLAGS_H

+enum {

+  VP9D_NOFILTERING            = 0,

+  VP9D_DEBLOCK                = 1 << 0,

+  VP9D_DEMACROBLOCK           = 1 << 1,

+  VP9D_ADDNOISE               = 1 << 2,

+  VP9D_DEBUG_TXT_FRAME_INFO   = 1 << 3,

+  VP9D_DEBUG_TXT_MBLK_MODES   = 1 << 4,

+  VP9D_DEBUG_TXT_DC_DIFF      = 1 << 5,

+  VP9D_DEBUG_TXT_RATE_INFO    = 1 << 6,

+  VP9D_DEBUG_DRAW_MV          = 1 << 7,

+  VP9D_DEBUG_CLR_BLK_MODES    = 1 << 8,

+  VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9

+};

+typedef struct {

+  int post_proc_flag;

+  int deblocking_level;

+  int noise_level;

+  int display_ref_frame_flag;

+  int display_mb_modes_flag;

+  int display_b_modes_flag;

+  int display_mv_flag;

+} vp9_ppflags_t;

+#endif

--- /dev/null

+++ b/vp9/common/pragmas.h

@@ -1,0 +1,19 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifdef __INTEL_COMPILER

+#pragma warning(disable:997 1011 170)

+#endif

+#ifdef _MSC_VER

+#pragma warning(disable:4799)

+#endif

--- /dev/null

+++ b/vp9/common/pred_common.c

@@ -1,0 +1,463 @@

+/*

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vp9/common/pred_common.h"

+#include "vp9/common/seg_common.h"

+// TBD prediction functions for various bitstream signals

+// Returns a context number for the given MB prediction signal

+unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,

+                                   const MACROBLOCKD *const xd,

+                                   PRED_ID pred_id) {

+  int pred_context;

+  MODE_INFO *m = xd->mode_info_context;

+  // Note:

+  // The mode info data structure has a one element border above and to the

+  // left of the entries correpsonding to real macroblocks.

+  // The prediction flags in these dummy entries are initialised to 0.

+  switch (pred_id) {

+    case PRED_SEG_ID:

+      pred_context = (m - 1)->mbmi.seg_id_predicted +

+                     (m - cm->mode_info_stride)->mbmi.seg_id_predicted;

+      break;

+    case PRED_REF:

+      pred_context = (m - 1)->mbmi.ref_predicted +

+                     (m - cm->mode_info_stride)->mbmi.ref_predicted;

+      break;

+    case PRED_COMP:

+      // Context based on use of comp pred flag by neighbours

+      // pred_context =

+      //   ((m - 1)->mbmi.second_ref_frame != INTRA_FRAME) +

+      //    ((m - cm->mode_info_stride)->mbmi.second_ref_frame != INTRA_FRAME);

+      // Context based on mode and reference frame

+      // if ( m->mbmi.ref_frame == LAST_FRAME )

+      //    pred_context = 0 + (m->mbmi.mode != ZEROMV);

+      // else if ( m->mbmi.ref_frame == GOLDEN_FRAME )

+      //    pred_context = 2 + (m->mbmi.mode != ZEROMV);

+      // else

+      //    pred_context = 4 + (m->mbmi.mode != ZEROMV);

+      if (m->mbmi.ref_frame == LAST_FRAME)

+        pred_context = 0;

+      else

+        pred_context = 1;

+      break;

+    case PRED_MBSKIP:

+      pred_context = (m - 1)->mbmi.mb_skip_coeff +

+                     (m - cm->mode_info_stride)->mbmi.mb_skip_coeff;

+      break;

+    case PRED_SWITCHABLE_INTERP:

+      {

+        int left_in_image = (m - 1)->mbmi.mb_in_image;

+        int above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image;

+        int left_mode = (m - 1)->mbmi.mode;

+        int above_mode = (m - cm->mode_info_stride)->mbmi.mode;

+        int left_interp, above_interp;

+        if (left_in_image && left_mode >= NEARESTMV && left_mode <= SPLITMV)

+          left_interp = vp9_switchable_interp_map[(m - 1)->mbmi.interp_filter];

+        else

+          left_interp = VP9_SWITCHABLE_FILTERS;

+        if (above_in_image && above_mode >= NEARESTMV && above_mode <= SPLITMV)

+          above_interp = vp9_switchable_interp_map[

+              (m - cm->mode_info_stride)->mbmi.interp_filter];

+        else

+          above_interp = VP9_SWITCHABLE_FILTERS;

+        if (left_interp == above_interp)

+          pred_context = left_interp;

+        else if (left_interp == VP9_SWITCHABLE_FILTERS &&

+                 above_interp != VP9_SWITCHABLE_FILTERS)

+          pred_context = above_interp;

+        else if (left_interp != VP9_SWITCHABLE_FILTERS &&

+                 above_interp == VP9_SWITCHABLE_FILTERS)

+          pred_context = left_interp;

+        else

+          pred_context = VP9_SWITCHABLE_FILTERS;

+      }

+      break;

+    default:

+      // TODO *** add error trap code.

+      pred_context = 0;

+      break;

+  }

+  return pred_context;

+}

+// This function returns a context probability for coding a given

+// prediction signal

+vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,

+                          const MACROBLOCKD *const xd,

+                          PRED_ID pred_id) {

+  vp9_prob pred_probability;

+  int pred_context;

+  // Get the appropriate prediction context

+  pred_context = vp9_get_pred_context(cm, xd, pred_id);

+  switch (pred_id) {

+    case PRED_SEG_ID:

+      pred_probability = cm->segment_pred_probs[pred_context];

+      break;

+    case PRED_REF:

+      pred_probability = cm->ref_pred_probs[pred_context];

+      break;

+    case PRED_COMP:

+      // In keeping with convention elsewhre the probability returned is

+      // the probability of a "0" outcome which in this case means the

+      // probability of comp pred off.

+      pred_probability = cm->prob_comppred[pred_context];

+      break;

+    case PRED_MBSKIP:

+      pred_probability = cm->mbskip_pred_probs[pred_context];

+      break;

+    default:

+      // TODO *** add error trap code.

+      pred_probability = 128;

+      break;

+  }

+  return pred_probability;

+}

+// This function returns a context probability ptr for coding a given

+// prediction signal

+const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,

+                                   const MACROBLOCKD *const xd,

+                                   PRED_ID pred_id) {

+  const vp9_prob *pred_probability;

+  int pred_context;

+  // Get the appropriate prediction context

+  pred_context = vp9_get_pred_context(cm, xd, pred_id);

+  switch (pred_id) {

+    case PRED_SEG_ID:

+      pred_probability = &cm->segment_pred_probs[pred_context];

+      break;

+    case PRED_REF:

+      pred_probability = &cm->ref_pred_probs[pred_context];

+      break;

+    case PRED_COMP:

+      // In keeping with convention elsewhre the probability returned is

+      // the probability of a "0" outcome which in this case means the

+      // probability of comp pred off.

+      pred_probability = &cm->prob_comppred[pred_context];

+      break;

+    case PRED_MBSKIP:

+      pred_probability = &cm->mbskip_pred_probs[pred_context];

+      break;

+    case PRED_SWITCHABLE_INTERP:

+      pred_probability = &cm->fc.switchable_interp_prob[pred_context][0];

+      break;

+    default:

+      // TODO *** add error trap code.

+      pred_probability = NULL;

+      break;

+  }

+  return pred_probability;

+}

+// This function returns the status of the given prediction signal.

+// I.e. is the predicted value for the given signal correct.

+unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,

+                                PRED_ID pred_id) {

+  unsigned char pred_flag = 0;

+  switch (pred_id) {

+    case PRED_SEG_ID:

+      pred_flag = xd->mode_info_context->mbmi.seg_id_predicted;

+      break;

+    case PRED_REF:

+      pred_flag = xd->mode_info_context->mbmi.ref_predicted;

+      break;

+    case PRED_MBSKIP:

+      pred_flag = xd->mode_info_context->mbmi.mb_skip_coeff;

+      break;

+    default:

+      // TODO *** add error trap code.

+      pred_flag = 0;

+      break;

+  }

+  return pred_flag;

+}

+// This function sets the status of the given prediction signal.

+// I.e. is the predicted value for the given signal correct.

+void vp9_set_pred_flag(MACROBLOCKD *const xd,

+                       PRED_ID pred_id,

+                       unsigned char pred_flag) {

+#if CONFIG_SUPERBLOCKS

+  const int mis = xd->mode_info_stride;

+#endif

+  switch (pred_id) {

+    case PRED_SEG_ID:

+      xd->mode_info_context->mbmi.seg_id_predicted = pred_flag;

+#if CONFIG_SUPERBLOCKS

+      if (xd->mode_info_context->mbmi.encoded_as_sb) {

+        if (xd->mb_to_right_edge > 0)

+          xd->mode_info_context[1].mbmi.seg_id_predicted = pred_flag;

+        if (xd->mb_to_bottom_edge > 0) {

+          xd->mode_info_context[mis].mbmi.seg_id_predicted = pred_flag;

+          if (xd->mb_to_right_edge > 0)

+            xd->mode_info_context[mis + 1].mbmi.seg_id_predicted = pred_flag;

+        }

+      }

+#endif

+      break;

+    case PRED_REF:

+      xd->mode_info_context->mbmi.ref_predicted = pred_flag;

+#if CONFIG_SUPERBLOCKS

+      if (xd->mode_info_context->mbmi.encoded_as_sb) {

+        if (xd->mb_to_right_edge > 0)

+          xd->mode_info_context[1].mbmi.ref_predicted = pred_flag;

+        if (xd->mb_to_bottom_edge > 0) {

+          xd->mode_info_context[mis].mbmi.ref_predicted = pred_flag;

+          if (xd->mb_to_right_edge > 0)

+            xd->mode_info_context[mis + 1].mbmi.ref_predicted = pred_flag;

+        }

+      }

+#endif

+      break;

+    case PRED_MBSKIP:

+      xd->mode_info_context->mbmi.mb_skip_coeff = pred_flag;

+#if CONFIG_SUPERBLOCKS

+      if (xd->mode_info_context->mbmi.encoded_as_sb) {

+        if (xd->mb_to_right_edge > 0)

+          xd->mode_info_context[1].mbmi.mb_skip_coeff = pred_flag;

+        if (xd->mb_to_bottom_edge > 0) {

+          xd->mode_info_context[mis].mbmi.mb_skip_coeff = pred_flag;

+          if (xd->mb_to_right_edge > 0)

+            xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = pred_flag;

+        }

+      }

+#endif

+      break;

+    default:

+      // TODO *** add error trap code.

+      break;

+  }

+}

+// The following contain the guts of the prediction code used to

+// peredict various bitstream signals.

+// Macroblock segment id prediction function

+unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm,

+                                    const MACROBLOCKD *const xd, int MbIndex) {

+  // Currently the prediction for the macroblock segment ID is

+  // the value stored for this macroblock in the previous frame.

+#if CONFIG_SUPERBLOCKS

+  if (!xd->mode_info_context->mbmi.encoded_as_sb) {

+#endif

+    return cm->last_frame_seg_map[MbIndex];

+#if CONFIG_SUPERBLOCKS

+  } else {

+    int seg_id = cm->last_frame_seg_map[MbIndex];

+    int mb_col = MbIndex % cm->mb_cols;

+    int mb_row = MbIndex / cm->mb_cols;

+    if (mb_col + 1 < cm->mb_cols)

+      seg_id = seg_id && cm->last_frame_seg_map[MbIndex + 1];

+    if (mb_row + 1 < cm->mb_rows) {

+      seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols];

+      if (mb_col + 1 < cm->mb_cols)

+        seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols + 1];

+    }

+    return seg_id;

+  }

+#endif

+}

+MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,

+                                    const MACROBLOCKD *const xd) {

+  MODE_INFO *m = xd->mode_info_context;

+  MV_REFERENCE_FRAME left;

+  MV_REFERENCE_FRAME above;

+  MV_REFERENCE_FRAME above_left;

+  MV_REFERENCE_FRAME pred_ref = LAST_FRAME;

+  int segment_id = xd->mode_info_context->mbmi.segment_id;

+  int seg_ref_active;

+  int i;

+  unsigned char frame_allowed[MAX_REF_FRAMES] = {1, 1, 1, 1};

+  unsigned char ref_score[MAX_REF_FRAMES];

+  unsigned char best_score = 0;

+  unsigned char left_in_image;

+  unsigned char above_in_image;

+  unsigned char above_left_in_image;

+  // Is segment coding ennabled

+  seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);

+  // Special case treatment if segment coding is enabled.

+  // Dont allow prediction of a reference frame that the segment

+  // does not allow

+  if (seg_ref_active) {

+    for (i = 0; i < MAX_REF_FRAMES; i++) {

+      frame_allowed[i] =

+        vp9_check_segref(xd, segment_id, i);

+      // Score set to 0 if ref frame not allowed

+      ref_score[i] = cm->ref_scores[i] * frame_allowed[i];

+    }

+  } else

+    vpx_memcpy(ref_score, cm->ref_scores, sizeof(ref_score));

+  // Reference frames used by neighbours

+  left = (m - 1)->mbmi.ref_frame;

+  above = (m - cm->mode_info_stride)->mbmi.ref_frame;

+  above_left = (m - 1 - cm->mode_info_stride)->mbmi.ref_frame;

+  // Are neighbours in image

+  left_in_image = (m - 1)->mbmi.mb_in_image;

+  above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image;

+  above_left_in_image = (m - 1 - cm->mode_info_stride)->mbmi.mb_in_image;

+  // Adjust scores for candidate reference frames based on neigbours

+  if (frame_allowed[left] && left_in_image) {

+    ref_score[left] += 16;

+    if (above_left_in_image && (left == above_left))

+      ref_score[left] += 4;

+  }

+  if (frame_allowed[above] && above_in_image) {

+    ref_score[above] += 16;

+    if (above_left_in_image && (above == above_left))

+      ref_score[above] += 4;

+  }

+  // Now choose the candidate with the highest score

+  for (i = 0; i < MAX_REF_FRAMES; i++) {

+    if (ref_score[i] > best_score) {

+      pred_ref = i;

+      best_score = ref_score[i];

+    }

+  }

+  return pred_ref;

+}

+// Functions to computes a set of modified reference frame probabilities

+// to use when the prediction of the reference frame value fails

+void vp9_calc_ref_probs(int *count, vp9_prob *probs) {

+  int tot_count;

+  tot_count = count[0] + count[1] + count[2] + count[3];

+  if (tot_count) {

+    probs[0] = (vp9_prob)((count[0] * 255 + (tot_count >> 1)) / tot_count);

+    probs[0] += !probs[0];

+  } else

+    probs[0] = 128;

+  tot_count -= count[0];

+  if (tot_count) {

+    probs[1] = (vp9_prob)((count[1] * 255 + (tot_count >> 1)) / tot_count);

+    probs[1] += !probs[1];

+  } else

+    probs[1] = 128;

+  tot_count -= count[1];

+  if (tot_count) {

+    probs[2] = (vp9_prob)((count[2] * 255 + (tot_count >> 1)) / tot_count);

+    probs[2] += !probs[2];

+  } else

+    probs[2] = 128;

+}

+// Computes a set of modified conditional probabilities for the reference frame

+// Values willbe set to 0 for reference frame options that are not possible

+// because wither they were predicted and prediction has failed or because

+// they are not allowed for a given segment.

+void vp9_compute_mod_refprobs(VP9_COMMON *const cm) {

+  int norm_cnt[MAX_REF_FRAMES];

+  int intra_count;

+  int inter_count;

+  int last_count;

+  int gfarf_count;

+  int gf_count;

+  int arf_count;

+  intra_count = cm->prob_intra_coded;

+  inter_count = (255 - intra_count);

+  last_count = (inter_count * cm->prob_last_coded) / 255;

+  gfarf_count = inter_count - last_count;

+  gf_count = (gfarf_count * cm->prob_gf_coded) / 255;

+  arf_count = gfarf_count - gf_count;

+  // Work out modified reference frame probabilities to use where prediction

+  // of the reference frame fails

+  norm_cnt[0] = 0;

+  norm_cnt[1] = last_count;

+  norm_cnt[2] = gf_count;

+  norm_cnt[3] = arf_count;

+  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[INTRA_FRAME]);

+  cm->mod_refprobs[INTRA_FRAME][0] = 0;    // This branch implicit

+  norm_cnt[0] = intra_count;

+  norm_cnt[1] = 0;

+  norm_cnt[2] = gf_count;

+  norm_cnt[3] = arf_count;

+  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[LAST_FRAME]);

+  cm->mod_refprobs[LAST_FRAME][1] = 0;    // This branch implicit

+  norm_cnt[0] = intra_count;

+  norm_cnt[1] = last_count;

+  norm_cnt[2] = 0;

+  norm_cnt[3] = arf_count;

+  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[GOLDEN_FRAME]);

+  cm->mod_refprobs[GOLDEN_FRAME][2] = 0;  // This branch implicit

+  norm_cnt[0] = intra_count;

+  norm_cnt[1] = last_count;

+  norm_cnt[2] = gf_count;

+  norm_cnt[3] = 0;

+  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[ALTREF_FRAME]);

+  cm->mod_refprobs[ALTREF_FRAME][2] = 0;  // This branch implicit

+  // Score the reference frames based on overal frequency.

+  // These scores contribute to the prediction choices.

+  // Max score 17 min 1

+  cm->ref_scores[INTRA_FRAME] = 1 + (intra_count * 16 / 255);

+  cm->ref_scores[LAST_FRAME] = 1 + (last_count * 16 / 255);

+  cm->ref_scores[GOLDEN_FRAME] = 1 + (gf_count * 16 / 255);

+  cm->ref_scores[ALTREF_FRAME] = 1 + (arf_count * 16 / 255);

+}

--- /dev/null

+++ b/vp9/common/pred_common.h

@@ -1,0 +1,56 @@

+/*

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "type_aliases.h"

+#include "onyxc_int.h"

+#include "vp9/common/blockd.h"

+#ifndef __INC_PRED_COMMON_H__

+#define __INC_PRED_COMMON_H__ 1

+// Predicted items

+typedef enum {

+  PRED_SEG_ID = 0,               // Segment identifier

+  PRED_REF = 1,

+  PRED_COMP = 2,

+  PRED_MBSKIP = 3,

+  PRED_SWITCHABLE_INTERP = 4

+} PRED_ID;

+extern unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,

+                                          const MACROBLOCKD *const xd,

+                                          PRED_ID pred_id);

+extern vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,

+                                  const MACROBLOCKD *const xd,

+                                  PRED_ID pred_id);

+extern const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,

+                                          const MACROBLOCKD *const xd,

+                                          PRED_ID pred_id);

+extern unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,

+                                       PRED_ID pred_id);

+extern void vp9_set_pred_flag(MACROBLOCKD *const xd,

+                              PRED_ID pred_id,

+                              unsigned char pred_flag);

+extern unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm,

+                                           const MACROBLOCKD *const xd,

+                                           int MbIndex);

+extern MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,

+                                       const MACROBLOCKD *const xd);

+extern void vp9_compute_mod_refprobs(VP9_COMMON *const cm);

+#endif /* __INC_PRED_COMMON_H__ */

--- /dev/null

+++ b/vp9/common/quant_common.c

@@ -1,0 +1,125 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "quant_common.h"

+static int dc_qlookup[QINDEX_RANGE];

+static int ac_qlookup[QINDEX_RANGE];

+#define ACDC_MIN 4

+void vp9_init_quant_tables() {

+  int i;

+  int current_val = 4;

+  int last_val = 4;

+  int ac_val;

+  for (i = 0; i < QINDEX_RANGE; i++) {

+    ac_qlookup[i] = current_val;

+    current_val = (int)((double)current_val * 1.02);

+    if (current_val == last_val)

+      current_val++;

+    last_val = current_val;

+    ac_val = ac_qlookup[i];

+    dc_qlookup[i] = (0.000000305 * ac_val * ac_val * ac_val) +

+                    (-0.00065 * ac_val * ac_val) +

+                    (0.9 * ac_val) + 0.5;

+    if (dc_qlookup[i] < ACDC_MIN)

+      dc_qlookup[i] = ACDC_MIN;

+  }

+}

+int vp9_dc_quant(int QIndex, int Delta) {

+  int retval;

+  QIndex = QIndex + Delta;

+  if (QIndex > MAXQ)

+    QIndex = MAXQ;

+  else if (QIndex < 0)

+    QIndex = 0;

+  retval = dc_qlookup[ QIndex ];

+  return retval;

+}

+int vp9_dc2quant(int QIndex, int Delta) {

+  int retval;

+  QIndex = QIndex + Delta;

+  if (QIndex > MAXQ)

+    QIndex = MAXQ;

+  else if (QIndex < 0)

+    QIndex = 0;

+  retval = dc_qlookup[ QIndex ];

+  return retval;

+}

+int vp9_dc_uv_quant(int QIndex, int Delta) {

+  int retval;

+  QIndex = QIndex + Delta;

+  if (QIndex > MAXQ)

+    QIndex = MAXQ;

+  else if (QIndex < 0)

+    QIndex = 0;

+  retval = dc_qlookup[ QIndex ];

+  return retval;

+}

+int vp9_ac_yquant(int QIndex) {

+  int retval;

+  if (QIndex > MAXQ)

+    QIndex = MAXQ;

+  else if (QIndex < 0)

+    QIndex = 0;

+  retval = ac_qlookup[ QIndex ];

+  return retval;

+}

+int vp9_ac2quant(int QIndex, int Delta) {

+  int retval;

+  QIndex = QIndex + Delta;

+  if (QIndex > MAXQ)

+    QIndex = MAXQ;

+  else if (QIndex < 0)

+    QIndex = 0;

+  retval = (ac_qlookup[ QIndex ] * 775) / 1000;

+  if (retval < 4)

+    retval = 4;

+  return retval;

+}

+int vp9_ac_uv_quant(int QIndex, int Delta) {

+  int retval;

+  QIndex = QIndex + Delta;

+  if (QIndex > MAXQ)

+    QIndex = MAXQ;

+  else if (QIndex < 0)

+    QIndex = 0;

+  retval = ac_qlookup[ QIndex ];

+  return retval;

+}

--- /dev/null

+++ b/vp9/common/quant_common.h

@@ -1,0 +1,22 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "string.h"

+#include "blockd.h"

+#include "onyxc_int.h"

+extern void vp9_init_quant_tables();

+extern int vp9_ac_yquant(int QIndex);

+extern int vp9_dc_quant(int QIndex, int Delta);

+extern int vp9_dc2quant(int QIndex, int Delta);

+extern int vp9_ac2quant(int QIndex, int Delta);

+extern int vp9_dc_uv_quant(int QIndex, int Delta);

+extern int vp9_ac_uv_quant(int QIndex, int Delta);

--- /dev/null

+++ b/vp9/common/recon.c

@@ -1,0 +1,197 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vpx_rtcd.h"

+#include "blockd.h"

+void vp9_recon_b_c

+(

+  unsigned char *pred_ptr,

+  short *diff_ptr,

+  unsigned char *dst_ptr,

+  int stride

+) {

+  int r, c;

+  for (r = 0; r < 4; r++) {

+    for (c = 0; c < 4; c++) {

+      int a = diff_ptr[c] + pred_ptr[c];

+      if (a < 0)

+        a = 0;

+      if (a > 255)

+        a = 255;

+      dst_ptr[c] = (unsigned char) a;

+    }

+    dst_ptr += stride;

+    diff_ptr += 16;

+    pred_ptr += 16;

+  }

+}

+void vp9_recon_uv_b_c

+(

+  unsigned char *pred_ptr,

+  short *diff_ptr,

+  unsigned char *dst_ptr,

+  int stride

+) {

+  int r, c;

+  for (r = 0; r < 4; r++) {

+    for (c = 0; c < 4; c++) {

+      int a = diff_ptr[c] + pred_ptr[c];

+      if (a < 0)

+        a = 0;

+      if (a > 255)

+        a = 255;

+      dst_ptr[c] = (unsigned char) a;

+    }

+    dst_ptr += stride;

+    diff_ptr += 8;

+    pred_ptr += 8;

+  }

+}

+void vp9_recon4b_c

+(

+  unsigned char *pred_ptr,

+  short *diff_ptr,

+  unsigned char *dst_ptr,

+  int stride

+) {

+  int r, c;

+  for (r = 0; r < 4; r++) {

+    for (c = 0; c < 16; c++) {

+      int a = diff_ptr[c] + pred_ptr[c];

+      if (a < 0)

+        a = 0;

+      if (a > 255)

+        a = 255;

+      dst_ptr[c] = (unsigned char) a;

+    }

+    dst_ptr += stride;

+    diff_ptr += 16;

+    pred_ptr += 16;

+  }

+}

+void vp9_recon2b_c

+(

+  unsigned char *pred_ptr,

+  short *diff_ptr,

+  unsigned char *dst_ptr,

+  int stride

+) {

+  int r, c;

+  for (r = 0; r < 4; r++) {

+    for (c = 0; c < 8; c++) {

+      int a = diff_ptr[c] + pred_ptr[c];

+      if (a < 0)

+        a = 0;

+      if (a > 255)

+        a = 255;

+      dst_ptr[c] = (unsigned char) a;

+    }

+    dst_ptr += stride;

+    diff_ptr += 8;

+    pred_ptr += 8;

+  }

+}

+#if CONFIG_SUPERBLOCKS

+void vp9_recon_mby_s_c(MACROBLOCKD *xd, uint8_t *dst) {

+  int x, y;

+  BLOCKD *b = &xd->block[0];

+  int stride = b->dst_stride;

+  short *diff = b->diff;

+  for (y = 0; y < 16; y++) {

+    for (x = 0; x < 16; x++) {

+      int a = dst[x] + diff[x];

+      if (a < 0)

+        a = 0;

+      else if (a > 255)

+        a = 255;

+      dst[x] = a;

+    }

+    dst += stride;

+    diff += 16;

+  }

+}

+void vp9_recon_mbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {

+  int x, y, i;

+  uint8_t *dst = udst;

+  for (i = 0; i < 2; i++, dst = vdst) {

+    BLOCKD *b = &xd->block[16 + 4 * i];

+    int stride = b->dst_stride;

+    short *diff = b->diff;

+    for (y = 0; y < 8; y++) {

+      for (x = 0; x < 8; x++) {

+        int a = dst[x] + diff[x];

+        if (a < 0)

+          a = 0;

+        else if (a > 255)

+          a = 255;

+        dst[x] = a;

+      }

+      dst += stride;

+      diff += 8;

+    }

+  }

+}

+#endif

+void vp9_recon_mby_c(MACROBLOCKD *xd) {

+  int i;

+  for (i = 0; i < 16; i += 4) {

+    BLOCKD *b = &xd->block[i];

+    vp9_recon4b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);

+  }

+}

+void vp9_recon_mb_c(MACROBLOCKD *xd) {

+  int i;

+  for (i = 0; i < 16; i += 4) {

+    BLOCKD *b = &xd->block[i];

+    vp9_recon4b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);

+  }

+  for (i = 16; i < 24; i += 2) {

+    BLOCKD *b = &xd->block[i];

+    vp9_recon2b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);

+  }

+}

--- /dev/null

+++ b/vp9/common/reconinter.c

@@ -1,0 +1,1145 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vpx/vpx_integer.h"

+#include "subpixel.h"

+#include "blockd.h"

+#include "reconinter.h"

+#if CONFIG_RUNTIME_CPU_DETECT

+#include "onyxc_int.h"

+#endif

+void vp9_setup_interp_filters(MACROBLOCKD *xd,

+                              INTERPOLATIONFILTERTYPE mcomp_filter_type,

+                              VP9_COMMON *cm) {

+  if (mcomp_filter_type == SIXTAP) {

+    xd->subpixel_predict        = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, sixtap4x4);

+    xd->subpixel_predict8x4     = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, sixtap8x4);

+    xd->subpixel_predict8x8     = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, sixtap8x8);

+    xd->subpixel_predict16x16   = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, sixtap16x16);

+    xd->subpixel_predict_avg    = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, sixtap_avg4x4);

+    xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, sixtap_avg8x8);

+    xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, sixtap_avg16x16);

+  } else if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) {

+    xd->subpixel_predict        = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, eighttap4x4);

+    xd->subpixel_predict8x4     = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, eighttap8x4);

+    xd->subpixel_predict8x8     = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, eighttap8x8);

+    xd->subpixel_predict16x16   = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, eighttap16x16);

+    xd->subpixel_predict_avg    = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, eighttap_avg4x4);

+    xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, eighttap_avg8x8);

+    xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, eighttap_avg16x16);

+  } else if (mcomp_filter_type == EIGHTTAP_SHARP) {

+    xd->subpixel_predict        = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, eighttap4x4_sharp);

+    xd->subpixel_predict8x4     = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, eighttap8x4_sharp);

+    xd->subpixel_predict8x8     = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, eighttap8x8_sharp);

+    xd->subpixel_predict16x16   = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, eighttap16x16_sharp);

+    xd->subpixel_predict_avg    = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, eighttap_avg4x4_sharp);

+    xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, eighttap_avg8x8_sharp);

+    xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, eighttap_avg16x16_sharp);

+  }

+  else {

+    xd->subpixel_predict        = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, bilinear4x4);

+    xd->subpixel_predict8x4     = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, bilinear8x4);

+    xd->subpixel_predict8x8     = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, bilinear8x8);

+    xd->subpixel_predict16x16   = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, bilinear16x16);

+    xd->subpixel_predict_avg    = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, bilinear_avg4x4);

+    xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, bilinear_avg8x8);

+    xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(

+        &cm->rtcd.subpix, bilinear_avg16x16);

+  }

+}

+void vp9_copy_mem16x16_c(unsigned char *src,

+                         int src_stride,

+                         unsigned char *dst,

+                         int dst_stride) {

+  int r;

+  for (r = 0; r < 16; r++) {

+#if !(CONFIG_FAST_UNALIGNED)

+    dst[0] = src[0];

+    dst[1] = src[1];

+    dst[2] = src[2];

+    dst[3] = src[3];

+    dst[4] = src[4];

+    dst[5] = src[5];

+    dst[6] = src[6];

+    dst[7] = src[7];

+    dst[8] = src[8];

+    dst[9] = src[9];

+    dst[10] = src[10];

+    dst[11] = src[11];

+    dst[12] = src[12];

+    dst[13] = src[13];

+    dst[14] = src[14];

+    dst[15] = src[15];

+#else

+    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];

+    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];

+    ((uint32_t *)dst)[2] = ((uint32_t *)src)[2];

+    ((uint32_t *)dst)[3] = ((uint32_t *)src)[3];

+#endif

+    src += src_stride;

+    dst += dst_stride;

+  }

+}

+void vp9_avg_mem16x16_c(unsigned char *src,

+                        int src_stride,

+                        unsigned char *dst,

+                        int dst_stride) {

+  int r;

+  for (r = 0; r < 16; r++) {

+    int n;

+    for (n = 0; n < 16; n++) {

+      dst[n] = (dst[n] + src[n] + 1) >> 1;

+    }

+    src += src_stride;

+    dst += dst_stride;

+  }

+}

+void vp9_copy_mem8x8_c(unsigned char *src,

+                       int src_stride,

+                       unsigned char *dst,

+                       int dst_stride) {

+  int r;

+  for (r = 0; r < 8; r++) {

+#if !(CONFIG_FAST_UNALIGNED)

+    dst[0] = src[0];

+    dst[1] = src[1];

+    dst[2] = src[2];

+    dst[3] = src[3];

+    dst[4] = src[4];

+    dst[5] = src[5];

+    dst[6] = src[6];

+    dst[7] = src[7];

+#else

+    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];

+    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];

+#endif

+    src += src_stride;

+    dst += dst_stride;

+  }

+}

+void vp9_avg_mem8x8_c(unsigned char *src,

+                      int src_stride,

+                      unsigned char *dst,

+                      int dst_stride) {

+  int r;

+  for (r = 0; r < 8; r++) {

+    int n;

+    for (n = 0; n < 8; n++) {

+      dst[n] = (dst[n] + src[n] + 1) >> 1;

+    }

+    src += src_stride;

+    dst += dst_stride;

+  }

+}

+void vp9_copy_mem8x4_c(unsigned char *src,

+                       int src_stride,

+                       unsigned char *dst,

+                       int dst_stride) {

+  int r;

+  for (r = 0; r < 4; r++) {

+#if !(CONFIG_FAST_UNALIGNED)

+    dst[0] = src[0];

+    dst[1] = src[1];

+    dst[2] = src[2];

+    dst[3] = src[3];

+    dst[4] = src[4];

+    dst[5] = src[5];

+    dst[6] = src[6];

+    dst[7] = src[7];

+#else

+    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];

+    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];

+#endif

+    src += src_stride;

+    dst += dst_stride;

+  }

+}

+void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) {

+  int r;

+  unsigned char *ptr_base;

+  unsigned char *ptr;

+  unsigned char *pred_ptr = d->predictor;

+  int_mv mv;

+  ptr_base = *(d->base_pre);

+  mv.as_int = d->bmi.as_mv.first.as_int;

+  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {

+    ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

+          (mv.as_mv.col >> 3);

+    sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,

+         pred_ptr, pitch);

+  } else {

+    ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

+                (mv.as_mv.col >> 3);

+    ptr = ptr_base;

+    for (r = 0; r < 4; r++) {

+#if !(CONFIG_FAST_UNALIGNED)

+      pred_ptr[0]  = ptr[0];

+      pred_ptr[1]  = ptr[1];

+      pred_ptr[2]  = ptr[2];

+      pred_ptr[3]  = ptr[3];

+#else

+      *(uint32_t *)pred_ptr = *(uint32_t *)ptr;

+#endif

+      pred_ptr     += pitch;

+      ptr         += d->pre_stride;

+    }

+  }

+}

+/*

+ * Similar to vp9_build_inter_predictors_b(), but instead of storing the

+ * results in d->predictor, we average the contents of d->predictor (which

+ * come from an earlier call to vp9_build_inter_predictors_b()) with the

+ * predictor of the second reference frame / motion vector.

+ */

+void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,

+                                      vp9_subpix_fn_t sppf) {

+  int r;

+  unsigned char *ptr_base;

+  unsigned char *ptr;

+  unsigned char *pred_ptr = d->predictor;

+  int_mv mv;

+  ptr_base = *(d->base_second_pre);

+  mv.as_int = d->bmi.as_mv.second.as_int;

+  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {

+    ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

+          (mv.as_mv.col >> 3);

+    sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,

+         pred_ptr, pitch);

+  } else {

+    ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

+                (mv.as_mv.col >> 3);

+    ptr = ptr_base;

+    for (r = 0; r < 4; r++) {

+      pred_ptr[0]  = (pred_ptr[0] + ptr[0] + 1) >> 1;

+      pred_ptr[1]  = (pred_ptr[1] + ptr[1] + 1) >> 1;

+      pred_ptr[2]  = (pred_ptr[2] + ptr[2] + 1) >> 1;

+      pred_ptr[3]  = (pred_ptr[3] + ptr[3] + 1) >> 1;

+      pred_ptr    += pitch;

+      ptr         += d->pre_stride;

+    }

+  }

+}

+void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {

+  unsigned char *ptr_base;

+  unsigned char *ptr;

+  unsigned char *pred_ptr = d->predictor;

+  int_mv mv;

+  ptr_base = *(d->base_pre);

+  mv.as_int = d->bmi.as_mv.first.as_int;

+  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

+        (mv.as_mv.col >> 3);

+  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {

+    xd->subpixel_predict8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,

+                            (mv.as_mv.row & 7) << 1, pred_ptr, pitch);

+  } else {

+    vp9_copy_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);

+  }

+}

+/*

+ * Similar to build_inter_predictors_4b(), but instead of storing the

+ * results in d->predictor, we average the contents of d->predictor (which

+ * come from an earlier call to build_inter_predictors_4b()) with the

+ * predictor of the second reference frame / motion vector.

+ */

+void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,

+                                      BLOCKD *d, int pitch) {

+  unsigned char *ptr_base;

+  unsigned char *ptr;

+  unsigned char *pred_ptr = d->predictor;

+  int_mv mv;

+  ptr_base = *(d->base_second_pre);

+  mv.as_int = d->bmi.as_mv.second.as_int;

+  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

+        (mv.as_mv.col >> 3);

+  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {

+    xd->subpixel_predict_avg8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,

+                               (mv.as_mv.row & 7) << 1, pred_ptr, pitch);

+  } else {

+    vp9_avg_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);

+  }

+}

+static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {

+  unsigned char *ptr_base;

+  unsigned char *ptr;

+  unsigned char *pred_ptr = d->predictor;

+  int_mv mv;

+  ptr_base = *(d->base_pre);

+  mv.as_int = d->bmi.as_mv.first.as_int;

+  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

+        (mv.as_mv.col >> 3);

+  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {

+    xd->subpixel_predict8x4(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,

+                           (mv.as_mv.row & 7) << 1, pred_ptr, pitch);

+  } else {

+    vp9_copy_mem8x4(ptr, d->pre_stride, pred_ptr, pitch);

+  }

+}

+/*encoder only*/

+#if CONFIG_PRED_FILTER

+// Select the thresholded or non-thresholded filter

+#define USE_THRESH_FILTER 0

+#define PRED_FILT_LEN 5

+static const int filt_shift = 4;

+static const int pred_filter[PRED_FILT_LEN] = {1, 2, 10, 2, 1};

+// Alternative filter {1, 1, 4, 1, 1}

+#if !USE_THRESH_FILTER

+void filter_mb(unsigned char *src, int src_stride,

+               unsigned char *dst, int dst_stride,

+               int width, int height) {

+  int i, j, k;

+  unsigned int Temp[32 * 32];

+  unsigned int  *pTmp = Temp;

+  unsigned char *pSrc = src - (1 + src_stride) * (PRED_FILT_LEN / 2);

+  // Horizontal

+  for (i = 0; i < height + PRED_FILT_LEN - 1; i++) {

+    for (j = 0; j < width; j++) {

+      int sum = 0;

+      for (k = 0; k < PRED_FILT_LEN; k++)

+        sum += pSrc[j + k] * pred_filter[k];

+      pTmp[j] = sum;

+    }

+    pSrc += src_stride;

+    pTmp += width;

+  }

+  // Vertical

+  pTmp = Temp;

+  for (i = 0; i < width; i++) {

+    unsigned char *pDst = dst + i;

+    for (j = 0; j < height; j++) {

+      int sum = 0;

+      for (k = 0; k < PRED_FILT_LEN; k++)

+        sum += pTmp[(j + k) * width] * pred_filter[k];

+      // Round

+      sum = (sum + ((1 << (filt_shift << 1)) >> 1)) >> (filt_shift << 1);

+      pDst[j * dst_stride] = (sum < 0 ? 0 : sum > 255 ? 255 : sum);

+    }

+    ++pTmp;

+  }

+}

+#else

+// Based on vp9_post_proc_down_and_across_c (postproc.c)

+void filter_mb(unsigned char *src, int src_stride,

+               unsigned char *dst, int dst_stride,

+               int width, int height) {

+  unsigned char *pSrc, *pDst;

+  int row;

+  int col;

+  int i;

+  int v;

+  unsigned char d[8];

+  /* TODO flimit should be linked to the quantizer value */

+  int flimit = 7;

+  for (row = 0; row < height; row++) {

+    /* post_proc_down for one row */

+    pSrc = src;

+    pDst = dst;

+    for (col = 0; col < width; col++) {

+      int kernel = (1 << (filt_shift - 1));

+      int v = pSrc[col];

+      for (i = -2; i <= 2; i++) {

+        if (abs(v - pSrc[col + i * src_stride]) > flimit)

+          goto down_skip_convolve;

+        kernel += pred_filter[2 + i] * pSrc[col + i * src_stride];

+      }

+      v = (kernel >> filt_shift);

+    down_skip_convolve:

+      pDst[col] = v;

+    }

+    /* now post_proc_across */

+    pSrc = dst;

+    pDst = dst;

+    for (i = 0; i < 8; i++)

+      d[i] = pSrc[i];

+    for (col = 0; col < width; col++) {

+      int kernel = (1 << (filt_shift - 1));

+      v = pSrc[col];

+      d[col & 7] = v;

+      for (i = -2; i <= 2; i++) {

+        if (abs(v - pSrc[col + i]) > flimit)

+          goto across_skip_convolve;

+        kernel += pred_filter[2 + i] * pSrc[col + i];

+      }

+      d[col & 7] = (kernel >> filt_shift);

+    across_skip_convolve:

+      if (col >= 2)

+        pDst[col - 2] = d[(col - 2) & 7];

+    }

+    /* handle the last two pixels */

+    pDst[col - 2] = d[(col - 2) & 7];

+    pDst[col - 1] = d[(col - 1) & 7];

+    /* next row */

+    src += src_stride;

+    dst += dst_stride;

+  }

+}

+#endif  // !USE_THRESH_FILTER

+#endif  // CONFIG_PRED_FILTER

+/*encoder only*/

+void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {

+  int i, j;

+  BLOCKD *blockd = xd->block;

+  /* build uv mvs */

+  for (i = 0; i < 2; i++) {

+    for (j = 0; j < 2; j++) {

+      int yoffset = i * 8 + j * 2;

+      int uoffset = 16 + i * 2 + j;

+      int voffset = 20 + i * 2 + j;

+      int temp;

+      temp = blockd[yoffset  ].bmi.as_mv.first.as_mv.row

+             + blockd[yoffset + 1].bmi.as_mv.first.as_mv.row

+             + blockd[yoffset + 4].bmi.as_mv.first.as_mv.row

+             + blockd[yoffset + 5].bmi.as_mv.first.as_mv.row;

+      if (temp < 0) temp -= 4;

+      else temp += 4;

+      xd->block[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) &

+        xd->fullpixel_mask;

+      temp = blockd[yoffset  ].bmi.as_mv.first.as_mv.col

+             + blockd[yoffset + 1].bmi.as_mv.first.as_mv.col

+             + blockd[yoffset + 4].bmi.as_mv.first.as_mv.col

+             + blockd[yoffset + 5].bmi.as_mv.first.as_mv.col;

+      if (temp < 0) temp -= 4;

+      else temp += 4;

+      blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) &

+        xd->fullpixel_mask;

+      blockd[voffset].bmi.as_mv.first.as_mv.row =

+        blockd[uoffset].bmi.as_mv.first.as_mv.row;

+      blockd[voffset].bmi.as_mv.first.as_mv.col =

+        blockd[uoffset].bmi.as_mv.first.as_mv.col;

+      if (xd->mode_info_context->mbmi.second_ref_frame) {

+        temp = blockd[yoffset  ].bmi.as_mv.second.as_mv.row

+               + blockd[yoffset + 1].bmi.as_mv.second.as_mv.row

+               + blockd[yoffset + 4].bmi.as_mv.second.as_mv.row

+               + blockd[yoffset + 5].bmi.as_mv.second.as_mv.row;

+        if (temp < 0) {

+          temp -= 4;

+        } else {

+          temp += 4;

+        }

+        blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) &

+          xd->fullpixel_mask;

+        temp = blockd[yoffset  ].bmi.as_mv.second.as_mv.col

+               + blockd[yoffset + 1].bmi.as_mv.second.as_mv.col

+               + blockd[yoffset + 4].bmi.as_mv.second.as_mv.col

+               + blockd[yoffset + 5].bmi.as_mv.second.as_mv.col;

+        if (temp < 0) {

+          temp -= 4;

+        } else {

+          temp += 4;

+        }

+        blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) &

+          xd->fullpixel_mask;

+        blockd[voffset].bmi.as_mv.second.as_mv.row =

+          blockd[uoffset].bmi.as_mv.second.as_mv.row;

+        blockd[voffset].bmi.as_mv.second.as_mv.col =

+          blockd[uoffset].bmi.as_mv.second.as_mv.col;

+      }

+    }

+  }

+  for (i = 16; i < 24; i += 2) {

+    BLOCKD *d0 = &blockd[i];

+    BLOCKD *d1 = &blockd[i + 1];

+    if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)

+      build_inter_predictors2b(xd, d0, 8);

+    else {

+      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict);

+      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict);

+    }

+    if (xd->mode_info_context->mbmi.second_ref_frame) {

+      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg);

+      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg);

+    }

+  }

+}

+static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {

+  /* If the MV points so far into the UMV border that no visible pixels

+   * are used for reconstruction, the subpel part of the MV can be

+   * discarded and the MV limited to 16 pixels with equivalent results.

+   *

+   * This limit kicks in at 19 pixels for the top and left edges, for

+   * the 16 pixels plus 3 taps right of the central pixel when subpel

+   * filtering. The bottom and right edges use 16 pixels plus 2 pixels

+   * left of the central pixel when filtering.

+   */

+  if (mv->col < (xd->mb_to_left_edge - ((16 + INTERP_EXTEND) << 3)))

+    mv->col = xd->mb_to_left_edge - (16 << 3);

+  else if (mv->col > xd->mb_to_right_edge + ((15 + INTERP_EXTEND) << 3))

+    mv->col = xd->mb_to_right_edge + (16 << 3);

+  if (mv->row < (xd->mb_to_top_edge - ((16 + INTERP_EXTEND) << 3)))

+    mv->row = xd->mb_to_top_edge - (16 << 3);

+  else if (mv->row > xd->mb_to_bottom_edge + ((15 + INTERP_EXTEND) << 3))

+    mv->row = xd->mb_to_bottom_edge + (16 << 3);

+}

+/* A version of the above function for chroma block MVs.*/

+static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {

+  mv->col = (2 * mv->col < (xd->mb_to_left_edge - ((16 + INTERP_EXTEND) << 3))) ?

+            (xd->mb_to_left_edge - (16 << 3)) >> 1 : mv->col;

+  mv->col = (2 * mv->col > xd->mb_to_right_edge + ((15 + INTERP_EXTEND) << 3)) ?

+            (xd->mb_to_right_edge + (16 << 3)) >> 1 : mv->col;

+  mv->row = (2 * mv->row < (xd->mb_to_top_edge - ((16 + INTERP_EXTEND) << 3))) ?

+            (xd->mb_to_top_edge - (16 << 3)) >> 1 : mv->row;

+  mv->row = (2 * mv->row > xd->mb_to_bottom_edge + ((15 + INTERP_EXTEND) << 3)) ?

+            (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row;

+}

+/*encoder only*/

+void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,

+                                             unsigned char *dst_y,

+                                             int dst_ystride,

+                                             int clamp_mvs) {

+  unsigned char *ptr_base = xd->pre.y_buffer;

+  unsigned char *ptr;

+  int pre_stride = xd->block[0].pre_stride;

+  int_mv ymv;

+  ymv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;

+  if (clamp_mvs)

+    clamp_mv_to_umv_border(&ymv.as_mv, xd);

+  ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3);

+#if CONFIG_PRED_FILTER

+  if (xd->mode_info_context->mbmi.pred_filter_enabled) {

+    if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {

+      // Sub-pel filter needs extended input

+      int len = 15 + (INTERP_EXTEND << 1);

+      unsigned char Temp[32 * 32]; // Data required by sub-pel filter

+      unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);

+      // Copy extended MB into Temp array, applying the spatial filter

+      filter_mb(ptr - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,

+                Temp, len, len, len);

+      // Sub-pel interpolation

+      xd->subpixel_predict16x16(pTemp, len,

+                                (ymv.as_mv.col & 7) << 1,

+                                (ymv.as_mv.row & 7) << 1,

+                                dst_y, dst_ystride);

+    } else {

+      // Apply spatial filter to create the prediction directly

+      filter_mb(ptr, pre_stride, dst_y, dst_ystride, 16, 16);

+    }

+  } else

+#endif

+    if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {

+      xd->subpixel_predict16x16(ptr, pre_stride,

+                                (ymv.as_mv.col & 7) << 1,

+                                (ymv.as_mv.row & 7) << 1,

+                                dst_y, dst_ystride);

+    } else {

+      vp9_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride);

+    }

+}

+void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,

+                                              unsigned char *dst_u,

+                                              unsigned char *dst_v,

+                                              int dst_uvstride) {

+  int offset;

+  unsigned char *uptr, *vptr;

+  int pre_stride = xd->block[0].pre_stride;

+  int_mv _o16x16mv;

+  int_mv _16x16mv;

+  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;

+  if (xd->mode_info_context->mbmi.need_to_clamp_mvs)

+    clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);

+  _o16x16mv = _16x16mv;

+  /* calc uv motion vectors */

+  if (_16x16mv.as_mv.row < 0)

+    _16x16mv.as_mv.row -= 1;

+  else

+    _16x16mv.as_mv.row += 1;

+  if (_16x16mv.as_mv.col < 0)

+    _16x16mv.as_mv.col -= 1;

+  else

+    _16x16mv.as_mv.col += 1;

+  _16x16mv.as_mv.row /= 2;

+  _16x16mv.as_mv.col /= 2;

+  _16x16mv.as_mv.row &= xd->fullpixel_mask;

+  _16x16mv.as_mv.col &= xd->fullpixel_mask;

+  pre_stride >>= 1;

+  offset = (_16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);

+  uptr = xd->pre.u_buffer + offset;

+  vptr = xd->pre.v_buffer + offset;

+#if CONFIG_PRED_FILTER

+  if (xd->mode_info_context->mbmi.pred_filter_enabled) {

+    int i;

+    unsigned char *pSrc = uptr;

+    unsigned char *pDst = dst_u;

+    int len = 7 + (INTERP_EXTEND << 1);

+    unsigned char Temp[32 * 32]; // Data required by the sub-pel filter

+    unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);

+    // U & V

+    for (i = 0; i < 2; i++) {

+      if (_o16x16mv.as_int & 0x000f000f) {

+        // Copy extended MB into Temp array, applying the spatial filter

+        filter_mb(pSrc - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,

+                  Temp, len, len, len);

+        // Sub-pel filter

+        xd->subpixel_predict8x8(pTemp, len,

+                                _o16x16mv.as_mv.col & 15,

+                                _o16x16mv.as_mv.row & 15,

+                                pDst, dst_uvstride);

+      } else {

+        filter_mb(pSrc, pre_stride, pDst, dst_uvstride, 8, 8);

+      }

+      // V

+      pSrc = vptr;

+      pDst = dst_v;

+    }

+  } else

+#endif

+    if (_o16x16mv.as_int & 0x000f000f) {

+      xd->subpixel_predict8x8(uptr, pre_stride, _o16x16mv.as_mv.col & 15,

+                              _o16x16mv.as_mv.row & 15, dst_u, dst_uvstride);

+      xd->subpixel_predict8x8(vptr, pre_stride, _o16x16mv.as_mv.col & 15,

+                              _o16x16mv.as_mv.row & 15, dst_v, dst_uvstride);

+    } else {

+      vp9_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);

+      vp9_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);

+    }

+}

+void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,

+                                            unsigned char *dst_y,

+                                            unsigned char *dst_u,

+                                            unsigned char *dst_v,

+                                            int dst_ystride, int dst_uvstride) {

+  vp9_build_1st_inter16x16_predictors_mby(xd, dst_y, dst_ystride,

+      xd->mode_info_context->mbmi.need_to_clamp_mvs);

+  vp9_build_1st_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);

+}

+#if CONFIG_SUPERBLOCKS

+void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,

+                                        unsigned char *dst_y,

+                                        unsigned char *dst_u,

+                                        unsigned char *dst_v,

+                                        int dst_ystride,

+                                        int dst_uvstride) {

+  uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;

+  uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer,

+          *v2 = x->second_pre.v_buffer;

+  int n;

+  for (n = 0; n < 4; n++)

+  {

+    const int x_idx = n & 1, y_idx = n >> 1;

+    x->pre.y_buffer = y1 + y_idx * 16 * x->pre.y_stride  + x_idx * 16;

+    x->pre.u_buffer = u1 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;

+    x->pre.v_buffer = v1 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;

+    vp9_build_1st_inter16x16_predictors_mb(x,

+      dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,

+      dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,

+      dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,

+      dst_ystride, dst_uvstride);

+    if (x->mode_info_context->mbmi.second_ref_frame) {

+      x->second_pre.y_buffer = y2 + y_idx * 16 * x->pre.y_stride  + x_idx * 16;

+      x->second_pre.u_buffer = u2 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;

+      x->second_pre.v_buffer = v2 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;

+      vp9_build_2nd_inter16x16_predictors_mb(x,

+        dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,

+        dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,

+        dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,

+        dst_ystride, dst_uvstride);

+    }

+  }

+  x->pre.y_buffer = y1;

+  x->pre.u_buffer = u1;

+  x->pre.v_buffer = v1;

+  if (x->mode_info_context->mbmi.second_ref_frame) {

+    x->second_pre.y_buffer = y2;

+    x->second_pre.u_buffer = u2;

+    x->second_pre.v_buffer = v2;

+  }

+}

+#endif

+/*

+ * The following functions should be called after an initial

+ * call to vp9_build_1st_inter16x16_predictors_mb() or _mby()/_mbuv().

+ * It will run a second sixtap filter on a (different) ref

+ * frame and average the result with the output of the

+ * first sixtap filter. The second reference frame is stored

+ * in x->second_pre (the reference frame index is in

+ * x->mode_info_context->mbmi.second_ref_frame). The second

+ * motion vector is x->mode_info_context->mbmi.second_mv.

+ *

+ * This allows blending prediction from two reference frames

+ * which sometimes leads to better prediction than from a

+ * single reference framer.

+ */

+void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,

+                                             unsigned char *dst_y,

+                                             int dst_ystride) {

+  unsigned char *ptr;

+  int_mv _16x16mv;

+  int mv_row;

+  int mv_col;

+  unsigned char *ptr_base = xd->second_pre.y_buffer;

+  int pre_stride = xd->block[0].pre_stride;

+  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;

+  if (xd->mode_info_context->mbmi.need_to_clamp_secondmv)

+    clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);

+  mv_row = _16x16mv.as_mv.row;

+  mv_col = _16x16mv.as_mv.col;

+  ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);

+#if CONFIG_PRED_FILTER

+  if (xd->mode_info_context->mbmi.pred_filter_enabled) {

+    if ((mv_row | mv_col) & 7) {

+      // Sub-pel filter needs extended input

+      int len = 15 + (INTERP_EXTEND << 1);

+      unsigned char Temp[32 * 32]; // Data required by sub-pel filter

+      unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);

+      // Copy extended MB into Temp array, applying the spatial filter

+      filter_mb(ptr - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,

+                Temp, len, len, len);

+      // Sub-pel filter

+      xd->subpixel_predict_avg16x16(pTemp, len, (mv_col & 7) << 1,

+                                    (mv_row & 7) << 1, dst_y, dst_ystride);

+    } else {

+      // TODO Needs to AVERAGE with the dst_y

+      // For now, do not apply the prediction filter in these cases!

+      vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);

+    }

+  } else

+#endif  // CONFIG_PRED_FILTER

+  {

+    if ((mv_row | mv_col) & 7) {

+      xd->subpixel_predict_avg16x16(ptr, pre_stride, (mv_col & 7) << 1,

+                                    (mv_row & 7) << 1, dst_y, dst_ystride);

+    } else {

+      vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);

+    }

+  }

+}

+void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,

+                                              unsigned char *dst_u,

+                                              unsigned char *dst_v,

+                                              int dst_uvstride) {

+  int offset;

+  unsigned char *uptr, *vptr;

+  int_mv _16x16mv;

+  int mv_row;

+  int mv_col;

+  int omv_row, omv_col;

+  int pre_stride = xd->block[0].pre_stride;

+  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;

+  if (xd->mode_info_context->mbmi.need_to_clamp_secondmv)

+    clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);

+  mv_row = _16x16mv.as_mv.row;

+  mv_col = _16x16mv.as_mv.col;

+  /* calc uv motion vectors */

+  omv_row = mv_row;

+  omv_col = mv_col;

+  mv_row = (mv_row + (mv_row > 0)) >> 1;

+  mv_col = (mv_col + (mv_col > 0)) >> 1;

+  mv_row &= xd->fullpixel_mask;

+  mv_col &= xd->fullpixel_mask;

+  pre_stride >>= 1;

+  offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);

+  uptr = xd->second_pre.u_buffer + offset;

+  vptr = xd->second_pre.v_buffer + offset;

+#if CONFIG_PRED_FILTER

+  if (xd->mode_info_context->mbmi.pred_filter_enabled) {

+    int i;

+    int len = 7 + (INTERP_EXTEND << 1);

+    unsigned char Temp[32 * 32]; // Data required by sub-pel filter

+    unsigned char *pTemp = Temp + (INTERP_EXTEND - 1) * (len + 1);

+    unsigned char *pSrc = uptr;

+    unsigned char *pDst = dst_u;

+    // U & V

+    for (i = 0; i < 2; i++) {

+      if ((omv_row | omv_col) & 15) {

+        // Copy extended MB into Temp array, applying the spatial filter

+        filter_mb(pSrc - (INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,

+                  Temp, len, len, len);

+        // Sub-pel filter

+        xd->subpixel_predict_avg8x8(pTemp, len, omv_col & 15,

+                                    omv_row & 15, pDst, dst_uvstride);

+      } else {

+        // TODO Needs to AVERAGE with the dst_[u|v]

+        // For now, do not apply the prediction filter here!

+        vp9_avg_mem8x8(pSrc, pre_stride, pDst, dst_uvstride);

+      }

+      // V

+      pSrc = vptr;

+      pDst = dst_v;

+    }

+  } else

+#endif  // CONFIG_PRED_FILTER

+    if ((omv_row | omv_col) & 15) {

+      xd->subpixel_predict_avg8x8(uptr, pre_stride, omv_col & 15,

+                                  omv_row & 15, dst_u, dst_uvstride);

+      xd->subpixel_predict_avg8x8(vptr, pre_stride, omv_col & 15,

+                                  omv_row & 15, dst_v, dst_uvstride);

+    } else {

+      vp9_avg_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);

+      vp9_avg_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);

+    }

+}

+void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,

+                                            unsigned char *dst_y,

+                                            unsigned char *dst_u,

+                                            unsigned char *dst_v,

+                                            int dst_ystride,

+                                            int dst_uvstride) {

+  vp9_build_2nd_inter16x16_predictors_mby(xd, dst_y, dst_ystride);

+  vp9_build_2nd_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);

+}

+static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {

+  int i;

+  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;

+  BLOCKD *blockd = xd->block;

+  if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) {

+    blockd[ 0].bmi = xd->mode_info_context->bmi[ 0];

+    blockd[ 2].bmi = xd->mode_info_context->bmi[ 2];

+    blockd[ 8].bmi = xd->mode_info_context->bmi[ 8];

+    blockd[10].bmi = xd->mode_info_context->bmi[10];

+    if (mbmi->need_to_clamp_mvs) {

+      clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.first.as_mv, xd);

+      clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.first.as_mv, xd);

+      clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.first.as_mv, xd);

+      clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.first.as_mv, xd);

+      if (mbmi->second_ref_frame) {

+        clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.second.as_mv, xd);

+        clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.second.as_mv, xd);

+        clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.second.as_mv, xd);

+        clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.second.as_mv, xd);

+      }

+    }

+    vp9_build_inter_predictors4b(xd, &blockd[ 0], 16);

+    vp9_build_inter_predictors4b(xd, &blockd[ 2], 16);

+    vp9_build_inter_predictors4b(xd, &blockd[ 8], 16);

+    vp9_build_inter_predictors4b(xd, &blockd[10], 16);

+    if (mbmi->second_ref_frame) {

+      vp9_build_2nd_inter_predictors4b(xd, &blockd[ 0], 16);

+      vp9_build_2nd_inter_predictors4b(xd, &blockd[ 2], 16);

+      vp9_build_2nd_inter_predictors4b(xd, &blockd[ 8], 16);

+      vp9_build_2nd_inter_predictors4b(xd, &blockd[10], 16);

+    }

+  } else {

+    for (i = 0; i < 16; i += 2) {

+      BLOCKD *d0 = &blockd[i];

+      BLOCKD *d1 = &blockd[i + 1];

+      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];

+      blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1];

+      if (mbmi->need_to_clamp_mvs) {

+        clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.first.as_mv, xd);

+        clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.first.as_mv, xd);

+        if (mbmi->second_ref_frame) {

+          clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.second.as_mv, xd);

+          clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.second.as_mv, xd);

+        }

+      }

+      if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)

+        build_inter_predictors2b(xd, d0, 16);

+      else {

+        vp9_build_inter_predictors_b(d0, 16, xd->subpixel_predict);

+        vp9_build_inter_predictors_b(d1, 16, xd->subpixel_predict);

+      }

+      if (mbmi->second_ref_frame) {

+        vp9_build_2nd_inter_predictors_b(d0, 16, xd->subpixel_predict_avg);

+        vp9_build_2nd_inter_predictors_b(d1, 16, xd->subpixel_predict_avg);

+      }

+    }

+  }

+  for (i = 16; i < 24; i += 2) {

+    BLOCKD *d0 = &blockd[i];

+    BLOCKD *d1 = &blockd[i + 1];

+    if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)

+      build_inter_predictors2b(xd, d0, 8);

+    else {

+      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict);

+      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict);

+    }

+    if (mbmi->second_ref_frame) {

+      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg);

+      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg);

+    }

+  }

+}

+static

+void build_4x4uvmvs(MACROBLOCKD *xd) {

+  int i, j;

+  BLOCKD *blockd = xd->block;

+  for (i = 0; i < 2; i++) {

+    for (j = 0; j < 2; j++) {

+      int yoffset = i * 8 + j * 2;

+      int uoffset = 16 + i * 2 + j;

+      int voffset = 20 + i * 2 + j;

+      int temp;

+      temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.row

+             + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.row

+             + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.row

+             + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.row;

+      if (temp < 0) temp -= 4;

+      else temp += 4;

+      blockd[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) &

+                                                  xd->fullpixel_mask;

+      temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.col

+             + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.col

+             + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.col

+             + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.col;

+      if (temp < 0) temp -= 4;

+      else temp += 4;

+      blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) &

+        xd->fullpixel_mask;

+      // if (x->mode_info_context->mbmi.need_to_clamp_mvs)

+      clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd);

+      // if (x->mode_info_context->mbmi.need_to_clamp_mvs)

+      clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd);

+      blockd[voffset].bmi.as_mv.first.as_mv.row =

+        blockd[uoffset].bmi.as_mv.first.as_mv.row;

+      blockd[voffset].bmi.as_mv.first.as_mv.col =

+        blockd[uoffset].bmi.as_mv.first.as_mv.col;

+      if (xd->mode_info_context->mbmi.second_ref_frame) {

+        temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.row

+               + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.row

+               + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.row

+               + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.row;

+        if (temp < 0) {

+          temp -= 4;

+        } else {

+          temp += 4;

+        }

+       blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) &

+                                                    xd->fullpixel_mask;

+        temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.col

+               + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.col

+               + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.col

+               + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.col;

+        if (temp < 0) {

+          temp -= 4;

+        } else {

+          temp += 4;

+        }

+        blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) &

+                                                        xd->fullpixel_mask;

+        // if (mbmi->need_to_clamp_mvs)

+        clamp_uvmv_to_umv_border(

+          &blockd[uoffset].bmi.as_mv.second.as_mv, xd);

+        // if (mbmi->need_to_clamp_mvs)

+        clamp_uvmv_to_umv_border(

+          &blockd[uoffset].bmi.as_mv.second.as_mv, xd);

+        blockd[voffset].bmi.as_mv.second.as_mv.row =

+          blockd[uoffset].bmi.as_mv.second.as_mv.row;

+        blockd[voffset].bmi.as_mv.second.as_mv.col =

+          blockd[uoffset].bmi.as_mv.second.as_mv.col;

+      }

+    }

+  }

+}

+void vp9_build_inter_predictors_mb(MACROBLOCKD *xd) {

+  if (xd->mode_info_context->mbmi.mode != SPLITMV) {

+    vp9_build_1st_inter16x16_predictors_mb(xd, xd->predictor,

+                                           &xd->predictor[256],

+                                           &xd->predictor[320], 16, 8);

+    if (xd->mode_info_context->mbmi.second_ref_frame) {

+      /* 256 = offset of U plane in Y+U+V buffer;

+       * 320 = offset of V plane in Y+U+V buffer.

+       * (256=16x16, 320=16x16+8x8). */

+      vp9_build_2nd_inter16x16_predictors_mb(xd, xd->predictor,

+                                             &xd->predictor[256],

+                                             &xd->predictor[320], 16, 8);

+    }

+  } else {

+    build_4x4uvmvs(xd);

+    build_inter4x4_predictors_mb(xd);

+  }

+}

--- /dev/null

+++ b/vp9/common/reconinter.h

@@ -1,0 +1,78 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_RECONINTER_H

+#define __INC_RECONINTER_H

+#include "onyxc_int.h"

+extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,

+                                                    unsigned char *dst_y,

+                                                    int dst_ystride,

+                                                    int clamp_mvs);

+extern void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,

+                                                     unsigned char *dst_u,

+                                                     unsigned char *dst_v,

+                                                     int dst_uvstride);

+extern void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,

+                                                   unsigned char *dst_y,

+                                                   unsigned char *dst_u,

+                                                   unsigned char *dst_v,

+                                                   int dst_ystride,

+                                                   int dst_uvstride);

+extern void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,

+                                                    unsigned char *dst_y,

+                                                    int dst_ystride);

+extern void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,

+                                                     unsigned char *dst_u,

+                                                     unsigned char *dst_v,

+                                                     int dst_uvstride);

+extern void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,

+                                                   unsigned char *dst_y,

+                                                   unsigned char *dst_u,

+                                                   unsigned char *dst_v,

+                                                   int dst_ystride,

+                                                   int dst_uvstride);

+#if CONFIG_SUPERBLOCKS

+extern void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,

+                                               unsigned char *dst_y,

+                                               unsigned char *dst_u,

+                                               unsigned char *dst_v,

+                                               int dst_ystride,

+                                               int dst_uvstride);

+#endif

+extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd);

+extern void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,

+                                         vp9_subpix_fn_t sppf);

+extern void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,

+                                             vp9_subpix_fn_t sppf);

+extern void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d,

+                                         int pitch);

+extern void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,

+                                             BLOCKD *d, int pitch);

+extern void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd);

+extern void vp9_setup_interp_filters(MACROBLOCKD *xd,

+                                     INTERPOLATIONFILTERTYPE filter,

+                                     VP9_COMMON *cm);

+#endif  // __INC_RECONINTER_H

--- /dev/null

+++ b/vp9/common/reconintra.c

@@ -1,0 +1,490 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <stdio.h>

+#include "vpx_ports/config.h"

+#include "vpx_rtcd.h"

+#include "reconintra.h"

+#include "vpx_mem/vpx_mem.h"

+/* For skip_recon_mb(), add vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd)

+ * and vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd).

+ */

+static void d27_predictor(uint8_t *ypred_ptr, int y_stride, int n,

+                          uint8_t *yabove_row, uint8_t *yleft_col) {

+  int r, c, h, w, v;

+  int a, b;

+  r = 0;

+  for (c = 0; c < n - 2; c++) {

+    if (c & 1)

+      a = yleft_col[r + 1];

+    else

+      a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1;

+    b = yabove_row[c + 2];

+    ypred_ptr[c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);

+  }

+  for (r = 1; r < n / 2 - 1; r++) {

+    for (c = 0; c < n - 2 - 2 * r; c++) {

+      if (c & 1)

+        a = yleft_col[r + 1];

+      else

+        a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1;

+      b = ypred_ptr[(r - 1) * y_stride + c + 2];

+      ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);

+    }

+  }

+  for (; r < n - 1; ++r) {

+    for (c = 0; c < n; c++) {

+      v = (c & 1 ? yleft_col[r + 1] : (yleft_col[r] + yleft_col[r + 1] + 1) >> 1);

+      h = r - c / 2;

+      ypred_ptr[h * y_stride + c] = v;

+    }

+  }

+  c = 0;

+  r = n - 1;

+  ypred_ptr[r * y_stride] = (ypred_ptr[(r - 1) * y_stride] +

+                             yleft_col[r] + 1) >> 1;

+  for (r = n - 2; r >= n / 2; --r) {

+    w = c + (n - 1 - r) * 2;

+    ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] +

+                                   ypred_ptr[r * y_stride + w - 1] + 1) >> 1;

+  }

+  for (c = 1; c < n; c++) {

+    for (r = n - 1; r >= n / 2 + c / 2; --r) {

+      w = c + (n - 1 - r) * 2;

+      ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] +

+                                     ypred_ptr[r * y_stride + w - 1] + 1) >> 1;

+    }

+  }

+}

+static void d63_predictor(uint8_t *ypred_ptr, int y_stride, int n,

+                          uint8_t *yabove_row, uint8_t *yleft_col) {

+  int r, c, h, w, v;

+  int a, b;

+  c = 0;

+  for (r = 0; r < n - 2; r++) {

+    if (r & 1)

+      a = yabove_row[c + 1];

+    else

+      a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1;

+    b = yleft_col[r + 2];

+    ypred_ptr[r * y_stride] = (2 * a + (r + 1) * b + (r + 3) / 2) / (r + 3);

+  }

+  for (c = 1; c < n / 2 - 1; c++) {

+    for (r = 0; r < n - 2 - 2 * c; r++) {

+      if (r & 1)

+        a = yabove_row[c + 1];

+      else

+        a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1;

+      b = ypred_ptr[(r + 2) * y_stride + c - 1];

+      ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);

+    }

+  }

+  for (; c < n - 1; ++c) {

+    for (r = 0; r < n; r++) {

+      v = (r & 1 ? yabove_row[c + 1] : (yabove_row[c] + yabove_row[c + 1] + 1) >> 1);

+      w = c - r / 2;

+      ypred_ptr[r * y_stride + w] = v;

+    }

+  }

+  r = 0;

+  c = n - 1;

+  ypred_ptr[c] = (ypred_ptr[(c - 1)] + yabove_row[c] + 1) >> 1;

+  for (c = n - 2; c >= n / 2; --c) {

+    h = r + (n - 1 - c) * 2;

+    ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] +

+                                   ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1;

+  }

+  for (r = 1; r < n; r++) {

+    for (c = n - 1; c >= n / 2 + r / 2; --c) {

+      h = r + (n - 1 - c) * 2;

+      ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] +

+                                     ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1;

+    }

+  }

+}

+static void d45_predictor(uint8_t *ypred_ptr, int y_stride, int n,

+                          uint8_t *yabove_row, uint8_t *yleft_col) {

+  int r, c;

+  for (r = 0; r < n - 1; ++r) {

+    for (c = 0; c <= r; ++c) {

+      ypred_ptr[(r - c) * y_stride + c] =

+        (yabove_row[r + 1] * (c + 1) +

+         yleft_col[r + 1] * (r - c + 1) + r / 2 + 1) / (r + 2);

+    }

+  }

+  for (c = 0; c <= r; ++c) {

+    int yabove_ext = yabove_row[r]; // 2*yabove_row[r] - yabove_row[r-1];

+    int yleft_ext = yleft_col[r]; // 2*yleft_col[r] - yleft_col[r-1];

+    yabove_ext = (yabove_ext > 255 ? 255 : (yabove_ext < 0 ? 0 : yabove_ext));

+    yleft_ext = (yleft_ext > 255 ? 255 : (yleft_ext < 0 ? 0 : yleft_ext));

+    ypred_ptr[(r - c) * y_stride + c] =

+      (yabove_ext * (c + 1) +

+       yleft_ext * (r - c + 1) + r / 2 + 1) / (r + 2);

+  }

+  for (r = 1; r < n; ++r) {

+    for (c = n - r; c < n; ++c)

+      ypred_ptr[r * y_stride + c] = (ypred_ptr[(r - 1) * y_stride + c] +

+                                     ypred_ptr[r * y_stride + c - 1] + 1) >> 1;

+  }

+}

+static void d117_predictor(uint8_t *ypred_ptr, int y_stride, int n,

+                           uint8_t *yabove_row, uint8_t *yleft_col) {

+  int r, c;

+  for (c = 0; c < n; c++)

+    ypred_ptr[c] = (yabove_row[c - 1] + yabove_row[c] + 1) >> 1;

+  ypred_ptr += y_stride;

+  for (c = 0; c < n; c++)

+    ypred_ptr[c] = yabove_row[c - 1];

+  ypred_ptr += y_stride;

+  for (r = 2; r < n; ++r) {

+    ypred_ptr[0] = yleft_col[r - 2];

+    for (c = 1; c < n; c++)

+      ypred_ptr[c] = ypred_ptr[-2 * y_stride + c - 1];

+    ypred_ptr += y_stride;

+  }

+}

+static void d135_predictor(uint8_t *ypred_ptr, int y_stride, int n,

+                           uint8_t *yabove_row, uint8_t *yleft_col) {

+  int r, c;

+  ypred_ptr[0] = yabove_row[-1];

+  for (c = 1; c < n; c++)

+    ypred_ptr[c] = yabove_row[c - 1];

+  for (r = 1; r < n; ++r)

+    ypred_ptr[r * y_stride] = yleft_col[r - 1];

+  ypred_ptr += y_stride;

+  for (r = 1; r < n; ++r) {

+    for (c = 1; c < n; c++) {

+      ypred_ptr[c] = ypred_ptr[-y_stride + c - 1];

+    }

+    ypred_ptr += y_stride;

+  }

+}

+static void d153_predictor(uint8_t *ypred_ptr, int y_stride, int n,

+                           uint8_t *yabove_row, uint8_t *yleft_col) {

+  int r, c;

+  ypred_ptr[0] = (yabove_row[-1] + yleft_col[0] + 1) >> 1;

+  for (r = 1; r < n; r++)

+    ypred_ptr[r * y_stride] = (yleft_col[r - 1] + yleft_col[r] + 1) >> 1;

+  ypred_ptr++;

+  ypred_ptr[0] = yabove_row[-1];

+  for (r = 1; r < n; r++)

+    ypred_ptr[r * y_stride] = yleft_col[r - 1];

+  ypred_ptr++;

+  for (c = 0; c < n - 2; c++)

+    ypred_ptr[c] = yabove_row[c];

+  ypred_ptr += y_stride;

+  for (r = 1; r < n; ++r) {

+    for (c = 0; c < n - 2; c++)

+      ypred_ptr[c] = ypred_ptr[-y_stride + c - 2];

+    ypred_ptr += y_stride;

+  }

+}

+void vp9_recon_intra_mbuv(MACROBLOCKD *xd) {

+  int i;

+  for (i = 16; i < 24; i += 2) {

+    BLOCKD *b = &xd->block[i];

+    vp9_recon2b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);

+  }

+}

+void vp9_build_intra_predictors_internal(unsigned char *src, int src_stride,

+                                         unsigned char *ypred_ptr,

+                                         int y_stride, int mode, int bsize,

+                                         int up_available, int left_available) {

+  unsigned char *yabove_row = src - src_stride;

+  unsigned char yleft_col[32];

+  unsigned char ytop_left = yabove_row[-1];

+  int r, c, i;

+  for (i = 0; i < bsize; i++) {

+    yleft_col[i] = src[i * src_stride - 1];

+  }

+  /* for Y */

+  switch (mode) {

+    case DC_PRED: {

+      int expected_dc;

+      int i;

+      int shift;

+      int average = 0;

+      int log2_bsize_minus_1;

+      assert(bsize == 4 || bsize == 8 || bsize == 16 || bsize == 32);

+      if (bsize == 4) {

+        log2_bsize_minus_1 = 1;

+      } else if (bsize == 8) {

+        log2_bsize_minus_1 = 2;

+      } else if (bsize == 16) {

+        log2_bsize_minus_1 = 3;

+      } else /* bsize == 32 */ {

+        log2_bsize_minus_1 = 4;

+      }

+      if (up_available || left_available) {

+        if (up_available) {

+          for (i = 0; i < bsize; i++) {

+            average += yabove_row[i];

+          }

+        }

+        if (left_available) {

+          for (i = 0; i < bsize; i++) {

+            average += yleft_col[i];

+          }

+        }

+        shift = log2_bsize_minus_1 + up_available + left_available;

+        expected_dc = (average + (1 << (shift - 1))) >> shift;

+      } else {

+        expected_dc = 128;

+      }

+      for (r = 0; r < bsize; r++) {

+        vpx_memset(ypred_ptr, expected_dc, bsize);

+        ypred_ptr += y_stride;

+      }

+    }

+    break;

+    case V_PRED: {

+      for (r = 0; r < bsize; r++) {

+        memcpy(ypred_ptr, yabove_row, bsize);

+        ypred_ptr += y_stride;

+      }

+    }

+    break;

+    case H_PRED: {

+      for (r = 0; r < bsize; r++) {

+        vpx_memset(ypred_ptr, yleft_col[r], bsize);

+        ypred_ptr += y_stride;

+      }

+    }

+    break;

+    case TM_PRED: {

+      for (r = 0; r < bsize; r++) {

+        for (c = 0; c < bsize; c++) {

+          int pred =  yleft_col[r] + yabove_row[ c] - ytop_left;

+          if (pred < 0)

+            pred = 0;

+          if (pred > 255)

+            pred = 255;

+          ypred_ptr[c] = pred;

+        }

+        ypred_ptr += y_stride;

+      }

+    }

+    break;

+    case D45_PRED: {

+      d45_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

+    }

+    break;

+    case D135_PRED: {

+      d135_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

+    }

+    break;

+    case D117_PRED: {

+      d117_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

+    }

+    break;

+    case D153_PRED: {

+      d153_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

+    }

+    break;

+    case D27_PRED: {

+      d27_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

+    }

+    break;

+    case D63_PRED: {

+      d63_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

+    }

+    break;

+    case I8X8_PRED:

+    case B_PRED:

+    case NEARESTMV:

+    case NEARMV:

+    case ZEROMV:

+    case NEWMV:

+    case SPLITMV:

+    case MB_MODE_COUNT:

+      break;

+  }

+}

+void vp9_build_intra_predictors_mby(MACROBLOCKD *xd) {

+  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,

+                                      xd->predictor, 16,

+                                      xd->mode_info_context->mbmi.mode, 16,

+                                      xd->up_available, xd->left_available);

+}

+void vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd) {

+  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,

+                                      xd->dst.y_buffer, xd->dst.y_stride,

+                                      xd->mode_info_context->mbmi.mode, 16,

+                                      xd->up_available, xd->left_available);

+}

+#if CONFIG_SUPERBLOCKS

+void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd) {

+  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,

+                                      xd->dst.y_buffer, xd->dst.y_stride,

+                                      xd->mode_info_context->mbmi.mode, 32,

+                                      xd->up_available, xd->left_available);

+}

+#endif

+#if CONFIG_COMP_INTRA_PRED

+void vp9_build_comp_intra_predictors_mby(MACROBLOCKD *xd) {

+  unsigned char predictor[2][256];

+  int i;

+  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,

+                                      predictor[0], 16,

+                                      xd->mode_info_context->mbmi.mode,

+                                      16, xd->up_available,

+                                      xd->left_available);

+  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,

+                                      predictor[1], 16,

+                                      xd->mode_info_context->mbmi.second_mode,

+                                      16, xd->up_available,

+                                      xd->left_available);

+  for (i = 0; i < 256; i++) {

+    xd->predictor[i] = (predictor[0][i] + predictor[1][i] + 1) >> 1;

+  }

+}

+#endif

+void vp9_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd,

+                                              unsigned char *upred_ptr,

+                                              unsigned char *vpred_ptr,

+                                              int uv_stride,

+                                              int mode, int bsize) {

+  vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride,

+                                      upred_ptr, uv_stride, mode, bsize,

+                                      xd->up_available, xd->left_available);

+  vp9_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride,

+                                      vpred_ptr, uv_stride, mode, bsize,

+                                      xd->up_available, xd->left_available);

+}

+void vp9_build_intra_predictors_mbuv(MACROBLOCKD *xd) {

+  vp9_build_intra_predictors_mbuv_internal(xd, &xd->predictor[256],

+                                           &xd->predictor[320], 8,

+                                           xd->mode_info_context->mbmi.uv_mode,

+                                           8);

+}

+void vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd) {

+  vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,

+                                           xd->dst.v_buffer,

+                                           xd->dst.uv_stride,

+                                           xd->mode_info_context->mbmi.uv_mode,

+                                           8);

+}

+#if CONFIG_SUPERBLOCKS

+void vp9_build_intra_predictors_sbuv_s(MACROBLOCKD *xd) {

+  vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,

+                                           xd->dst.v_buffer, xd->dst.uv_stride,

+                                           xd->mode_info_context->mbmi.uv_mode,

+                                           16);

+}

+#endif

+#if CONFIG_COMP_INTRA_PRED

+void vp9_build_comp_intra_predictors_mbuv(MACROBLOCKD *xd) {

+  unsigned char predictor[2][2][64];

+  int i;

+  vp9_build_intra_predictors_mbuv_internal(

+    xd, predictor[0][0], predictor[1][0], 8,

+    xd->mode_info_context->mbmi.uv_mode, 8);

+  vp9_build_intra_predictors_mbuv_internal(

+    xd, predictor[0][1], predictor[1][1], 8,

+    xd->mode_info_context->mbmi.second_uv_mode, 8);

+  for (i = 0; i < 64; i++) {

+    xd->predictor[256 + i] = (predictor[0][0][i] + predictor[0][1][i] + 1) >> 1;

+    xd->predictor[256 + 64 + i] = (predictor[1][0][i] +

+                                   predictor[1][1][i] + 1) >> 1;

+  }

+}

+#endif

+void vp9_intra8x8_predict(BLOCKD *xd,

+                          int mode,

+                          unsigned char *predictor) {

+  vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,

+                                      xd->dst_stride, predictor, 16,

+                                      mode, 8, 1, 1);

+}

+#if CONFIG_COMP_INTRA_PRED

+void vp9_comp_intra8x8_predict(BLOCKD *xd,

+                               int mode, int second_mode,

+                               unsigned char *out_predictor) {

+  unsigned char predictor[2][8 * 16];

+  int i, j;

+  vp9_intra8x8_predict(xd, mode, predictor[0]);

+  vp9_intra8x8_predict(xd, second_mode, predictor[1]);

+  for (i = 0; i < 8 * 16; i += 16) {

+    for (j = i; j < i + 8; j++) {

+      out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;

+    }

+  }

+}

+#endif

+void vp9_intra_uv4x4_predict(BLOCKD *xd,

+                             int mode,

+                             unsigned char *predictor) {

+  vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,

+                                      xd->dst_stride, predictor, 8,

+                                      mode, 4, 1, 1);

+}

+#if CONFIG_COMP_INTRA_PRED

+void vp9_comp_intra_uv4x4_predict(BLOCKD *xd,

+                                  int mode, int mode2,

+                                  unsigned char *out_predictor) {

+  unsigned char predictor[2][8 * 4];

+  int i, j;

+  vp9_intra_uv4x4_predict(xd, mode, predictor[0]);

+  vp9_intra_uv4x4_predict(xd, mode2, predictor[1]);

+  for (i = 0; i < 4 * 8; i += 8) {

+    for (j = i; j < i + 4; j++) {

+      out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;

+    }

+  }

+}

+#endif

+/* TODO: try different ways of use Y-UV mode correlation

+ Current code assumes that a uv 4x4 block use same mode

+ as corresponding Y 8x8 area

+ */

--- /dev/null

+++ b/vp9/common/reconintra.h

@@ -1,0 +1,18 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_RECONINTRA_H

+#define __INC_RECONINTRA_H

+#include "blockd.h"

+extern void init_intra_left_above_pixels(MACROBLOCKD *xd);

+#endif  // __INC_RECONINTRA_H

--- /dev/null

+++ b/vp9/common/reconintra4x4.c

@@ -1,0 +1,321 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vpx_mem/vpx_mem.h"

+#include "reconintra.h"

+#include "vpx_rtcd.h"

+void vp9_intra4x4_predict_c(BLOCKD *x, int b_mode,

+                            unsigned char *predictor) {

+  int i, r, c;

+  unsigned char *Above = *(x->base_dst) + x->dst - x->dst_stride;

+  unsigned char Left[4];

+  unsigned char top_left = Above[-1];

+  Left[0] = (*(x->base_dst))[x->dst - 1];

+  Left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride];

+  Left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride];

+  Left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride];

+  switch (b_mode) {

+    case B_DC_PRED: {

+      int expected_dc = 0;

+      for (i = 0; i < 4; i++) {

+        expected_dc += Above[i];

+        expected_dc += Left[i];

+      }

+      expected_dc = (expected_dc + 4) >> 3;

+      for (r = 0; r < 4; r++) {

+        for (c = 0; c < 4; c++) {

+          predictor[c] = expected_dc;

+        }

+        predictor += 16;

+      }

+    }

+    break;

+    case B_TM_PRED: {

+      /* prediction similar to true_motion prediction */

+      for (r = 0; r < 4; r++) {

+        for (c = 0; c < 4; c++) {

+          int pred = Above[c] - top_left + Left[r];

+          if (pred < 0)

+            pred = 0;

+          if (pred > 255)

+            pred = 255;

+          predictor[c] = pred;

+        }

+        predictor += 16;

+      }

+    }

+    break;

+    case B_VE_PRED: {

+      unsigned int ap[4];

+      ap[0] = Above[0];

+      ap[1] = Above[1];

+      ap[2] = Above[2];

+      ap[3] = Above[3];

+      for (r = 0; r < 4; r++) {

+        for (c = 0; c < 4; c++) {

+          predictor[c] = ap[c];

+        }

+        predictor += 16;

+      }

+    }

+    break;

+    case B_HE_PRED: {

+      unsigned int lp[4];

+      lp[0] = Left[0];

+      lp[1] = Left[1];

+      lp[2] = Left[2];

+      lp[3] = Left[3];

+      for (r = 0; r < 4; r++) {

+        for (c = 0; c < 4; c++) {

+          predictor[c] = lp[r];

+        }

+        predictor += 16;

+      }

+    }

+    break;

+    case B_LD_PRED: {

+      unsigned char *ptr = Above;

+      predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;

+      predictor[0 * 16 + 1] =

+        predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;

+      predictor[0 * 16 + 2] =

+        predictor[1 * 16 + 1] =

+          predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;

+      predictor[0 * 16 + 3] =

+        predictor[1 * 16 + 2] =

+          predictor[2 * 16 + 1] =

+            predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;

+      predictor[1 * 16 + 3] =

+        predictor[2 * 16 + 2] =

+          predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;

+      predictor[2 * 16 + 3] =

+        predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;

+      predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;

+    }

+    break;

+    case B_RD_PRED: {

+      unsigned char pp[9];

+      pp[0] = Left[3];

+      pp[1] = Left[2];

+      pp[2] = Left[1];

+      pp[3] = Left[0];

+      pp[4] = top_left;

+      pp[5] = Above[0];

+      pp[6] = Above[1];

+      pp[7] = Above[2];

+      pp[8] = Above[3];

+      predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;

+      predictor[3 * 16 + 1] =

+        predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;

+      predictor[3 * 16 + 2] =

+        predictor[2 * 16 + 1] =

+          predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;

+      predictor[3 * 16 + 3] =

+        predictor[2 * 16 + 2] =

+          predictor[1 * 16 + 1] =

+            predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;

+      predictor[2 * 16 + 3] =

+        predictor[1 * 16 + 2] =

+          predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;

+      predictor[1 * 16 + 3] =

+        predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;

+      predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;

+    }

+    break;

+    case B_VR_PRED: {

+      unsigned char pp[9];

+      pp[0] = Left[3];

+      pp[1] = Left[2];

+      pp[2] = Left[1];

+      pp[3] = Left[0];

+      pp[4] = top_left;

+      pp[5] = Above[0];

+      pp[6] = Above[1];

+      pp[7] = Above[2];

+      pp[8] = Above[3];

+      predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;

+      predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;

+      predictor[3 * 16 + 1] =

+        predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;

+      predictor[2 * 16 + 1] =

+        predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1;

+      predictor[3 * 16 + 2] =

+        predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;

+      predictor[2 * 16 + 2] =

+        predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1;

+      predictor[3 * 16 + 3] =

+        predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;

+      predictor[2 * 16 + 3] =

+        predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1;

+      predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;

+      predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1;

+    }

+    break;

+    case B_VL_PRED: {

+      unsigned char *pp = Above;

+      predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;

+      predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;

+      predictor[2 * 16 + 0] =

+        predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1;

+      predictor[1 * 16 + 1] =

+        predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;

+      predictor[2 * 16 + 1] =

+        predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1;

+      predictor[3 * 16 + 1] =

+        predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;

+      predictor[0 * 16 + 3] =

+        predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1;

+      predictor[1 * 16 + 3] =

+        predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;

+      predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;

+      predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;

+    }

+    break;

+    case B_HD_PRED: {

+      unsigned char pp[9];

+      pp[0] = Left[3];

+      pp[1] = Left[2];

+      pp[2] = Left[1];

+      pp[3] = Left[0];

+      pp[4] = top_left;

+      pp[5] = Above[0];

+      pp[6] = Above[1];

+      pp[7] = Above[2];

+      pp[8] = Above[3];

+      predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;

+      predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;

+      predictor[2 * 16 + 0] =

+        predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1;

+      predictor[2 * 16 + 1] =

+        predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;

+      predictor[2 * 16 + 2] =

+        predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;

+      predictor[2 * 16 + 3] =

+        predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;

+      predictor[1 * 16 + 2] =

+        predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1;

+      predictor[1 * 16 + 3] =

+        predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;

+      predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;

+      predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;

+    }

+    break;

+    case B_HU_PRED: {

+      unsigned char *pp = Left;

+      predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;

+      predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;

+      predictor[0 * 16 + 2] =

+        predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1;

+      predictor[0 * 16 + 3] =

+        predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;

+      predictor[1 * 16 + 2] =

+        predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;

+      predictor[1 * 16 + 3] =

+        predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;

+      predictor[2 * 16 + 2] =

+        predictor[2 * 16 + 3] =

+          predictor[3 * 16 + 0] =

+            predictor[3 * 16 + 1] =

+              predictor[3 * 16 + 2] =

+                predictor[3 * 16 + 3] = pp[3];

+    }

+    break;

+  }

+}

+#if CONFIG_COMP_INTRA_PRED

+void vp9_comp_intra4x4_predict_c(BLOCKD *x,

+                               int b_mode, int b_mode2,

+                               unsigned char *out_predictor) {

+  unsigned char predictor[2][4 * 16];

+  int i, j;

+  vp9_intra4x4_predict(x, b_mode, predictor[0]);

+  vp9_intra4x4_predict(x, b_mode2, predictor[1]);

+  for (i = 0; i < 16 * 4; i += 16) {

+    for (j = i; j < i + 4; j++) {

+      out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;

+    }

+  }

+}

+#endif

+/* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and

+ * to the right prediction have filled in pixels to use.

+ */

+void vp9_intra_prediction_down_copy(MACROBLOCKD *xd) {

+  int extend_edge = (xd->mb_to_right_edge == 0 && xd->mb_index < 2);

+  unsigned char *above_right = *(xd->block[0].base_dst) + xd->block[0].dst -

+                               xd->block[0].dst_stride + 16;

+  unsigned int *src_ptr = (unsigned int *)

+      (above_right - (xd->mb_index == 3 ? 16 * xd->block[0].dst_stride : 0));

+  unsigned int *dst_ptr0 = (unsigned int *)above_right;

+  unsigned int *dst_ptr1 =

+    (unsigned int *)(above_right + 4 * xd->block[0].dst_stride);

+  unsigned int *dst_ptr2 =

+    (unsigned int *)(above_right + 8 * xd->block[0].dst_stride);

+  unsigned int *dst_ptr3 =

+    (unsigned int *)(above_right + 12 * xd->block[0].dst_stride);

+  if (extend_edge) {

+    *src_ptr = ((uint8_t *) src_ptr)[-1] * 0x01010101U;

+  }

+  *dst_ptr0 = *src_ptr;

+  *dst_ptr1 = *src_ptr;

+  *dst_ptr2 = *src_ptr;

+  *dst_ptr3 = *src_ptr;

+}

--- /dev/null

+++ b/vp9/common/reconintra4x4.h

@@ -1,0 +1,17 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_RECONINTRA4x4_H

+#define __INC_RECONINTRA4x4_H

+extern void vp9_intra_prediction_down_copy(MACROBLOCKD *xd);

+#endif

--- /dev/null

+++ b/vp9/common/rtcd.c

@@ -1,0 +1,105 @@

+/*

+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_config.h"

+#define RTCD_C

+#include "vpx_rtcd.h"

+#if CONFIG_MULTITHREAD && defined(_WIN32)

+#include <windows.h>

+#include <stdlib.h>

+static void once(void (*func)(void))

+{

+    static CRITICAL_SECTION *lock;

+    static LONG waiters;

+    static int done;

+    void *lock_ptr = &lock;

+    /* If the initialization is complete, return early. This isn't just an

+     * optimization, it prevents races on the destruction of the global

+     * lock.

+     */

+    if(done)

+        return;

+    InterlockedIncrement(&waiters);

+    /* Get a lock. We create one and try to make it the one-true-lock,

+     * throwing it away if we lost the race.

+     */

+    {

+        /* Scope to protect access to new_lock */

+        CRITICAL_SECTION *new_lock = malloc(sizeof(CRITICAL_SECTION));

+        InitializeCriticalSection(new_lock);

+        if (InterlockedCompareExchangePointer(lock_ptr, new_lock, NULL) != NULL)

+        {

+            DeleteCriticalSection(new_lock);

+            free(new_lock);

+        }

+    }

+    /* At this point, we have a lock that can be synchronized on. We don't

+     * care which thread actually performed the allocation.

+     */

+    EnterCriticalSection(lock);

+    if (!done)

+    {

+        func();

+        done = 1;

+    }

+    LeaveCriticalSection(lock);

+    /* Last one out should free resources. The destructed objects are

+     * protected by checking if(done) above.

+     */

+    if(!InterlockedDecrement(&waiters))

+    {

+        DeleteCriticalSection(lock);

+        free(lock);

+        lock = NULL;

+    }

+}

+#elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H

+#include <pthread.h>

+static void once(void (*func)(void))

+{

+    static pthread_once_t lock = PTHREAD_ONCE_INIT;

+    pthread_once(&lock, func);

+}

+#else

+/* No-op version that performs no synchronization. vpx_rtcd() is idempotent,

+ * so as long as your platform provides atomic loads/stores of pointers

+ * no synchronization is strictly necessary.

+ */

+static void once(void (*func)(void))

+{

+    static int done;

+    if(!done)

+    {

+        func();

+        done = 1;

+    }

+}

+#endif

+void vpx_rtcd()

+{

+    once(setup_rtcd_internal);

+}

--- /dev/null

+++ b/vp9/common/rtcd_defs.sh

@@ -1,0 +1,482 @@

+common_forward_decls() {

+cat <<EOF

+struct loop_filter_info;

+struct blockd;

+struct macroblockd;

+struct loop_filter_info;

+/* Encoder forward decls */

+struct block;

+struct macroblock;

+struct variance_vtable;

+/* Encoder forward decls */

+struct variance_vtable;

+union int_mv;

+struct yv12_buffer_config;

+EOF

+}

+forward_decls common_forward_decls

+prototype void vp9_filter_block2d_4x4_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"

+prototype void vp9_filter_block2d_8x4_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"

+prototype void vp9_filter_block2d_8x8_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"

+prototype void vp9_filter_block2d_16x16_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"

+# At the very least, MSVC 2008 has compiler bug exhibited by this code; code

+# compiles warning free but a dissassembly of generated code show bugs. To be

+# on the safe side, only enabled when compiled with 'gcc'.

+if [ "$CONFIG_GCC" = "yes" ]; then

+    specialize vp9_filter_block2d_4x4_8 sse4_1 sse2

+fi

+    specialize vp9_filter_block2d_8x4_8 ssse3 #sse4_1 sse2

+    specialize vp9_filter_block2d_8x8_8 ssse3 #sse4_1 sse2

+    specialize vp9_filter_block2d_16x16_8 ssse3 #sse4_1 sse2

+#

+# Dequant

+#

+prototype void vp9_dequantize_b "struct blockd *x"

+specialize vp9_dequantize_b mmx

+prototype void vp9_dequantize_b_2x2 "struct blockd *x"

+specialize vp9_dequantize_b_2x2

+prototype void vp9_dequant_dc_idct_add_y_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs, short *dc, struct macroblockd *xd"

+specialize vp9_dequant_dc_idct_add_y_block_8x8

+prototype void vp9_dequant_idct_add_y_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs, struct macroblockd *xd"

+specialize vp9_dequant_idct_add_y_block_8x8

+prototype void vp9_dequant_idct_add_uv_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, char *eobs, struct macroblockd *xd"

+specialize vp9_dequant_idct_add_uv_block_8x8

+prototype void vp9_dequant_idct_add_16x16 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"

+specialize vp9_dequant_idct_add_16x16

+prototype void vp9_dequant_idct_add "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"

+specialize vp9_dequant_idct_add

+prototype void vp9_dequant_dc_idct_add "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc"

+specialize vp9_dequant_dc_idct_add

+prototype void vp9_dequant_dc_idct_add_y_block "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs, short *dc"

+specialize vp9_dequant_dc_idct_add_y_block mmx

+prototype void vp9_dequant_idct_add_y_block "short *q, short *dq, unsigned char *pre, unsigned char *dst, int stride, char *eobs"

+specialize vp9_dequant_idct_add_y_block mmx

+prototype void vp9_dequant_idct_add_uv_block "short *q, short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, char *eobs"

+specialize vp9_dequant_idct_add_uv_block mmx

+#

+# RECON

+#

+prototype void vp9_copy_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"

+specialize vp9_copy_mem16x16 mmx sse2 media neon dspr2

+vp9_copy_mem16x16_media=vp9_copy_mem16x16_v6

+vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2

+prototype void vp9_copy_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"

+specialize vp9_copy_mem8x8 mmx media neon dspr2

+vp9_copy_mem8x8_media=vp9_copy_mem8x8_v6

+vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2

+prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"

+specialize vp9_copy_mem8x4 mmx

+prototype void vp9_intra4x4_predict "unsigned char *Above, unsigned char *yleft, int left_stride, B_PREDICTION_MODE b_mode, unsigned char *dst, int dst_stride, unsigned char top_left"

+specialize vp9_intra4x4_predict

+prototype void vp9_avg_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"

+specialize vp9_avg_mem16x16

+prototype void vp9_avg_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"

+specialize vp9_avg_mem8x8

+prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"

+specialize vp9_copy_mem8x4 mmx media neon dspr2

+vp9_copy_mem8x4_media=vp9_copy_mem8x4_v6

+vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2

+prototype void vp9_recon_b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"

+specialize vp9_recon_b

+prototype void vp9_recon_uv_b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"

+specialize vp9_recon_uv_b

+prototype void vp9_recon2b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"

+specialize vp9_recon2b sse2

+prototype void vp9_recon4b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"

+specialize vp9_recon4b sse2

+prototype void vp9_recon_mb "struct macroblockd *x"

+specialize vp9_recon_mb

+prototype void vp9_recon_mby "struct macroblockd *x"

+specialize vp9_recon_mby

+prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"

+specialize vp9_build_intra_predictors_mby_s

+prototype void vp9_build_intra_predictors_sby_s "struct macroblockd *x"

+specialize vp9_build_intra_predictors_sby_s;

+prototype void vp9_build_intra_predictors_sbuv_s "struct macroblockd *x"

+specialize vp9_build_intra_predictors_sbuv_s;

+prototype void vp9_build_intra_predictors_mby "struct macroblockd *x"

+specialize vp9_build_intra_predictors_mby;

+prototype void vp9_build_comp_intra_predictors_mby "struct macroblockd *x"

+specialize vp9_build_comp_intra_predictors_mby;

+prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"

+specialize vp9_build_intra_predictors_mby_s;

+prototype void vp9_build_intra_predictors_mbuv "struct macroblockd *x"

+specialize vp9_build_intra_predictors_mbuv;

+prototype void vp9_build_intra_predictors_mbuv_s "struct macroblockd *x"

+specialize vp9_build_intra_predictors_mbuv_s;

+prototype void vp9_build_comp_intra_predictors_mbuv "struct macroblockd *x"

+specialize vp9_build_comp_intra_predictors_mbuv;

+prototype void vp9_intra4x4_predict "struct blockd *x, int b_mode, unsigned char *predictor"

+specialize vp9_intra4x4_predict;

+prototype void vp9_comp_intra4x4_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"

+specialize vp9_comp_intra4x4_predict;

+prototype void vp9_intra8x8_predict "struct blockd *x, int b_mode, unsigned char *predictor"

+specialize vp9_intra8x8_predict;

+prototype void vp9_comp_intra8x8_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"

+specialize vp9_comp_intra8x8_predict;

+prototype void vp9_intra_uv4x4_predict "struct blockd *x, int b_mode, unsigned char *predictor"

+specialize vp9_intra_uv4x4_predict;

+prototype void vp9_comp_intra_uv4x4_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"

+specialize vp9_comp_intra_uv4x4_predict;

+#

+# Loopfilter

+#

+prototype void vp9_loop_filter_mbv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"

+specialize vp9_loop_filter_mbv sse2

+prototype void vp9_loop_filter_bv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"

+specialize vp9_loop_filter_bv sse2

+prototype void vp9_loop_filter_bv8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"

+specialize vp9_loop_filter_bv8x8 sse2

+prototype void vp9_loop_filter_mbh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"

+specialize vp9_loop_filter_mbh sse2

+prototype void vp9_loop_filter_bh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"

+specialize vp9_loop_filter_bh sse2

+prototype void vp9_loop_filter_bh8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"

+specialize vp9_loop_filter_bh8x8 sse2

+prototype void vp9_loop_filter_simple_mbv "unsigned char *y, int ystride, const unsigned char *blimit"

+specialize vp9_loop_filter_simple_mbv mmx sse2 media neon

+vp9_loop_filter_simple_mbv_c=vp9_loop_filter_simple_vertical_edge_c

+vp9_loop_filter_simple_mbv_mmx=vp9_loop_filter_simple_vertical_edge_mmx

+vp9_loop_filter_simple_mbv_sse2=vp9_loop_filter_simple_vertical_edge_sse2

+vp9_loop_filter_simple_mbv_media=vp9_loop_filter_simple_vertical_edge_armv6

+vp9_loop_filter_simple_mbv_neon=vp9_loop_filter_mbvs_neon

+prototype void vp9_loop_filter_simple_mbh "unsigned char *y, int ystride, const unsigned char *blimit"

+specialize vp9_loop_filter_simple_mbh mmx sse2 media neon

+vp9_loop_filter_simple_mbh_c=vp9_loop_filter_simple_horizontal_edge_c

+vp9_loop_filter_simple_mbh_mmx=vp9_loop_filter_simple_horizontal_edge_mmx

+vp9_loop_filter_simple_mbh_sse2=vp9_loop_filter_simple_horizontal_edge_sse2

+vp9_loop_filter_simple_mbh_media=vp9_loop_filter_simple_horizontal_edge_armv6

+vp9_loop_filter_simple_mbh_neon=vp9_loop_filter_mbhs_neon

+prototype void vp9_loop_filter_simple_bv "unsigned char *y, int ystride, const unsigned char *blimit"

+specialize vp9_loop_filter_simple_bv mmx sse2 media neon

+vp9_loop_filter_simple_bv_c=vp9_loop_filter_bvs_c

+vp9_loop_filter_simple_bv_mmx=vp9_loop_filter_bvs_mmx

+vp9_loop_filter_simple_bv_sse2=vp9_loop_filter_bvs_sse2

+vp9_loop_filter_simple_bv_media=vp9_loop_filter_bvs_armv6

+vp9_loop_filter_simple_bv_neon=vp9_loop_filter_bvs_neon

+prototype void vp9_loop_filter_simple_bh "unsigned char *y, int ystride, const unsigned char *blimit"

+specialize vp9_loop_filter_simple_bh mmx sse2 media neon

+vp9_loop_filter_simple_bh_c=vp9_loop_filter_bhs_c

+vp9_loop_filter_simple_bh_mmx=vp9_loop_filter_bhs_mmx

+vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2

+vp9_loop_filter_simple_bh_media=vp9_loop_filter_bhs_armv6

+vp9_loop_filter_simple_bh_neon=vp9_loop_filter_bhs_neon

+#

+# sad 16x3, 3x16

+#

+if [ "$CONFIG_NEWBESTREFMV" = "yes" ]; then

+prototype unsigned int vp9_sad16x3 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"

+specialize vp9_sad16x3 sse2

+prototype unsigned int vp9_sad3x16 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"

+specialize vp9_sad3x16 sse2

+fi

+#

+# Encoder functions below this point.

+#

+if [ "$CONFIG_VP9_ENCODER" = "yes" ]; then

+# variance

+[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2

+prototype unsigned int vp9_variance32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_variance32x32

+prototype unsigned int vp9_variance16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_variance16x16 mmx sse2

+vp9_variance16x16_sse2=vp9_variance16x16_wmt

+vp9_variance16x16_mmx=vp9_variance16x16_mmx

+prototype unsigned int vp9_variance16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_variance16x8 mmx sse2

+vp9_variance16x8_sse2=vp9_variance16x8_wmt

+vp9_variance16x8_mmx=vp9_variance16x8_mmx

+prototype unsigned int vp9_variance8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_variance8x16 mmx sse2

+vp9_variance8x16_sse2=vp9_variance8x16_wmt

+vp9_variance8x16_mmx=vp9_variance8x16_mmx

+prototype unsigned int vp9_variance8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_variance8x8 mmx sse2

+vp9_variance8x8_sse2=vp9_variance8x8_wmt

+vp9_variance8x8_mmx=vp9_variance8x8_mmx

+prototype unsigned int vp9_variance4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_variance4x4 mmx sse2

+vp9_variance4x4_sse2=vp9_variance4x4_wmt

+vp9_variance4x4_mmx=vp9_variance4x4_mmx

+prototype unsigned int vp9_sub_pixel_variance32x32 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"

+specialize vp9_sub_pixel_variance32x32

+prototype unsigned int vp9_sub_pixel_variance16x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"

+specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3

+vp9_sub_pixel_variance16x16_sse2=vp9_sub_pixel_variance16x16_wmt

+prototype unsigned int vp9_sub_pixel_variance8x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"

+specialize vp9_sub_pixel_variance8x16 sse2 mmx

+vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt

+prototype unsigned int vp9_sub_pixel_variance16x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"

+specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3

+vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;

+vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt

+prototype unsigned int vp9_sub_pixel_variance8x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"

+specialize vp9_sub_pixel_variance8x8 sse2 mmx

+vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt

+prototype unsigned int vp9_sub_pixel_variance4x4 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"

+specialize vp9_sub_pixel_variance4x4 sse2 mmx

+vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt

+prototype unsigned int vp9_sad32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"

+specialize vp9_sad32x32

+prototype unsigned int vp9_sad16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"

+specialize vp9_sad16x16 mmx sse2 sse3

+vp9_sad16x16_sse2=vp9_sad16x16_wmt

+prototype unsigned int vp9_sad16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"

+specialize vp9_sad16x8 mmx sse2

+vp9_sad16x8_sse2=vp9_sad16x8_wmt

+prototype unsigned int vp9_sad8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"

+specialize vp9_sad8x16 mmx sse2

+vp9_sad8x16_sse2=vp9_sad8x16_wmt

+prototype unsigned int vp9_sad8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"

+specialize vp9_sad8x8 mmx sse2

+vp9_sad8x8_sse2=vp9_sad8x8_wmt

+prototype unsigned int vp9_sad4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"

+specialize vp9_sad4x4 mmx sse2

+vp9_sad4x4_sse2=vp9_sad4x4_wmt

+prototype unsigned int vp9_variance_halfpixvar16x16_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_variance_halfpixvar16x16_h mmx sse2

+vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt

+prototype unsigned int vp9_variance_halfpixvar16x16_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_variance_halfpixvar16x16_v mmx sse2

+vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt

+prototype unsigned int vp9_variance_halfpixvar16x16_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_variance_halfpixvar16x16_hv mmx sse2

+vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt

+prototype unsigned int vp9_variance_halfpixvar32x32_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_variance_halfpixvar32x32_h

+prototype unsigned int vp9_variance_halfpixvar32x32_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_variance_halfpixvar32x32_v

+prototype unsigned int vp9_variance_halfpixvar32x32_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_variance_halfpixvar32x32_hv

+prototype void vp9_sad32x32x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad32x32x3

+prototype void vp9_sad16x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad16x16x3 sse3 ssse3

+prototype void vp9_sad16x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad16x8x3 sse3 ssse3

+prototype void vp9_sad8x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad8x16x3 sse3

+prototype void vp9_sad8x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad8x8x3 sse3

+prototype void vp9_sad4x4x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad4x4x3 sse3

+prototype void vp9_sad32x32x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"

+specialize vp9_sad32x32x8

+prototype void vp9_sad16x16x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"

+specialize vp9_sad16x16x8 sse4

+prototype void vp9_sad16x8x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"

+specialize vp9_sad16x8x8 sse4

+prototype void vp9_sad8x16x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"

+specialize vp9_sad8x16x8 sse4

+prototype void vp9_sad8x8x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"

+specialize vp9_sad8x8x8 sse4

+prototype void vp9_sad4x4x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"

+specialize vp9_sad4x4x8 sse4

+prototype void vp9_sad32x32x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad32x32x4d

+prototype void vp9_sad16x16x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad16x16x4d sse3

+prototype void vp9_sad16x8x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad16x8x4d sse3

+prototype void vp9_sad8x16x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad8x16x4d sse3

+prototype void vp9_sad8x8x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad8x8x4d sse3

+prototype void vp9_sad4x4x4d "const unsigned char *src_ptr, int  src_stride, unsigned char *ref_ptr[], int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad4x4x4d sse3

+#

+# Block copy

+#

+case $arch in

+    x86*)

+    prototype void vp9_copy32xn "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n"

+    specialize vp9_copy32xn sse2 sse3

+    ;;

+esac

+prototype unsigned int vp9_sub_pixel_mse16x16 "const unsigned char  *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse"

+specialize vp9_sub_pixel_mse16x16 sse2 mmx

+vp9_sub_pixel_mse16x16_sse2=vp9_sub_pixel_mse16x16_wmt

+prototype unsigned int vp9_mse16x16 "const unsigned char *src_ptr, int  source_stride, const unsigned char *ref_ptr, int  recon_stride, unsigned int *sse"

+specialize vp9_mse16x16 mmx sse2

+vp9_mse16x16_sse2=vp9_mse16x16_wmt

+prototype unsigned int vp9_sub_pixel_mse32x32 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"

+specialize vp9_sub_pixel_mse32x32

+prototype unsigned int vp9_get_mb_ss "const short *"

+specialize vp9_get_mb_ss mmx sse2

+# ENCODEMB INVOKE

+prototype int vp9_mbblock_error "struct macroblock *mb, int dc"

+specialize vp9_mbblock_error mmx sse2

+vp9_mbblock_error_sse2=vp9_mbblock_error_xmm

+prototype int vp9_block_error "short *coeff, short *dqcoeff, int block_size"

+specialize vp9_block_error mmx sse2

+vp9_block_error_sse2=vp9_block_error_xmm

+prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"

+specialize vp9_subtract_b mmx sse2

+prototype int vp9_mbuverror "struct macroblock *mb"

+specialize vp9_mbuverror mmx sse2

+vp9_mbuverror_sse2=vp9_mbuverror_xmm

+prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"

+specialize vp9_subtract_b mmx sse2

+prototype void vp9_subtract_mby "short *diff, unsigned char *src, unsigned char *pred, int stride"

+specialize vp9_subtract_mby mmx sse2

+prototype void vp9_subtract_mbuv "short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride"

+specialize vp9_subtract_mbuv mmx sse2

+#

+# Structured Similarity (SSIM)

+#

+if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then

+    [ $arch = "x86_64" ] && sse2_on_x86_64=sse2

+    prototype void vp9_ssim_parms_8x8 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"

+    specialize vp9_ssim_parms_8x8 $sse2_on_x86_64

+    prototype void vp9_ssim_parms_16x16 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"

+    specialize vp9_ssim_parms_16x16 $sse2_on_x86_64

+fi

+# fdct functions

+prototype void vp9_fht "const short *input, int pitch, short *output, int tx_type, int tx_dim"

+specialize vp9_fht

+prototype void vp9_short_fdct8x8 "short *InputData, short *OutputData, int pitch"

+specialize vp9_short_fdct8x8

+prototype void vp9_short_fhaar2x2 "short *InputData, short *OutputData, int pitch"

+specialize vp9_short_fhaar2x2

+prototype void vp9_short_fdct4x4 "short *InputData, short *OutputData, int pitch"

+specialize vp9_short_fdct4x4

+prototype void vp9_short_fdct8x4 "short *InputData, short *OutputData, int pitch"

+specialize vp9_short_fdct8x4

+prototype void vp9_short_walsh4x4 "short *InputData, short *OutputData, int pitch"

+specialize vp9_short_walsh4x4

+prototype void vp9_short_fdct16x16 "short *InputData, short *OutputData, int pitch"

+specialize vp9_short_fdct16x16

+prototype void vp9_short_walsh4x4_lossless "short *InputData, short *OutputData, int pitch"

+specialize vp9_short_walsh4x4_lossless

+prototype void vp9_short_walsh4x4_x8 "short *InputData, short *OutputData, int pitch"

+specialize vp9_short_walsh4x4_x8

+prototype void vp9_short_walsh8x4_x8 "short *InputData, short *OutputData, int pitch"

+specialize vp9_short_walsh8x4_x8

+fi

+# end encoder functions

--- /dev/null

+++ b/vp9/common/sadmxn.h

@@ -1,0 +1,37 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_SAD_H

+#define __INC_SAD_H

+static __inline

+unsigned int sad_mx_n_c(

+  const unsigned char *src_ptr,

+  int  src_stride,

+  const unsigned char *ref_ptr,

+  int  ref_stride,

+  int m,

+  int n) {

+  int r, c;

+  unsigned int sad = 0;

+  for (r = 0; r < n; r++) {

+    for (c = 0; c < m; c++) {

+      sad += abs(src_ptr[c] - ref_ptr[c]);

+    }

+    src_ptr += src_stride;

+    ref_ptr += ref_stride;

+  }

+  return sad;

+}

+#endif

--- /dev/null

+++ b/vp9/common/seg_common.c

@@ -1,0 +1,103 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vp9/common/seg_common.h"

+static const int segfeaturedata_signed[SEG_LVL_MAX] = { 1, 1, 0, 0, 0, 0 };

+static const int seg_feature_data_bits[SEG_LVL_MAX] = { QINDEX_BITS, 6, 4, 4, 6, 2 };

+// These functions provide access to new segment level features.

+// Eventually these function may be "optimized out" but for the moment,

+// the coding mechanism is still subject to change so these provide a

+// convenient single point of change.

+int vp9_segfeature_active(const MACROBLOCKD *xd,

+                          int segment_id,

+                          SEG_LVL_FEATURES feature_id) {

+  // Return true if mask bit set and segmentation enabled.

+  return (xd->segmentation_enabled &&

+          (xd->segment_feature_mask[segment_id] &

+           (0x01 << feature_id)));

+}

+void vp9_clearall_segfeatures(MACROBLOCKD *xd) {

+  vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));

+  vpx_memset(xd->segment_feature_mask, 0, sizeof(xd->segment_feature_mask));

+}

+void vp9_enable_segfeature(MACROBLOCKD *xd,

+                           int segment_id,

+                           SEG_LVL_FEATURES feature_id) {

+  xd->segment_feature_mask[segment_id] |= (0x01 << feature_id);

+}

+void vp9_disable_segfeature(MACROBLOCKD *xd,

+                            int segment_id,

+                            SEG_LVL_FEATURES feature_id) {

+  xd->segment_feature_mask[segment_id] &= ~(1 << feature_id);

+}

+int vp9_seg_feature_data_bits(SEG_LVL_FEATURES feature_id) {

+  return seg_feature_data_bits[feature_id];

+}

+int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {

+  return (segfeaturedata_signed[feature_id]);

+}

+void vp9_clear_segdata(MACROBLOCKD *xd,

+                       int segment_id,

+                       SEG_LVL_FEATURES feature_id) {

+  xd->segment_feature_data[segment_id][feature_id] = 0;

+}

+void vp9_set_segdata(MACROBLOCKD *xd,

+                     int segment_id,

+                     SEG_LVL_FEATURES feature_id,

+                     int seg_data) {

+  xd->segment_feature_data[segment_id][feature_id] = seg_data;

+}

+int vp9_get_segdata(const MACROBLOCKD *xd,

+                    int segment_id,

+                    SEG_LVL_FEATURES feature_id) {

+  return xd->segment_feature_data[segment_id][feature_id];

+}

+void vp9_clear_segref(MACROBLOCKD *xd, int segment_id) {

+  xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] = 0;

+}

+void vp9_set_segref(MACROBLOCKD *xd,

+                    int segment_id,

+                    MV_REFERENCE_FRAME ref_frame) {

+  xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] |=

+    (1 << ref_frame);

+}

+int vp9_check_segref(const MACROBLOCKD *xd,

+                     int segment_id,

+                     MV_REFERENCE_FRAME ref_frame) {

+  return (xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] &

+          (1 << ref_frame)) ? 1 : 0;

+}

+int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id) {

+  return (xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] &

+          ~(1 << INTRA_FRAME)) ? 1 : 0;

+}

+int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id) {

+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_TRANSFORM))

+    return vp9_get_segdata(xd, segment_id, SEG_LVL_TRANSFORM);

+  else

+    return TX_4X4;

+}

+// TBD? Functions to read and write segment data with range / validity checking

--- /dev/null

+++ b/vp9/common/seg_common.h

@@ -1,0 +1,64 @@

+/*

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "type_aliases.h"

+#include "onyxc_int.h"

+#include "vp9/common/blockd.h"

+#ifndef __INC_SEG_COMMON_H__

+#define __INC_SEG_COMMON_H__ 1

+int vp9_segfeature_active(const MACROBLOCKD *xd,

+                          int segment_id,

+                          SEG_LVL_FEATURES feature_id);

+void vp9_clearall_segfeatures(MACROBLOCKD *xd);

+void vp9_enable_segfeature(MACROBLOCKD *xd,

+                           int segment_id,

+                           SEG_LVL_FEATURES feature_id);

+void vp9_disable_segfeature(MACROBLOCKD *xd,

+                            int segment_id,

+                            SEG_LVL_FEATURES feature_id);

+int vp9_seg_feature_data_bits(SEG_LVL_FEATURES feature_id);

+int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id);

+void vp9_clear_segdata(MACROBLOCKD *xd,

+                       int segment_id,

+                       SEG_LVL_FEATURES feature_id);

+void vp9_set_segdata(MACROBLOCKD *xd,

+                     int segment_id,

+                     SEG_LVL_FEATURES feature_id,

+                     int seg_data);

+int vp9_get_segdata(const MACROBLOCKD *xd,

+                    int segment_id,

+                    SEG_LVL_FEATURES feature_id);

+void vp9_clear_segref(MACROBLOCKD *xd, int segment_id);

+void vp9_set_segref(MACROBLOCKD *xd,

+                    int segment_id,

+                    MV_REFERENCE_FRAME ref_frame);

+int vp9_check_segref(const MACROBLOCKD *xd,

+                     int segment_id,

+                     MV_REFERENCE_FRAME ref_frame);

+int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id);

+int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id);

+#endif /* __INC_SEG_COMMON_H__ */

--- /dev/null

+++ b/vp9/common/setupintrarecon.c

@@ -1,0 +1,31 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "setupintrarecon.h"

+#include "vpx_mem/vpx_mem.h"

+void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf) {

+  int i;

+  /* set up frame new frame for intra coded blocks */

+  vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);

+  for (i = 0; i < ybf->y_height; i++)

+    ybf->y_buffer[ybf->y_stride * i - 1] = (unsigned char) 129;

+  vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);

+  for (i = 0; i < ybf->uv_height; i++)

+    ybf->u_buffer[ybf->uv_stride * i - 1] = (unsigned char) 129;

+  vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);

+  for (i = 0; i < ybf->uv_height; i++)

+    ybf->v_buffer[ybf->uv_stride * i - 1] = (unsigned char) 129;

+}

--- /dev/null

+++ b/vp9/common/setupintrarecon.h

@@ -1,0 +1,13 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_scale/yv12config.h"

+extern void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);

--- /dev/null

+++ b/vp9/common/subpixel.h

@@ -1,0 +1,204 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef SUBPIXEL_H

+#define SUBPIXEL_H

+#define prototype_subpixel_predict(sym) \

+  void sym(unsigned char *src, int src_pitch, int xofst, int yofst, \

+           unsigned char *dst, int dst_pitch)

+#if ARCH_X86 || ARCH_X86_64

+#include "x86/subpixel_x86.h"

+#endif

+#if ARCH_ARM

+#include "arm/subpixel_arm.h"

+#endif

+#ifndef vp9_subpix_sixtap16x16

+#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_sixtap16x16);

+#ifndef vp9_subpix_sixtap8x8

+#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_sixtap8x8);

+#ifndef vp9_subpix_sixtap_avg16x16

+#define vp9_subpix_sixtap_avg16x16 vp9_sixtap_predict_avg16x16_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_sixtap_avg16x16);

+#ifndef vp9_subpix_sixtap_avg8x8

+#define vp9_subpix_sixtap_avg8x8 vp9_sixtap_predict_avg8x8_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_sixtap_avg8x8);

+#ifndef vp9_subpix_sixtap8x4

+#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_sixtap8x4);

+#ifndef vp9_subpix_sixtap4x4

+#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_sixtap4x4);

+#ifndef vp9_subpix_sixtap_avg4x4

+#define vp9_subpix_sixtap_avg4x4 vp9_sixtap_predict_avg_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_sixtap_avg4x4);

+#ifndef vp9_subpix_eighttap16x16

+#define vp9_subpix_eighttap16x16 vp9_eighttap_predict16x16_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_eighttap16x16);

+#ifndef vp9_subpix_eighttap8x8

+#define vp9_subpix_eighttap8x8 vp9_eighttap_predict8x8_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_eighttap8x8);

+#ifndef vp9_subpix_eighttap_avg16x16

+#define vp9_subpix_eighttap_avg16x16 vp9_eighttap_predict_avg16x16_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_eighttap_avg16x16);

+#ifndef vp9_subpix_eighttap_avg8x8

+#define vp9_subpix_eighttap_avg8x8 vp9_eighttap_predict_avg8x8_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_eighttap_avg8x8);

+#ifndef vp9_subpix_eighttap8x4

+#define vp9_subpix_eighttap8x4 vp9_eighttap_predict8x4_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_eighttap8x4);

+#ifndef vp9_subpix_eighttap4x4

+#define vp9_subpix_eighttap4x4 vp9_eighttap_predict_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_eighttap4x4);

+#ifndef vp9_subpix_eighttap_avg4x4

+#define vp9_subpix_eighttap_avg4x4 vp9_eighttap_predict_avg4x4_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_eighttap_avg4x4);

+#ifndef vp9_subpix_eighttap16x16_sharp

+#define vp9_subpix_eighttap16x16_sharp vp9_eighttap_predict16x16_sharp_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_eighttap16x16_sharp);

+#ifndef vp9_subpix_eighttap8x8_sharp

+#define vp9_subpix_eighttap8x8_sharp vp9_eighttap_predict8x8_sharp_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_eighttap8x8_sharp);

+#ifndef vp9_subpix_eighttap_avg16x16_sharp

+#define vp9_subpix_eighttap_avg16x16_sharp vp9_eighttap_predict_avg16x16_sharp_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_eighttap_avg16x16_sharp);

+#ifndef vp9_subpix_eighttap_avg8x8_sharp

+#define vp9_subpix_eighttap_avg8x8_sharp vp9_eighttap_predict_avg8x8_sharp_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_eighttap_avg8x8_sharp);

+#ifndef vp9_subpix_eighttap8x4_sharp

+#define vp9_subpix_eighttap8x4_sharp vp9_eighttap_predict8x4_sharp_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_eighttap8x4_sharp);

+#ifndef vp9_subpix_eighttap4x4_sharp

+#define vp9_subpix_eighttap4x4_sharp vp9_eighttap_predict_sharp_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_eighttap4x4_sharp);

+#ifndef vp9_subpix_eighttap_avg4x4_sharp

+#define vp9_subpix_eighttap_avg4x4_sharp vp9_eighttap_predict_avg4x4_sharp_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_eighttap_avg4x4_sharp);

+#ifndef vp9_subpix_bilinear16x16

+#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_bilinear16x16);

+#ifndef vp9_subpix_bilinear8x8

+#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_bilinear8x8);

+#ifndef vp9_subpix_bilinear_avg16x16

+#define vp9_subpix_bilinear_avg16x16 vp9_bilinear_predict_avg16x16_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_bilinear_avg16x16);

+#ifndef vp9_subpix_bilinear_avg8x8

+#define vp9_subpix_bilinear_avg8x8 vp9_bilinear_predict_avg8x8_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_bilinear_avg8x8);

+#ifndef vp9_subpix_bilinear8x4

+#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_bilinear8x4);

+#ifndef vp9_subpix_bilinear4x4

+#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_bilinear4x4);

+#ifndef vp9_subpix_bilinear_avg4x4

+#define vp9_subpix_bilinear_avg4x4 vp9_bilinear_predict_avg4x4_c

+#endif

+extern prototype_subpixel_predict(vp9_subpix_bilinear_avg4x4);

+typedef prototype_subpixel_predict((*vp9_subpix_fn_t));

+typedef struct {

+  vp9_subpix_fn_t  eighttap16x16;

+  vp9_subpix_fn_t  eighttap8x8;

+  vp9_subpix_fn_t  eighttap_avg16x16;

+  vp9_subpix_fn_t  eighttap_avg8x8;

+  vp9_subpix_fn_t  eighttap_avg4x4;

+  vp9_subpix_fn_t  eighttap8x4;

+  vp9_subpix_fn_t  eighttap4x4;

+  vp9_subpix_fn_t  eighttap16x16_sharp;

+  vp9_subpix_fn_t  eighttap8x8_sharp;

+  vp9_subpix_fn_t  eighttap_avg16x16_sharp;

+  vp9_subpix_fn_t  eighttap_avg8x8_sharp;

+  vp9_subpix_fn_t  eighttap_avg4x4_sharp;

+  vp9_subpix_fn_t  eighttap8x4_sharp;

+  vp9_subpix_fn_t  eighttap4x4_sharp;

+  vp9_subpix_fn_t  sixtap16x16;

+  vp9_subpix_fn_t  sixtap8x8;

+  vp9_subpix_fn_t  sixtap_avg16x16;

+  vp9_subpix_fn_t  sixtap_avg8x8;

+  vp9_subpix_fn_t  sixtap8x4;

+  vp9_subpix_fn_t  sixtap4x4;

+  vp9_subpix_fn_t  sixtap_avg4x4;

+  vp9_subpix_fn_t  bilinear16x16;

+  vp9_subpix_fn_t  bilinear8x8;

+  vp9_subpix_fn_t  bilinear_avg16x16;

+  vp9_subpix_fn_t  bilinear_avg8x8;

+  vp9_subpix_fn_t  bilinear8x4;

+  vp9_subpix_fn_t  bilinear4x4;

+  vp9_subpix_fn_t  bilinear_avg4x4;

+} vp9_subpix_rtcd_vtable_t;

+#if CONFIG_RUNTIME_CPU_DETECT

+#define SUBPIX_INVOKE(ctx,fn) (ctx)->fn

+#else

+#define SUBPIX_INVOKE(ctx,fn) vp9_subpix_##fn

+#endif

+#endif

--- /dev/null

+++ b/vp9/common/swapyv12buffer.c

@@ -1,0 +1,32 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "swapyv12buffer.h"

+void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,

+                          YV12_BUFFER_CONFIG *last_frame) {

+  unsigned char *temp;

+  temp = last_frame->buffer_alloc;

+  last_frame->buffer_alloc = new_frame->buffer_alloc;

+  new_frame->buffer_alloc = temp;

+  temp = last_frame->y_buffer;

+  last_frame->y_buffer = new_frame->y_buffer;

+  new_frame->y_buffer = temp;

+  temp = last_frame->u_buffer;

+  last_frame->u_buffer = new_frame->u_buffer;

+  new_frame->u_buffer = temp;

+  temp = last_frame->v_buffer;

+  last_frame->v_buffer = new_frame->v_buffer;

+  new_frame->v_buffer = temp;

+}

--- /dev/null

+++ b/vp9/common/swapyv12buffer.h

@@ -1,0 +1,19 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __SWAPYV12_BUFFER_H

+#define __SWAPYV12_BUFFER_H

+#include "vpx_scale/yv12config.h"

+void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,

+                          YV12_BUFFER_CONFIG *last_frame);

+#endif  // __SWAPYV12_BUFFER_H

--- /dev/null

+++ b/vp9/common/systemdependent.h

@@ -1,0 +1,21 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#if ARCH_X86 || ARCH_X86_64

+void vpx_reset_mmx_state(void);

+#define vp9_clear_system_state() vpx_reset_mmx_state()

+#else

+#define vp9_clear_system_state()

+#endif

+struct VP9Common;

+void vp9_machine_specific_config(struct VP9Common *);

--- /dev/null

+++ b/vp9/common/tapify.py

@@ -1,0 +1,106 @@

+"""

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+"""

+#!/usr/bin/env python

+import sys,string,os,re,math,numpy

+scale = 2**16

+def dist(p1,p2):

+  x1,y1 = p1

+  x2,y2 = p2

+  if x1==x2 and y1==y2 :

+    return 1.0

+  return 1/ math.sqrt((x1-x2)*(x1-x2)+(y1-y2)*(y1-y2))

+def gettaps(p):

+  def l(b):

+    return int(math.floor(b))

+  def h(b):

+    return int(math.ceil(b))

+  def t(b,p,s):

+    return int((scale*dist(b,p)+s/2)/s)

+  r,c = p

+  ul=[l(r),l(c)]

+  ur=[l(r),h(c)]

+  ll=[h(r),l(c)]

+  lr=[h(r),h(c)]

+  sum = dist(ul,p)+dist(ur,p)+dist(ll,p)+dist(lr,p)

+  t4 = scale - t(ul,p,sum) - t(ur,p,sum) - t(ll,p,sum);

+  return [[ul,t(ul,p,sum)],[ur,t(ur,p,sum)],

+          [ll,t(ll,p,sum)],[lr,t4]]

+def print_mb_taps(angle,blocksize):

+  theta = angle / 57.2957795;

+  affine = [[math.cos(theta),-math.sin(theta)],

+            [math.sin(theta),math.cos(theta)]]

+  radius = (float(blocksize)-1)/2

+  print " // angle of",angle,"degrees"

+  for y in range(blocksize) :

+    for x in range(blocksize) :

+      r,c = numpy.dot(affine,[y-radius, x-radius])

+      tps = gettaps([r+radius,c+radius])

+      for t in tps :

+        p,t = t

+        tr,tc = p

+        print " %2d, %2d, %5d, " % (tr,tc,t,),

+      print " // %2d,%2d " % (y,x)

+i=float(sys.argv[1])

+while  i <= float(sys.argv[2]) :

+  print_mb_taps(i,float(sys.argv[4]))

+  i=i+float(sys.argv[3])

+"""

+taps = []

+pt=dict()

+ptr=dict()

+for y in range(16) :

+  for x in range(16) :

+    r,c = numpy.dot(affine,[y-7.5, x-7.5])

+    tps = gettaps([r+7.5,c+7.5])

+    j=0

+    for tp in tps :

+      p,i = tp

+      r,c = p

+      pt[y,x,j]= [p,i]

+      try:

+        ptr[r,j,c].append([y,x])

+      except:

+        ptr[r,j,c]=[[y,x]]

+      j = j+1

+for key in sorted(pt.keys()) :

+  print key,pt[key]

+lr = -99

+lj = -99

+lc = 0

+shuf=""

+mask=""

+for r,j,c in sorted(ptr.keys()) :

+  for y,x in ptr[r,j,c] :

+    if lr != r or lj != j :

+      print "shuf_"+str(lr)+"_"+str(lj)+"_"+shuf.ljust(16,"0"), lc

+      shuf=""

+      lc = 0

+    for i in range(lc,c-1) :

+      shuf = shuf +"0"

+    shuf = shuf + hex(x)[2]

+    lc =c

+    break

+  lr = r

+  lj = j

+#  print r,j,c,ptr[r,j,c]

+#  print

+for r,j,c in sorted(ptr.keys()) :

+  for y,x in ptr[r,j,c] :

+    print r,j,c,y,x

+    break

+"""

--- /dev/null

+++ b/vp9/common/textblit.c

@@ -1,0 +1,116 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <stdlib.h>

+void vp9_blit_text(const char *msg, unsigned char *address, const int pitch) {

+  int letter_bitmap;

+  unsigned char *output_pos = address;

+  int colpos;

+  const int font[] = {

+    0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000,

+    0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110,

+    0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA,

+    0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20,

+    0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF,

+    0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F,

+    0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2,

+    0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731,

+    0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820

+  };

+  colpos = 0;

+  while (msg[colpos] != 0) {

+    char letter = msg[colpos];

+    int fontcol, fontrow;

+    if (letter <= 'Z' && letter >= ' ')

+      letter_bitmap = font[letter - ' '];

+    else if (letter <= 'z' && letter >= 'a')

+      letter_bitmap = font[letter - 'a' + 'A' - ' '];

+    else

+      letter_bitmap = font[0];

+    for (fontcol = 6; fontcol >= 0; fontcol--)

+      for (fontrow = 0; fontrow < 5; fontrow++)

+        output_pos[fontrow * pitch + fontcol] =

+          ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0);

+    output_pos += 7;

+    colpos++;

+  }

+}

+static void plot(const int x, const int y, unsigned char *image, const int pitch) {

+  image [x + y * pitch] ^= 255;

+}

+/* Bresenham line algorithm */

+void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch) {

+  int steep = abs(y1 - y0) > abs(x1 - x0);

+  int deltax, deltay;

+  int error, ystep, y, x;

+  if (steep) {

+    int t;

+    t = x0;

+    x0 = y0;

+    y0 = t;

+    t = x1;

+    x1 = y1;

+    y1 = t;

+  }

+  if (x0 > x1) {

+    int t;

+    t = x0;

+    x0 = x1;

+    x1 = t;

+    t = y0;

+    y0 = y1;

+    y1 = t;

+  }

+  deltax = x1 - x0;

+  deltay = abs(y1 - y0);

+  error  = deltax / 2;

+  y = y0;

+  if (y0 < y1)

+    ystep = 1;

+  else

+    ystep = -1;

+  if (steep) {

+    for (x = x0; x <= x1; x++) {

+      plot(y, x, image, pitch);

+      error = error - deltay;

+      if (error < 0) {

+        y = y + ystep;

+        error = error + deltax;

+      }

+    }

+  } else {

+    for (x = x0; x <= x1; x++) {

+      plot(x, y, image, pitch);

+      error = error - deltay;

+      if (error < 0) {

+        y = y + ystep;

+        error = error + deltax;

+      }

+    }

+  }

+}

--- /dev/null

+++ b/vp9/common/treecoder.c

@@ -1,0 +1,138 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_config.h"

+#if defined(CONFIG_DEBUG) && CONFIG_DEBUG

+#include <assert.h>

+#endif

+#include <stdio.h>

+#include "treecoder.h"

+static void tree2tok(

+  struct vp9_token_struct *const p,

+  vp9_tree t,

+  int i,

+  int v,

+  int L

+) {

+  v += v;

+  ++L;

+  do {

+    const vp9_tree_index j = t[i++];

+    if (j <= 0) {

+      p[-j].value = v;

+      p[-j].Len = L;

+    } else

+      tree2tok(p, t, j, v, L);

+  } while (++v & 1);

+}

+void vp9_tokens_from_tree(struct vp9_token_struct *p, vp9_tree t) {

+  tree2tok(p, t, 0, 0, 0);

+}

+void vp9_tokens_from_tree_offset(struct vp9_token_struct *p, vp9_tree t,

+                                 int offset) {

+  tree2tok(p - offset, t, 0, 0, 0);

+}

+static void branch_counts(

+  int n,                      /* n = size of alphabet */

+  vp9_token tok               [ /* n */ ],

+  vp9_tree tree,

+  unsigned int branch_ct       [ /* n-1 */ ] [2],

+  const unsigned int num_events[ /* n */ ]

+) {

+  const int tree_len = n - 1;

+  int t = 0;

+#if CONFIG_DEBUG

+  assert(tree_len);

+#endif

+  do {

+    branch_ct[t][0] = branch_ct[t][1] = 0;

+  } while (++t < tree_len);

+  t = 0;

+  do {

+    int L = tok[t].Len;

+    const int enc = tok[t].value;

+    const unsigned int ct = num_events[t];

+    vp9_tree_index i = 0;

+    do {

+      const int b = (enc >> --L) & 1;

+      const int j = i >> 1;

+#if CONFIG_DEBUG

+      assert(j < tree_len  &&  0 <= L);

+#endif

+      branch_ct [j] [b] += ct;

+      i = tree[ i + b];

+    } while (i > 0);

+#if CONFIG_DEBUG

+    assert(!L);

+#endif

+  } while (++t < n);

+}

+void vp9_tree_probs_from_distribution(

+  int n,                      /* n = size of alphabet */

+  vp9_token tok               [ /* n */ ],

+  vp9_tree tree,

+  vp9_prob probs          [ /* n-1 */ ],

+  unsigned int branch_ct       [ /* n-1 */ ] [2],

+  const unsigned int num_events[ /* n */ ],

+  unsigned int Pfac,

+  int rd

+) {

+  const int tree_len = n - 1;

+  int t = 0;

+  branch_counts(n, tok, tree, branch_ct, num_events);

+  do {

+    const unsigned int *const c = branch_ct[t];

+    const unsigned int tot = c[0] + c[1];

+#if CONFIG_DEBUG

+    assert(tot < (1 << 24));        /* no overflow below */

+#endif

+    if (tot) {

+      const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot;

+      probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */

+    } else

+      probs[t] = vp9_prob_half;

+  } while (++t < tree_len);

+}

+vp9_prob vp9_bin_prob_from_distribution(const unsigned int counts[2]) {

+  int tot_count = counts[0] + counts[1];

+  vp9_prob prob;

+  if (tot_count) {

+    prob = (counts[0] * 255 + (tot_count >> 1)) / tot_count;

+    prob += !prob;

+  } else {

+    prob = 128;

+  }

+  return prob;

+}

--- /dev/null

+++ b/vp9/common/treecoder.h

@@ -1,0 +1,75 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_TREECODER_H

+#define __INC_TREECODER_H

+typedef unsigned char vp9_prob;

+#define vp9_prob_half ( (vp9_prob) 128)

+typedef signed char vp9_tree_index;

+struct bool_coder_spec;

+typedef struct bool_coder_spec bool_coder_spec;

+typedef struct bool_writer bool_writer;

+typedef struct bool_reader bool_reader;

+typedef const bool_coder_spec c_bool_coder_spec;

+typedef const bool_writer c_bool_writer;

+typedef const bool_reader c_bool_reader;

+# define vp9_complement( x) (255 - x)

+/* We build coding trees compactly in arrays.

+   Each node of the tree is a pair of vp9_tree_indices.

+   Array index often references a corresponding probability table.

+   Index <= 0 means done encoding/decoding and value = -Index,

+   Index > 0 means need another bit, specification at index.

+   Nonnegative indices are always even;  processing begins at node 0. */

+typedef const vp9_tree_index vp9_tree[], *vp9_tree_p;

+typedef const struct vp9_token_struct {

+  int value;

+  int Len;

+} vp9_token;

+/* Construct encoding array from tree. */

+void vp9_tokens_from_tree(struct vp9_token_struct *, vp9_tree);

+void vp9_tokens_from_tree_offset(struct vp9_token_struct *, vp9_tree,

+                                 int offset);

+/* Convert array of token occurrence counts into a table of probabilities

+   for the associated binary encoding tree.  Also writes count of branches

+   taken for each node on the tree; this facilitiates decisions as to

+   probability updates. */

+void vp9_tree_probs_from_distribution(

+  int n,                      /* n = size of alphabet */

+  vp9_token tok               [ /* n */ ],

+  vp9_tree tree,

+  vp9_prob probs          [ /* n-1 */ ],

+  unsigned int branch_ct       [ /* n-1 */ ] [2],

+  const unsigned int num_events[ /* n */ ],

+  unsigned int Pfactor,

+  int Round

+);

+vp9_prob vp9_bin_prob_from_distribution(const unsigned int counts[2]);

+#endif

--- /dev/null

+++ b/vp9/common/type_aliases.h

@@ -1,0 +1,120 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+/****************************************************************************

+*

+*   Module Title :     type_aliases.h

+*

+*   Description  :     Standard type aliases

+*

+****************************************************************************/

+#ifndef __INC_TYPE_ALIASES_H

+#define __INC_TYPE_ALIASES_H

+/****************************************************************************

+* Macros

+****************************************************************************/

+#define EXPORT

+#define IMPORT          extern      /* Used to declare imported data & routines */

+#define PRIVATE         static      /* Used to declare & define module-local data */

+#define LOCAL           static      /* Used to define all persistent routine-local data */

+#define STD_IN_PATH     0           /* Standard input path */

+#define STD_OUT_PATH    1           /* Standard output path */

+#define STD_ERR_PATH    2           /* Standard error path */

+#define STD_IN_FILE     stdin       /* Standard input file pointer */

+#define STD_OUT_FILE    stdout      /* Standard output file pointer */

+#define STD_ERR_FILE    stderr      /* Standard error file pointer */

+#define max_int         0x7FFFFFFF

+#define __export

+#define _export

+#define CCONV

+#ifndef NULL

+#ifdef __cplusplus

+#define NULL    0

+#else

+#define NULL    ((void *)0)

+#endif

+#endif

+#ifndef FALSE

+#define FALSE   0

+#endif

+#ifndef TRUE

+#define TRUE    1

+#endif

+/****************************************************************************

+* Typedefs

+****************************************************************************/

+#ifndef TYPE_INT8

+#define TYPE_INT8

+typedef signed char     INT8;

+#endif

+#ifndef TYPE_INT16

+/*#define TYPE_INT16*/

+typedef signed short    INT16;

+#endif

+#ifndef TYPE_INT32

+/*#define TYPE_INT32*/

+typedef signed int      INT32;

+#endif

+#ifndef TYPE_UINT8

+/*#define TYPE_UINT8*/

+typedef unsigned char   UINT8;

+#endif

+#ifndef TYPE_UINT32

+/*#define TYPE_UINT32*/

+typedef unsigned int    UINT32;

+#endif

+#ifndef TYPE_UINT16

+/*#define TYPE_UINT16*/

+typedef unsigned short  UINT16;

+#endif

+#ifndef TYPE_BOOL

+/*#define TYPE_BOOL*/

+typedef int             BOOL;

+#endif

+typedef unsigned char   BOOLEAN;

+#ifdef _MSC_VER

+typedef __int64 INT64;

+#ifndef INT64_MAX

+#define INT64_MAX LLONG_MAX

+#endif

+#else

+#ifndef TYPE_INT64

+#ifdef _TMS320C6X

+/* for now we only have 40bits */

+typedef long INT64;

+#else

+typedef long long INT64;

+#endif

+#endif

+#endif

+/* Floating point */

+typedef  double         FLOAT64;

+typedef  float          FLOAT32;

+#endif

--- /dev/null

+++ b/vp9/common/x86/filter_sse2.c

@@ -1,0 +1,289 @@

+/*

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <assert.h> // for alignment checks

+#include <emmintrin.h> // SSE2

+#include "vp9/common/filter.h"

+#include "vpx_ports/mem.h" // for DECLARE_ALIGNED

+#include "vpx_rtcd.h"

+// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is

+//           just a quick partial snapshot so that other can already use some

+//           speedup.

+// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap

+//           filtering.

+// TODO(cd): Add some comments, better variable naming.

+// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum

+//           of positive above 128), or have higher precision filter

+//           coefficients.

+DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {

+  VP9_FILTER_WEIGHT >> 1,

+  VP9_FILTER_WEIGHT >> 1,

+  VP9_FILTER_WEIGHT >> 1,

+  VP9_FILTER_WEIGHT >> 1,

+};

+// Creating a macro to do more than four pixels at once to hide instruction

+// latency is actually slower :-(

+#define DO_FOUR_PIXELS(result, src_ptr, offset)                                \

+  {                                                                            \

+  /* Do shifted load to achieve require shuffles through unpacking */          \

+  const __m128i src0  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \

+  const __m128i src1  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \

+  const __m128i src2  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \

+  const __m128i src3  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \

+  const __m128i src01 = _mm_unpacklo_epi8(src0, src1);                         \

+  const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero);                     \

+  const __m128i src23 = _mm_unpacklo_epi8(src2, src3);                         \

+  const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero);                     \

+  /* Shit by 4 bytes through suffle to get additional shifted loads */         \

+  const __m128i src4  = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1));      \

+  const __m128i src5  = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1));      \

+  const __m128i src6  = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1));      \

+  const __m128i src7  = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1));      \

+  const __m128i src45 = _mm_unpacklo_epi8(src4, src5);                         \

+  const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero);                     \

+  const __m128i src67 = _mm_unpacklo_epi8(src6, src7);                         \

+  const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero);                     \

+  /* multiply accumulate them */                                               \

+  const __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                       \

+  const __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                       \

+  const __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                       \

+  const __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                       \

+  const __m128i mad0123 = _mm_add_epi32(mad01, mad23);                         \

+  const __m128i mad4567 = _mm_add_epi32(mad45, mad67);                         \

+  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \

+  mad_all = _mm_add_epi32(mad_all, rounding);                                  \

+  result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);                          \

+  }

+void vp9_filter_block2d_4x4_8_sse2

+(

+ const unsigned char *src_ptr, const unsigned int src_stride,

+ const short *HFilter_aligned16, const short *VFilter_aligned16,

+ unsigned char *dst_ptr, unsigned int dst_stride

+) {

+  __m128i intermediateA, intermediateB, intermediateC;

+  const int kInterp_Extend = 4;

+  const __m128i zero = _mm_set1_epi16(0);

+  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);

+  // check alignment

+  assert(0 == ((long)HFilter_aligned16)%16);

+  assert(0 == ((long)VFilter_aligned16)%16);

+  {

+    __m128i transpose3_0;

+    __m128i transpose3_1;

+    __m128i transpose3_2;

+    __m128i transpose3_3;

+    // Horizontal pass (src -> intermediate).

+    {

+      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);

+      // get first two columns filter coefficients

+      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));

+      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));

+      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));

+      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));

+      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);

+      {

+        __m128i mad_all0;

+        __m128i mad_all1;

+        __m128i mad_all2;

+        __m128i mad_all3;

+        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)

+        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)

+        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)

+        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)

+        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

+        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);

+        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);

+        // --

+        src_ptr += src_stride*4;

+        // --

+        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)

+        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)

+        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)

+        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)

+        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

+        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);

+        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);

+        // --

+        src_ptr += src_stride*4;

+        // --

+        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)

+        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)

+        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)

+        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

+        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);

+        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);

+      }

+    }

+    // Transpose result (intermediate -> transpose3_x)

+    {

+      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33

+      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73

+      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx

+      const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB);

+      const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB);

+      const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC);

+      const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC);

+      // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53

+      // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73

+      // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx

+      // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx

+      const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);

+      const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);

+      const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3);

+      const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3);

+      // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63

+      // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73

+      // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx

+      // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx

+      const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1);

+      const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1);

+      const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3);

+      const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3);

+      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71

+      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73

+      // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx

+      // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx

+      transpose3_0 = _mm_castps_si128(

+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),

+                                           _mm_castsi128_ps(transpose2_2),

+                                           _MM_SHUFFLE(1, 0, 1, 0)));

+      transpose3_1 = _mm_castps_si128(

+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),

+                                           _mm_castsi128_ps(transpose2_2),

+                                           _MM_SHUFFLE(3, 2, 3, 2)));

+      transpose3_2 = _mm_castps_si128(

+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),

+                                           _mm_castsi128_ps(transpose2_3),

+                                           _MM_SHUFFLE(1, 0, 1, 0)));

+      transpose3_3 = _mm_castps_si128(

+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),

+                                           _mm_castsi128_ps(transpose2_3),

+                                           _MM_SHUFFLE(3, 2, 3, 2)));

+      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx

+      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx

+      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx

+      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx

+    }

+    // Vertical pass (transpose3_x -> dst).

+    {

+      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);

+      // get first two columns filter coefficients

+      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));

+      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));

+      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));

+      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));

+      __m128i col0, col1, col2, col3;

+        DECLARE_ALIGNED(16, unsigned char, temp[32]);

+      {

+        _mm_store_si128((__m128i *)temp, transpose3_0);

+        DO_FOUR_PIXELS(col0, temp, 0);

+      }

+      {

+        _mm_store_si128((__m128i *)temp, transpose3_1);

+        DO_FOUR_PIXELS(col1, temp, 0);

+      }

+      {

+        _mm_store_si128((__m128i *)temp, transpose3_2);

+        DO_FOUR_PIXELS(col2, temp, 0);

+      }

+      {

+        _mm_store_si128((__m128i *)temp, transpose3_3);

+        DO_FOUR_PIXELS(col3, temp, 0);

+      }

+      // transpose

+      {

+        __m128i T0 = _mm_unpacklo_epi32(col0, col1);

+        __m128i T1 = _mm_unpacklo_epi32(col2, col3);

+        __m128i T2 = _mm_unpackhi_epi32(col0, col1);

+        __m128i T3 = _mm_unpackhi_epi32(col2, col3);

+        col0 = _mm_unpacklo_epi64(T0, T1);

+        col1 = _mm_unpackhi_epi64(T0, T1);

+        col2 = _mm_unpacklo_epi64(T2, T3);

+        col3 = _mm_unpackhi_epi64(T2, T3);

+      }

+      // saturate to 8 bit

+      {

+        col0 = _mm_packs_epi32(col0, col0);

+        col0 = _mm_packus_epi16(col0, col0);

+        col1 = _mm_packs_epi32(col1, col1);

+        col1 = _mm_packus_epi16(col1, col1);

+        col2 = _mm_packs_epi32 (col2, col2);

+        col2 = _mm_packus_epi16(col2, col2);

+        col3 = _mm_packs_epi32 (col3, col3);

+        col3 = _mm_packus_epi16(col3, col3);

+      }

+      // store

+      {

+        *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0);

+        *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1);

+        *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2);

+        *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3);

+      }

+    }

+  }

+}

+void vp9_filter_block2d_8x4_8_sse2

+(

+ const unsigned char *src_ptr, const unsigned int src_stride,

+ const short *HFilter_aligned16, const short *VFilter_aligned16,

+ unsigned char *dst_ptr, unsigned int dst_stride

+) {

+  int j;

+  for (j=0; j<8; j+=4) {

+    vp9_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride,

+                                  HFilter_aligned16, VFilter_aligned16,

+                                  dst_ptr + j, dst_stride);

+  }

+}

+void vp9_filter_block2d_8x8_8_sse2

+(

+ const unsigned char *src_ptr, const unsigned int src_stride,

+ const short *HFilter_aligned16, const short *VFilter_aligned16,

+ unsigned char *dst_ptr, unsigned int dst_stride

+) {

+  int i, j;

+  for (i=0; i<8; i+=4) {

+    for (j=0; j<8; j+=4) {

+      vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,

+                                    HFilter_aligned16, VFilter_aligned16,

+                                    dst_ptr + j + i*dst_stride, dst_stride);

+    }

+  }

+}

+void vp9_filter_block2d_16x16_8_sse2

+(

+ const unsigned char *src_ptr, const unsigned int src_stride,

+ const short *HFilter_aligned16, const short *VFilter_aligned16,

+ unsigned char *dst_ptr, unsigned int dst_stride

+) {

+  int i, j;

+  for (i=0; i<16; i+=4) {

+    for (j=0; j<16; j+=4) {

+      vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,

+                                    HFilter_aligned16, VFilter_aligned16,

+                                    dst_ptr + j + i*dst_stride, dst_stride);

+    }

+  }

+}

--- /dev/null

+++ b/vp9/common/x86/filter_sse4.c

@@ -1,0 +1,362 @@

+/*

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <assert.h> // for alignment checks

+#include <smmintrin.h> // SSE4.1

+#include "vp9/common/filter.h"

+#include "vpx_ports/mem.h" // for DECLARE_ALIGNED

+#include "vpx_rtcd.h"

+// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is

+//           just a quick partial snapshot so that other can already use some

+//           speedup.

+// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap

+//           filtering.

+// TODO(cd): Reduce source size by using macros instead of current code

+//           duplication.

+// TODO(cd): Add some comments, better variable naming.

+// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum

+//           of positive above 128), or have higher precision filter

+//           coefficients.

+DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = {

+  0x00, 0x01,

+  0x01, 0x02,

+  0x02, 0x03,

+  0x03, 0x04,

+  0x02, 0x03,

+  0x03, 0x04,

+  0x04, 0x05,

+  0x05, 0x06,

+};

+DECLARE_ALIGNED(16, static const unsigned char, mask4567_c[16]) = {

+  0x04, 0x05,

+  0x05, 0x06,

+  0x06, 0x07,

+  0x07, 0x08,

+  0x06, 0x07,

+  0x07, 0x08,

+  0x08, 0x09,

+  0x09, 0x0A,

+};

+DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {

+  VP9_FILTER_WEIGHT >> 1,

+  VP9_FILTER_WEIGHT >> 1,

+  VP9_FILTER_WEIGHT >> 1,

+  VP9_FILTER_WEIGHT >> 1,

+};

+DECLARE_ALIGNED(16, static const unsigned char, transpose_c[16]) = {

+  0, 4,  8, 12,

+  1, 5,  9, 13,

+  2, 6, 10, 14,

+  3, 7, 11, 15

+};

+// Creating a macro to do more than four pixels at once to hide instruction

+// latency is actually slower :-(

+#define DO_FOUR_PIXELS(result, offset)                                         \

+  {                                                                            \

+  /*load pixels*/                                                              \

+  __m128i src  = _mm_loadu_si128((const __m128i *)(src_ptr + offset));         \

+  /* extract the ones used for first column */                                 \

+  __m128i src0123 = _mm_shuffle_epi8(src, mask0123);                           \

+  __m128i src4567 = _mm_shuffle_epi8(src, mask4567);                           \

+  __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);                         \

+  __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);                         \

+  __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);                         \

+  __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);                         \

+  /* multiply accumulate them */                                               \

+  __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                             \

+  __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                             \

+  __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                             \

+  __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                             \

+  __m128i mad0123 = _mm_add_epi32(mad01, mad23);                               \

+  __m128i mad4567 = _mm_add_epi32(mad45, mad67);                               \

+  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \

+  mad_all = _mm_add_epi32(mad_all, rounding);                                  \

+  result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);                          \

+  }

+void vp9_filter_block2d_4x4_8_sse4_1

+(

+ const unsigned char *src_ptr, const unsigned int src_stride,

+ const short *HFilter_aligned16, const short *VFilter_aligned16,

+ unsigned char *dst_ptr, unsigned int dst_stride

+) {

+  __m128i intermediateA, intermediateB, intermediateC;

+  const int kInterp_Extend = 4;

+  const __m128i zero = _mm_set1_epi16(0);

+  const __m128i mask0123 = _mm_load_si128((const __m128i *)mask0123_c);

+  const __m128i mask4567 = _mm_load_si128((const __m128i *)mask4567_c);

+  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);

+  const __m128i transpose = _mm_load_si128((const __m128i *)transpose_c);

+  // check alignment

+  assert(0 == ((long)HFilter_aligned16)%16);

+  assert(0 == ((long)VFilter_aligned16)%16);

+  {

+    __m128i transpose3_0;

+    __m128i transpose3_1;

+    __m128i transpose3_2;

+    __m128i transpose3_3;

+    // Horizontal pass (src -> intermediate).

+    {

+      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);

+      // get first two columns filter coefficients

+      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));

+      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));

+      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));

+      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));

+      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);

+      {

+        __m128i mad_all0;

+        __m128i mad_all1;

+        __m128i mad_all2;

+        __m128i mad_all3;

+        DO_FOUR_PIXELS(mad_all0, 0*src_stride)

+        DO_FOUR_PIXELS(mad_all1, 1*src_stride)

+        DO_FOUR_PIXELS(mad_all2, 2*src_stride)

+        DO_FOUR_PIXELS(mad_all3, 3*src_stride)

+        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

+        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);

+        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);

+        // --

+        src_ptr += src_stride*4;

+        // --

+        DO_FOUR_PIXELS(mad_all0, 0*src_stride)

+        DO_FOUR_PIXELS(mad_all1, 1*src_stride)

+        DO_FOUR_PIXELS(mad_all2, 2*src_stride)

+        DO_FOUR_PIXELS(mad_all3, 3*src_stride)

+        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

+        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);

+        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);

+        // --

+        src_ptr += src_stride*4;

+        // --

+        DO_FOUR_PIXELS(mad_all0, 0*src_stride)

+        DO_FOUR_PIXELS(mad_all1, 1*src_stride)

+        DO_FOUR_PIXELS(mad_all2, 2*src_stride)

+        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

+        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);

+        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);

+      }

+    }

+    // Transpose result (intermediate -> transpose3_x)

+    {

+      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33

+      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73

+      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx

+      const __m128i transpose1_0 = _mm_shuffle_epi8(intermediateA, transpose);

+      const __m128i transpose1_1 = _mm_shuffle_epi8(intermediateB, transpose);

+      const __m128i transpose1_2 = _mm_shuffle_epi8(intermediateC, transpose);

+      // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33

+      // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73

+      // 80 90 A0 xx 81 91 A1 xx 82 92 A2 xx 83 93 A3 xx

+      const __m128i transpose2_0 = _mm_unpacklo_epi32(transpose1_0, transpose1_1);

+      const __m128i transpose2_1 = _mm_unpackhi_epi32(transpose1_0, transpose1_1);

+      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71

+      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73

+      transpose3_0 = _mm_castps_si128(

+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),

+                                           _mm_castsi128_ps(transpose1_2),

+                                           _MM_SHUFFLE(0, 0, 1, 0)));

+      transpose3_1 = _mm_castps_si128(

+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),

+                                           _mm_castsi128_ps(transpose1_2),

+                                           _MM_SHUFFLE(1, 1, 3, 2)));

+      transpose3_2 = _mm_castps_si128(

+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),

+                                           _mm_castsi128_ps(transpose1_2),

+                                           _MM_SHUFFLE(2, 2, 1, 0)));

+      transpose3_3 = _mm_castps_si128(

+                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),

+                                           _mm_castsi128_ps(transpose1_2),

+                                           _MM_SHUFFLE(3, 3, 3, 2)));

+      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx

+      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx

+      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx

+      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx

+    }

+    // Vertical pass (transpose3_x -> dst).

+    {

+      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);

+      // get first two columns filter coefficients

+      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));

+      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));

+      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));

+      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));

+      __m128i col0, col1, col2, col3;

+      {

+        //load pixels

+        __m128i src  = transpose3_0;

+        // extract the ones used for first column

+        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);

+        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);

+        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);

+        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);

+        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);

+        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);

+        // multiply accumulate them

+        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);

+        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);

+        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);

+        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);

+        __m128i mad0123 = _mm_add_epi32(mad01, mad23);

+        __m128i mad4567 = _mm_add_epi32(mad45, mad67);

+        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);

+        mad_all = _mm_add_epi32(mad_all, rounding);

+        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);

+        mad_all = _mm_packs_epi32(mad_all, mad_all);

+        col0 = _mm_packus_epi16(mad_all, mad_all);

+      }

+      {

+        //load pixels

+        __m128i src  = transpose3_1;

+        // extract the ones used for first column

+        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);

+        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);

+        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);

+        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);

+        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);

+        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);

+        // multiply accumulate them

+        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);

+        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);

+        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);

+        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);

+        __m128i mad0123 = _mm_add_epi32(mad01, mad23);

+        __m128i mad4567 = _mm_add_epi32(mad45, mad67);

+        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);

+        mad_all = _mm_add_epi32(mad_all, rounding);

+        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);

+        mad_all = _mm_packs_epi32(mad_all, mad_all);

+        col1 = _mm_packus_epi16(mad_all, mad_all);

+      }

+      {

+        //load pixels

+        __m128i src  = transpose3_2;

+        // extract the ones used for first column

+        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);

+        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);

+        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);

+        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);

+        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);

+        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);

+        // multiply accumulate them

+        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);

+        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);

+        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);

+        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);

+        __m128i mad0123 = _mm_add_epi32(mad01, mad23);

+        __m128i mad4567 = _mm_add_epi32(mad45, mad67);

+        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);

+        mad_all = _mm_add_epi32(mad_all, rounding);

+        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);

+        mad_all = _mm_packs_epi32(mad_all, mad_all);

+        col2 = _mm_packus_epi16(mad_all, mad_all);

+      }

+      {

+        //load pixels

+        __m128i src  = transpose3_3;

+        // extract the ones used for first column

+        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);

+        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);

+        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);

+        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);

+        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);

+        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);

+        // multiply accumulate them

+        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);

+        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);

+        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);

+        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);

+        __m128i mad0123 = _mm_add_epi32(mad01, mad23);

+        __m128i mad4567 = _mm_add_epi32(mad45, mad67);

+        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);

+        mad_all = _mm_add_epi32(mad_all, rounding);

+        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);

+        mad_all = _mm_packs_epi32(mad_all, mad_all);

+        col3 = _mm_packus_epi16(mad_all, mad_all);

+      }

+      {

+        __m128i col01 = _mm_unpacklo_epi8(col0, col1);

+        __m128i col23 = _mm_unpacklo_epi8(col2, col3);

+        __m128i col0123 = _mm_unpacklo_epi16(col01, col23);

+        //TODO(cd): look into Ronald's comment:

+        //    Future suggestion: I believe here, too, you can merge the

+        //    packs_epi32() and pacus_epi16() for the 4 cols above, so that

+        //    you get the data in a single register, and then use pshufb

+        //    (shuffle_epi8()) instead of the unpacks here. Should be

+        //    2+3+2 instructions faster.

+        *((unsigned int *)&dst_ptr[dst_stride * 0]) =

+            _mm_extract_epi32(col0123, 0);

+        *((unsigned int *)&dst_ptr[dst_stride * 1]) =

+            _mm_extract_epi32(col0123, 1);

+        *((unsigned int *)&dst_ptr[dst_stride * 2]) =

+            _mm_extract_epi32(col0123, 2);

+        *((unsigned int *)&dst_ptr[dst_stride * 3]) =

+            _mm_extract_epi32(col0123, 3);

+      }

+    }

+  }

+}

+void vp9_filter_block2d_8x4_8_sse4_1

+(

+ const unsigned char *src_ptr, const unsigned int src_stride,

+ const short *HFilter_aligned16, const short *VFilter_aligned16,

+ unsigned char *dst_ptr, unsigned int dst_stride

+) {

+  int j;

+  for (j=0; j<8; j+=4) {

+    vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j, src_stride,

+                                    HFilter_aligned16, VFilter_aligned16,

+                                    dst_ptr + j, dst_stride);

+  }

+}

+void vp9_filter_block2d_8x8_8_sse4_1

+(

+ const unsigned char *src_ptr, const unsigned int src_stride,

+ const short *HFilter_aligned16, const short *VFilter_aligned16,

+ unsigned char *dst_ptr, unsigned int dst_stride

+) {

+  int i, j;

+  for (i=0; i<8; i+=4) {

+    for (j=0; j<8; j+=4) {

+      vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,

+                                      HFilter_aligned16, VFilter_aligned16,

+                                      dst_ptr + j + i*dst_stride, dst_stride);

+    }

+  }

+}

+void vp9_filter_block2d_16x16_8_sse4_1

+(

+ const unsigned char *src_ptr, const unsigned int src_stride,

+ const short *HFilter_aligned16, const short *VFilter_aligned16,

+ unsigned char *dst_ptr, unsigned int dst_stride

+) {

+  int i, j;

+  for (i=0; i<16; i+=4) {

+    for (j=0; j<16; j+=4) {

+      vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,

+                                      HFilter_aligned16, VFilter_aligned16,

+                                      dst_ptr + j + i*dst_stride, dst_stride);

+    }

+  }

+}

--- /dev/null

+++ b/vp9/common/x86/idct_x86.h

@@ -1,0 +1,64 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef IDCT_X86_H

+#define IDCT_X86_H

+/* Note:

+ *

+ * This platform is commonly built for runtime CPU detection. If you modify

+ * any of the function mappings present in this file, be sure to also update

+ * them in the function pointer initialization code

+ */

+#if HAVE_MMX

+extern prototype_idct(vp9_short_idct4x4llm_1_mmx);

+extern prototype_idct(vp9_short_idct4x4llm_mmx);

+extern prototype_idct_scalar_add(vp9_dc_only_idct_add_mmx);

+extern prototype_second_order(vp9_short_inv_walsh4x4_mmx);

+extern prototype_second_order(vp9_short_inv_walsh4x4_1_mmx);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp9_idct_idct1

+#define vp9_idct_idct1 vp9_short_idct4x4llm_1_mmx

+#undef  vp9_idct_idct16

+#define vp9_idct_idct16 vp9_short_idct4x4llm_mmx

+#undef  vp9_idct_idct1_scalar_add

+#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_mmx

+#undef vp9_idct_iwalsh16

+#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_mmx

+#undef vp9_idct_iwalsh1

+#define vp9_idct_iwalsh1 vp9_short_inv_walsh4x4_1_mmx

+#endif

+#endif

+#if HAVE_SSE2

+extern prototype_second_order(vp9_short_inv_walsh4x4_sse2);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef vp9_idct_iwalsh16

+#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_sse2

+#endif

+#endif

+#endif

--- /dev/null

+++ b/vp9/common/x86/idctllm_mmx.asm

@@ -1,0 +1,241 @@

+;

+;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "third_party/x86inc/x86inc.asm"

+SECTION_RODATA

+align 16

+x_s1sqr2:      times 4 dw 0x8A8C

+align 16

+x_c1sqr2less1: times 4 dw 0x4E7B

+align 16

+pw_16:         times 4 dw 16

+SECTION .text

+; /****************************************************************************

+; * Notes:

+; *

+; * This implementation makes use of 16 bit fixed point version of two multiply

+; * constants:

+; *        1.   sqrt(2) * cos (pi/8)

+; *        2.   sqrt(2) * sin (pi/8)

+; * Because the first constant is bigger than 1, to maintain the same 16 bit

+; * fixed point precision as the second one, we use a trick of

+; *        x * a = x + x*(a-1)

+; * so

+; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).

+; *

+; * For the second constant, because of the 16bit version is 35468, which

+; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative

+; * number.

+; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x

+; *

+; **************************************************************************/

+INIT_MMX

+;void short_idct4x4llm_mmx(short *input, short *output, int pitch)

+cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit

+    mova            m0,     [inpq +0]

+    mova            m1,     [inpq +8]

+    mova            m2,     [inpq+16]

+    mova            m3,     [inpq+24]

+    psubw           m0,      m2             ; b1= 0-2

+    paddw           m2,      m2             ;

+    mova            m5,      m1

+    paddw           m2,      m0             ; a1 =0+2

+    pmulhw          m5,     [x_s1sqr2]       ;

+    paddw           m5,      m1             ; ip1 * sin(pi/8) * sqrt(2)

+    mova            m7,      m3             ;

+    pmulhw          m7,     [x_c1sqr2less1]   ;

+    paddw           m7,      m3             ; ip3 * cos(pi/8) * sqrt(2)

+    psubw           m7,      m5             ; c1

+    mova            m5,      m1

+    mova            m4,      m3

+    pmulhw          m5,     [x_c1sqr2less1]

+    paddw           m5,      m1

+    pmulhw          m3,     [x_s1sqr2]

+    paddw           m3,      m4

+    paddw           m3,      m5             ; d1

+    mova            m6,      m2             ; a1

+    mova            m4,      m0             ; b1

+    paddw           m2,      m3             ;0

+    paddw           m4,      m7             ;1

+    psubw           m0,      m7             ;2

+    psubw           m6,      m3             ;3

+    mova            m1,      m2             ; 03 02 01 00

+    mova            m3,      m4             ; 23 22 21 20

+    punpcklwd       m1,      m0             ; 11 01 10 00

+    punpckhwd       m2,      m0             ; 13 03 12 02

+    punpcklwd       m3,      m6             ; 31 21 30 20

+    punpckhwd       m4,      m6             ; 33 23 32 22

+    mova            m0,      m1             ; 11 01 10 00

+    mova            m5,      m2             ; 13 03 12 02

+    punpckldq       m0,      m3             ; 30 20 10 00

+    punpckhdq       m1,      m3             ; 31 21 11 01

+    punpckldq       m2,      m4             ; 32 22 12 02

+    punpckhdq       m5,      m4             ; 33 23 13 03

+    mova            m3,      m5             ; 33 23 13 03

+    psubw           m0,      m2             ; b1= 0-2

+    paddw           m2,      m2             ;

+    mova            m5,      m1

+    paddw           m2,      m0             ; a1 =0+2

+    pmulhw          m5,     [x_s1sqr2]        ;

+    paddw           m5,      m1             ; ip1 * sin(pi/8) * sqrt(2)

+    mova            m7,      m3             ;

+    pmulhw          m7,     [x_c1sqr2less1]   ;

+    paddw           m7,      m3             ; ip3 * cos(pi/8) * sqrt(2)

+    psubw           m7,      m5             ; c1

+    mova            m5,      m1

+    mova            m4,      m3

+    pmulhw          m5,     [x_c1sqr2less1]

+    paddw           m5,      m1

+    pmulhw          m3,     [x_s1sqr2]

+    paddw           m3,      m4

+    paddw           m3,      m5             ; d1

+    paddw           m0,     [pw_16]

+    paddw           m2,     [pw_16]

+    mova            m6,      m2             ; a1

+    mova            m4,      m0             ; b1

+    paddw           m2,      m3             ;0

+    paddw           m4,      m7             ;1

+    psubw           m0,      m7             ;2

+    psubw           m6,      m3             ;3

+    psraw           m2,      5

+    psraw           m0,      5

+    psraw           m4,      5

+    psraw           m6,      5

+    mova            m1,      m2             ; 03 02 01 00

+    mova            m3,      m4             ; 23 22 21 20

+    punpcklwd       m1,      m0             ; 11 01 10 00

+    punpckhwd       m2,      m0             ; 13 03 12 02

+    punpcklwd       m3,      m6             ; 31 21 30 20

+    punpckhwd       m4,      m6             ; 33 23 32 22

+    mova            m0,      m1             ; 11 01 10 00

+    mova            m5,      m2             ; 13 03 12 02

+    punpckldq       m0,      m3             ; 30 20 10 00

+    punpckhdq       m1,      m3             ; 31 21 11 01

+    punpckldq       m2,      m4             ; 32 22 12 02

+    punpckhdq       m5,      m4             ; 33 23 13 03

+    mova        [outq],      m0

+    mova     [outq+r2],      m1

+    mova [outq+pitq*2],      m2

+    add           outq,      pitq

+    mova [outq+pitq*2],      m5

+    RET

+;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)

+cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit

+    movh            m0,     [inpq]

+    paddw           m0,     [pw_16]

+    psraw           m0,      5

+    punpcklwd       m0,      m0

+    punpckldq       m0,      m0

+    mova        [outq],      m0

+    mova   [outq+pitq],      m0

+    mova [outq+pitq*2],      m0

+    add             r1,      r2

+    mova [outq+pitq*2],      m0

+    RET

+;void dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)

+cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride

+%if ARCH_X86_64

+    movsxd         strideq,      dword stridem

+%else

+    mov            strideq,      stridem

+%endif

+    pxor                m0,      m0

+    movh                m5,      in_dcq ; dc

+    paddw               m5,     [pw_16]

+    psraw               m5,      5

+    punpcklwd           m5,      m5

+    punpckldq           m5,      m5

+    movh                m1,     [predq]

+    punpcklbw           m1,      m0

+    paddsw              m1,      m5

+    packuswb            m1,      m0              ; pack and unpack to saturate

+    movh            [dstq],      m1

+    movh                m2,     [predq+pitq]

+    punpcklbw           m2,      m0

+    paddsw              m2,      m5

+    packuswb            m2,      m0              ; pack and unpack to saturate

+    movh    [dstq+strideq],      m2

+    movh                m3,     [predq+2*pitq]

+    punpcklbw           m3,      m0

+    paddsw              m3,      m5

+    packuswb            m3,      m0              ; pack and unpack to saturate

+    movh  [dstq+2*strideq],      m3

+    add               dstq,      strideq

+    add              predq,      pitq

+    movh                m4,     [predq+2*pitq]

+    punpcklbw           m4,      m0

+    paddsw              m4,      m5

+    packuswb            m4,      m0              ; pack and unpack to saturate

+    movh  [dstq+2*strideq],      m4

+    RET

--- /dev/null

+++ b/vp9/common/x86/idctllm_sse2.asm

@@ -1,0 +1,712 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+;void vp9_idct_dequant_0_2x_sse2

+; (

+;   short *qcoeff       - 0

+;   short *dequant      - 1

+;   unsigned char *pre  - 2

+;   unsigned char *dst  - 3

+;   int dst_stride      - 4

+;   int blk_stride      - 5

+; )

+global sym(vp9_idct_dequant_0_2x_sse2)

+sym(vp9_idct_dequant_0_2x_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    GET_GOT     rbx

+    ; end prolog

+        mov         rdx,            arg(1) ; dequant

+        mov         rax,            arg(0) ; qcoeff

+        movd        xmm4,           [rax]

+        movd        xmm5,           [rdx]

+        pinsrw      xmm4,           [rax+32],   4

+        pinsrw      xmm5,           [rdx],      4

+        pmullw      xmm4,           xmm5

+    ; Zero out xmm5, for use unpacking

+        pxor        xmm5,           xmm5

+    ; clear coeffs

+        movd        [rax],          xmm5

+        movd        [rax+32],       xmm5

+;pshufb

+        pshuflw     xmm4,           xmm4,       00000000b

+        pshufhw     xmm4,           xmm4,       00000000b

+        mov         rax,            arg(2) ; pre

+        paddw       xmm4,           [GLOBAL(fours)]

+        movsxd      rcx,            dword ptr arg(5) ; blk_stride

+        psraw       xmm4,           3

+        movq        xmm0,           [rax]

+        movq        xmm1,           [rax+rcx]

+        movq        xmm2,           [rax+2*rcx]

+        lea         rcx,            [3*rcx]

+        movq        xmm3,           [rax+rcx]

+        punpcklbw   xmm0,           xmm5

+        punpcklbw   xmm1,           xmm5

+        punpcklbw   xmm2,           xmm5

+        punpcklbw   xmm3,           xmm5

+        mov         rax,            arg(3) ; dst

+        movsxd      rdx,            dword ptr arg(4) ; dst_stride

+    ; Add to predict buffer

+        paddw       xmm0,           xmm4

+        paddw       xmm1,           xmm4

+        paddw       xmm2,           xmm4

+        paddw       xmm3,           xmm4

+    ; pack up before storing

+        packuswb    xmm0,           xmm5

+        packuswb    xmm1,           xmm5

+        packuswb    xmm2,           xmm5

+        packuswb    xmm3,           xmm5

+    ; store blocks back out

+        movq        [rax],          xmm0

+        movq        [rax + rdx],    xmm1

+        lea         rax,            [rax + 2*rdx]

+        movq        [rax],          xmm2

+        movq        [rax + rdx],    xmm3

+    ; begin epilog

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+global sym(vp9_idct_dequant_full_2x_sse2)

+sym(vp9_idct_dequant_full_2x_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 7

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ; special case when 2 blocks have 0 or 1 coeffs

+    ; dc is set as first coeff, so no need to load qcoeff

+        mov         rax,            arg(0) ; qcoeff

+        mov         rsi,            arg(2) ; pre

+        mov         rdi,            arg(3) ; dst

+        movsxd      rcx,            dword ptr arg(5) ; blk_stride

+    ; Zero out xmm7, for use unpacking

+        pxor        xmm7,           xmm7

+        mov         rdx,            arg(1)  ; dequant

+    ; note the transpose of xmm1 and xmm2, necessary for shuffle

+    ;   to spit out sensicle data

+        movdqa      xmm0,           [rax]

+        movdqa      xmm2,           [rax+16]

+        movdqa      xmm1,           [rax+32]

+        movdqa      xmm3,           [rax+48]

+    ; Clear out coeffs

+        movdqa      [rax],          xmm7

+        movdqa      [rax+16],       xmm7

+        movdqa      [rax+32],       xmm7

+        movdqa      [rax+48],       xmm7

+    ; dequantize qcoeff buffer

+        pmullw      xmm0,           [rdx]

+        pmullw      xmm2,           [rdx+16]

+        pmullw      xmm1,           [rdx]

+        pmullw      xmm3,           [rdx+16]

+    ; repack so block 0 row x and block 1 row x are together

+        movdqa      xmm4,           xmm0

+        punpckldq   xmm0,           xmm1

+        punpckhdq   xmm4,           xmm1

+        pshufd      xmm0,           xmm0,       11011000b

+        pshufd      xmm1,           xmm4,       11011000b

+        movdqa      xmm4,           xmm2

+        punpckldq   xmm2,           xmm3

+        punpckhdq   xmm4,           xmm3

+        pshufd      xmm2,           xmm2,       11011000b

+        pshufd      xmm3,           xmm4,       11011000b

+    ; first pass

+        psubw       xmm0,           xmm2        ; b1 = 0-2

+        paddw       xmm2,           xmm2        ;

+        movdqa      xmm5,           xmm1

+        paddw       xmm2,           xmm0        ; a1 = 0+2

+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)

+        movdqa      xmm7,           xmm3

+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)

+        psubw       xmm7,           xmm5        ; c1

+        movdqa      xmm5,           xmm1

+        movdqa      xmm4,           xmm3

+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm5,           xmm1

+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm3,           xmm4

+        paddw       xmm3,           xmm5        ; d1

+        movdqa      xmm6,           xmm2        ; a1

+        movdqa      xmm4,           xmm0        ; b1

+        paddw       xmm2,           xmm3        ;0

+        paddw       xmm4,           xmm7        ;1

+        psubw       xmm0,           xmm7        ;2

+        psubw       xmm6,           xmm3        ;3

+    ; transpose for the second pass

+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

+        pshufd      xmm0,           xmm2,       11011000b

+        pshufd      xmm2,           xmm1,       11011000b

+        pshufd      xmm1,           xmm5,       11011000b

+        pshufd      xmm3,           xmm7,       11011000b

+    ; second pass

+        psubw       xmm0,           xmm2            ; b1 = 0-2

+        paddw       xmm2,           xmm2

+        movdqa      xmm5,           xmm1

+        paddw       xmm2,           xmm0            ; a1 = 0+2

+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)

+        movdqa      xmm7,           xmm3

+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)

+        psubw       xmm7,           xmm5            ; c1

+        movdqa      xmm5,           xmm1

+        movdqa      xmm4,           xmm3

+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm5,           xmm1

+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm3,           xmm4

+        paddw       xmm3,           xmm5            ; d1

+        paddw       xmm0,           [GLOBAL(fours)]

+        paddw       xmm2,           [GLOBAL(fours)]

+        movdqa      xmm6,           xmm2            ; a1

+        movdqa      xmm4,           xmm0            ; b1

+        paddw       xmm2,           xmm3            ;0

+        paddw       xmm4,           xmm7            ;1

+        psubw       xmm0,           xmm7            ;2

+        psubw       xmm6,           xmm3            ;3

+        psraw       xmm2,           3

+        psraw       xmm0,           3

+        psraw       xmm4,           3

+        psraw       xmm6,           3

+    ; transpose to save

+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

+        pshufd      xmm0,           xmm2,       11011000b

+        pshufd      xmm2,           xmm1,       11011000b

+        pshufd      xmm1,           xmm5,       11011000b

+        pshufd      xmm3,           xmm7,       11011000b

+        pxor        xmm7,           xmm7

+    ; Load up predict blocks

+        movq        xmm4,           [rsi]

+        movq        xmm5,           [rsi+rcx]

+        punpcklbw   xmm4,           xmm7

+        punpcklbw   xmm5,           xmm7

+        paddw       xmm0,           xmm4

+        paddw       xmm1,           xmm5

+        movq        xmm4,           [rsi+2*rcx]

+        lea         rcx,            [3*rcx]

+        movq        xmm5,           [rsi+rcx]

+        punpcklbw   xmm4,           xmm7

+        punpcklbw   xmm5,           xmm7

+        paddw       xmm2,           xmm4

+        paddw       xmm3,           xmm5

+.finish:

+    ; pack up before storing

+        packuswb    xmm0,           xmm7

+        packuswb    xmm1,           xmm7

+        packuswb    xmm2,           xmm7

+        packuswb    xmm3,           xmm7

+    ; Load destination stride before writing out,

+    ;   doesn't need to persist

+        movsxd      rdx,            dword ptr arg(4) ; dst_stride

+    ; store blocks back out

+        movq        [rdi],          xmm0

+        movq        [rdi + rdx],    xmm1

+        lea         rdi,            [rdi + 2*rdx]

+        movq        [rdi],          xmm2

+        movq        [rdi + rdx],    xmm3

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_idct_dequant_dc_0_2x_sse2

+; (

+;   short *qcoeff       - 0

+;   short *dequant      - 1

+;   unsigned char *pre  - 2

+;   unsigned char *dst  - 3

+;   int dst_stride      - 4

+;   short *dc           - 5

+; )

+global sym(vp9_idct_dequant_dc_0_2x_sse2)

+sym(vp9_idct_dequant_dc_0_2x_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ; special case when 2 blocks have 0 or 1 coeffs

+    ; dc is set as first coeff, so no need to load qcoeff

+        mov         rax,            arg(0) ; qcoeff

+        mov         rsi,            arg(2) ; pre

+        mov         rdi,            arg(3) ; dst

+        mov         rdx,            arg(5) ; dc

+    ; Zero out xmm5, for use unpacking

+        pxor        xmm5,           xmm5

+    ; load up 2 dc words here == 2*16 = doubleword

+        movd        xmm4,           [rdx]

+    ; Load up predict blocks

+        movq        xmm0,           [rsi]

+        movq        xmm1,           [rsi+16]

+        movq        xmm2,           [rsi+32]

+        movq        xmm3,           [rsi+48]

+    ; Duplicate and expand dc across

+        punpcklwd   xmm4,           xmm4

+        punpckldq   xmm4,           xmm4

+    ; Rounding to dequant and downshift

+        paddw       xmm4,           [GLOBAL(fours)]

+        psraw       xmm4,           3

+    ; Predict buffer needs to be expanded from bytes to words

+        punpcklbw   xmm0,           xmm5

+        punpcklbw   xmm1,           xmm5

+        punpcklbw   xmm2,           xmm5

+        punpcklbw   xmm3,           xmm5

+    ; Add to predict buffer

+        paddw       xmm0,           xmm4

+        paddw       xmm1,           xmm4

+        paddw       xmm2,           xmm4

+        paddw       xmm3,           xmm4

+    ; pack up before storing

+        packuswb    xmm0,           xmm5

+        packuswb    xmm1,           xmm5

+        packuswb    xmm2,           xmm5

+        packuswb    xmm3,           xmm5

+    ; Load destination stride before writing out,

+    ;   doesn't need to persist

+        movsxd      rdx,            dword ptr arg(4) ; dst_stride

+    ; store blocks back out

+        movq        [rdi],          xmm0

+        movq        [rdi + rdx],    xmm1

+        lea         rdi,            [rdi + 2*rdx]

+        movq        [rdi],          xmm2

+        movq        [rdi + rdx],    xmm3

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+global sym(vp9_idct_dequant_dc_full_2x_sse2)

+sym(vp9_idct_dequant_dc_full_2x_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 7

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ; special case when 2 blocks have 0 or 1 coeffs

+    ; dc is set as first coeff, so no need to load qcoeff

+        mov         rax,            arg(0) ; qcoeff

+        mov         rsi,            arg(2) ; pre

+        mov         rdi,            arg(3) ; dst

+    ; Zero out xmm7, for use unpacking

+        pxor        xmm7,           xmm7

+        mov         rdx,            arg(1)  ; dequant

+    ; note the transpose of xmm1 and xmm2, necessary for shuffle

+    ;   to spit out sensicle data

+        movdqa      xmm0,           [rax]

+        movdqa      xmm2,           [rax+16]

+        movdqa      xmm1,           [rax+32]

+        movdqa      xmm3,           [rax+48]

+    ; Clear out coeffs

+        movdqa      [rax],          xmm7

+        movdqa      [rax+16],       xmm7

+        movdqa      [rax+32],       xmm7

+        movdqa      [rax+48],       xmm7

+    ; dequantize qcoeff buffer

+        pmullw      xmm0,           [rdx]

+        pmullw      xmm2,           [rdx+16]

+        pmullw      xmm1,           [rdx]

+        pmullw      xmm3,           [rdx+16]

+    ; DC component

+        mov         rdx,            arg(5)

+    ; repack so block 0 row x and block 1 row x are together

+        movdqa      xmm4,           xmm0

+        punpckldq   xmm0,           xmm1

+        punpckhdq   xmm4,           xmm1

+        pshufd      xmm0,           xmm0,       11011000b

+        pshufd      xmm1,           xmm4,       11011000b

+        movdqa      xmm4,           xmm2

+        punpckldq   xmm2,           xmm3

+        punpckhdq   xmm4,           xmm3

+        pshufd      xmm2,           xmm2,       11011000b

+        pshufd      xmm3,           xmm4,       11011000b

+    ; insert DC component

+        pinsrw      xmm0,           [rdx],      0

+        pinsrw      xmm0,           [rdx+2],    4

+    ; first pass

+        psubw       xmm0,           xmm2        ; b1 = 0-2

+        paddw       xmm2,           xmm2        ;

+        movdqa      xmm5,           xmm1

+        paddw       xmm2,           xmm0        ; a1 = 0+2

+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)

+        movdqa      xmm7,           xmm3

+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)

+        psubw       xmm7,           xmm5        ; c1

+        movdqa      xmm5,           xmm1

+        movdqa      xmm4,           xmm3

+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm5,           xmm1

+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm3,           xmm4

+        paddw       xmm3,           xmm5        ; d1

+        movdqa      xmm6,           xmm2        ; a1

+        movdqa      xmm4,           xmm0        ; b1

+        paddw       xmm2,           xmm3        ;0

+        paddw       xmm4,           xmm7        ;1

+        psubw       xmm0,           xmm7        ;2

+        psubw       xmm6,           xmm3        ;3

+    ; transpose for the second pass

+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

+        pshufd      xmm0,           xmm2,       11011000b

+        pshufd      xmm2,           xmm1,       11011000b

+        pshufd      xmm1,           xmm5,       11011000b

+        pshufd      xmm3,           xmm7,       11011000b

+    ; second pass

+        psubw       xmm0,           xmm2            ; b1 = 0-2

+        paddw       xmm2,           xmm2

+        movdqa      xmm5,           xmm1

+        paddw       xmm2,           xmm0            ; a1 = 0+2

+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)

+        movdqa      xmm7,           xmm3

+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)

+        psubw       xmm7,           xmm5            ; c1

+        movdqa      xmm5,           xmm1

+        movdqa      xmm4,           xmm3

+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm5,           xmm1

+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm3,           xmm4

+        paddw       xmm3,           xmm5            ; d1

+        paddw       xmm0,           [GLOBAL(fours)]

+        paddw       xmm2,           [GLOBAL(fours)]

+        movdqa      xmm6,           xmm2            ; a1

+        movdqa      xmm4,           xmm0            ; b1

+        paddw       xmm2,           xmm3            ;0

+        paddw       xmm4,           xmm7            ;1

+        psubw       xmm0,           xmm7            ;2

+        psubw       xmm6,           xmm3            ;3

+        psraw       xmm2,           3

+        psraw       xmm0,           3

+        psraw       xmm4,           3

+        psraw       xmm6,           3

+    ; transpose to save

+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

+        pshufd      xmm0,           xmm2,       11011000b

+        pshufd      xmm2,           xmm1,       11011000b

+        pshufd      xmm1,           xmm5,       11011000b

+        pshufd      xmm3,           xmm7,       11011000b

+        pxor        xmm7,           xmm7

+    ; Load up predict blocks

+        movq        xmm4,           [rsi]

+        movq        xmm5,           [rsi+16]

+        punpcklbw   xmm4,           xmm7

+        punpcklbw   xmm5,           xmm7

+        paddw       xmm0,           xmm4

+        paddw       xmm1,           xmm5

+        movq        xmm4,           [rsi+32]

+        movq        xmm5,           [rsi+48]

+        punpcklbw   xmm4,           xmm7

+        punpcklbw   xmm5,           xmm7

+        paddw       xmm2,           xmm4

+        paddw       xmm3,           xmm5

+.finish:

+    ; pack up before storing

+        packuswb    xmm0,           xmm7

+        packuswb    xmm1,           xmm7

+        packuswb    xmm2,           xmm7

+        packuswb    xmm3,           xmm7

+    ; Load destination stride before writing out,

+    ;   doesn't need to persist

+        movsxd      rdx,            dword ptr arg(4) ; dst_stride

+    ; store blocks back out

+        movq        [rdi],          xmm0

+        movq        [rdi + rdx],    xmm1

+        lea         rdi,            [rdi + 2*rdx]

+        movq        [rdi],          xmm2

+        movq        [rdi + rdx],    xmm3

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+fours:

+    times 8 dw 0x0004

+align 16

+x_s1sqr2:

+    times 8 dw 0x8A8C

+align 16

+x_c1sqr2less1:

+    times 8 dw 0x4E7B

--- /dev/null

+++ b/vp9/common/x86/iwalsh_mmx.asm

@@ -1,0 +1,173 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+;void vp9_short_inv_walsh4x4_1_mmx(short *input, short *output)

+global sym(vp9_short_inv_walsh4x4_1_mmx)

+sym(vp9_short_inv_walsh4x4_1_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 2

+    push        rsi

+    push        rdi

+    ; end prolog

+    mov     rsi, arg(0)

+    mov     rax, 3

+    mov     rdi, arg(1)

+    add     rax, [rsi]          ;input[0] + 3

+    movd    mm0, eax

+    punpcklwd mm0, mm0          ;x x val val

+    punpckldq mm0, mm0          ;val val val val

+    psraw   mm0, 3            ;(input[0] + 3) >> 3

+    movq  [rdi + 0], mm0

+    movq  [rdi + 8], mm0

+    movq  [rdi + 16], mm0

+    movq  [rdi + 24], mm0

+    ; begin epilog

+    pop rdi

+    pop rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_short_inv_walsh4x4_mmx(short *input, short *output)

+global sym(vp9_short_inv_walsh4x4_mmx)

+sym(vp9_short_inv_walsh4x4_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 2

+    push        rsi

+    push        rdi

+    ; end prolog

+    mov     rax, 3

+    mov     rsi, arg(0)

+    mov     rdi, arg(1)

+    shl     rax, 16

+    movq    mm0, [rsi + 0]        ;ip[0]

+    movq    mm1, [rsi + 8]        ;ip[4]

+    or      rax, 3            ;00030003h

+    movq    mm2, [rsi + 16]       ;ip[8]

+    movq    mm3, [rsi + 24]       ;ip[12]

+    movq    mm7, rax

+    movq    mm4, mm0

+    punpcklwd mm7, mm7          ;0003000300030003h

+    movq    mm5, mm1

+    paddw   mm4, mm3          ;ip[0] + ip[12] aka al

+    paddw   mm5, mm2          ;ip[4] + ip[8] aka bl

+    movq    mm6, mm4          ;temp al

+    paddw   mm4, mm5          ;al + bl

+    psubw   mm6, mm5          ;al - bl

+    psubw   mm0, mm3          ;ip[0] - ip[12] aka d1

+    psubw   mm1, mm2          ;ip[4] - ip[8] aka c1

+    movq    mm5, mm0          ;temp dl

+    paddw   mm0, mm1          ;dl + cl

+    psubw   mm5, mm1          ;dl - cl

+    ; 03 02 01 00

+    ; 13 12 11 10

+    ; 23 22 21 20

+    ; 33 32 31 30

+    movq    mm3, mm4          ; 03 02 01 00

+    punpcklwd mm4, mm0          ; 11 01 10 00

+    punpckhwd mm3, mm0          ; 13 03 12 02

+    movq    mm1, mm6          ; 23 22 21 20

+    punpcklwd mm6, mm5          ; 31 21 30 20

+    punpckhwd mm1, mm5          ; 33 23 32 22

+    movq    mm0, mm4          ; 11 01 10 00

+    movq    mm2, mm3          ; 13 03 12 02

+    punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]

+    punpckhdq mm4, mm6          ; 31 21 11 01 aka ip[4]

+    punpckldq mm2, mm1          ; 32 22 12 02 aka ip[8]

+    punpckhdq mm3, mm1          ; 33 23 13 03 aka ip[12]

+;~~~~~~~~~~~~~~~~~~~~~

+    movq    mm1, mm0

+    movq    mm5, mm4

+    paddw   mm1, mm3          ;ip[0] + ip[12] aka al

+    paddw   mm5, mm2          ;ip[4] + ip[8] aka bl

+    movq    mm6, mm1          ;temp al

+    paddw   mm1, mm5          ;al + bl

+    psubw   mm6, mm5          ;al - bl

+    psubw   mm0, mm3          ;ip[0] - ip[12] aka d1

+    psubw   mm4, mm2          ;ip[4] - ip[8] aka c1

+    movq    mm5, mm0          ;temp dl

+    paddw   mm0, mm4          ;dl + cl

+    psubw   mm5, mm4          ;dl - cl

+;~~~~~~~~~~~~~~~~~~~~~

+    movq    mm3, mm1          ; 03 02 01 00

+    punpcklwd mm1, mm0          ; 11 01 10 00

+    punpckhwd mm3, mm0          ; 13 03 12 02

+    movq    mm4, mm6          ; 23 22 21 20

+    punpcklwd mm6, mm5          ; 31 21 30 20

+    punpckhwd mm4, mm5          ; 33 23 32 22

+    movq    mm0, mm1          ; 11 01 10 00

+    movq    mm2, mm3          ; 13 03 12 02

+    punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]

+    punpckhdq mm1, mm6          ; 31 21 11 01 aka ip[4]

+    punpckldq mm2, mm4          ; 32 22 12 02 aka ip[8]

+    punpckhdq mm3, mm4          ; 33 23 13 03 aka ip[12]

+    paddw   mm0, mm7

+    paddw   mm1, mm7

+    paddw   mm2, mm7

+    paddw   mm3, mm7

+    psraw   mm0, 3

+    psraw   mm1, 3

+    psraw   mm2, 3

+    psraw   mm3, 3

+    movq  [rdi + 0], mm0

+    movq  [rdi + 8], mm1

+    movq  [rdi + 16], mm2

+    movq  [rdi + 24], mm3

+    ; begin epilog

+    pop rdi

+    pop rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

--- /dev/null

+++ b/vp9/common/x86/iwalsh_sse2.asm

@@ -1,0 +1,119 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+;void vp9_short_inv_walsh4x4_sse2(short *input, short *output)

+global sym(vp9_short_inv_walsh4x4_sse2)

+sym(vp9_short_inv_walsh4x4_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 2

+    SAVE_XMM 6

+    push        rsi

+    push        rdi

+    ; end prolog

+    mov     rsi, arg(0)

+    mov     rdi, arg(1)

+    mov     rax, 3

+    movdqa    xmm0, [rsi + 0]       ;ip[4] ip[0]

+    movdqa    xmm1, [rsi + 16]      ;ip[12] ip[8]

+    shl     rax, 16

+    or      rax, 3            ;00030003h

+    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]

+    movdqa    xmm3, xmm0          ;ip[4] ip[0]

+    paddw   xmm0, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1

+    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1

+    movdqa    xmm4, xmm0

+    punpcklqdq  xmm0, xmm3          ;d1 a1

+    punpckhqdq  xmm4, xmm3          ;c1 b1

+    movd    xmm6, eax

+    movdqa    xmm1, xmm4          ;c1 b1

+    paddw   xmm4, xmm0          ;dl+cl a1+b1 aka op[4] op[0]

+    psubw   xmm0, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]

+;;;temp output

+;;  movdqu  [rdi + 0], xmm4

+;;  movdqu  [rdi + 16], xmm3

+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+    ; 13 12 11 10 03 02 01 00

+    ;

+    ; 33 32 31 30 23 22 21 20

+    ;

+    movdqa    xmm3, xmm4          ; 13 12 11 10 03 02 01 00

+    punpcklwd xmm4, xmm0          ; 23 03 22 02 21 01 20 00

+    punpckhwd xmm3, xmm0          ; 33 13 32 12 31 11 30 10

+    movdqa    xmm1, xmm4          ; 23 03 22 02 21 01 20 00

+    punpcklwd xmm4, xmm3          ; 31 21 11 01 30 20 10 00

+    punpckhwd xmm1, xmm3          ; 33 23 13 03 32 22 12 02

+    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]

+    movdqa    xmm3, xmm4          ;ip[4] ip[0]

+    pshufd    xmm6, xmm6, 0       ;03 03 03 03 03 03 03 03

+    paddw   xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1

+    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1

+    movdqa    xmm5, xmm4

+    punpcklqdq  xmm4, xmm3          ;d1 a1

+    punpckhqdq  xmm5, xmm3          ;c1 b1

+    movdqa    xmm1, xmm5          ;c1 b1

+    paddw   xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0]

+    psubw   xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]

+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+    ; 13 12 11 10 03 02 01 00

+    ;

+    ; 33 32 31 30 23 22 21 20

+    ;

+    movdqa    xmm0, xmm5          ; 13 12 11 10 03 02 01 00

+    punpcklwd xmm5, xmm4          ; 23 03 22 02 21 01 20 00

+    punpckhwd xmm0, xmm4          ; 33 13 32 12 31 11 30 10

+    movdqa    xmm1, xmm5          ; 23 03 22 02 21 01 20 00

+    punpcklwd xmm5, xmm0          ; 31 21 11 01 30 20 10 00

+    punpckhwd xmm1, xmm0          ; 33 23 13 03 32 22 12 02

+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+    paddw   xmm5, xmm6

+    paddw   xmm1, xmm6

+    psraw   xmm5, 3

+    psraw   xmm1, 3

+    movdqa  [rdi + 0], xmm5

+    movdqa  [rdi + 16], xmm1

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+x_s1sqr2:

+    times 4 dw 0x8A8C

+align 16

+x_c1sqr2less1:

+    times 4 dw 0x4E7B

+align 16

+fours:

+    times 4 dw 0x0004

--- /dev/null

+++ b/vp9/common/x86/loopfilter_mmx.asm

@@ -1,0 +1,969 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+;void vp9_loop_filter_horizontal_edge_mmx

+;(

+;    unsigned char *src_ptr,

+;    int src_pixel_step,

+;    const char *blimit,

+;    const char *limit,

+;    const char *thresh,

+;    int  count

+;)

+global sym(vp9_loop_filter_horizontal_edge_mmx)

+sym(vp9_loop_filter_horizontal_edge_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 32                         ; reserve 32 bytes

+    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[8];

+    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[8];

+        mov         rsi, arg(0) ;src_ptr

+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?

+        movsxd      rcx, dword ptr arg(5) ;count

+.next8_h:

+        mov         rdx, arg(3) ;limit

+        movq        mm7, [rdx]

+        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing

+        add         rdi, rax

+        ; calculate breakout conditions

+        movq        mm2, [rdi+2*rax]      ; q3

+        movq        mm1, [rsi+2*rax]      ; q2

+        movq        mm6, mm1              ; q2

+        psubusb     mm1, mm2              ; q2-=q3

+        psubusb     mm2, mm6              ; q3-=q2

+        por         mm1, mm2              ; abs(q3-q2)

+        psubusb     mm1, mm7              ;

+        movq        mm4, [rsi+rax]        ; q1

+        movq        mm3, mm4              ; q1

+        psubusb     mm4, mm6              ; q1-=q2

+        psubusb     mm6, mm3              ; q2-=q1

+        por         mm4, mm6              ; abs(q2-q1)

+        psubusb     mm4, mm7

+        por        mm1, mm4

+        movq        mm4, [rsi]            ; q0

+        movq        mm0, mm4              ; q0

+        psubusb     mm4, mm3              ; q0-=q1

+        psubusb     mm3, mm0              ; q1-=q0

+        por         mm4, mm3              ; abs(q0-q1)

+        movq        t0, mm4               ; save to t0

+        psubusb     mm4, mm7

+        por        mm1, mm4

+        neg         rax                   ; negate pitch to deal with above border

+        movq        mm2, [rsi+4*rax]      ; p3

+        movq        mm4, [rdi+4*rax]      ; p2

+        movq        mm5, mm4              ; p2

+        psubusb     mm4, mm2              ; p2-=p3

+        psubusb     mm2, mm5              ; p3-=p2

+        por         mm4, mm2              ; abs(p3 - p2)

+        psubusb     mm4, mm7

+        por        mm1, mm4

+        movq        mm4, [rsi+2*rax]      ; p1

+        movq        mm3, mm4              ; p1

+        psubusb     mm4, mm5              ; p1-=p2

+        psubusb     mm5, mm3              ; p2-=p1

+        por         mm4, mm5              ; abs(p2 - p1)

+        psubusb     mm4, mm7

+        por        mm1, mm4

+        movq        mm2, mm3              ; p1

+        movq        mm4, [rsi+rax]        ; p0

+        movq        mm5, mm4              ; p0

+        psubusb     mm4, mm3              ; p0-=p1

+        psubusb     mm3, mm5              ; p1-=p0

+        por         mm4, mm3              ; abs(p1 - p0)

+        movq        t1, mm4               ; save to t1

+        psubusb     mm4, mm7

+        por        mm1, mm4

+        movq        mm3, [rdi]            ; q1

+        movq        mm4, mm3              ; q1

+        psubusb     mm3, mm2              ; q1-=p1

+        psubusb     mm2, mm4              ; p1-=q1

+        por         mm2, mm3              ; abs(p1-q1)

+        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero

+        psrlw       mm2, 1                ; abs(p1-q1)/2

+        movq        mm6, mm5              ; p0

+        movq        mm3, [rsi]            ; q0

+        psubusb     mm5, mm3              ; p0-=q0

+        psubusb     mm3, mm6              ; q0-=p0

+        por         mm5, mm3              ; abs(p0 - q0)

+        paddusb     mm5, mm5              ; abs(p0-q0)*2

+        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2

+        mov         rdx, arg(2) ;blimit           ; get blimit

+        movq        mm7, [rdx]            ; blimit

+        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit

+        por         mm1,    mm5

+        pxor        mm5,    mm5

+        pcmpeqb     mm1,    mm5           ; mask mm1

+        ; calculate high edge variance

+        mov         rdx, arg(4) ;thresh           ; get thresh

+        movq        mm7, [rdx]            ;

+        movq        mm4, t0               ; get abs (q1 - q0)

+        psubusb     mm4, mm7

+        movq        mm3, t1               ; get abs (p1 - p0)

+        psubusb     mm3, mm7

+        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh

+        pcmpeqb     mm4,        mm5

+        pcmpeqb     mm5,        mm5

+        pxor        mm4,        mm5

+        ; start work on filters

+        movq        mm2, [rsi+2*rax]      ; p1

+        movq        mm7, [rdi]            ; q1

+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values

+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values

+        psubsb      mm2, mm7              ; p1 - q1

+        pand        mm2, mm4              ; high var mask (hvm)(p1 - q1)

+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values

+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values

+        movq        mm3, mm0              ; q0

+        psubsb      mm0, mm6              ; q0 - p0

+        paddsb      mm2, mm0              ; 1 * (q0 - p0) + hvm(p1 - q1)

+        paddsb      mm2, mm0              ; 2 * (q0 - p0) + hvm(p1 - q1)

+        paddsb      mm2, mm0              ; 3 * (q0 - p0) + hvm(p1 - q1)

+        pand        mm1, mm2                  ; mask filter values we don't care about

+        movq        mm2, mm1

+        paddsb      mm1, [GLOBAL(t4)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 4

+        paddsb      mm2, [GLOBAL(t3)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 3

+        pxor        mm0, mm0             ;

+        pxor        mm5, mm5

+        punpcklbw   mm0, mm2            ;

+        punpckhbw   mm5, mm2            ;

+        psraw       mm0, 11             ;

+        psraw       mm5, 11

+        packsswb    mm0, mm5

+        movq        mm2, mm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;

+        pxor        mm0, mm0              ; 0

+        movq        mm5, mm1              ; abcdefgh

+        punpcklbw   mm0, mm1              ; e0f0g0h0

+        psraw       mm0, 11               ; sign extended shift right by 3

+        pxor        mm1, mm1              ; 0

+        punpckhbw   mm1, mm5              ; a0b0c0d0

+        psraw       mm1, 11               ; sign extended shift right by 3

+        movq        mm5, mm0              ; save results

+        packsswb    mm0, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3

+        paddsw      mm5, [GLOBAL(ones)]

+        paddsw      mm1, [GLOBAL(ones)]

+        psraw       mm5, 1                ; partial shifted one more time for 2nd tap

+        psraw       mm1, 1                ; partial shifted one more time for 2nd tap

+        packsswb    mm5, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4

+        pandn       mm4, mm5              ; high edge variance additive

+        paddsb      mm6, mm2              ; p0+= p0 add

+        pxor        mm6, [GLOBAL(t80)]    ; unoffset

+        movq        [rsi+rax], mm6        ; write back

+        movq        mm6, [rsi+2*rax]      ; p1

+        pxor        mm6, [GLOBAL(t80)]    ; reoffset

+        paddsb      mm6, mm4              ; p1+= p1 add

+        pxor        mm6, [GLOBAL(t80)]    ; unoffset

+        movq        [rsi+2*rax], mm6      ; write back

+        psubsb      mm3, mm0              ; q0-= q0 add

+        pxor        mm3, [GLOBAL(t80)]    ; unoffset

+        movq        [rsi], mm3            ; write back

+        psubsb      mm7, mm4              ; q1-= q1 add

+        pxor        mm7, [GLOBAL(t80)]    ; unoffset

+        movq        [rdi], mm7            ; write back

+        add         rsi,8

+        neg         rax

+        dec         rcx

+        jnz         .next8_h

+    add rsp, 32

+    pop rsp

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_loop_filter_vertical_edge_mmx

+;(

+;    unsigned char *src_ptr,

+;    int  src_pixel_step,

+;    const char *blimit,

+;    const char *limit,

+;    const char *thresh,

+;    int count

+;)

+global sym(vp9_loop_filter_vertical_edge_mmx)

+sym(vp9_loop_filter_vertical_edge_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub          rsp, 64      ; reserve 64 bytes

+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];

+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];

+    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[32];

+        mov         rsi,        arg(0) ;src_ptr

+        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?

+        lea         rsi,        [rsi + rax*4 - 4]

+        movsxd      rcx,        dword ptr arg(5) ;count

+.next8_v:

+        mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing

+        add         rdi,        rax

+        ;transpose

+        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60

+        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70

+        punpckhbw   mm7,        [rdi+2*rax]                 ; 77 67 76 66 75 65 74 64

+        punpcklbw   mm6,        [rdi+2*rax]                 ; 73 63 72 62 71 61 70 60

+        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40

+        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40

+        punpckhbw   mm5,        [rsi+rax]                   ; 57 47 56 46 55 45 54 44

+        punpcklbw   mm4,        [rsi+rax]                   ; 53 43 52 42 51 41 50 40

+        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44

+        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46

+        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44

+        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40

+        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42

+        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40

+        neg         rax

+        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20

+        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20

+        punpckhbw   mm6,        [rsi+rax]                   ; 37 27 36 36 35 25 34 24

+        punpcklbw   mm1,        [rsi+rax]                   ; 33 23 32 22 31 21 30 20

+        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00

+        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04

+        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04

+        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06

+        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04

+        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06

+        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3

+        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2

+        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06

+        psubusb     mm5,        mm7                         ; q2-q3

+        psubusb     mm7,        mm6                         ; q3-q2

+        por         mm7,        mm5;                        ; mm7=abs (q3-q2)

+        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04

+        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1

+        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0

+        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1

+        psubusb     mm3,        mm6                         ; q1-q2

+        psubusb     mm6,        mm5                         ; q2-q1

+        por         mm6,        mm3                         ; mm6=abs(q2-q1)

+        lea         rdx,        srct

+        movq        [rdx+24],   mm5                         ; save q1

+        movq        [rdx+16],   mm0                         ; save q0

+        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00

+        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00

+        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00

+        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00

+        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02

+        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00

+        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3

+        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2

+        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2

+        psubusb     mm2,        mm0                         ; p2-p3

+        psubusb     mm0,        mm1                         ; p3-p2

+        por         mm0,        mm2                         ; mm0=abs(p3-p2)

+        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02

+        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1

+        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0

+        movq        [rdx+8],    mm3                         ; save p0

+        movq        [rdx],      mm2                         ; save p1

+        movq        mm5,        mm2                         ; mm5 = p1

+        psubusb     mm2,        mm1                         ; p1-p2

+        psubusb     mm1,        mm5                         ; p2-p1

+        por         mm1,        mm2                         ; mm1=abs(p2-p1)

+        mov         rdx,        arg(3) ;limit

+        movq        mm4,        [rdx]                       ; mm4 = limit

+        psubusb     mm7,        mm4

+        psubusb     mm0,        mm4

+        psubusb     mm1,        mm4

+        psubusb     mm6,        mm4

+        por         mm7,        mm6

+        por         mm0,        mm1

+        por         mm0,        mm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit

+        movq        mm1,        mm5                         ; p1

+        movq        mm7,        mm3                         ; mm3=mm7=p0

+        psubusb     mm7,        mm5                         ; p0 - p1

+        psubusb     mm5,        mm3                         ; p1 - p0

+        por         mm5,        mm7                         ; abs(p1-p0)

+        movq        t0,         mm5                         ; save abs(p1-p0)

+        lea         rdx,        srct

+        psubusb     mm5,        mm4

+        por         mm0,        mm5                         ; mm0=mask

+        movq        mm5,        [rdx+16]                    ; mm5=q0

+        movq        mm7,        [rdx+24]                    ; mm7=q1

+        movq        mm6,        mm5                         ; mm6=q0

+        movq        mm2,        mm7                         ; q1

+        psubusb     mm5,        mm7                         ; q0-q1

+        psubusb     mm7,        mm6                         ; q1-q0

+        por         mm7,        mm5                         ; abs(q1-q0)

+        movq        t1,         mm7                         ; save abs(q1-q0)

+        psubusb     mm7,        mm4

+        por         mm0,        mm7                         ; mask

+        movq        mm5,        mm2                         ; q1

+        psubusb     mm5,        mm1                         ; q1-=p1

+        psubusb     mm1,        mm2                         ; p1-=q1

+        por         mm5,        mm1                         ; abs(p1-q1)

+        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero

+        psrlw       mm5,        1                           ; abs(p1-q1)/2

+        mov         rdx,        arg(2) ;blimit                      ;

+        movq        mm4,        [rdx]                       ;blimit

+        movq        mm1,        mm3                         ; mm1=mm3=p0

+        movq        mm7,        mm6                         ; mm7=mm6=q0

+        psubusb     mm1,        mm7                         ; p0-q0

+        psubusb     mm7,        mm3                         ; q0-p0

+        por         mm1,        mm7                         ; abs(q0-p0)

+        paddusb     mm1,        mm1                         ; abs(q0-p0)*2

+        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2

+        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit

+        por         mm1,        mm0;                        ; mask

+        pxor        mm0,        mm0

+        pcmpeqb     mm1,        mm0

+        ; calculate high edge variance

+        mov         rdx,        arg(4) ;thresh            ; get thresh

+        movq        mm7,        [rdx]

+        ;

+        movq        mm4,        t0              ; get abs (q1 - q0)

+        psubusb     mm4,        mm7

+        movq        mm3,        t1              ; get abs (p1 - p0)

+        psubusb     mm3,        mm7

+        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh

+        pcmpeqb     mm4,        mm0

+        pcmpeqb     mm0,        mm0

+        pxor        mm4,        mm0

+        ; start work on filters

+        lea         rdx,        srct

+        movq        mm2,        [rdx]           ; p1

+        movq        mm7,        [rdx+24]        ; q1

+        movq        mm6,        [rdx+8]         ; p0

+        movq        mm0,        [rdx+16]        ; q0

+        pxor        mm2,        [GLOBAL(t80)]   ; p1 offset to convert to signed values

+        pxor        mm7,        [GLOBAL(t80)]   ; q1 offset to convert to signed values

+        psubsb      mm2,        mm7             ; p1 - q1

+        pand        mm2,        mm4             ; high var mask (hvm)(p1 - q1)

+        pxor        mm6,        [GLOBAL(t80)]   ; offset to convert to signed values

+        pxor        mm0,        [GLOBAL(t80)]   ; offset to convert to signed values

+        movq        mm3,        mm0             ; q0

+        psubsb      mm0,        mm6             ; q0 - p0

+        paddsb      mm2,        mm0             ; 1 * (q0 - p0) + hvm(p1 - q1)

+        paddsb      mm2,        mm0             ; 2 * (q0 - p0) + hvm(p1 - q1)

+        paddsb      mm2,        mm0             ; 3 * (q0 - p0) + hvm(p1 - q1)

+        pand       mm1,        mm2              ; mask filter values we don't care about

+        movq        mm2,        mm1

+        paddsb      mm1,        [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4

+        paddsb      mm2,        [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3

+        pxor        mm0,        mm0          ;

+        pxor        mm5,        mm5

+        punpcklbw   mm0,        mm2         ;

+        punpckhbw   mm5,        mm2         ;

+        psraw       mm0,        11              ;

+        psraw       mm5,        11

+        packsswb    mm0,        mm5

+        movq        mm2,        mm0         ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;

+        pxor        mm0,        mm0           ; 0

+        movq        mm5,        mm1           ; abcdefgh

+        punpcklbw   mm0,        mm1           ; e0f0g0h0

+        psraw       mm0,        11                ; sign extended shift right by 3

+        pxor        mm1,        mm1           ; 0

+        punpckhbw   mm1,        mm5           ; a0b0c0d0

+        psraw       mm1,        11                ; sign extended shift right by 3

+        movq        mm5,        mm0              ; save results

+        packsswb    mm0,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3

+        paddsw      mm5,        [GLOBAL(ones)]

+        paddsw      mm1,        [GLOBAL(ones)]

+        psraw       mm5,        1                 ; partial shifted one more time for 2nd tap

+        psraw       mm1,        1                 ; partial shifted one more time for 2nd tap

+        packsswb    mm5,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4

+        pandn       mm4,        mm5             ; high edge variance additive

+        paddsb      mm6,        mm2             ; p0+= p0 add

+        pxor        mm6,        [GLOBAL(t80)]   ; unoffset

+        ; mm6=p0                               ;

+        movq        mm1,        [rdx]           ; p1

+        pxor        mm1,        [GLOBAL(t80)]   ; reoffset

+        paddsb      mm1,        mm4                 ; p1+= p1 add

+        pxor        mm1,        [GLOBAL(t80)]       ; unoffset

+        ; mm6 = p0 mm1 = p1

+        psubsb      mm3,        mm0                 ; q0-= q0 add

+        pxor        mm3,        [GLOBAL(t80)]       ; unoffset

+        ; mm3 = q0

+        psubsb      mm7,        mm4                 ; q1-= q1 add

+        pxor        mm7,        [GLOBAL(t80)]       ; unoffset

+        ; mm7 = q1

+        ; tranpose and write back

+        ; mm1 =    72 62 52 42 32 22 12 02

+        ; mm6 =    73 63 53 43 33 23 13 03

+        ; mm3 =    74 64 54 44 34 24 14 04

+        ; mm7 =    75 65 55 45 35 25 15 05

+        movq        mm2,        mm1             ; 72 62 52 42 32 22 12 02

+        punpcklbw   mm2,        mm6             ; 33 32 23 22 13 12 03 02

+        movq        mm4,        mm3             ; 74 64 54 44 34 24 14 04

+        punpckhbw   mm1,        mm6             ; 73 72 63 62 53 52 43 42

+        punpcklbw   mm4,        mm7             ; 35 34 25 24 15 14 05 04

+        punpckhbw   mm3,        mm7             ; 75 74 65 64 55 54 45 44

+        movq        mm6,        mm2             ; 33 32 23 22 13 12 03 02

+        punpcklwd   mm2,        mm4             ; 15 14 13 12 05 04 03 02

+        punpckhwd   mm6,        mm4             ; 35 34 33 32 25 24 23 22

+        movq        mm5,        mm1             ; 73 72 63 62 53 52 43 42

+        punpcklwd   mm1,        mm3             ; 55 54 53 52 45 44 43 42

+        punpckhwd   mm5,        mm3             ; 75 74 73 72 65 64 63 62

+        ; mm2 = 15 14 13 12 05 04 03 02

+        ; mm6 = 35 34 33 32 25 24 23 22

+        ; mm5 = 55 54 53 52 45 44 43 42

+        ; mm1 = 75 74 73 72 65 64 63 62

+        movd        [rsi+rax*4+2], mm2

+        psrlq       mm2,        32

+        movd        [rdi+rax*4+2], mm2

+        movd        [rsi+rax*2+2], mm6

+        psrlq       mm6,        32

+        movd        [rsi+rax+2],mm6

+        movd        [rsi+2],    mm1

+        psrlq       mm1,        32

+        movd        [rdi+2],    mm1

+        neg         rax

+        movd        [rdi+rax+2],mm5

+        psrlq       mm5,        32

+        movd        [rdi+rax*2+2], mm5

+        lea         rsi,        [rsi+rax*8]

+        dec         rcx

+        jnz         .next8_v

+    add rsp, 64

+    pop rsp

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_loop_filter_simple_horizontal_edge_mmx

+;(

+;    unsigned char *src_ptr,

+;    int  src_pixel_step,

+;    const char *blimit

+;)

+global sym(vp9_loop_filter_simple_horizontal_edge_mmx)

+sym(vp9_loop_filter_simple_horizontal_edge_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 3

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov         rsi, arg(0) ;src_ptr

+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?

+        mov         rcx, 2                ; count

+.nexts8_h:

+        mov         rdx, arg(2) ;blimit           ; get blimit

+        movq        mm3, [rdx]            ;

+        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing

+        add         rdi, rax

+        neg         rax

+        ; calculate mask

+        movq        mm1, [rsi+2*rax]      ; p1

+        movq        mm0, [rdi]            ; q1

+        movq        mm2, mm1

+        movq        mm7, mm0

+        movq        mm4, mm0

+        psubusb     mm0, mm1              ; q1-=p1

+        psubusb     mm1, mm4              ; p1-=q1

+        por         mm1, mm0              ; abs(p1-q1)

+        pand        mm1, [GLOBAL(tfe)]    ; set lsb of each byte to zero

+        psrlw       mm1, 1                ; abs(p1-q1)/2

+        movq        mm5, [rsi+rax]        ; p0

+        movq        mm4, [rsi]            ; q0

+        movq        mm0, mm4              ; q0

+        movq        mm6, mm5              ; p0

+        psubusb     mm5, mm4              ; p0-=q0

+        psubusb     mm4, mm6              ; q0-=p0

+        por         mm5, mm4              ; abs(p0 - q0)

+        paddusb     mm5, mm5              ; abs(p0-q0)*2

+        paddusb     mm5, mm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2

+        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit

+        pxor        mm3, mm3

+        pcmpeqb     mm5, mm3

+        ; start work on filters

+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values

+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values

+        psubsb      mm2, mm7              ; p1 - q1

+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values

+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values

+        movq        mm3, mm0              ; q0

+        psubsb      mm0, mm6              ; q0 - p0

+        paddsb      mm2, mm0              ; p1 - q1 + 1 * (q0 - p0)

+        paddsb      mm2, mm0              ; p1 - q1 + 2 * (q0 - p0)

+        paddsb      mm2, mm0              ; p1 - q1 + 3 * (q0 - p0)

+        pand        mm5, mm2              ; mask filter values we don't care about

+        ; do + 4 side

+        paddsb      mm5, [GLOBAL(t4)]     ; 3* (q0 - p0) + (p1 - q1) + 4

+        movq        mm0, mm5              ; get a copy of filters

+        psllw       mm0, 8                ; shift left 8

+        psraw       mm0, 3                ; arithmetic shift right 11

+        psrlw       mm0, 8

+        movq        mm1, mm5              ; get a copy of filters

+        psraw       mm1, 11               ; arithmetic shift right 11

+        psllw       mm1, 8                ; shift left 8 to put it back

+        por         mm0, mm1              ; put the two together to get result

+        psubsb      mm3, mm0              ; q0-= q0 add

+        pxor        mm3, [GLOBAL(t80)]    ; unoffset

+        movq        [rsi], mm3            ; write back

+        ; now do +3 side

+        psubsb      mm5, [GLOBAL(t1s)]     ; +3 instead of +4

+        movq        mm0, mm5              ; get a copy of filters

+        psllw       mm0, 8                ; shift left 8

+        psraw       mm0, 3                ; arithmetic shift right 11

+        psrlw       mm0, 8

+        psraw       mm5, 11               ; arithmetic shift right 11

+        psllw       mm5, 8                ; shift left 8 to put it back

+        por         mm0, mm5              ; put the two together to get result

+        paddsb      mm6, mm0              ; p0+= p0 add

+        pxor        mm6, [GLOBAL(t80)]    ; unoffset

+        movq        [rsi+rax], mm6        ; write back

+        add         rsi,8

+        neg         rax

+        dec         rcx

+        jnz         .nexts8_h

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_loop_filter_simple_vertical_edge_mmx

+;(

+;    unsigned char *src_ptr,

+;    int  src_pixel_step,

+;    const char *blimit

+;)

+global sym(vp9_loop_filter_simple_vertical_edge_mmx)

+sym(vp9_loop_filter_simple_vertical_edge_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 3

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub          rsp, 32      ; reserve 32 bytes

+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];

+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];

+        mov         rsi, arg(0) ;src_ptr

+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?

+        lea         rsi, [rsi + rax*4- 2];  ;

+        mov         rcx, 2                                      ; count

+.nexts8_v:

+        lea         rdi,        [rsi + rax];

+        movd        mm0,        [rdi + rax * 2]                 ; xx xx xx xx 73 72 71 70

+        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 63 62 61 60

+        punpcklbw   mm6,        mm0                             ; 73 63 72 62 71 61 70 60

+        movd        mm0,        [rsi + rax]                     ; xx xx xx xx 53 52 51 50

+        movd        mm4,        [rsi]                           ; xx xx xx xx 43 42 41 40

+        punpcklbw   mm4,        mm0                             ; 53 43 52 42 51 41 50 40

+        movq        mm5,        mm4                             ; 53 43 52 42 51 41 50 40

+        punpcklwd   mm4,        mm6                             ; 71 61 51 41 70 60 50 40

+        punpckhwd   mm5,        mm6                             ; 73 63 53 43 72 62 52 42

+        neg         rax

+        movd        mm7,        [rsi + rax]                     ; xx xx xx xx 33 32 31 30

+        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 23 22 21 20

+        punpcklbw   mm6,        mm7                             ; 33 23 32 22 31 21 30 20

+        movd        mm1,        [rdi + rax * 4]                 ; xx xx xx xx 13 12 11 10

+        movd        mm0,        [rsi + rax * 4]                 ; xx xx xx xx 03 02 01 00

+        punpcklbw   mm0,        mm1                             ; 13 03 12 02 11 01 10 00

+        movq        mm2,        mm0                             ; 13 03 12 02 11 01 10 00

+        punpcklwd   mm0,        mm6                             ; 31 21 11 01 30 20 10 00

+        punpckhwd   mm2,        mm6                             ; 33 23 13 03 32 22 12 02

+        movq        mm1,        mm0                             ; 13 03 12 02 11 01 10 00

+        punpckldq   mm0,        mm4                             ; 70 60 50 40 30 20 10 00       = p1

+        movq        mm3,        mm2                             ; 33 23 13 03 32 22 12 02

+        punpckhdq   mm1,        mm4                             ; 71 61 51 41 31 21 11 01       = p0

+        punpckldq   mm2,        mm5                             ; 72 62 52 42 32 22 12 02       = q0

+        punpckhdq   mm3,        mm5                             ; 73 63 53 43 33 23 13 03       = q1

+        ; calculate mask

+        movq        mm6,        mm0                             ; p1

+        movq        mm7,        mm3                             ; q1

+        psubusb     mm7,        mm6                             ; q1-=p1

+        psubusb     mm6,        mm3                             ; p1-=q1

+        por         mm6,        mm7                             ; abs(p1-q1)

+        pand        mm6,        [GLOBAL(tfe)]                   ; set lsb of each byte to zero

+        psrlw       mm6,        1                               ; abs(p1-q1)/2

+        movq        mm5,        mm1                             ; p0

+        movq        mm4,        mm2                             ; q0

+        psubusb     mm5,        mm2                             ; p0-=q0

+        psubusb     mm4,        mm1                             ; q0-=p0

+        por         mm5,        mm4                             ; abs(p0 - q0)

+        paddusb     mm5,        mm5                             ; abs(p0-q0)*2

+        paddusb     mm5,        mm6                             ; abs (p0 - q0) *2 + abs(p1-q1)/2

+        mov         rdx,        arg(2) ;blimit                          ; get blimit

+        movq        mm7,        [rdx]

+        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit

+        pxor        mm7,        mm7

+        pcmpeqb     mm5,        mm7                             ; mm5 = mask

+        ; start work on filters

+        movq        t0,         mm0

+        movq        t1,         mm3

+        pxor        mm0,        [GLOBAL(t80)]                   ; p1 offset to convert to signed values

+        pxor        mm3,        [GLOBAL(t80)]                   ; q1 offset to convert to signed values

+        psubsb      mm0,        mm3                             ; p1 - q1

+        movq        mm6,        mm1                             ; p0

+        movq        mm7,        mm2                             ; q0

+        pxor        mm6,        [GLOBAL(t80)]                   ; offset to convert to signed values

+        pxor        mm7,        [GLOBAL(t80)]                   ; offset to convert to signed values

+        movq        mm3,        mm7                             ; offseted ; q0

+        psubsb      mm7,        mm6                             ; q0 - p0

+        paddsb      mm0,        mm7                             ; p1 - q1 + 1 * (q0 - p0)

+        paddsb      mm0,        mm7                             ; p1 - q1 + 2 * (q0 - p0)

+        paddsb      mm0,        mm7                             ; p1 - q1 + 3 * (q0 - p0)

+        pand        mm5,        mm0                             ; mask filter values we don't care about

+        paddsb      mm5,        [GLOBAL(t4)]                    ;  3* (q0 - p0) + (p1 - q1) + 4

+        movq        mm0,        mm5                             ; get a copy of filters

+        psllw       mm0,        8                               ; shift left 8

+        psraw       mm0,        3                               ; arithmetic shift right 11

+        psrlw       mm0,        8

+        movq        mm7,        mm5                             ; get a copy of filters

+        psraw       mm7,        11                              ; arithmetic shift right 11

+        psllw       mm7,        8                               ; shift left 8 to put it back

+        por         mm0,        mm7                             ; put the two together to get result

+        psubsb      mm3,        mm0                             ; q0-= q0sz add

+        pxor        mm3,        [GLOBAL(t80)]                   ; unoffset

+        ; now do +3 side

+        psubsb      mm5, [GLOBAL(t1s)]                          ; +3 instead of +4

+        movq        mm0, mm5                                    ; get a copy of filters

+        psllw       mm0, 8                                      ; shift left 8

+        psraw       mm0, 3                                      ; arithmetic shift right 11

+        psrlw       mm0, 8

+        psraw       mm5, 11                                     ; arithmetic shift right 11

+        psllw       mm5, 8                                      ; shift left 8 to put it back

+        por         mm0, mm5                                    ; put the two together to get result

+        paddsb      mm6, mm0                                    ; p0+= p0 add

+        pxor        mm6, [GLOBAL(t80)]                          ; unoffset

+        movq        mm0,        t0

+        movq        mm4,        t1

+        ; mm0 = 70 60 50 40 30 20 10 00

+        ; mm6 = 71 61 51 41 31 21 11 01

+        ; mm3 = 72 62 52 42 32 22 12 02

+        ; mm4 = 73 63 53 43 33 23 13 03

+        ; transpose back to write out

+        movq        mm1,        mm0                         ;

+        punpcklbw   mm0,        mm6                         ; 31 30 21 20 11 10 01 00

+        punpckhbw   mm1,        mm6                         ; 71 70 61 60 51 50 41 40

+        movq        mm2,        mm3                         ;

+        punpcklbw   mm2,        mm4                         ; 33 32 23 22 13 12 03 02

+        movq        mm5,        mm1                         ; 71 70 61 60 51 50 41 40

+        punpckhbw   mm3,        mm4                         ; 73 72 63 62 53 52 43 42

+        movq        mm6,        mm0                         ; 31 30 21 20 11 10 01 00

+        punpcklwd   mm0,        mm2                         ; 13 12 11 10 03 02 01 00

+        punpckhwd   mm6,        mm2                         ; 33 32 31 30 23 22 21 20

+        movd        [rsi+rax*4], mm0                        ; write 03 02 01 00

+        punpcklwd   mm1,        mm3                         ; 53 52 51 50 43 42 41 40

+        psrlq       mm0,        32                          ; xx xx xx xx 13 12 11 10

+        punpckhwd   mm5,        mm3                         ; 73 72 71 70 63 62 61 60

+        movd        [rdi+rax*4], mm0                        ; write 13 12 11 10

+        movd        [rsi+rax*2], mm6                        ; write 23 22 21 20

+        psrlq       mm6,        32                          ; 33 32 31 30

+        movd        [rsi],      mm1                         ; write 43 42 41 40

+        movd        [rsi + rax], mm6                        ; write 33 32 31 30

+        neg         rax

+        movd        [rsi + rax*2], mm5                      ; write 63 62 61 60

+        psrlq       mm1,        32                          ; 53 52 51 50

+        movd        [rdi],      mm1                         ; write out 53 52 51 50

+        psrlq       mm5,        32                          ; 73 72 71 70

+        movd        [rdi + rax*2], mm5                      ; write 73 72 71 70

+        lea         rsi,        [rsi+rax*8]                 ; next 8

+        dec         rcx

+        jnz         .nexts8_v

+    add rsp, 32

+    pop rsp

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,

+;                  int y_stride,

+;                  loop_filter_info *lfi)

+;{

+;

+;

+;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);

+;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);

+;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);

+;}

+SECTION_RODATA

+align 16

+tfe:

+    times 8 db 0xfe

+align 16

+t80:

+    times 8 db 0x80

+align 16

+t1s:

+    times 8 db 0x01

+align 16

+t3:

+    times 8 db 0x03

+align 16

+t4:

+    times 8 db 0x04

+align 16

+ones:

+    times 4 dw 0x0001

+align 16

+s27:

+    times 4 dw 0x1b00

+align 16

+s18:

+    times 4 dw 0x1200

+align 16

+s9:

+    times 4 dw 0x0900

+align 16

+s63:

+    times 4 dw 0x003f

--- /dev/null

+++ b/vp9/common/x86/loopfilter_sse2.asm

@@ -1,0 +1,1238 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+; Use of pmaxub instead of psubusb to compute filter mask was seen

+; in ffvp8

+%macro LFH_FILTER_AND_HEV_MASK 1

+%if %1

+        movdqa      xmm2,                   [rdi+2*rax]       ; q3

+        movdqa      xmm1,                   [rsi+2*rax]       ; q2

+        movdqa      xmm4,                   [rsi+rax]         ; q1

+        movdqa      xmm5,                   [rsi]             ; q0

+        neg         rax                     ; negate pitch to deal with above border

+%else

+        movlps      xmm2,                   [rsi + rcx*2]     ; q3

+        movlps      xmm1,                   [rsi + rcx]       ; q2

+        movlps      xmm4,                   [rsi]             ; q1

+        movlps      xmm5,                   [rsi + rax]       ; q0

+        movhps      xmm2,                   [rdi + rcx*2]

+        movhps      xmm1,                   [rdi + rcx]

+        movhps      xmm4,                   [rdi]

+        movhps      xmm5,                   [rdi + rax]

+        lea         rsi,                    [rsi + rax*4]

+        lea         rdi,                    [rdi + rax*4]

+        movdqa      XMMWORD PTR [rsp],      xmm1              ; store q2

+        movdqa      XMMWORD PTR [rsp + 16], xmm4              ; store q1

+%endif

+        movdqa      xmm6,                   xmm1              ; q2

+        movdqa      xmm3,                   xmm4              ; q1

+        psubusb     xmm1,                   xmm2              ; q2-=q3

+        psubusb     xmm2,                   xmm6              ; q3-=q2

+        psubusb     xmm4,                   xmm6              ; q1-=q2

+        psubusb     xmm6,                   xmm3              ; q2-=q1

+        por         xmm4,                   xmm6              ; abs(q2-q1)

+        por         xmm1,                   xmm2              ; abs(q3-q2)

+        movdqa      xmm0,                   xmm5              ; q0

+        pmaxub      xmm1,                   xmm4

+        psubusb     xmm5,                   xmm3              ; q0-=q1

+        psubusb     xmm3,                   xmm0              ; q1-=q0

+        por         xmm5,                   xmm3              ; abs(q0-q1)

+        movdqa      t0,                     xmm5              ; save to t0

+        pmaxub      xmm1,                   xmm5

+%if %1

+        movdqa      xmm2,                   [rsi+4*rax]       ; p3

+        movdqa      xmm4,                   [rdi+4*rax]       ; p2

+        movdqa      xmm6,                   [rsi+2*rax]       ; p1

+%else

+        movlps      xmm2,                   [rsi + rax]       ; p3

+        movlps      xmm4,                   [rsi]             ; p2

+        movlps      xmm6,                   [rsi + rcx]       ; p1

+        movhps      xmm2,                   [rdi + rax]

+        movhps      xmm4,                   [rdi]

+        movhps      xmm6,                   [rdi + rcx]

+        movdqa      XMMWORD PTR [rsp + 32], xmm4              ; store p2

+        movdqa      XMMWORD PTR [rsp + 48], xmm6              ; store p1

+%endif

+        movdqa      xmm5,                   xmm4              ; p2

+        movdqa      xmm3,                   xmm6              ; p1

+        psubusb     xmm4,                   xmm2              ; p2-=p3

+        psubusb     xmm2,                   xmm5              ; p3-=p2

+        psubusb     xmm3,                   xmm5              ; p1-=p2

+        pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)

+        psubusb     xmm5,                   xmm6              ; p2-=p1

+        pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)

+        pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)

+        movdqa      xmm2,                   xmm6              ; p1

+        pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)

+%if %1

+        movdqa      xmm4,                   [rsi+rax]         ; p0

+        movdqa      xmm3,                   [rdi]             ; q1

+%else

+        movlps      xmm4,                   [rsi + rcx*2]     ; p0

+        movhps      xmm4,                   [rdi + rcx*2]

+        movdqa      xmm3,                   q1                ; q1

+%endif

+        movdqa      xmm5,                   xmm4              ; p0

+        psubusb     xmm4,                   xmm6              ; p0-=p1

+        psubusb     xmm6,                   xmm5              ; p1-=p0

+        por         xmm6,                   xmm4              ; abs(p1 - p0)

+        mov         rdx,                    arg(2)            ; get blimit

+        movdqa        t1,                   xmm6              ; save to t1

+        movdqa      xmm4,                   xmm3              ; q1

+        pmaxub      xmm1,                   xmm6

+        psubusb     xmm3,                   xmm2              ; q1-=p1

+        psubusb     xmm2,                   xmm4              ; p1-=q1

+        psubusb     xmm1,                   xmm7

+        por         xmm2,                   xmm3              ; abs(p1-q1)

+        movdqa      xmm7,                   XMMWORD PTR [rdx] ; blimit

+        movdqa      xmm3,                   xmm0              ; q0

+        pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero

+        mov         rdx,                    arg(4)            ; hev get thresh

+        movdqa      xmm6,                   xmm5              ; p0

+        psrlw       xmm2,                   1                 ; abs(p1-q1)/2

+        psubusb     xmm5,                   xmm3              ; p0-=q0

+        psubusb     xmm3,                   xmm6              ; q0-=p0

+        por         xmm5,                   xmm3              ; abs(p0 - q0)

+        paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2

+        movdqa      xmm4,                   t0                ; hev get abs (q1 - q0)

+        movdqa      xmm3,                   t1                ; get abs (p1 - p0)

+        paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2

+        movdqa      xmm2,                   XMMWORD PTR [rdx] ; hev

+        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit

+        psubusb     xmm4,                   xmm2              ; hev

+        psubusb     xmm3,                   xmm2              ; hev

+        por         xmm1,                   xmm5

+        pxor        xmm7,                   xmm7

+        paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh

+        pcmpeqb     xmm4,                   xmm5              ; hev

+        pcmpeqb     xmm3,                   xmm3              ; hev

+        pcmpeqb     xmm1,                   xmm7              ; mask xmm1

+        pxor        xmm4,                   xmm3              ; hev

+%endmacro

+%macro B_FILTER 1

+%if %1 == 0

+        movdqa      xmm2,                   p1                ; p1

+        movdqa      xmm7,                   q1                ; q1

+%elif %1 == 1

+        movdqa      xmm2,                   [rsi+2*rax]       ; p1

+        movdqa      xmm7,                   [rdi]             ; q1

+%elif %1 == 2

+        lea         rdx,                    srct

+        movdqa      xmm2,                   [rdx]             ; p1

+        movdqa      xmm7,                   [rdx+48]          ; q1

+        movdqa      xmm6,                   [rdx+16]          ; p0

+        movdqa      xmm0,                   [rdx+32]          ; q0

+%endif

+        pxor        xmm2,                   [GLOBAL(t80)]     ; p1 offset to convert to signed values

+        pxor        xmm7,                   [GLOBAL(t80)]     ; q1 offset to convert to signed values

+        psubsb      xmm2,                   xmm7              ; p1 - q1

+        pxor        xmm6,                   [GLOBAL(t80)]     ; offset to convert to signed values

+        pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)

+        pxor        xmm0,                   [GLOBAL(t80)]     ; offset to convert to signed values

+        movdqa      xmm3,                   xmm0              ; q0

+        psubsb      xmm0,                   xmm6              ; q0 - p0

+        paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)

+        paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)

+        paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)

+        pand        xmm1,                   xmm2              ; mask filter values we don't care about

+        movdqa      xmm2,                   xmm1

+        paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4

+        paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3

+        punpckhbw   xmm5,                   xmm2              ; axbxcxdx

+        punpcklbw   xmm2,                   xmm2              ; exfxgxhx

+        punpcklbw   xmm0,                   xmm1              ; exfxgxhx

+        psraw       xmm5,                   11                ; sign extended shift right by 3

+        punpckhbw   xmm1,                   xmm1              ; axbxcxdx

+        psraw       xmm2,                   11                ; sign extended shift right by 3

+        packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;

+        psraw       xmm0,                   11                ; sign extended shift right by 3

+        psraw       xmm1,                   11                ; sign extended shift right by 3

+        movdqa      xmm5,                   xmm0              ; save results

+        packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3

+        paddsw      xmm5,                   [GLOBAL(ones)]

+        paddsw      xmm1,                   [GLOBAL(ones)]

+        psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap

+        psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap

+        paddsb      xmm6,                   xmm2              ; p0+= p0 add

+        packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4

+%if %1 == 0

+        movdqa      xmm1,                   p1                ; p1

+%elif %1 == 1

+        movdqa      xmm1,                   [rsi+2*rax]       ; p1

+%elif %1 == 2

+        movdqa      xmm1,                   [rdx]             ; p1

+%endif

+        pandn       xmm4,                   xmm5              ; high edge variance additive

+        pxor        xmm6,                   [GLOBAL(t80)]     ; unoffset

+        pxor        xmm1,                   [GLOBAL(t80)]     ; reoffset

+        psubsb      xmm3,                   xmm0              ; q0-= q0 add

+        paddsb      xmm1,                   xmm4              ; p1+= p1 add

+        pxor        xmm3,                   [GLOBAL(t80)]     ; unoffset

+        pxor        xmm1,                   [GLOBAL(t80)]     ; unoffset

+        psubsb      xmm7,                   xmm4              ; q1-= q1 add

+        pxor        xmm7,                   [GLOBAL(t80)]     ; unoffset

+%if %1 == 0

+        lea         rsi,                    [rsi + rcx*2]

+        lea         rdi,                    [rdi + rcx*2]

+        movq        MMWORD PTR [rsi],       xmm6              ; p0

+        movhps      MMWORD PTR [rdi],       xmm6

+        movq        MMWORD PTR [rsi + rax], xmm1              ; p1

+        movhps      MMWORD PTR [rdi + rax], xmm1

+        movq        MMWORD PTR [rsi + rcx], xmm3              ; q0

+        movhps      MMWORD PTR [rdi + rcx], xmm3

+        movq        MMWORD PTR [rsi + rcx*2],xmm7             ; q1

+        movhps      MMWORD PTR [rdi + rcx*2],xmm7

+%elif %1 == 1

+        movdqa      [rsi+rax],              xmm6              ; write back

+        movdqa      [rsi+2*rax],            xmm1              ; write back

+        movdqa      [rsi],                  xmm3              ; write back

+        movdqa      [rdi],                  xmm7              ; write back

+%endif

+%endmacro

+;void vp9_loop_filter_horizontal_edge_sse2

+;(

+;    unsigned char *src_ptr,

+;    int            src_pixel_step,

+;    const char    *blimit,

+;    const char    *limit,

+;    const char    *thresh,

+;    int            count

+;)

+global sym(vp9_loop_filter_horizontal_edge_sse2)

+sym(vp9_loop_filter_horizontal_edge_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 32     ; reserve 32 bytes

+    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];

+    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];

+        mov         rsi,                    arg(0)           ;src_ptr

+        movsxd      rax,                    dword ptr arg(1) ;src_pixel_step

+        mov         rdx,                    arg(3)           ;limit

+        movdqa      xmm7,                   XMMWORD PTR [rdx]

+        lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing

+        ; calculate breakout conditions and high edge variance

+        LFH_FILTER_AND_HEV_MASK 1

+        ; filter and write back the result

+        B_FILTER 1

+    add rsp, 32

+    pop rsp

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_loop_filter_horizontal_edge_uv_sse2

+;(

+;    unsigned char *src_ptr,

+;    int            src_pixel_step,

+;    const char    *blimit,

+;    const char    *limit,

+;    const char    *thresh,

+;    int            count

+;)

+global sym(vp9_loop_filter_horizontal_edge_uv_sse2)

+sym(vp9_loop_filter_horizontal_edge_uv_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 96       ; reserve 96 bytes

+    %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];

+    %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];

+    %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];

+    %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];

+    %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];

+    %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];

+        mov         rsi,                    arg(0)             ; u

+        mov         rdi,                    arg(5)             ; v

+        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step

+        mov         rcx,                    rax

+        neg         rax                     ; negate pitch to deal with above border

+        mov         rdx,                    arg(3)             ;limit

+        movdqa      xmm7,                   XMMWORD PTR [rdx]

+        lea         rsi,                    [rsi + rcx]

+        lea         rdi,                    [rdi + rcx]

+        ; calculate breakout conditions and high edge variance

+        LFH_FILTER_AND_HEV_MASK 0

+        ; filter and write back the result

+        B_FILTER 0

+    add rsp, 96

+    pop rsp

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+%macro TRANSPOSE_16X8 2

+        movq        xmm4,               QWORD PTR [rsi]        ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00

+        movq        xmm1,               QWORD PTR [rdi]        ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10

+        movq        xmm0,               QWORD PTR [rsi+2*rax]  ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20

+        movq        xmm7,               QWORD PTR [rdi+2*rax]  ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30

+        movq        xmm5,               QWORD PTR [rsi+4*rax]  ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40

+        movq        xmm2,               QWORD PTR [rdi+4*rax]  ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50

+        punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00

+        movq        xmm1,               QWORD PTR [rdi+2*rcx]  ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70

+        movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00

+        punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20

+        movq        xmm7,               QWORD PTR [rsi+2*rcx]  ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60

+        punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40

+%if %1

+        lea         rsi,                [rsi+rax*8]

+%else

+        mov         rsi,                arg(5)          ; v_ptr

+%endif

+        movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40

+        punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60

+        punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40

+        punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44

+%if %1

+        lea         rdi,                [rdi+rax*8]

+%else

+        lea         rsi,                [rsi - 4]

+%endif

+        punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00

+%if %1

+        lea         rdx,                srct

+%else

+        lea         rdi,                [rsi + rax]     ; rdi points to row +1 for indirect addressing

+%endif

+        movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00

+        punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04

+        movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04

+        punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02

+        punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06

+        punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04

+        punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00

+        movdqa      t0,                 xmm2            ; save to free XMM2

+        movq        xmm2,               QWORD PTR [rsi]       ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80

+        movq        xmm6,               QWORD PTR [rdi]       ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90

+        movq        xmm0,               QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0

+        movq        xmm5,               QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0

+        movq        xmm1,               QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0

+        punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80

+        movq        xmm6,               QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0

+        punpcklbw   xmm0,               xmm5                  ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0

+        movq        xmm5,               QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0

+        punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0

+        movq        xmm6,               QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0

+        punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0

+        movdqa      xmm6,               xmm1            ;

+        punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4

+        punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0

+        movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80

+        punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80

+        punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84

+        movdqa      xmm0,               xmm5

+        punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80

+        punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82

+        movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84

+        punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84

+        punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86

+        movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06

+        punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06

+        punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07

+%if %2

+        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02

+        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02

+        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03

+        movdqa      [rdx],              xmm2            ; save 2

+        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04

+        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04

+        movdqa      [rdx+16],           xmm3            ; save 3

+        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05

+        movdqa      [rdx+32],           xmm4            ; save 4

+        movdqa      [rdx+48],           xmm5            ; save 5

+        movdqa      xmm1,               t0              ; get

+        movdqa      xmm2,               xmm1            ;

+        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01

+        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00

+%else

+        movdqa      [rdx+112],          xmm7            ; save 7

+        movdqa      [rdx+96],           xmm6            ; save 6

+        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02

+        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03

+        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02

+        movdqa      [rdx+32],           xmm2            ; save 2

+        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04

+        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04

+        movdqa      [rdx+48],           xmm3            ; save 3

+        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05

+        movdqa      [rdx+64],           xmm4            ; save 4

+        movdqa      [rdx+80],           xmm5            ; save 5

+        movdqa      xmm1,               t0              ; get

+        movdqa      xmm2,               xmm1

+        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01

+        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00

+        movdqa      [rdx+16],           xmm1

+        movdqa      [rdx],              xmm2

+%endif

+%endmacro

+%macro LFV_FILTER_MASK_HEV_MASK 1

+        movdqa      xmm0,               xmm6            ; q2

+        psubusb     xmm0,               xmm7            ; q2-q3

+        psubusb     xmm7,               xmm6            ; q3-q2

+        movdqa      xmm4,               xmm5            ; q1

+        por         xmm7,               xmm0            ; abs (q3-q2)

+        psubusb     xmm4,               xmm6            ; q1-q2

+        movdqa      xmm0,               xmm1

+        psubusb     xmm6,               xmm5            ; q2-q1

+        por         xmm6,               xmm4            ; abs (q2-q1)

+        psubusb     xmm0,               xmm2            ; p2 - p3;

+        psubusb     xmm2,               xmm1            ; p3 - p2;

+        por         xmm0,               xmm2            ; abs(p2-p3)

+%if %1

+        movdqa      xmm2,               [rdx]           ; p1

+%else

+        movdqa      xmm2,               [rdx+32]        ; p1

+%endif

+        movdqa      xmm5,               xmm2            ; p1

+        pmaxub      xmm0,               xmm7

+        psubusb     xmm5,               xmm1            ; p1-p2

+        psubusb     xmm1,               xmm2            ; p2-p1

+        movdqa      xmm7,               xmm3            ; p0

+        psubusb     xmm7,               xmm2            ; p0-p1

+        por         xmm1,               xmm5            ; abs(p2-p1)

+        pmaxub      xmm0,               xmm6

+        pmaxub      xmm0,               xmm1

+        movdqa      xmm1,               xmm2            ; p1

+        psubusb     xmm2,               xmm3            ; p1-p0

+        lea         rdx,                srct

+        por         xmm2,               xmm7            ; abs(p1-p0)

+        movdqa      t0,                 xmm2            ; save abs(p1-p0)

+        pmaxub      xmm0,               xmm2

+%if %1

+        movdqa      xmm5,               [rdx+32]        ; q0

+        movdqa      xmm7,               [rdx+48]        ; q1

+%else

+        movdqa      xmm5,               [rdx+64]        ; q0

+        movdqa      xmm7,               [rdx+80]        ; q1

+%endif

+        mov         rdx,                arg(3)          ; limit

+        movdqa      xmm6,               xmm5            ; q0

+        movdqa      xmm2,               xmm7            ; q1

+        psubusb     xmm5,               xmm7            ; q0-q1

+        psubusb     xmm7,               xmm6            ; q1-q0

+        por         xmm7,               xmm5            ; abs(q1-q0)

+        movdqa      t1,                 xmm7            ; save abs(q1-q0)

+        movdqa      xmm4,               XMMWORD PTR [rdx]; limit

+        pmaxub      xmm0,               xmm7

+        mov         rdx,                arg(2)          ; blimit

+        psubusb     xmm0,               xmm4

+        movdqa      xmm5,               xmm2            ; q1

+        psubusb     xmm5,               xmm1            ; q1-=p1

+        psubusb     xmm1,               xmm2            ; p1-=q1

+        por         xmm5,               xmm1            ; abs(p1-q1)

+        movdqa      xmm1,               xmm3            ; p0

+        pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero

+        psubusb     xmm1,               xmm6            ; p0-q0

+        psrlw       xmm5,               1               ; abs(p1-q1)/2

+        psubusb     xmm6,               xmm3            ; q0-p0

+        movdqa      xmm4,               XMMWORD PTR [rdx]; blimit

+        mov         rdx,                arg(4)          ; get thresh

+        por         xmm1,               xmm6            ; abs(q0-p0)

+        movdqa      xmm6,               t0              ; get abs (q1 - q0)

+        paddusb     xmm1,               xmm1            ; abs(q0-p0)*2

+        movdqa      xmm3,               t1              ; get abs (p1 - p0)

+        movdqa      xmm7,               XMMWORD PTR [rdx]

+        paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2

+        psubusb     xmm6,               xmm7            ; abs(q1 - q0) > thresh

+        psubusb     xmm3,               xmm7            ; abs(p1 - p0)> thresh

+        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit

+        por         xmm6,               xmm3            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh

+        por         xmm1,               xmm0            ; mask

+        pcmpeqb     xmm6,               xmm0

+        pxor        xmm0,               xmm0

+        pcmpeqb     xmm4,               xmm4

+        pcmpeqb     xmm1,               xmm0

+        pxor        xmm4,               xmm6

+%endmacro

+%macro BV_TRANSPOSE 0

+        ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02

+        ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03

+        ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04

+        ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05

+        movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02

+        punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02

+        movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04

+        punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82

+        punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04

+        punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84

+        movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02

+        punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02

+        punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42

+        movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82

+        punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82

+        punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2

+        ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02

+        ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42

+        ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82

+        ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2

+%endmacro

+%macro BV_WRITEBACK 2

+        movd        [rsi+2],            %1

+        psrldq      %1,                 4

+        movd        [rdi+2],            %1

+        psrldq      %1,                 4

+        movd        [rsi+2*rax+2],      %1

+        psrldq      %1,                 4

+        movd        [rdi+2*rax+2],      %1

+        movd        [rsi+4*rax+2],      %2

+        psrldq      %2,                 4

+        movd        [rdi+4*rax+2],      %2

+        psrldq      %2,                 4

+        movd        [rsi+2*rcx+2],      %2

+        psrldq      %2,                 4

+        movd        [rdi+2*rcx+2],      %2

+%endmacro

+;void vp9_loop_filter_vertical_edge_sse2

+;(

+;    unsigned char *src_ptr,

+;    int            src_pixel_step,

+;    const char    *blimit,

+;    const char    *limit,

+;    const char    *thresh,

+;    int            count

+;)

+global sym(vp9_loop_filter_vertical_edge_sse2)

+sym(vp9_loop_filter_vertical_edge_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub             rsp, 96      ; reserve 96 bytes

+    %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];

+    %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];

+    %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];

+        mov         rsi,        arg(0)                  ; src_ptr

+        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step

+        lea         rsi,        [rsi - 4]

+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing

+        lea         rcx,        [rax*2+rax]

+        ;transpose 16x8 to 8x16, and store the 8-line result on stack.

+        TRANSPOSE_16X8 1, 1

+        ; calculate filter mask and high edge variance

+        LFV_FILTER_MASK_HEV_MASK 1

+        ; start work on filters

+        B_FILTER 2

+        ; tranpose and write back - only work on q1, q0, p0, p1

+        BV_TRANSPOSE

+        ; store 16-line result

+        lea         rdx,        [rax]

+        neg         rdx

+        BV_WRITEBACK xmm1, xmm5

+        lea         rsi,        [rsi+rdx*8]

+        lea         rdi,        [rdi+rdx*8]

+        BV_WRITEBACK xmm2, xmm6

+    add rsp, 96

+    pop rsp

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_loop_filter_vertical_edge_uv_sse2

+;(

+;    unsigned char *u,

+;    int            src_pixel_step,

+;    const char    *blimit,

+;    const char    *limit,

+;    const char    *thresh,

+;    unsigned char *v

+;)

+global sym(vp9_loop_filter_vertical_edge_uv_sse2)

+sym(vp9_loop_filter_vertical_edge_uv_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub             rsp, 96      ; reserve 96 bytes

+    %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];

+    %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];

+    %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];

+        mov         rsi,        arg(0)                  ; u_ptr

+        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step

+        lea         rsi,        [rsi - 4]

+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing

+        lea         rcx,        [rax+2*rax]

+        lea         rdx,        srct

+        ;transpose 16x8 to 8x16, and store the 8-line result on stack.

+        TRANSPOSE_16X8 0, 1

+        ; calculate filter mask and high edge variance

+        LFV_FILTER_MASK_HEV_MASK 1

+        ; start work on filters

+        B_FILTER 2

+        ; tranpose and write back - only work on q1, q0, p0, p1

+        BV_TRANSPOSE

+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing

+        ; store 16-line result

+        BV_WRITEBACK xmm1, xmm5

+        mov         rsi,        arg(0)                  ; u_ptr

+        lea         rsi,        [rsi - 4]

+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing

+        BV_WRITEBACK xmm2, xmm6

+    add rsp, 96

+    pop rsp

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_loop_filter_simple_horizontal_edge_sse2

+;(

+;    unsigned char *src_ptr,

+;    int  src_pixel_step,

+;    const char *blimit,

+;)

+global sym(vp9_loop_filter_simple_horizontal_edge_sse2)

+sym(vp9_loop_filter_simple_horizontal_edge_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 3

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov         rsi, arg(0)             ;src_ptr

+        movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?

+        mov         rdx, arg(2)             ;blimit

+        movdqa      xmm3, XMMWORD PTR [rdx]

+        mov         rdi, rsi                ; rdi points to row +1 for indirect addressing

+        add         rdi, rax

+        neg         rax

+        ; calculate mask

+        movdqa      xmm1, [rsi+2*rax]       ; p1

+        movdqa      xmm0, [rdi]             ; q1

+        movdqa      xmm2, xmm1

+        movdqa      xmm7, xmm0

+        movdqa      xmm4, xmm0

+        psubusb     xmm0, xmm1              ; q1-=p1

+        psubusb     xmm1, xmm4              ; p1-=q1

+        por         xmm1, xmm0              ; abs(p1-q1)

+        pand        xmm1, [GLOBAL(tfe)]     ; set lsb of each byte to zero

+        psrlw       xmm1, 1                 ; abs(p1-q1)/2

+        movdqa      xmm5, [rsi+rax]         ; p0

+        movdqa      xmm4, [rsi]             ; q0

+        movdqa      xmm0, xmm4              ; q0

+        movdqa      xmm6, xmm5              ; p0

+        psubusb     xmm5, xmm4              ; p0-=q0

+        psubusb     xmm4, xmm6              ; q0-=p0

+        por         xmm5, xmm4              ; abs(p0 - q0)

+        paddusb     xmm5, xmm5              ; abs(p0-q0)*2

+        paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2

+        psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit

+        pxor        xmm3, xmm3

+        pcmpeqb     xmm5, xmm3

+        ; start work on filters

+        pxor        xmm2, [GLOBAL(t80)]     ; p1 offset to convert to signed values

+        pxor        xmm7, [GLOBAL(t80)]     ; q1 offset to convert to signed values

+        psubsb      xmm2, xmm7              ; p1 - q1

+        pxor        xmm6, [GLOBAL(t80)]     ; offset to convert to signed values

+        pxor        xmm0, [GLOBAL(t80)]     ; offset to convert to signed values

+        movdqa      xmm3, xmm0              ; q0

+        psubsb      xmm0, xmm6              ; q0 - p0

+        paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)

+        paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)

+        paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)

+        pand        xmm5, xmm2              ; mask filter values we don't care about

+        ; do + 4 side

+        paddsb      xmm5, [GLOBAL(t4)]      ; 3* (q0 - p0) + (p1 - q1) + 4

+        movdqa      xmm0, xmm5              ; get a copy of filters

+        psllw       xmm0, 8                 ; shift left 8

+        psraw       xmm0, 3                 ; arithmetic shift right 11

+        psrlw       xmm0, 8

+        movdqa      xmm1, xmm5              ; get a copy of filters

+        psraw       xmm1, 11                ; arithmetic shift right 11

+        psllw       xmm1, 8                 ; shift left 8 to put it back

+        por         xmm0, xmm1              ; put the two together to get result

+        psubsb      xmm3, xmm0              ; q0-= q0 add

+        pxor        xmm3, [GLOBAL(t80)]     ; unoffset

+        movdqa      [rsi], xmm3             ; write back

+        ; now do +3 side

+        psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4

+        movdqa      xmm0, xmm5              ; get a copy of filters

+        psllw       xmm0, 8                 ; shift left 8

+        psraw       xmm0, 3                 ; arithmetic shift right 11

+        psrlw       xmm0, 8

+        psraw       xmm5, 11                ; arithmetic shift right 11

+        psllw       xmm5, 8                 ; shift left 8 to put it back

+        por         xmm0, xmm5              ; put the two together to get result

+        paddsb      xmm6, xmm0              ; p0+= p0 add

+        pxor        xmm6, [GLOBAL(t80)]     ; unoffset

+        movdqa      [rsi+rax], xmm6         ; write back

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_loop_filter_simple_vertical_edge_sse2

+;(

+;    unsigned char *src_ptr,

+;    int  src_pixel_step,

+;    const char *blimit,

+;)

+global sym(vp9_loop_filter_simple_vertical_edge_sse2)

+sym(vp9_loop_filter_simple_vertical_edge_sse2):

+    push        rbp         ; save old base pointer value.

+    mov         rbp, rsp    ; set new base pointer value.

+    SHADOW_ARGS_TO_STACK 3

+    SAVE_XMM 7

+    GET_GOT     rbx         ; save callee-saved reg

+    push        rsi

+    push        rdi

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 32                         ; reserve 32 bytes

+    %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];

+    %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];

+        mov         rsi, arg(0) ;src_ptr

+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?

+        lea         rsi,        [rsi - 2 ]

+        lea         rdi,        [rsi + rax]

+        lea         rdx,        [rsi + rax*4]

+        lea         rcx,        [rdx + rax]

+        movd        xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00

+        movd        xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40

+        movd        xmm2,       [rdi]                   ; 13 12 11 10

+        movd        xmm3,       [rcx]                   ; 53 52 51 50

+        punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00

+        punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10

+        movd        xmm4,       [rsi + rax*2]           ; 23 22 21 20

+        movd        xmm5,       [rdx + rax*2]           ; 63 62 61 60

+        movd        xmm6,       [rdi + rax*2]           ; 33 32 31 30

+        movd        xmm7,       [rcx + rax*2]           ; 73 72 71 70

+        punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20

+        punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30

+        punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00

+        punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20

+        movdqa      xmm1,       xmm0

+        punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00

+        punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40

+        movdqa      xmm2,       xmm0

+        punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00

+        punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02

+        movdqa      t0,         xmm0                    ; save to t0

+        movdqa      t1,         xmm2                    ; save to t1

+        lea         rsi,        [rsi + rax*8]

+        lea         rdi,        [rsi + rax]

+        lea         rdx,        [rsi + rax*4]

+        lea         rcx,        [rdx + rax]

+        movd        xmm4,       [rsi]                   ; 83 82 81 80

+        movd        xmm1,       [rdx]                   ; c3 c2 c1 c0

+        movd        xmm6,       [rdi]                   ; 93 92 91 90

+        movd        xmm3,       [rcx]                   ; d3 d2 d1 d0

+        punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80

+        punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90

+        movd        xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0

+        movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0

+        movd        xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0

+        movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0

+        punpckldq   xmm0,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0

+        punpckldq   xmm2,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0

+        punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80

+        punpcklbw   xmm0,       xmm2                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0

+        movdqa      xmm1,       xmm4

+        punpcklwd   xmm4,       xmm0                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80

+        punpckhwd   xmm1,       xmm0                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0

+        movdqa      xmm6,       xmm4

+        punpckldq   xmm4,       xmm1                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80

+        punpckhdq   xmm6,       xmm1                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82

+        movdqa      xmm0,       t0                      ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00

+        movdqa      xmm2,       t1                      ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02

+        movdqa      xmm1,       xmm0

+        movdqa      xmm3,       xmm2

+        punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00

+        punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01

+        punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02

+        punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03

+        ; calculate mask

+        movdqa      xmm6,       xmm0                            ; p1

+        movdqa      xmm7,       xmm3                            ; q1

+        psubusb     xmm7,       xmm0                            ; q1-=p1

+        psubusb     xmm6,       xmm3                            ; p1-=q1

+        por         xmm6,       xmm7                            ; abs(p1-q1)

+        pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero

+        psrlw       xmm6,       1                               ; abs(p1-q1)/2

+        movdqa      xmm5,       xmm1                            ; p0

+        movdqa      xmm4,       xmm2                            ; q0

+        psubusb     xmm5,       xmm2                            ; p0-=q0

+        psubusb     xmm4,       xmm1                            ; q0-=p0

+        por         xmm5,       xmm4                            ; abs(p0 - q0)

+        paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2

+        paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2

+        mov         rdx,        arg(2)                          ;blimit

+        movdqa      xmm7, XMMWORD PTR [rdx]

+        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit

+        pxor        xmm7,        xmm7

+        pcmpeqb     xmm5,        xmm7                           ; mm5 = mask

+        ; start work on filters

+        movdqa        t0,        xmm0

+        movdqa        t1,        xmm3

+        pxor        xmm0,        [GLOBAL(t80)]                  ; p1 offset to convert to signed values

+        pxor        xmm3,        [GLOBAL(t80)]                  ; q1 offset to convert to signed values

+        psubsb      xmm0,        xmm3                           ; p1 - q1

+        movdqa      xmm6,        xmm1                           ; p0

+        movdqa      xmm7,        xmm2                           ; q0

+        pxor        xmm6,        [GLOBAL(t80)]                  ; offset to convert to signed values

+        pxor        xmm7,        [GLOBAL(t80)]                  ; offset to convert to signed values

+        movdqa      xmm3,        xmm7                           ; offseted ; q0

+        psubsb      xmm7,        xmm6                           ; q0 - p0

+        paddsb      xmm0,        xmm7                           ; p1 - q1 + 1 * (q0 - p0)

+        paddsb      xmm0,        xmm7                           ; p1 - q1 + 2 * (q0 - p0)

+        paddsb      xmm0,        xmm7                           ; p1 - q1 + 3 * (q0 - p0)

+        pand        xmm5,        xmm0                           ; mask filter values we don't care about

+        paddsb      xmm5,        [GLOBAL(t4)]                   ;  3* (q0 - p0) + (p1 - q1) + 4

+        movdqa      xmm0,        xmm5                           ; get a copy of filters

+        psllw       xmm0,        8                              ; shift left 8

+        psraw       xmm0,        3                              ; arithmetic shift right 11

+        psrlw       xmm0,        8

+        movdqa      xmm7,        xmm5                           ; get a copy of filters

+        psraw       xmm7,        11                             ; arithmetic shift right 11

+        psllw       xmm7,        8                              ; shift left 8 to put it back

+        por         xmm0,        xmm7                           ; put the two together to get result

+        psubsb      xmm3,        xmm0                           ; q0-= q0sz add

+        pxor        xmm3,        [GLOBAL(t80)]                  ; unoffset   q0

+        ; now do +3 side

+        psubsb      xmm5,        [GLOBAL(t1s)]                  ; +3 instead of +4

+        movdqa      xmm0,        xmm5                           ; get a copy of filters

+        psllw       xmm0,        8                              ; shift left 8

+        psraw       xmm0,        3                              ; arithmetic shift right 11

+        psrlw       xmm0,        8

+        psraw       xmm5,        11                             ; arithmetic shift right 11

+        psllw       xmm5,        8                              ; shift left 8 to put it back

+        por         xmm0,        xmm5                           ; put the two together to get result

+        paddsb      xmm6,        xmm0                           ; p0+= p0 add

+        pxor        xmm6,        [GLOBAL(t80)]                  ; unoffset   p0

+        movdqa      xmm0,        t0                             ; p1

+        movdqa      xmm4,        t1                             ; q1

+        ; transpose back to write out

+        ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00

+        ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01

+        ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02

+        ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03

+        movdqa      xmm1,       xmm0

+        punpcklbw   xmm0,       xmm6                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00

+        punpckhbw   xmm1,       xmm6                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80

+        movdqa      xmm5,       xmm3

+        punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02

+        punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82

+        movdqa      xmm2,       xmm0

+        punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00

+        punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40

+        movdqa      xmm3,       xmm1

+        punpcklwd   xmm1,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80

+        punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0

+        ; write out order: xmm0 xmm2 xmm1 xmm3

+        lea         rdx,        [rsi + rax*4]

+        movd        [rsi],      xmm1                               ; write the second 8-line result

+        psrldq      xmm1,       4

+        movd        [rdi],      xmm1

+        psrldq      xmm1,       4

+        movd        [rsi + rax*2], xmm1

+        psrldq      xmm1,       4

+        movd        [rdi + rax*2], xmm1

+        movd        [rdx],      xmm3

+        psrldq      xmm3,       4

+        movd        [rcx],      xmm3

+        psrldq      xmm3,       4

+        movd        [rdx + rax*2], xmm3

+        psrldq      xmm3,       4

+        movd        [rcx + rax*2], xmm3

+        neg         rax

+        lea         rsi,        [rsi + rax*8]

+        neg         rax

+        lea         rdi,        [rsi + rax]

+        lea         rdx,        [rsi + rax*4]

+        lea         rcx,        [rdx + rax]

+        movd        [rsi],      xmm0                                ; write the first 8-line result

+        psrldq      xmm0,       4

+        movd        [rdi],      xmm0

+        psrldq      xmm0,       4

+        movd        [rsi + rax*2], xmm0

+        psrldq      xmm0,       4

+        movd        [rdi + rax*2], xmm0

+        movd        [rdx],      xmm2

+        psrldq      xmm2,       4

+        movd        [rcx],      xmm2

+        psrldq      xmm2,       4

+        movd        [rdx + rax*2], xmm2

+        psrldq      xmm2,       4

+        movd        [rcx + rax*2], xmm2

+    add rsp, 32

+    pop rsp

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+tfe:

+    times 16 db 0xfe

+align 16

+t80:

+    times 16 db 0x80

+align 16

+t1s:

+    times 16 db 0x01

+align 16

+t3:

+    times 16 db 0x03

+align 16

+t4:

+    times 16 db 0x04

+align 16

+ones:

+    times 8 dw 0x0001

+align 16

+s9:

+    times 8 dw 0x0900

+align 16

+s63:

+    times 8 dw 0x003f

--- /dev/null

+++ b/vp9/common/x86/loopfilter_x86.c

@@ -1,0 +1,543 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <emmintrin.h>  // SSE2

+#include "vpx_config.h"

+#include "vp9/common/loopfilter.h"

+prototype_loopfilter(vp9_loop_filter_vertical_edge_mmx);

+prototype_loopfilter(vp9_loop_filter_horizontal_edge_mmx);

+prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2);

+prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2);

+extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2;

+extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2;

+#if HAVE_MMX

+/* Horizontal MB filtering */

+void vp9_loop_filter_mbh_mmx(unsigned char *y_ptr,

+                             unsigned char *u_ptr, unsigned char *v_ptr,

+                             int y_stride, int uv_stride,

+                             struct loop_filter_info *lfi) {

+}

+/* Vertical MB Filtering */

+void vp9_loop_filter_mbv_mmx(unsigned char *y_ptr,

+                             unsigned char *u_ptr, unsigned char *v_ptr,

+                             int y_stride, int uv_stride,

+                             struct loop_filter_info *lfi) {

+}

+/* Horizontal B Filtering */

+void vp9_loop_filter_bh_mmx(unsigned char *y_ptr,

+                            unsigned char *u_ptr, unsigned char *v_ptr,

+                            int y_stride, int uv_stride,

+                            struct loop_filter_info *lfi) {

+}

+void vp9_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride,

+                             const unsigned char *blimit) {

+  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride,

+                                             y_stride, blimit);

+  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride,

+                                             y_stride, blimit);

+  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride,

+                                             y_stride, blimit);

+}

+/* Vertical B Filtering */

+void vp9_loop_filter_bv_mmx(unsigned char *y_ptr,

+                            unsigned char *u_ptr, unsigned char *v_ptr,

+                            int y_stride, int uv_stride,

+                            struct loop_filter_info *lfi) {

+  vp9_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride,

+                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);

+  vp9_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride,

+                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);

+  vp9_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride,

+                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);

+  if (u_ptr)

+    vp9_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride,

+                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);

+  if (v_ptr)

+    vp9_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride,

+                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);

+}

+void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride,

+                             const unsigned char *blimit) {

+  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);

+  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);

+  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);

+}

+#endif

+#if HAVE_SSE2

+void vp9_mbloop_filter_horizontal_edge_c_sse2(unsigned char *s,

+                                              int p,

+                                              const unsigned char *_blimit,

+                                              const unsigned char *_limit,

+                                              const unsigned char *_thresh,

+                                              int count) {

+  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);

+  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);

+  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);

+  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);

+  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);

+  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);

+  __m128i mask, hev, flat;

+  __m128i thresh, limit, blimit;

+  const __m128i zero = _mm_set1_epi16(0);

+  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;

+  thresh = _mm_shuffle_epi32(_mm_cvtsi32_si128(_thresh[0] * 0x01010101), 0);

+  limit = _mm_shuffle_epi32(_mm_cvtsi32_si128(_limit[0] * 0x01010101), 0);

+  blimit = _mm_shuffle_epi32(_mm_cvtsi32_si128(_blimit[0] * 0x01010101), 0);

+  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));

+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));

+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));

+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));

+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));

+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));

+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));

+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));

+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));

+  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));

+  {

+    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),

+                                          _mm_subs_epu8(p0, p1));

+    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),

+                                          _mm_subs_epu8(q0, q1));

+    const __m128i one = _mm_set1_epi8(1);

+    const __m128i fe = _mm_set1_epi8(0xfe);

+    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);

+    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),

+                                    _mm_subs_epu8(q0, p0));

+    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),

+                                    _mm_subs_epu8(q1, p1));

+    __m128i work;

+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);

+    hev = _mm_subs_epu8(flat, thresh);

+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);

+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);

+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);

+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);

+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);

+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;

+    mask = _mm_max_epu8(flat, mask);

+    // mask |= (abs(p1 - p0) > limit) * -1;

+    // mask |= (abs(q1 - q0) > limit) * -1;

+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),

+                                     _mm_subs_epu8(p1, p2)),

+                         _mm_or_si128(_mm_subs_epu8(p3, p2),

+                                      _mm_subs_epu8(p2, p3)));

+    mask = _mm_max_epu8(work, mask);

+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),

+                                     _mm_subs_epu8(q1, q2)),

+                         _mm_or_si128(_mm_subs_epu8(q3, q2),

+                                      _mm_subs_epu8(q2, q3)));

+    mask = _mm_max_epu8(work, mask);

+    mask = _mm_subs_epu8(mask, limit);

+    mask = _mm_cmpeq_epi8(mask, zero);

+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),

+                                     _mm_subs_epu8(p0, p2)),

+                         _mm_or_si128(_mm_subs_epu8(q2, q0),

+                                      _mm_subs_epu8(q0, q2)));

+    flat = _mm_max_epu8(work, flat);

+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),

+                                     _mm_subs_epu8(p0, p3)),

+                         _mm_or_si128(_mm_subs_epu8(q3, q0),

+                                      _mm_subs_epu8(q0, q3)));

+    flat = _mm_max_epu8(work, flat);

+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),

+                                     _mm_subs_epu8(p0, p4)),

+                         _mm_or_si128(_mm_subs_epu8(q4, q0),

+                                      _mm_subs_epu8(q0, q4)));

+    flat = _mm_max_epu8(work, flat);

+    flat = _mm_subs_epu8(flat, one);

+    flat = _mm_cmpeq_epi8(flat, zero);

+    flat = _mm_and_si128(flat, mask);

+  }

+  {

+    const __m128i four = _mm_set1_epi16(4);

+    unsigned char *src = s;

+    int i = 0;

+    do {

+      __m128i workp_a, workp_b, workp_shft;

+      p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);

+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);

+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);

+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);

+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);

+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);

+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);

+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);

+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);

+      q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);

+      workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1));

+      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);

+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4);

+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

+      _mm_storel_epi64((__m128i *)&flat_op2[i*8],

+                       _mm_packus_epi16(workp_shft, workp_shft));

+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);

+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

+      _mm_storel_epi64((__m128i *)&flat_op1[i*8],

+                       _mm_packus_epi16(workp_shft, workp_shft));

+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2);

+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);

+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

+      _mm_storel_epi64((__m128i *)&flat_op0[i*8],

+                       _mm_packus_epi16(workp_shft, workp_shft));

+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);

+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);

+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

+      _mm_storel_epi64((__m128i *)&flat_oq0[i*8],

+                       _mm_packus_epi16(workp_shft, workp_shft));

+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4);

+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);

+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

+      _mm_storel_epi64((__m128i *)&flat_oq1[i*8],

+                       _mm_packus_epi16(workp_shft, workp_shft));

+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4);

+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);

+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

+      _mm_storel_epi64((__m128i *)&flat_oq2[i*8],

+                       _mm_packus_epi16(workp_shft, workp_shft));

+      src += 8;

+    } while (++i < count);

+  }

+  // lp filter

+  {

+    const __m128i t4 = _mm_set1_epi8(4);

+    const __m128i t3 = _mm_set1_epi8(3);

+    const __m128i t80 = _mm_set1_epi8(0x80);

+    const __m128i te0 = _mm_set1_epi8(0xe0);

+    const __m128i t1f = _mm_set1_epi8(0x1f);

+    const __m128i t1 = _mm_set1_epi8(0x1);

+    const __m128i t7f = _mm_set1_epi8(0x7f);

+    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),

+                                      t80);

+    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),

+                                      t80);

+    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),

+                                      t80);

+    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),

+                                      t80);

+    __m128i filt;

+    __m128i work_a;

+    __m128i filter1, filter2;

+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);

+    work_a = _mm_subs_epi8(qs0, ps0);

+    filt = _mm_adds_epi8(filt, work_a);

+    filt = _mm_adds_epi8(filt, work_a);

+    filt = _mm_adds_epi8(filt, work_a);

+    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */

+    filt = _mm_and_si128(filt, mask);

+    filter1 = _mm_adds_epi8(filt, t4);

+    filter2 = _mm_adds_epi8(filt, t3);

+    /* Filter1 >> 3 */

+    work_a = _mm_cmpgt_epi8(zero, filter1);

+    filter1 = _mm_srli_epi16(filter1, 3);

+    work_a = _mm_and_si128(work_a, te0);

+    filter1 = _mm_and_si128(filter1, t1f);

+    filter1 = _mm_or_si128(filter1, work_a);

+    /* Filter2 >> 3 */

+    work_a = _mm_cmpgt_epi8(zero, filter2);

+    filter2 = _mm_srli_epi16(filter2, 3);

+    work_a = _mm_and_si128(work_a, te0);

+    filter2 = _mm_and_si128(filter2, t1f);

+    filter2 = _mm_or_si128(filter2, work_a);

+    /* filt >> 1 */

+    filt = _mm_adds_epi8(filter1, t1);

+    work_a = _mm_cmpgt_epi8(zero, filt);

+    filt = _mm_srli_epi16(filt, 1);

+    work_a = _mm_and_si128(work_a, t80);

+    filt = _mm_and_si128(filt, t7f);

+    filt = _mm_or_si128(filt, work_a);

+    filt = _mm_andnot_si128(hev, filt);

+    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);

+    q0 = _mm_load_si128((__m128i *)flat_oq0);

+    work_a = _mm_andnot_si128(flat, work_a);

+    q0 = _mm_and_si128(flat, q0);

+    q0 = _mm_or_si128(work_a, q0);

+    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);

+    q1 = _mm_load_si128((__m128i *)flat_oq1);

+    work_a = _mm_andnot_si128(flat, work_a);

+    q1 = _mm_and_si128(flat, q1);

+    q1 = _mm_or_si128(work_a, q1);

+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));

+    q2 = _mm_load_si128((__m128i *)flat_oq2);

+    work_a = _mm_andnot_si128(flat, work_a);

+    q2 = _mm_and_si128(flat, q2);

+    q2 = _mm_or_si128(work_a, q2);

+    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);

+    p0 = _mm_load_si128((__m128i *)flat_op0);

+    work_a = _mm_andnot_si128(flat, work_a);

+    p0 = _mm_and_si128(flat, p0);

+    p0 = _mm_or_si128(work_a, p0);

+    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);

+    p1 = _mm_load_si128((__m128i *)flat_op1);

+    work_a = _mm_andnot_si128(flat, work_a);

+    p1 = _mm_and_si128(flat, p1);

+    p1 = _mm_or_si128(work_a, p1);

+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));

+    p2 = _mm_load_si128((__m128i *)flat_op2);

+    work_a = _mm_andnot_si128(flat, work_a);

+    p2 = _mm_and_si128(flat, p2);

+    p2 = _mm_or_si128(work_a, p2);

+    if (count == 1) {

+      _mm_storel_epi64((__m128i *)(s - 3 * p), p2);

+      _mm_storel_epi64((__m128i *)(s - 2 * p), p1);

+      _mm_storel_epi64((__m128i *)(s - 1 * p), p0);

+      _mm_storel_epi64((__m128i *)(s + 0 * p), q0);

+      _mm_storel_epi64((__m128i *)(s + 1 * p), q1);

+      _mm_storel_epi64((__m128i *)(s + 2 * p), q2);

+    } else {

+      _mm_storeu_si128((__m128i *)(s - 3 * p), p2);

+      _mm_storeu_si128((__m128i *)(s - 2 * p), p1);

+      _mm_storeu_si128((__m128i *)(s - 1 * p), p0);

+      _mm_storeu_si128((__m128i *)(s + 0 * p), q0);

+      _mm_storeu_si128((__m128i *)(s + 1 * p), q1);

+      _mm_storeu_si128((__m128i *)(s + 2 * p), q2);

+    }

+  }

+}

+static __inline void transpose(unsigned char *src[], int in_p,

+                               unsigned char *dst[], int out_p,

+                               int num_8x8_to_transpose) {

+  int idx8x8 = 0;

+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;

+  do {

+    unsigned char *in = src[idx8x8];

+    unsigned char *out = dst[idx8x8];

+    x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07

+    x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17

+    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27

+    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37

+    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47

+    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57

+    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67

+    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77

+    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17

+    x0 = _mm_unpacklo_epi8(x0, x1);

+    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37

+    x1 = _mm_unpacklo_epi8(x2, x3);

+    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57

+    x2 = _mm_unpacklo_epi8(x4, x5);

+    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77

+    x3 = _mm_unpacklo_epi8(x6, x7);

+    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33

+    x4 = _mm_unpacklo_epi16(x0, x1);

+    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73

+    x5 = _mm_unpacklo_epi16(x2, x3);

+    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71

+    x6 = _mm_unpacklo_epi32(x4, x5);

+    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73

+    x7 = _mm_unpackhi_epi32(x4, x5);

+    _mm_storel_pd((double *)(out + 0*out_p),

+                  _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70

+    _mm_storeh_pd((double *)(out + 1*out_p),

+                  _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71

+    _mm_storel_pd((double *)(out + 2*out_p),

+                  _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72

+    _mm_storeh_pd((double *)(out + 3*out_p),

+                  _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73

+    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37

+    x4 = _mm_unpackhi_epi16(x0, x1);

+    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77

+    x5 = _mm_unpackhi_epi16(x2, x3);

+    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75

+    x6 = _mm_unpacklo_epi32(x4, x5);

+    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77

+    x7 = _mm_unpackhi_epi32(x4, x5);

+    _mm_storel_pd((double *)(out + 4*out_p),

+                  _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74

+    _mm_storeh_pd((double *)(out + 5*out_p),

+                  _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75

+    _mm_storel_pd((double *)(out + 6*out_p),

+                  _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76

+    _mm_storeh_pd((double *)(out + 7*out_p),

+                  _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77

+  } while (++idx8x8 < num_8x8_to_transpose);

+}

+void vp9_mbloop_filter_vertical_edge_c_sse2(unsigned char *s,

+                                            int p,

+                                            const unsigned char *blimit,

+                                            const unsigned char *limit,

+                                            const unsigned char *thresh,

+                                            int count) {

+  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 16]);

+  unsigned char *src[4];

+  unsigned char *dst[4];

+  src[0] = s - 5;

+  src[1] = s - 5 + 8;

+  src[2] = s - 5 + p*8;

+  src[3] = s - 5 + p*8 + 8;

+  dst[0] = t_dst;

+  dst[1] = t_dst + 16*8;

+  dst[2] = t_dst + 8;

+  dst[3] = t_dst + 16*8 + 8;

+  // 16x16->16x16 or 16x8->8x16

+  transpose(src, p, dst, 16, (1 << count));

+  vp9_mbloop_filter_horizontal_edge_c_sse2(t_dst + 5*16, 16, blimit, limit,

+                                           thresh, count);

+  dst[0] = s - 5;

+  dst[1] = s - 5 + p*8;

+  src[0] = t_dst;

+  src[1] = t_dst + 8;

+  // 16x8->8x16 or 8x8->8x8

+  transpose(src, 16, dst, p, (1 << (count - 1)));

+}

+/* Horizontal MB filtering */

+void vp9_loop_filter_mbh_sse2(unsigned char *y_ptr,

+                              unsigned char *u_ptr, unsigned char *v_ptr,

+                              int y_stride, int uv_stride,

+                              struct loop_filter_info *lfi) {

+  vp9_mbloop_filter_horizontal_edge_c_sse2(y_ptr, y_stride, lfi->mblim,

+                                           lfi->lim, lfi->hev_thr, 2);

+  /* TODO: write sse2 version with u,v interleaved */

+  if (u_ptr)

+    vp9_mbloop_filter_horizontal_edge_c_sse2(u_ptr, uv_stride, lfi->mblim,

+                                             lfi->lim, lfi->hev_thr, 1);

+  if (v_ptr)

+    vp9_mbloop_filter_horizontal_edge_c_sse2(v_ptr, uv_stride, lfi->mblim,

+                                             lfi->lim, lfi->hev_thr, 1);

+}

+void vp9_loop_filter_bh8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,

+                             unsigned char *v_ptr, int y_stride, int uv_stride,

+                             struct loop_filter_info *lfi) {

+  vp9_mbloop_filter_horizontal_edge_c_sse2(

+    y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

+}

+/* Vertical MB Filtering */

+void vp9_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr,

+                              unsigned char *v_ptr, int y_stride, int uv_stride,

+                              struct loop_filter_info *lfi) {

+  vp9_mbloop_filter_vertical_edge_c_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim,

+                                         lfi->hev_thr, 2);

+  /* TODO: write sse2 version with u,v interleaved */

+  if (u_ptr)

+    vp9_mbloop_filter_vertical_edge_c_sse2(u_ptr, uv_stride, lfi->mblim,

+                                           lfi->lim, lfi->hev_thr, 1);

+  if (v_ptr)

+    vp9_mbloop_filter_vertical_edge_c_sse2(v_ptr, uv_stride, lfi->mblim,

+                                           lfi->lim, lfi->hev_thr, 1);

+}

+void vp9_loop_filter_bv8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,

+                             unsigned char *v_ptr, int y_stride, int uv_stride,

+                             struct loop_filter_info *lfi) {

+  vp9_mbloop_filter_vertical_edge_c_sse2(

+    y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

+}

+/* Horizontal B Filtering */

+void vp9_loop_filter_bh_sse2(unsigned char *y_ptr,

+                             unsigned char *u_ptr, unsigned char *v_ptr,

+                             int y_stride, int uv_stride,

+                             struct loop_filter_info *lfi) {

+  vp9_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride,

+                                       lfi->blim, lfi->lim, lfi->hev_thr, 2);

+  vp9_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride,

+                                       lfi->blim, lfi->lim, lfi->hev_thr, 2);

+  vp9_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride,

+                                       lfi->blim, lfi->lim, lfi->hev_thr, 2);

+  if (u_ptr)

+    vp9_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride,

+                                            lfi->blim, lfi->lim, lfi->hev_thr,

+                                            v_ptr + 4 * uv_stride);

+}

+void vp9_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride,

+                              const unsigned char *blimit) {

+  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride,

+                                              y_stride, blimit);

+  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride,

+                                              y_stride, blimit);

+  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride,

+                                              y_stride, blimit);

+}

+/* Vertical B Filtering */

+void vp9_loop_filter_bv_sse2(unsigned char *y_ptr,

+                             unsigned char *u_ptr, unsigned char *v_ptr,

+                             int y_stride, int uv_stride,

+                             struct loop_filter_info *lfi) {

+  vp9_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride,

+                                     lfi->blim, lfi->lim, lfi->hev_thr, 2);

+  vp9_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride,

+                                     lfi->blim, lfi->lim, lfi->hev_thr, 2);

+  vp9_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride,

+                                     lfi->blim, lfi->lim, lfi->hev_thr, 2);

+  if (u_ptr)

+    vp9_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride,

+                                          lfi->blim, lfi->lim, lfi->hev_thr,

+                                          v_ptr + 4);

+}

+void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride,

+                              const unsigned char *blimit) {

+  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);

+  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);

+  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);

+}

+#endif

--- /dev/null

+++ b/vp9/common/x86/loopfilter_x86.h

@@ -1,0 +1,43 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef LOOPFILTER_X86_H

+#define LOOPFILTER_X86_H

+/* Note:

+ *

+ * This platform is commonly built for runtime CPU detection. If you modify

+ * any of the function mappings present in this file, be sure to also update

+ * them in the function pointer initialization code

+ */

+#if HAVE_MMX

+extern prototype_loopfilter_block(vp9_loop_filter_mbv_mmx);

+extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx);

+extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx);

+extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx);

+extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_mmx);

+extern prototype_simple_loopfilter(vp9_loop_filter_bvs_mmx);

+extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_mmx);

+extern prototype_simple_loopfilter(vp9_loop_filter_bhs_mmx);

+#endif

+#if HAVE_SSE2

+extern prototype_loopfilter_block(vp9_loop_filter_mbv_sse2);

+extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2);

+extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2);

+extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2);

+extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_sse2);

+extern prototype_simple_loopfilter(vp9_loop_filter_bvs_sse2);

+extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_sse2);

+extern prototype_simple_loopfilter(vp9_loop_filter_bhs_sse2);

+#endif

+#endif  // LOOPFILTER_X86_H

--- /dev/null

+++ b/vp9/common/x86/mask_sse3.asm

@@ -1,0 +1,484 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+;void int vp8_makemask_sse3(

+;    unsigned char *y,

+;    unsigned char *u,

+;    unsigned char *v,

+;    unsigned char *ym,

+;    unsigned char *uvm,

+;    int yp,

+;    int uvp,

+;    int ys,

+;    int us,

+;    int vs,

+;    int yt,

+;    int ut,

+;    int vt)

+global sym(vp8_makemask_sse3)

+sym(vp8_makemask_sse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 14

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov             rsi,        arg(0) ;y

+        mov             rdi,        arg(1) ;u

+        mov             rcx,        arg(2) ;v

+        mov             rax,        arg(3) ;ym

+        movsxd          rbx,        dword arg(4) ;yp

+        movsxd          rdx,        dword arg(5) ;uvp

+        pxor            xmm0,xmm0

+        ;make 16 copies of the center y value

+        movd            xmm1, arg(6)

+        pshufb          xmm1, xmm0

+        ; make 16 copies of the center u value

+        movd            xmm2, arg(7)

+        pshufb          xmm2, xmm0

+        ; make 16 copies of the center v value

+        movd            xmm3, arg(8)

+        pshufb          xmm3, xmm0

+        unpcklpd        xmm2, xmm3

+        ;make 16 copies of the y tolerance

+        movd            xmm3, arg(9)

+        pshufb          xmm3, xmm0

+        ;make 16 copies of the u tolerance

+        movd            xmm4, arg(10)

+        pshufb          xmm4, xmm0

+        ;make 16 copies of the v tolerance

+        movd            xmm5, arg(11)

+        pshufb          xmm5, xmm0

+        unpckhpd        xmm4, xmm5

+        mov             r8,8

+NextPairOfRows:

+        ;grab the y source values

+        movdqu          xmm0, [rsi]

+        ;compute abs difference between source and y target

+        movdqa          xmm6, xmm1

+        movdqa          xmm7, xmm0

+        psubusb         xmm0, xmm1

+        psubusb         xmm6, xmm7

+        por             xmm0, xmm6

+        ;compute abs difference between

+        movdqa          xmm6, xmm3

+        pcmpgtb         xmm6, xmm0

+        ;grab the y source values

+        add             rsi, rbx

+        movdqu          xmm0, [rsi]

+        ;compute abs difference between source and y target

+        movdqa          xmm11, xmm1

+        movdqa          xmm7, xmm0

+        psubusb         xmm0, xmm1

+        psubusb         xmm11, xmm7

+        por             xmm0, xmm11

+        ;compute abs difference between

+        movdqa          xmm11, xmm3

+        pcmpgtb         xmm11, xmm0

+        ;grab the u and v source values

+        movdqu          xmm7, [rdi]

+        movdqu          xmm8, [rcx]

+        unpcklpd        xmm7, xmm8

+        ;compute abs difference between source and uv targets

+        movdqa          xmm9, xmm2

+        movdqa          xmm10, xmm7

+        psubusb         xmm7, xmm2

+        psubusb         xmm9, xmm10

+        por             xmm7, xmm9

+        ;check whether the number is < tolerance

+        movdqa          xmm0, xmm4

+        pcmpgtb         xmm0, xmm7

+        ;double  u and v masks

+        movdqa          xmm8, xmm0

+        punpckhbw       xmm0, xmm0

+        punpcklbw       xmm8, xmm8

+        ;mask row 0 and output

+        pand            xmm6, xmm8

+        pand            xmm6, xmm0

+        movdqa          [rax],xmm6

+        ;mask row 1 and output

+        pand            xmm11, xmm8

+        pand            xmm11, xmm0

+        movdqa          [rax+16],xmm11

+        ; to the next row or set of rows

+        add             rsi, rbx

+        add             rdi, rdx

+        add             rcx, rdx

+        add             rax,32

+        dec r8

+        jnz NextPairOfRows

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;GROW_HORIZ (register for result, source register or mem local)

+; takes source and shifts left and ors with source

+; then shifts right and ors with source

+%macro GROW_HORIZ 2

+    movdqa          %1, %2

+    movdqa          xmm14, %1

+    movdqa          xmm15, %1

+    pslldq          xmm14, 1

+    psrldq          xmm15, 1

+    por             %1,xmm14

+    por             %1,xmm15

+%endmacro

+;GROW_VERT (result, center row, above row, below row)

+%macro GROW_VERT 4

+    movdqa          %1,%2

+    por             %1,%3

+    por             %1,%4

+%endmacro

+;GROW_NEXTLINE (new line to grow, new source, line to write)

+%macro GROW_NEXTLINE 3

+    GROW_HORIZ %1, %2

+    GROW_VERT xmm3, xmm0, xmm1, xmm2

+    movdqa %3,xmm3

+%endmacro

+;void int vp8_growmaskmb_sse3(

+;    unsigned char *om,

+;    unsigned char *nm,

+global sym(vp8_growmaskmb_sse3)

+sym(vp8_growmaskmb_sse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 2

+    push        rsi

+    push        rdi

+    ; end prolog

+    mov             rsi,        arg(0) ;src

+    mov             rdi,        arg(1) ;rst

+    GROW_HORIZ xmm0, [rsi]

+    GROW_HORIZ xmm1, [rsi+16]

+    GROW_HORIZ xmm2, [rsi+32]

+    GROW_VERT xmm3, xmm0, xmm1, xmm2

+    por xmm0,xmm1

+    movdqa [rdi], xmm0

+    movdqa [rdi+16],xmm3

+    GROW_NEXTLINE xmm0,[rsi+48],[rdi+32]

+    GROW_NEXTLINE xmm1,[rsi+64],[rdi+48]

+    GROW_NEXTLINE xmm2,[rsi+80],[rdi+64]

+    GROW_NEXTLINE xmm0,[rsi+96],[rdi+80]

+    GROW_NEXTLINE xmm1,[rsi+112],[rdi+96]

+    GROW_NEXTLINE xmm2,[rsi+128],[rdi+112]

+    GROW_NEXTLINE xmm0,[rsi+144],[rdi+128]

+    GROW_NEXTLINE xmm1,[rsi+160],[rdi+144]

+    GROW_NEXTLINE xmm2,[rsi+176],[rdi+160]

+    GROW_NEXTLINE xmm0,[rsi+192],[rdi+176]

+    GROW_NEXTLINE xmm1,[rsi+208],[rdi+192]

+    GROW_NEXTLINE xmm2,[rsi+224],[rdi+208]

+    GROW_NEXTLINE xmm0,[rsi+240],[rdi+224]

+    por xmm0,xmm2

+    movdqa [rdi+240], xmm0

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;unsigned int vp8_sad16x16_masked_wmt(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride,

+;    unsigned char *mask)

+global sym(vp8_sad16x16_masked_wmt)

+sym(vp8_sad16x16_masked_wmt):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    push        rsi

+    push        rdi

+    ; end prolog

+    mov             rsi,        arg(0) ;src_ptr

+    mov             rdi,        arg(2) ;ref_ptr

+    mov             rbx,        arg(4) ;mask

+    movsxd          rax,        dword ptr arg(1) ;src_stride

+    movsxd          rdx,        dword ptr arg(3) ;ref_stride

+    mov             rcx,        16

+    pxor            xmm3,       xmm3

+NextSadRow:

+    movdqu          xmm0,       [rsi]

+    movdqu          xmm1,       [rdi]

+    movdqu          xmm2,       [rbx]

+    pand            xmm0,       xmm2

+    pand            xmm1,       xmm2

+    psadbw          xmm0,       xmm1

+    paddw           xmm3,       xmm0

+    add             rsi, rax

+    add             rdi, rdx

+    add             rbx,  16

+    dec rcx

+    jnz NextSadRow

+    movdqa          xmm4 ,     xmm3

+    psrldq          xmm4,       8

+    paddw           xmm3,      xmm4

+    movq            rax,       xmm3

+    ; begin epilog

+    pop rdi

+    pop rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;unsigned int vp8_sad16x16_unmasked_wmt(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride,

+;    unsigned char *mask)

+global sym(vp8_sad16x16_unmasked_wmt)

+sym(vp8_sad16x16_unmasked_wmt):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    push        rsi

+    push        rdi

+    ; end prolog

+    mov             rsi,        arg(0) ;src_ptr

+    mov             rdi,        arg(2) ;ref_ptr

+    mov             rbx,        arg(4) ;mask

+    movsxd          rax,        dword ptr arg(1) ;src_stride

+    movsxd          rdx,        dword ptr arg(3) ;ref_stride

+    mov             rcx,        16

+    pxor            xmm3,       xmm3

+next_vp8_sad16x16_unmasked_wmt:

+    movdqu          xmm0,       [rsi]

+    movdqu          xmm1,       [rdi]

+    movdqu          xmm2,       [rbx]

+    por             xmm0,       xmm2

+    por             xmm1,       xmm2

+    psadbw          xmm0,       xmm1

+    paddw           xmm3,       xmm0

+    add             rsi, rax

+    add             rdi, rdx

+    add             rbx,  16

+    dec rcx

+    jnz next_vp8_sad16x16_unmasked_wmt

+    movdqa          xmm4 ,     xmm3

+    psrldq          xmm4,       8

+    paddw           xmm3,      xmm4

+    movq            rax,        xmm3

+    ; begin epilog

+    pop rdi

+    pop rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;unsigned int vp8_masked_predictor_wmt(

+;    unsigned char *masked,

+;    unsigned char *unmasked,

+;    int  src_stride,

+;    unsigned char *dst_ptr,

+;    int  dst_stride,

+;    unsigned char *mask)

+global sym(vp8_masked_predictor_wmt)

+sym(vp8_masked_predictor_wmt):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    push        rsi

+    push        rdi

+    ; end prolog

+    mov             rsi,        arg(0) ;src_ptr

+    mov             rdi,        arg(1) ;ref_ptr

+    mov             rbx,        arg(5) ;mask

+    movsxd          rax,        dword ptr arg(2) ;src_stride

+    mov             r11,        arg(3) ; destination

+    movsxd          rdx,        dword ptr arg(4) ;dst_stride

+    mov             rcx,        16

+    pxor            xmm3,       xmm3

+next_vp8_masked_predictor_wmt:

+    movdqu          xmm0,       [rsi]

+    movdqu          xmm1,       [rdi]

+    movdqu          xmm2,       [rbx]

+    pand            xmm0,       xmm2

+    pandn           xmm2,       xmm1

+    por             xmm0,       xmm2

+    movdqu          [r11],      xmm0

+    add             r11, rdx

+    add             rsi, rax

+    add             rdi, rdx

+    add             rbx,  16

+    dec rcx

+    jnz next_vp8_masked_predictor_wmt

+    ; begin epilog

+    pop rdi

+    pop rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;unsigned int vp8_masked_predictor_uv_wmt(

+;    unsigned char *masked,

+;    unsigned char *unmasked,

+;    int  src_stride,

+;    unsigned char *dst_ptr,

+;    int  dst_stride,

+;    unsigned char *mask)

+global sym(vp8_masked_predictor_uv_wmt)

+sym(vp8_masked_predictor_uv_wmt):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    push        rsi

+    push        rdi

+    ; end prolog

+    mov             rsi,        arg(0) ;src_ptr

+    mov             rdi,        arg(1) ;ref_ptr

+    mov             rbx,        arg(5) ;mask

+    movsxd          rax,        dword ptr arg(2) ;src_stride

+    mov             r11,        arg(3) ; destination

+    movsxd          rdx,        dword ptr arg(4) ;dst_stride

+    mov             rcx,        8

+    pxor            xmm3,       xmm3

+next_vp8_masked_predictor_uv_wmt:

+    movq            xmm0,       [rsi]

+    movq            xmm1,       [rdi]

+    movq            xmm2,       [rbx]

+    pand            xmm0,       xmm2

+    pandn           xmm2,       xmm1

+    por             xmm0,       xmm2

+    movq            [r11],      xmm0

+    add             r11, rdx

+    add             rsi, rax

+    add             rdi, rax

+    add             rbx,  8

+    dec rcx

+    jnz next_vp8_masked_predictor_uv_wmt

+    ; begin epilog

+    pop rdi

+    pop rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;unsigned int vp8_uv_from_y_mask(

+;    unsigned char *ymask,

+;    unsigned char *uvmask)

+global sym(vp8_uv_from_y_mask)

+sym(vp8_uv_from_y_mask):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    push        rsi

+    push        rdi

+    ; end prolog

+    mov             rsi,        arg(0) ;src_ptr

+    mov             rdi,        arg(1) ;dst_ptr

+    mov             rcx,        8

+    pxor            xmm3,       xmm3

+next_p8_uv_from_y_mask:

+    movdqu          xmm0,       [rsi]

+    pshufb          xmm0, [shuf1b] ;[GLOBAL(shuf1b)]

+    movq            [rdi],xmm0

+    add             rdi, 8

+    add             rsi,32

+    dec rcx

+    jnz next_p8_uv_from_y_mask

+    ; begin epilog

+    pop rdi

+    pop rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+shuf1b:

+    db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0

--- /dev/null

+++ b/vp9/common/x86/postproc_mmx.asm

@@ -1,0 +1,534 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+%define VP9_FILTER_WEIGHT 128

+%define VP9_FILTER_SHIFT  7

+;void vp9_post_proc_down_and_across_mmx

+;(

+;    unsigned char *src_ptr,

+;    unsigned char *dst_ptr,

+;    int src_pixels_per_line,

+;    int dst_pixels_per_line,

+;    int rows,

+;    int cols,

+;    int flimit

+;)

+global sym(vp9_post_proc_down_and_across_mmx)

+sym(vp9_post_proc_down_and_across_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+%if ABI_IS_32BIT=1 && CONFIG_PIC=1

+    ; move the global rd onto the stack, since we don't have enough registers

+    ; to do PIC addressing

+    movq        mm0, [GLOBAL(rd)]

+    sub         rsp, 8

+    movq        [rsp], mm0

+%define RD [rsp]

+%else

+%define RD [GLOBAL(rd)]

+%endif

+        push        rbx

+        lea         rbx, [GLOBAL(Blur)]

+        movd        mm2, dword ptr arg(6) ;flimit

+        punpcklwd   mm2, mm2

+        punpckldq   mm2, mm2

+        mov         rsi,        arg(0) ;src_ptr

+        mov         rdi,        arg(1) ;dst_ptr

+        movsxd      rcx, DWORD PTR arg(4) ;rows

+        movsxd      rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?

+        pxor        mm0, mm0              ; mm0 = 00000000

+.nextrow:

+        xor         rdx,        rdx       ; clear out rdx for use as loop counter

+.nextcol:

+        pxor        mm7, mm7              ; mm7 = 00000000

+        movq        mm6, [rbx + 32 ]      ; mm6 = kernel 2 taps

+        movq        mm3, [rsi]            ; mm4 = r0 p0..p7

+        punpcklbw   mm3, mm0              ; mm3 = p0..p3

+        movq        mm1, mm3              ; mm1 = p0..p3

+        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers

+        movq        mm6, [rbx + 48]       ; mm6 = kernel 3 taps

+        movq        mm5, [rsi + rax]      ; mm4 = r1 p0..p7

+        punpcklbw   mm5, mm0              ; mm5 = r1 p0..p3

+        pmullw      mm6, mm5              ; mm6 *= p0..p3 * kernel 3 modifiers

+        paddusw     mm3, mm6              ; mm3 += mm6

+        ; thresholding

+        movq        mm7, mm1              ; mm7 = r0 p0..p3

+        psubusw     mm7, mm5              ; mm7 = r0 p0..p3 - r1 p0..p3

+        psubusw     mm5, mm1              ; mm5 = r1 p0..p3 - r0 p0..p3

+        paddusw     mm7, mm5              ; mm7 = abs(r0 p0..p3 - r1 p0..p3)

+        pcmpgtw     mm7, mm2

+        movq        mm6, [rbx + 64 ]      ; mm6 = kernel 4 modifiers

+        movq        mm5, [rsi + 2*rax]    ; mm4 = r2 p0..p7

+        punpcklbw   mm5, mm0              ; mm5 = r2 p0..p3

+        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers

+        paddusw     mm3, mm6              ; mm3 += mm5

+        ; thresholding

+        movq        mm6, mm1              ; mm6 = r0 p0..p3

+        psubusw     mm6, mm5              ; mm6 = r0 p0..p3 - r2 p0..p3

+        psubusw     mm5, mm1              ; mm5 = r2 p0..p3 - r2 p0..p3

+        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r2 p0..p3)

+        pcmpgtw     mm6, mm2

+        por         mm7, mm6              ; accumulate thresholds

+        neg         rax

+        movq        mm6, [rbx ]           ; kernel 0 taps

+        movq        mm5, [rsi+2*rax]      ; mm4 = r-2 p0..p7

+        punpcklbw   mm5, mm0              ; mm5 = r-2 p0..p3

+        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers

+        paddusw     mm3, mm6              ; mm3 += mm5

+        ; thresholding

+        movq        mm6, mm1              ; mm6 = r0 p0..p3

+        psubusw     mm6, mm5              ; mm6 = p0..p3 - r-2 p0..p3

+        psubusw     mm5, mm1              ; mm5 = r-2 p0..p3 - p0..p3

+        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)

+        pcmpgtw     mm6, mm2

+        por         mm7, mm6              ; accumulate thresholds

+        movq        mm6, [rbx + 16]       ; kernel 1 taps

+        movq        mm4, [rsi+rax]        ; mm4 = r-1 p0..p7

+        punpcklbw   mm4, mm0              ; mm4 = r-1 p0..p3

+        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.

+        paddusw     mm3, mm6              ; mm3 += mm5

+        ; thresholding

+        movq        mm6, mm1              ; mm6 = r0 p0..p3

+        psubusw     mm6, mm4              ; mm6 = p0..p3 - r-2 p0..p3

+        psubusw     mm4, mm1              ; mm5 = r-1 p0..p3 - p0..p3

+        paddusw     mm6, mm4              ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)

+        pcmpgtw     mm6, mm2

+        por         mm7, mm6              ; accumulate thresholds

+        paddusw     mm3, RD               ; mm3 += round value

+        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128

+        pand        mm1, mm7              ; mm1 select vals > thresh from source

+        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result

+        paddusw     mm1, mm7              ; combination

+        packuswb    mm1, mm0              ; pack to bytes

+        movd        [rdi], mm1            ;

+        neg         rax                   ; pitch is positive

+        add         rsi, 4

+        add         rdi, 4

+        add         rdx, 4

+        cmp         edx, dword ptr arg(5) ;cols

+        jl          .nextcol

+        ; done with the all cols, start the across filtering in place

+        sub         rsi, rdx

+        sub         rdi, rdx

+        push        rax

+        xor         rdx,    rdx

+        mov         rax,    [rdi-4];

+.acrossnextcol:

+        pxor        mm7, mm7              ; mm7 = 00000000

+        movq        mm6, [rbx + 32 ]      ;

+        movq        mm4, [rdi+rdx]        ; mm4 = p0..p7

+        movq        mm3, mm4              ; mm3 = p0..p7

+        punpcklbw   mm3, mm0              ; mm3 = p0..p3

+        movq        mm1, mm3              ; mm1 = p0..p3

+        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers

+        movq        mm6, [rbx + 48]

+        psrlq       mm4, 8                ; mm4 = p1..p7

+        movq        mm5, mm4              ; mm5 = p1..p7

+        punpcklbw   mm5, mm0              ; mm5 = p1..p4

+        pmullw      mm6, mm5              ; mm6 *= p1..p4 * kernel 3 modifiers

+        paddusw     mm3, mm6              ; mm3 += mm6

+        ; thresholding

+        movq        mm7, mm1              ; mm7 = p0..p3

+        psubusw     mm7, mm5              ; mm7 = p0..p3 - p1..p4

+        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3

+        paddusw     mm7, mm5              ; mm7 = abs(p0..p3 - p1..p4)

+        pcmpgtw     mm7, mm2

+        movq        mm6, [rbx + 64 ]

+        psrlq       mm4, 8                ; mm4 = p2..p7

+        movq        mm5, mm4              ; mm5 = p2..p7

+        punpcklbw   mm5, mm0              ; mm5 = p2..p5

+        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers

+        paddusw     mm3, mm6              ; mm3 += mm5

+        ; thresholding

+        movq        mm6, mm1              ; mm6 = p0..p3

+        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4

+        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3

+        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)

+        pcmpgtw     mm6, mm2

+        por         mm7, mm6              ; accumulate thresholds

+        movq        mm6, [rbx ]

+        movq        mm4, [rdi+rdx-2]      ; mm4 = p-2..p5

+        movq        mm5, mm4              ; mm5 = p-2..p5

+        punpcklbw   mm5, mm0              ; mm5 = p-2..p1

+        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers

+        paddusw     mm3, mm6              ; mm3 += mm5

+        ; thresholding

+        movq        mm6, mm1              ; mm6 = p0..p3

+        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4

+        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3

+        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)

+        pcmpgtw     mm6, mm2

+        por         mm7, mm6              ; accumulate thresholds

+        movq        mm6, [rbx + 16]

+        psrlq       mm4, 8                ; mm4 = p-1..p5

+        punpcklbw   mm4, mm0              ; mm4 = p-1..p2

+        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.

+        paddusw     mm3, mm6              ; mm3 += mm5

+        ; thresholding

+        movq        mm6, mm1              ; mm6 = p0..p3

+        psubusw     mm6, mm4              ; mm6 = p0..p3 - p1..p4

+        psubusw     mm4, mm1              ; mm5 = p1..p4 - p0..p3

+        paddusw     mm6, mm4              ; mm6 = abs(p0..p3 - p1..p4)

+        pcmpgtw     mm6, mm2

+        por         mm7, mm6              ; accumulate thresholds

+        paddusw     mm3, RD               ; mm3 += round value

+        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128

+        pand        mm1, mm7              ; mm1 select vals > thresh from source

+        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result

+        paddusw     mm1, mm7              ; combination

+        packuswb    mm1, mm0              ; pack to bytes

+        mov         DWORD PTR [rdi+rdx-4],  eax   ; store previous four bytes

+        movd        eax,    mm1

+        add         rdx, 4

+        cmp         edx, dword ptr arg(5) ;cols

+        jl          .acrossnextcol;

+        mov         DWORD PTR [rdi+rdx-4],  eax

+        pop         rax

+        ; done with this rwo

+        add         rsi,rax               ; next line

+        movsxd      rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?

+        add         rdi,rax               ; next destination

+        movsxd      rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?

+        dec         rcx                   ; decrement count

+        jnz         .nextrow               ; next row

+        pop         rbx

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+%undef RD

+;void vp9_mbpost_proc_down_mmx(unsigned char *dst,

+;                             int pitch, int rows, int cols,int flimit)

+extern sym(vp9_rv)

+global sym(vp9_mbpost_proc_down_mmx)

+sym(vp9_mbpost_proc_down_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 136

+    ; unsigned char d[16][8] at [rsp]

+    ; create flimit2 at [rsp+128]

+    mov         eax, dword ptr arg(4) ;flimit

+    mov         [rsp+128], eax

+    mov         [rsp+128+4], eax

+%define flimit2 [rsp+128]

+%if ABI_IS_32BIT=0

+    lea         r8,       [GLOBAL(sym(vp9_rv))]

+%endif

+    ;rows +=8;

+    add         dword ptr arg(2), 8

+    ;for(c=0; c<cols; c+=4)

+.loop_col:

+            mov         rsi,        arg(0)  ;s

+            pxor        mm0,        mm0     ;

+            movsxd      rax,        dword ptr arg(1) ;pitch       ;

+            neg         rax                                     ; rax = -pitch

+            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]

+            neg         rax

+            pxor        mm5,        mm5

+            pxor        mm6,        mm6     ;

+            pxor        mm7,        mm7     ;

+            mov         rdi,        rsi

+            mov         rcx,        15          ;

+.loop_initvar:

+            movd        mm1,        DWORD PTR [rdi];

+            punpcklbw   mm1,        mm0     ;

+            paddw       mm5,        mm1     ;

+            pmullw      mm1,        mm1     ;

+            movq        mm2,        mm1     ;

+            punpcklwd   mm1,        mm0     ;

+            punpckhwd   mm2,        mm0     ;

+            paddd       mm6,        mm1     ;

+            paddd       mm7,        mm2     ;

+            lea         rdi,        [rdi+rax]   ;

+            dec         rcx

+            jne         .loop_initvar

+            ;save the var and sum

+            xor         rdx,        rdx

+.loop_row:

+            movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]

+            movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]

+            punpcklbw   mm1,        mm0

+            punpcklbw   mm2,        mm0

+            paddw       mm5,        mm2

+            psubw       mm5,        mm1

+            pmullw      mm2,        mm2

+            movq        mm4,        mm2

+            punpcklwd   mm2,        mm0

+            punpckhwd   mm4,        mm0

+            paddd       mm6,        mm2

+            paddd       mm7,        mm4

+            pmullw      mm1,        mm1

+            movq        mm2,        mm1

+            punpcklwd   mm1,        mm0

+            psubd       mm6,        mm1

+            punpckhwd   mm2,        mm0

+            psubd       mm7,        mm2

+            movq        mm3,        mm6

+            pslld       mm3,        4

+            psubd       mm3,        mm6

+            movq        mm1,        mm5

+            movq        mm4,        mm5

+            pmullw      mm1,        mm1

+            pmulhw      mm4,        mm4

+            movq        mm2,        mm1

+            punpcklwd   mm1,        mm4

+            punpckhwd   mm2,        mm4

+            movq        mm4,        mm7

+            pslld       mm4,        4

+            psubd       mm4,        mm7

+            psubd       mm3,        mm1

+            psubd       mm4,        mm2

+            psubd       mm3,        flimit2

+            psubd       mm4,        flimit2

+            psrad       mm3,        31

+            psrad       mm4,        31

+            packssdw    mm3,        mm4

+            packsswb    mm3,        mm0

+            movd        mm1,        DWORD PTR [rsi+rax*8]

+            movq        mm2,        mm1

+            punpcklbw   mm1,        mm0

+            paddw       mm1,        mm5

+            mov         rcx,        rdx

+            and         rcx,        127

+%if ABI_IS_32BIT=1 && CONFIG_PIC=1

+            push        rax

+            lea         rax,        [GLOBAL(sym(vp9_rv))]

+            movq        mm4,        [rax + rcx*2] ;vp9_rv[rcx*2]

+            pop         rax

+%elif ABI_IS_32BIT=0

+            movq        mm4,        [r8 + rcx*2] ;vp9_rv[rcx*2]

+%else

+            movq        mm4,        [sym(vp9_rv) + rcx*2]

+%endif

+            paddw       mm1,        mm4

+            ;paddw     xmm1,       eight8s

+            psraw       mm1,        4

+            packuswb    mm1,        mm0

+            pand        mm1,        mm3

+            pandn       mm3,        mm2

+            por         mm1,        mm3

+            and         rcx,        15

+            movd        DWORD PTR   [rsp+rcx*4], mm1 ;d[rcx*4]

+            mov         rcx,        rdx

+            sub         rcx,        8

+            and         rcx,        15

+            movd        mm1,        DWORD PTR [rsp+rcx*4] ;d[rcx*4]

+            movd        [rsi],      mm1

+            lea         rsi,        [rsi+rax]

+            lea         rdi,        [rdi+rax]

+            add         rdx,        1

+            cmp         edx,        dword arg(2) ;rows

+            jl          .loop_row

+        add         dword arg(0), 4 ; s += 4

+        sub         dword arg(3), 4 ; cols -= 4

+        cmp         dword arg(3), 0

+        jg          .loop_col

+    add         rsp, 136

+    pop         rsp

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+%undef flimit2

+;void vp9_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,

+;                            unsigned char blackclamp[16],

+;                            unsigned char whiteclamp[16],

+;                            unsigned char bothclamp[16],

+;                            unsigned int Width, unsigned int Height, int Pitch)

+extern sym(rand)

+global sym(vp9_plane_add_noise_mmx)

+sym(vp9_plane_add_noise_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 8

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+.addnoise_loop:

+    call sym(rand) WRT_PLT

+    mov     rcx, arg(1) ;noise

+    and     rax, 0xff

+    add     rcx, rax

+    ; we rely on the fact that the clamping vectors are stored contiguously

+    ; in black/white/both order. Note that we have to reload this here because

+    ; rdx could be trashed by rand()

+    mov     rdx, arg(2) ; blackclamp

+            mov     rdi, rcx

+            movsxd  rcx, dword arg(5) ;[Width]

+            mov     rsi, arg(0) ;Pos

+            xor         rax,rax

+.addnoise_nextset:

+            movq        mm1,[rsi+rax]         ; get the source

+            psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise

+            paddusb     mm1, [rdx+32] ;bothclamp

+            psubusb     mm1, [rdx+16] ;whiteclamp

+            movq        mm2,[rdi+rax]         ; get the noise for this line

+            paddb       mm1,mm2              ; add it in

+            movq        [rsi+rax],mm1         ; store the result

+            add         rax,8                 ; move to the next line

+            cmp         rax, rcx

+            jl          .addnoise_nextset

+    movsxd  rax, dword arg(7) ; Pitch

+    add     arg(0), rax ; Start += Pitch

+    sub     dword arg(6), 1   ; Height -= 1

+    jg      .addnoise_loop

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+Blur:

+    times 16 dw 16

+    times  8 dw 64

+    times 16 dw 16

+    times  8 dw  0

+rd:

+    times 4 dw 0x40

--- /dev/null

+++ b/vp9/common/x86/postproc_sse2.asm

@@ -1,0 +1,695 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+;void vp9_post_proc_down_and_across_xmm

+;(

+;    unsigned char *src_ptr,

+;    unsigned char *dst_ptr,

+;    int src_pixels_per_line,

+;    int dst_pixels_per_line,

+;    int rows,

+;    int cols,

+;    int flimit

+;)

+global sym(vp9_post_proc_down_and_across_xmm)

+sym(vp9_post_proc_down_and_across_xmm):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 7

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+%if ABI_IS_32BIT=1 && CONFIG_PIC=1

+    ALIGN_STACK 16, rax

+    ; move the global rd onto the stack, since we don't have enough registers

+    ; to do PIC addressing

+    movdqa      xmm0, [GLOBAL(rd42)]

+    sub         rsp, 16

+    movdqa      [rsp], xmm0

+%define RD42 [rsp]

+%else

+%define RD42 [GLOBAL(rd42)]

+%endif

+        movd        xmm2,       dword ptr arg(6) ;flimit

+        punpcklwd   xmm2,       xmm2

+        punpckldq   xmm2,       xmm2

+        punpcklqdq  xmm2,       xmm2

+        mov         rsi,        arg(0) ;src_ptr

+        mov         rdi,        arg(1) ;dst_ptr

+        movsxd      rcx,        DWORD PTR arg(4) ;rows

+        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?

+        pxor        xmm0,       xmm0              ; mm0 = 00000000

+.nextrow:

+        xor         rdx,        rdx       ; clear out rdx for use as loop counter

+.nextcol:

+        movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7

+        punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3

+        movdqa      xmm1,       xmm3                    ; mm1 = p0..p3

+        psllw       xmm3,       2                       ;

+        movq        xmm5,       QWORD PTR [rsi + rax]   ; mm4 = r1 p0..p7

+        punpcklbw   xmm5,       xmm0                    ; mm5 = r1 p0..p3

+        paddusw     xmm3,       xmm5                    ; mm3 += mm6

+        ; thresholding

+        movdqa      xmm7,       xmm1                    ; mm7 = r0 p0..p3

+        psubusw     xmm7,       xmm5                    ; mm7 = r0 p0..p3 - r1 p0..p3

+        psubusw     xmm5,       xmm1                    ; mm5 = r1 p0..p3 - r0 p0..p3

+        paddusw     xmm7,       xmm5                    ; mm7 = abs(r0 p0..p3 - r1 p0..p3)

+        pcmpgtw     xmm7,       xmm2

+        movq        xmm5,       QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7

+        punpcklbw   xmm5,       xmm0                    ; mm5 = r2 p0..p3

+        paddusw     xmm3,       xmm5                    ; mm3 += mm5

+        ; thresholding

+        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3

+        psubusw     xmm6,       xmm5                    ; mm6 = r0 p0..p3 - r2 p0..p3

+        psubusw     xmm5,       xmm1                    ; mm5 = r2 p0..p3 - r2 p0..p3

+        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r2 p0..p3)

+        pcmpgtw     xmm6,       xmm2

+        por         xmm7,       xmm6                    ; accumulate thresholds

+        neg         rax

+        movq        xmm5,       QWORD PTR [rsi+2*rax]   ; mm4 = r-2 p0..p7

+        punpcklbw   xmm5,       xmm0                    ; mm5 = r-2 p0..p3

+        paddusw     xmm3,       xmm5                    ; mm3 += mm5

+        ; thresholding

+        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3

+        psubusw     xmm6,       xmm5                    ; mm6 = p0..p3 - r-2 p0..p3

+        psubusw     xmm5,       xmm1                    ; mm5 = r-2 p0..p3 - p0..p3

+        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)

+        pcmpgtw     xmm6,       xmm2

+        por         xmm7,       xmm6                    ; accumulate thresholds

+        movq        xmm4,       QWORD PTR [rsi+rax]     ; mm4 = r-1 p0..p7

+        punpcklbw   xmm4,       xmm0                    ; mm4 = r-1 p0..p3

+        paddusw     xmm3,       xmm4                    ; mm3 += mm5

+        ; thresholding

+        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3

+        psubusw     xmm6,       xmm4                    ; mm6 = p0..p3 - r-2 p0..p3

+        psubusw     xmm4,       xmm1                    ; mm5 = r-1 p0..p3 - p0..p3

+        paddusw     xmm6,       xmm4                    ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)

+        pcmpgtw     xmm6,       xmm2

+        por         xmm7,       xmm6                    ; accumulate thresholds

+        paddusw     xmm3,       RD42                    ; mm3 += round value

+        psraw       xmm3,       3                       ; mm3 /= 8

+        pand        xmm1,       xmm7                    ; mm1 select vals > thresh from source

+        pandn       xmm7,       xmm3                    ; mm7 select vals < thresh from blurred result

+        paddusw     xmm1,       xmm7                    ; combination

+        packuswb    xmm1,       xmm0                    ; pack to bytes

+        movq        QWORD PTR [rdi], xmm1             ;

+        neg         rax                   ; pitch is positive

+        add         rsi,        8

+        add         rdi,        8

+        add         rdx,        8

+        cmp         edx,        dword arg(5) ;cols

+        jl          .nextcol

+        ; done with the all cols, start the across filtering in place

+        sub         rsi,        rdx

+        sub         rdi,        rdx

+        xor         rdx,        rdx

+        movq        mm0,        QWORD PTR [rdi-8];

+.acrossnextcol:

+        movq        xmm7,       QWORD PTR [rdi +rdx -2]

+        movd        xmm4,       DWORD PTR [rdi +rdx +6]

+        pslldq      xmm4,       8

+        por         xmm4,       xmm7

+        movdqa      xmm3,       xmm4

+        psrldq      xmm3,       2

+        punpcklbw   xmm3,       xmm0              ; mm3 = p0..p3

+        movdqa      xmm1,       xmm3              ; mm1 = p0..p3

+        psllw       xmm3,       2

+        movdqa      xmm5,       xmm4

+        psrldq      xmm5,       3

+        punpcklbw   xmm5,       xmm0              ; mm5 = p1..p4

+        paddusw     xmm3,       xmm5              ; mm3 += mm6

+        ; thresholding

+        movdqa      xmm7,       xmm1              ; mm7 = p0..p3

+        psubusw     xmm7,       xmm5              ; mm7 = p0..p3 - p1..p4

+        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3

+        paddusw     xmm7,       xmm5              ; mm7 = abs(p0..p3 - p1..p4)

+        pcmpgtw     xmm7,       xmm2

+        movdqa      xmm5,       xmm4

+        psrldq      xmm5,       4

+        punpcklbw   xmm5,       xmm0              ; mm5 = p2..p5

+        paddusw     xmm3,       xmm5              ; mm3 += mm5

+        ; thresholding

+        movdqa      xmm6,       xmm1              ; mm6 = p0..p3

+        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4

+        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3

+        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)

+        pcmpgtw     xmm6,       xmm2

+        por         xmm7,       xmm6              ; accumulate thresholds

+        movdqa      xmm5,       xmm4              ; mm5 = p-2..p5

+        punpcklbw   xmm5,       xmm0              ; mm5 = p-2..p1

+        paddusw     xmm3,       xmm5              ; mm3 += mm5

+        ; thresholding

+        movdqa      xmm6,       xmm1              ; mm6 = p0..p3

+        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4

+        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3

+        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)

+        pcmpgtw     xmm6,       xmm2

+        por         xmm7,       xmm6              ; accumulate thresholds

+        psrldq      xmm4,       1                   ; mm4 = p-1..p5

+        punpcklbw   xmm4,       xmm0              ; mm4 = p-1..p2

+        paddusw     xmm3,       xmm4              ; mm3 += mm5

+        ; thresholding

+        movdqa      xmm6,       xmm1              ; mm6 = p0..p3

+        psubusw     xmm6,       xmm4              ; mm6 = p0..p3 - p1..p4

+        psubusw     xmm4,       xmm1              ; mm5 = p1..p4 - p0..p3

+        paddusw     xmm6,       xmm4              ; mm6 = abs(p0..p3 - p1..p4)

+        pcmpgtw     xmm6,       xmm2

+        por         xmm7,       xmm6              ; accumulate thresholds

+        paddusw     xmm3,       RD42              ; mm3 += round value

+        psraw       xmm3,       3                 ; mm3 /= 8

+        pand        xmm1,       xmm7              ; mm1 select vals > thresh from source

+        pandn       xmm7,       xmm3              ; mm7 select vals < thresh from blurred result

+        paddusw     xmm1,       xmm7              ; combination

+        packuswb    xmm1,       xmm0              ; pack to bytes

+        movq        QWORD PTR [rdi+rdx-8],  mm0   ; store previous four bytes

+        movdq2q     mm0,        xmm1

+        add         rdx,        8

+        cmp         edx,        dword arg(5) ;cols

+        jl          .acrossnextcol;

+        ; last 8 pixels

+        movq        QWORD PTR [rdi+rdx-8],  mm0

+        ; done with this rwo

+        add         rsi,rax               ; next line

+        mov         eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?

+        add         rdi,rax               ; next destination

+        mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?

+        dec         rcx                   ; decrement count

+        jnz         .nextrow              ; next row

+%if ABI_IS_32BIT=1 && CONFIG_PIC=1

+    add rsp,16

+    pop rsp

+%endif

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+%undef RD42

+;void vp9_mbpost_proc_down_xmm(unsigned char *dst,

+;                            int pitch, int rows, int cols,int flimit)

+extern sym(vp9_rv)

+global sym(vp9_mbpost_proc_down_xmm)

+sym(vp9_mbpost_proc_down_xmm):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 128+16

+    ; unsigned char d[16][8] at [rsp]

+    ; create flimit2 at [rsp+128]

+    mov         eax, dword ptr arg(4) ;flimit

+    mov         [rsp+128], eax

+    mov         [rsp+128+4], eax

+    mov         [rsp+128+8], eax

+    mov         [rsp+128+12], eax

+%define flimit4 [rsp+128]

+%if ABI_IS_32BIT=0

+    lea         r8,       [GLOBAL(sym(vp9_rv))]

+%endif

+    ;rows +=8;

+    add         dword arg(2), 8

+    ;for(c=0; c<cols; c+=8)

+.loop_col:

+            mov         rsi,        arg(0) ; s

+            pxor        xmm0,       xmm0        ;

+            movsxd      rax,        dword ptr arg(1) ;pitch       ;

+            neg         rax                                     ; rax = -pitch

+            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]

+            neg         rax

+            pxor        xmm5,       xmm5

+            pxor        xmm6,       xmm6        ;

+            pxor        xmm7,       xmm7        ;

+            mov         rdi,        rsi

+            mov         rcx,        15          ;

+.loop_initvar:

+            movq        xmm1,       QWORD PTR [rdi];

+            punpcklbw   xmm1,       xmm0        ;

+            paddw       xmm5,       xmm1        ;

+            pmullw      xmm1,       xmm1        ;

+            movdqa      xmm2,       xmm1        ;

+            punpcklwd   xmm1,       xmm0        ;

+            punpckhwd   xmm2,       xmm0        ;

+            paddd       xmm6,       xmm1        ;

+            paddd       xmm7,       xmm2        ;

+            lea         rdi,        [rdi+rax]   ;

+            dec         rcx

+            jne         .loop_initvar

+            ;save the var and sum

+            xor         rdx,        rdx

+.loop_row:

+            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]

+            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]

+            punpcklbw   xmm1,       xmm0

+            punpcklbw   xmm2,       xmm0

+            paddw       xmm5,       xmm2

+            psubw       xmm5,       xmm1

+            pmullw      xmm2,       xmm2

+            movdqa      xmm4,       xmm2

+            punpcklwd   xmm2,       xmm0

+            punpckhwd   xmm4,       xmm0

+            paddd       xmm6,       xmm2

+            paddd       xmm7,       xmm4

+            pmullw      xmm1,       xmm1

+            movdqa      xmm2,       xmm1

+            punpcklwd   xmm1,       xmm0

+            psubd       xmm6,       xmm1

+            punpckhwd   xmm2,       xmm0

+            psubd       xmm7,       xmm2

+            movdqa      xmm3,       xmm6

+            pslld       xmm3,       4

+            psubd       xmm3,       xmm6

+            movdqa      xmm1,       xmm5

+            movdqa      xmm4,       xmm5

+            pmullw      xmm1,       xmm1

+            pmulhw      xmm4,       xmm4

+            movdqa      xmm2,       xmm1

+            punpcklwd   xmm1,       xmm4

+            punpckhwd   xmm2,       xmm4

+            movdqa      xmm4,       xmm7

+            pslld       xmm4,       4

+            psubd       xmm4,       xmm7

+            psubd       xmm3,       xmm1

+            psubd       xmm4,       xmm2

+            psubd       xmm3,       flimit4

+            psubd       xmm4,       flimit4

+            psrad       xmm3,       31

+            psrad       xmm4,       31

+            packssdw    xmm3,       xmm4

+            packsswb    xmm3,       xmm0

+            movq        xmm1,       QWORD PTR [rsi+rax*8]

+            movq        xmm2,       xmm1

+            punpcklbw   xmm1,       xmm0

+            paddw       xmm1,       xmm5

+            mov         rcx,        rdx

+            and         rcx,        127

+%if ABI_IS_32BIT=1 && CONFIG_PIC=1

+            push        rax

+            lea         rax,        [GLOBAL(sym(vp9_rv))]

+            movdqu      xmm4,       [rax + rcx*2] ;vp9_rv[rcx*2]

+            pop         rax

+%elif ABI_IS_32BIT=0

+            movdqu      xmm4,       [r8 + rcx*2] ;vp9_rv[rcx*2]

+%else

+            movdqu      xmm4,       [sym(vp9_rv) + rcx*2]

+%endif

+            paddw       xmm1,       xmm4

+            ;paddw     xmm1,       eight8s

+            psraw       xmm1,       4

+            packuswb    xmm1,       xmm0

+            pand        xmm1,       xmm3

+            pandn       xmm3,       xmm2

+            por         xmm1,       xmm3

+            and         rcx,        15

+            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]

+            mov         rcx,        rdx

+            sub         rcx,        8

+            and         rcx,        15

+            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]

+            movq        [rsi],      mm0

+            lea         rsi,        [rsi+rax]

+            lea         rdi,        [rdi+rax]

+            add         rdx,        1

+            cmp         edx,        dword arg(2) ;rows

+            jl          .loop_row

+        add         dword arg(0), 8 ; s += 8

+        sub         dword arg(3), 8 ; cols -= 8

+        cmp         dword arg(3), 0

+        jg          .loop_col

+    add         rsp, 128+16

+    pop         rsp

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+%undef flimit4

+;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src,

+;                                int pitch, int rows, int cols,int flimit)

+global sym(vp9_mbpost_proc_across_ip_xmm)

+sym(vp9_mbpost_proc_across_ip_xmm):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 16

+    ; create flimit4 at [rsp]

+    mov         eax, dword ptr arg(4) ;flimit

+    mov         [rsp], eax

+    mov         [rsp+4], eax

+    mov         [rsp+8], eax

+    mov         [rsp+12], eax

+%define flimit4 [rsp]

+    ;for(r=0;r<rows;r++)

+.ip_row_loop:

+        xor         rdx,    rdx ;sumsq=0;

+        xor         rcx,    rcx ;sum=0;

+        mov         rsi,    arg(0); s

+        mov         rdi,    -8

+.ip_var_loop:

+        ;for(i=-8;i<=6;i++)

+        ;{

+        ;    sumsq += s[i]*s[i];

+        ;    sum   += s[i];

+        ;}

+        movzx       eax, byte [rsi+rdi]

+        add         ecx, eax

+        mul         al

+        add         edx, eax

+        add         rdi, 1

+        cmp         rdi, 6

+        jle         .ip_var_loop

+            ;mov         rax,    sumsq

+            ;movd        xmm7,   rax

+            movd        xmm7,   edx

+            ;mov         rax,    sum

+            ;movd        xmm6,   rax

+            movd        xmm6,   ecx

+            mov         rsi,    arg(0) ;s

+            xor         rcx,    rcx

+            movsxd      rdx,    dword arg(3) ;cols

+            add         rdx,    8

+            pxor        mm0,    mm0

+            pxor        mm1,    mm1

+            pxor        xmm0,   xmm0

+.nextcol4:

+            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5

+            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10

+            punpcklbw   xmm1,   xmm0                    ; expanding

+            punpcklbw   xmm2,   xmm0                    ; expanding

+            punpcklwd   xmm1,   xmm0                    ; expanding to dwords

+            punpcklwd   xmm2,   xmm0                    ; expanding to dwords

+            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5

+            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2

+            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5

+            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5

+            paddd       xmm6,   xmm2

+            paddd       xmm7,   xmm1

+            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones

+            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones

+            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000

+            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000

+            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared

+            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared

+            paddd       xmm6,   xmm4

+            paddd       xmm7,   xmm3

+            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared

+            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared

+            paddd       xmm7,   xmm3

+            paddd       xmm6,   xmm4

+            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared

+            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared

+            paddd       xmm7,   xmm3

+            paddd       xmm6,   xmm4

+            movdqa      xmm3,   xmm6

+            pmaddwd     xmm3,   xmm3

+            movdqa      xmm5,   xmm7

+            pslld       xmm5,   4

+            psubd       xmm5,   xmm7

+            psubd       xmm5,   xmm3

+            psubd       xmm5,   flimit4

+            psrad       xmm5,   31

+            packssdw    xmm5,   xmm0

+            packsswb    xmm5,   xmm0

+            movd        xmm1,   DWORD PTR [rsi+rcx]

+            movq        xmm2,   xmm1

+            punpcklbw   xmm1,   xmm0

+            punpcklwd   xmm1,   xmm0

+            paddd       xmm1,   xmm6

+            paddd       xmm1,   [GLOBAL(four8s)]

+            psrad       xmm1,   4

+            packssdw    xmm1,   xmm0

+            packuswb    xmm1,   xmm0

+            pand        xmm1,   xmm5

+            pandn       xmm5,   xmm2

+            por         xmm5,   xmm1

+            movd        [rsi+rcx-8],  mm0

+            movq        mm0,    mm1

+            movdq2q     mm1,    xmm5

+            psrldq      xmm7,   12

+            psrldq      xmm6,   12

+            add         rcx,    4

+            cmp         rcx,    rdx

+            jl          .nextcol4

+        ;s+=pitch;

+        movsxd rax, dword arg(1)

+        add    arg(0), rax

+        sub dword arg(2), 1 ;rows-=1

+        cmp dword arg(2), 0

+        jg .ip_row_loop

+    add         rsp, 16

+    pop         rsp

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+%undef flimit4

+;void vp9_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,

+;                            unsigned char blackclamp[16],

+;                            unsigned char whiteclamp[16],

+;                            unsigned char bothclamp[16],

+;                            unsigned int Width, unsigned int Height, int Pitch)

+extern sym(rand)

+global sym(vp9_plane_add_noise_wmt)

+sym(vp9_plane_add_noise_wmt):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 8

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+.addnoise_loop:

+    call sym(rand) WRT_PLT

+    mov     rcx, arg(1) ;noise

+    and     rax, 0xff

+    add     rcx, rax

+    ; we rely on the fact that the clamping vectors are stored contiguously

+    ; in black/white/both order. Note that we have to reload this here because

+    ; rdx could be trashed by rand()

+    mov     rdx, arg(2) ; blackclamp

+            mov     rdi, rcx

+            movsxd  rcx, dword arg(5) ;[Width]

+            mov     rsi, arg(0) ;Pos

+            xor         rax,rax

+.addnoise_nextset:

+            movdqu      xmm1,[rsi+rax]         ; get the source

+            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise

+            paddusb     xmm1, [rdx+32] ;bothclamp

+            psubusb     xmm1, [rdx+16] ;whiteclamp

+            movdqu      xmm2,[rdi+rax]         ; get the noise for this line

+            paddb       xmm1,xmm2              ; add it in

+            movdqu      [rsi+rax],xmm1         ; store the result

+            add         rax,16                 ; move to the next line

+            cmp         rax, rcx

+            jl          .addnoise_nextset

+    movsxd  rax, dword arg(7) ; Pitch

+    add     arg(0), rax ; Start += Pitch

+    sub     dword arg(6), 1   ; Height -= 1

+    jg      .addnoise_loop

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+rd42:

+    times 8 dw 0x04

+four8s:

+    times 4 dd 8

--- /dev/null

+++ b/vp9/common/x86/postproc_x86.h

@@ -1,0 +1,64 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef POSTPROC_X86_H

+#define POSTPROC_X86_H

+/* Note:

+ *

+ * This platform is commonly built for runtime CPU detection. If you modify

+ * any of the function mappings present in this file, be sure to also update

+ * them in the function pointer initialization code

+ */

+#if HAVE_MMX

+extern prototype_postproc_inplace(vp9_mbpost_proc_down_mmx);

+extern prototype_postproc(vp9_post_proc_down_and_across_mmx);

+extern prototype_postproc_addnoise(vp9_plane_add_noise_mmx);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp9_postproc_down

+#define vp9_postproc_down vp9_mbpost_proc_down_mmx

+#undef  vp9_postproc_downacross

+#define vp9_postproc_downacross vp9_post_proc_down_and_across_mmx

+#undef  vp9_postproc_addnoise

+#define vp9_postproc_addnoise vp9_plane_add_noise_mmx

+#endif

+#endif

+#if HAVE_SSE2

+extern prototype_postproc_inplace(vp9_mbpost_proc_down_xmm);

+extern prototype_postproc_inplace(vp9_mbpost_proc_across_ip_xmm);

+extern prototype_postproc(vp9_post_proc_down_and_across_xmm);

+extern prototype_postproc_addnoise(vp9_plane_add_noise_wmt);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp9_postproc_down

+#define vp9_postproc_down vp9_mbpost_proc_down_xmm

+#undef  vp9_postproc_across

+#define vp9_postproc_across vp9_mbpost_proc_across_ip_xmm

+#undef  vp9_postproc_downacross

+#define vp9_postproc_downacross vp9_post_proc_down_and_across_xmm

+#undef  vp9_postproc_addnoise

+#define vp9_postproc_addnoise vp9_plane_add_noise_wmt

+#endif

+#endif

+#endif

--- /dev/null

+++ b/vp9/common/x86/recon_mmx.asm

@@ -1,0 +1,321 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+;void vp9_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride)

+global sym(vp9_recon_b_mmx)

+sym(vp9_recon_b_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov       rsi, arg(0) ;s

+        mov       rdi, arg(2) ;d

+        mov       rdx, arg(1) ;q

+        movsxd    rax, dword ptr arg(3) ;stride

+        pxor      mm0, mm0

+        movd      mm1, [rsi]

+        punpcklbw mm1, mm0

+        paddsw    mm1, [rdx]

+        packuswb  mm1,  mm0              ; pack and unpack to saturate

+        movd      [rdi], mm1

+        movd      mm2, [rsi+16]

+        punpcklbw mm2, mm0

+        paddsw    mm2, [rdx+32]

+        packuswb  mm2, mm0              ; pack and unpack to saturate

+        movd      [rdi+rax], mm2

+        movd      mm3, [rsi+32]

+        punpcklbw mm3, mm0

+        paddsw    mm3, [rdx+64]

+        packuswb  mm3,  mm0              ; pack and unpack to saturate

+        movd      [rdi+2*rax], mm3

+        add       rdi, rax

+        movd      mm4, [rsi+48]

+        punpcklbw mm4, mm0

+        paddsw    mm4, [rdx+96]

+        packuswb  mm4, mm0              ; pack and unpack to saturate

+        movd      [rdi+2*rax], mm4

+    ; begin epilog

+    pop rdi

+    pop rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void copy_mem8x8_mmx(

+;    unsigned char *src,

+;    int src_stride,

+;    unsigned char *dst,

+;    int dst_stride

+;    )

+global sym(vp9_copy_mem8x8_mmx)

+sym(vp9_copy_mem8x8_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov         rsi,        arg(0) ;src;

+        movq        mm0,        [rsi]

+        movsxd      rax,        dword ptr arg(1) ;src_stride;

+        mov         rdi,        arg(2) ;dst;

+        movq        mm1,        [rsi+rax]

+        movq        mm2,        [rsi+rax*2]

+        movsxd      rcx,        dword ptr arg(3) ;dst_stride

+        lea         rsi,        [rsi+rax*2]

+        movq        [rdi],      mm0

+        add         rsi,        rax

+        movq        [rdi+rcx],      mm1

+        movq        [rdi+rcx*2],    mm2

+        lea         rdi,        [rdi+rcx*2]

+        movq        mm3,        [rsi]

+        add         rdi,        rcx

+        movq        mm4,        [rsi+rax]

+        movq        mm5,        [rsi+rax*2]

+        movq        [rdi],      mm3

+        lea         rsi,        [rsi+rax*2]

+        movq        [rdi+rcx],  mm4

+        movq        [rdi+rcx*2],    mm5

+        lea         rdi,        [rdi+rcx*2]

+        movq        mm0,        [rsi+rax]

+        movq        mm1,        [rsi+rax*2]

+        movq        [rdi+rcx],  mm0

+        movq        [rdi+rcx*2],mm1

+    ; begin epilog

+    pop rdi

+    pop rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void copy_mem8x4_mmx(

+;    unsigned char *src,

+;    int src_stride,

+;    unsigned char *dst,

+;    int dst_stride

+;    )

+global sym(vp9_copy_mem8x4_mmx)

+sym(vp9_copy_mem8x4_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov         rsi,        arg(0) ;src;

+        movq        mm0,        [rsi]

+        movsxd      rax,        dword ptr arg(1) ;src_stride;

+        mov         rdi,        arg(2) ;dst;

+        movq        mm1,        [rsi+rax]

+        movq        mm2,        [rsi+rax*2]

+        movsxd      rcx,        dword ptr arg(3) ;dst_stride

+        lea         rsi,        [rsi+rax*2]

+        movq        [rdi],      mm0

+        movq        [rdi+rcx],      mm1

+        movq        [rdi+rcx*2],    mm2

+        lea         rdi,        [rdi+rcx*2]

+        movq        mm3,        [rsi+rax]

+        movq        [rdi+rcx],      mm3

+    ; begin epilog

+    pop rdi

+    pop rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void copy_mem16x16_mmx(

+;    unsigned char *src,

+;    int src_stride,

+;    unsigned char *dst,

+;    int dst_stride

+;    )

+global sym(vp9_copy_mem16x16_mmx)

+sym(vp9_copy_mem16x16_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov         rsi,        arg(0) ;src;

+        movsxd      rax,        dword ptr arg(1) ;src_stride;

+        mov         rdi,        arg(2) ;dst;

+        movsxd      rcx,        dword ptr arg(3) ;dst_stride

+        movq        mm0,            [rsi]

+        movq        mm3,            [rsi+8];

+        movq        mm1,            [rsi+rax]

+        movq        mm4,            [rsi+rax+8]

+        movq        mm2,            [rsi+rax*2]

+        movq        mm5,            [rsi+rax*2+8]

+        lea         rsi,            [rsi+rax*2]

+        add         rsi,            rax

+        movq        [rdi],          mm0

+        movq        [rdi+8],        mm3

+        movq        [rdi+rcx],      mm1

+        movq        [rdi+rcx+8],    mm4

+        movq        [rdi+rcx*2],    mm2

+        movq        [rdi+rcx*2+8],  mm5

+        lea         rdi,            [rdi+rcx*2]

+        add         rdi,            rcx

+        movq        mm0,            [rsi]

+        movq        mm3,            [rsi+8];

+        movq        mm1,            [rsi+rax]

+        movq        mm4,            [rsi+rax+8]

+        movq        mm2,            [rsi+rax*2]

+        movq        mm5,            [rsi+rax*2+8]

+        lea         rsi,            [rsi+rax*2]

+        add         rsi,            rax

+        movq        [rdi],          mm0

+        movq        [rdi+8],        mm3

+        movq        [rdi+rcx],      mm1

+        movq        [rdi+rcx+8],    mm4

+        movq        [rdi+rcx*2],    mm2

+        movq        [rdi+rcx*2+8],  mm5

+        lea         rdi,            [rdi+rcx*2]

+        add         rdi,            rcx

+        movq        mm0,            [rsi]

+        movq        mm3,            [rsi+8];

+        movq        mm1,            [rsi+rax]

+        movq        mm4,            [rsi+rax+8]

+        movq        mm2,            [rsi+rax*2]

+        movq        mm5,            [rsi+rax*2+8]

+        lea         rsi,            [rsi+rax*2]

+        add         rsi,            rax

+        movq        [rdi],          mm0

+        movq        [rdi+8],        mm3

+        movq        [rdi+rcx],      mm1

+        movq        [rdi+rcx+8],    mm4

+        movq        [rdi+rcx*2],    mm2

+        movq        [rdi+rcx*2+8],  mm5

+        lea         rdi,            [rdi+rcx*2]

+        add         rdi,            rcx

+        movq        mm0,            [rsi]

+        movq        mm3,            [rsi+8];

+        movq        mm1,            [rsi+rax]

+        movq        mm4,            [rsi+rax+8]

+        movq        mm2,            [rsi+rax*2]

+        movq        mm5,            [rsi+rax*2+8]

+        lea         rsi,            [rsi+rax*2]

+        add         rsi,            rax

+        movq        [rdi],          mm0

+        movq        [rdi+8],        mm3

+        movq        [rdi+rcx],      mm1

+        movq        [rdi+rcx+8],    mm4

+        movq        [rdi+rcx*2],    mm2

+        movq        [rdi+rcx*2+8],  mm5

+        lea         rdi,            [rdi+rcx*2]

+        add         rdi,            rcx

+        movq        mm0,            [rsi]

+        movq        mm3,            [rsi+8];

+        movq        mm1,            [rsi+rax]

+        movq        mm4,            [rsi+rax+8]

+        movq        mm2,            [rsi+rax*2]

+        movq        mm5,            [rsi+rax*2+8]

+        lea         rsi,            [rsi+rax*2]

+        add         rsi,            rax

+        movq        [rdi],          mm0

+        movq        [rdi+8],        mm3

+        movq        [rdi+rcx],      mm1

+        movq        [rdi+rcx+8],    mm4

+        movq        [rdi+rcx*2],    mm2

+        movq        [rdi+rcx*2+8],  mm5

+        lea         rdi,            [rdi+rcx*2]

+        add         rdi,            rcx

+        movq        mm0,            [rsi]

+        movq        mm3,            [rsi+8];

+        movq        [rdi],          mm0

+        movq        [rdi+8],        mm3

+    ; begin epilog

+    pop rdi

+    pop rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

--- /dev/null

+++ b/vp9/common/x86/recon_sse2.asm

@@ -1,0 +1,688 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+;void vp9_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)

+global sym(vp9_recon2b_sse2)

+sym(vp9_recon2b_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov         rsi,        arg(0) ;s

+        mov         rdi,        arg(2) ;d

+        mov         rdx,        arg(1) ;q

+        movsxd      rax,        dword ptr arg(3) ;stride

+        pxor        xmm0,       xmm0

+        movq        xmm1,       MMWORD PTR [rsi]

+        punpcklbw   xmm1,       xmm0

+        paddsw      xmm1,       XMMWORD PTR [rdx]

+        packuswb    xmm1,       xmm0              ; pack and unpack to saturate

+        movq        MMWORD PTR [rdi],   xmm1

+        movq        xmm2,       MMWORD PTR [rsi+8]

+        punpcklbw   xmm2,       xmm0

+        paddsw      xmm2,       XMMWORD PTR [rdx+16]

+        packuswb    xmm2,       xmm0              ; pack and unpack to saturate

+        movq        MMWORD PTR [rdi+rax],   xmm2

+        movq        xmm3,       MMWORD PTR [rsi+16]

+        punpcklbw   xmm3,       xmm0

+        paddsw      xmm3,       XMMWORD PTR [rdx+32]

+        packuswb    xmm3,       xmm0              ; pack and unpack to saturate

+        movq        MMWORD PTR [rdi+rax*2], xmm3

+        add         rdi, rax

+        movq        xmm4,       MMWORD PTR [rsi+24]

+        punpcklbw   xmm4,       xmm0

+        paddsw      xmm4,       XMMWORD PTR [rdx+48]

+        packuswb    xmm4,       xmm0              ; pack and unpack to saturate

+        movq        MMWORD PTR [rdi+rax*2], xmm4

+    ; begin epilog

+    pop rdi

+    pop rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)

+global sym(vp9_recon4b_sse2)

+sym(vp9_recon4b_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    SAVE_XMM 7

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov         rsi,        arg(0) ;s

+        mov         rdi,        arg(2) ;d

+        mov         rdx,        arg(1) ;q

+        movsxd      rax,        dword ptr arg(3) ;stride

+        pxor        xmm0,       xmm0

+        movdqa      xmm1,       XMMWORD PTR [rsi]

+        movdqa      xmm5,       xmm1

+        punpcklbw   xmm1,       xmm0

+        punpckhbw   xmm5,       xmm0

+        paddsw      xmm1,       XMMWORD PTR [rdx]

+        paddsw      xmm5,       XMMWORD PTR [rdx+16]

+        packuswb    xmm1,       xmm5              ; pack and unpack to saturate

+        movdqa      XMMWORD PTR [rdi],  xmm1

+        movdqa      xmm2,       XMMWORD PTR [rsi+16]

+        movdqa      xmm6,       xmm2

+        punpcklbw   xmm2,       xmm0

+        punpckhbw   xmm6,       xmm0

+        paddsw      xmm2,       XMMWORD PTR [rdx+32]

+        paddsw      xmm6,       XMMWORD PTR [rdx+48]

+        packuswb    xmm2,       xmm6              ; pack and unpack to saturate

+        movdqa      XMMWORD PTR [rdi+rax],  xmm2

+        movdqa      xmm3,       XMMWORD PTR [rsi+32]

+        movdqa      xmm7,       xmm3

+        punpcklbw   xmm3,       xmm0

+        punpckhbw   xmm7,       xmm0

+        paddsw      xmm3,       XMMWORD PTR [rdx+64]

+        paddsw      xmm7,       XMMWORD PTR [rdx+80]

+        packuswb    xmm3,       xmm7              ; pack and unpack to saturate

+        movdqa      XMMWORD PTR [rdi+rax*2],    xmm3

+        add       rdi, rax

+        movdqa      xmm4,       XMMWORD PTR [rsi+48]

+        movdqa      xmm5,       xmm4

+        punpcklbw   xmm4,       xmm0

+        punpckhbw   xmm5,       xmm0

+        paddsw      xmm4,       XMMWORD PTR [rdx+96]

+        paddsw      xmm5,       XMMWORD PTR [rdx+112]

+        packuswb    xmm4,       xmm5              ; pack and unpack to saturate

+        movdqa      XMMWORD PTR [rdi+rax*2],    xmm4

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void copy_mem16x16_sse2(

+;    unsigned char *src,

+;    int src_stride,

+;    unsigned char *dst,

+;    int dst_stride

+;    )

+global sym(vp9_copy_mem16x16_sse2)

+sym(vp9_copy_mem16x16_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov         rsi,        arg(0) ;src;

+        movdqu      xmm0,       [rsi]

+        movsxd      rax,        dword ptr arg(1) ;src_stride;

+        mov         rdi,        arg(2) ;dst;

+        movdqu      xmm1,       [rsi+rax]

+        movdqu      xmm2,       [rsi+rax*2]

+        movsxd      rcx,        dword ptr arg(3) ;dst_stride

+        lea         rsi,        [rsi+rax*2]

+        movdqa      [rdi],      xmm0

+        add         rsi,        rax

+        movdqa      [rdi+rcx],  xmm1

+        movdqa      [rdi+rcx*2],xmm2

+        lea         rdi,        [rdi+rcx*2]

+        movdqu      xmm3,       [rsi]

+        add         rdi,        rcx

+        movdqu      xmm4,       [rsi+rax]

+        movdqu      xmm5,       [rsi+rax*2]

+        lea         rsi,        [rsi+rax*2]

+        movdqa      [rdi],  xmm3

+        add         rsi,        rax

+        movdqa      [rdi+rcx],  xmm4

+        movdqa      [rdi+rcx*2],xmm5

+        lea         rdi,        [rdi+rcx*2]

+        movdqu      xmm0,       [rsi]

+        add         rdi,        rcx

+        movdqu      xmm1,       [rsi+rax]

+        movdqu      xmm2,       [rsi+rax*2]

+        lea         rsi,        [rsi+rax*2]

+        movdqa      [rdi],      xmm0

+        add         rsi,        rax

+        movdqa      [rdi+rcx],  xmm1

+        movdqa      [rdi+rcx*2],    xmm2

+        movdqu      xmm3,       [rsi]

+        movdqu      xmm4,       [rsi+rax]

+        lea         rdi,        [rdi+rcx*2]

+        add         rdi,        rcx

+        movdqu      xmm5,       [rsi+rax*2]

+        lea         rsi,        [rsi+rax*2]

+        movdqa      [rdi],  xmm3

+        add         rsi,        rax

+        movdqa      [rdi+rcx],  xmm4

+        movdqa      [rdi+rcx*2],xmm5

+        movdqu      xmm0,       [rsi]

+        lea         rdi,        [rdi+rcx*2]

+        movdqu      xmm1,       [rsi+rax]

+        add         rdi,        rcx

+        movdqu      xmm2,       [rsi+rax*2]

+        lea         rsi,        [rsi+rax*2]

+        movdqa      [rdi],      xmm0

+        movdqa      [rdi+rcx],  xmm1

+        movdqa      [rdi+rcx*2],xmm2

+        movdqu      xmm3,       [rsi+rax]

+        lea         rdi,        [rdi+rcx*2]

+        movdqa      [rdi+rcx],  xmm3

+    ; begin epilog

+    pop rdi

+    pop rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_intra_pred_uv_dc_mmx2(

+;    unsigned char *dst,

+;    int dst_stride

+;    unsigned char *src,

+;    int src_stride,

+;    )

+global sym(vp9_intra_pred_uv_dc_mmx2)

+sym(vp9_intra_pred_uv_dc_mmx2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    push        rsi

+    push        rdi

+    ; end prolog

+    ; from top

+    mov         rsi,        arg(2) ;src;

+    movsxd      rax,        dword ptr arg(3) ;src_stride;

+    sub         rsi,        rax

+    pxor        mm0,        mm0

+    movq        mm1,        [rsi]

+    psadbw      mm1,        mm0

+    ; from left

+    dec         rsi

+    lea         rdi,        [rax*3]

+    movzx       ecx,        byte [rsi+rax]

+    movzx       edx,        byte [rsi+rax*2]

+    add         ecx,        edx

+    movzx       edx,        byte [rsi+rdi]

+    add         ecx,        edx

+    lea         rsi,        [rsi+rax*4]

+    movzx       edx,        byte [rsi]

+    add         ecx,        edx

+    movzx       edx,        byte [rsi+rax]

+    add         ecx,        edx

+    movzx       edx,        byte [rsi+rax*2]

+    add         ecx,        edx

+    movzx       edx,        byte [rsi+rdi]

+    add         ecx,        edx

+    movzx       edx,        byte [rsi+rax*4]

+    add         ecx,        edx

+    ; add up

+    pextrw      edx,        mm1, 0x0

+    lea         edx,        [edx+ecx+8]

+    sar         edx,        4

+    movd        mm1,        edx

+    pshufw      mm1,        mm1, 0x0

+    packuswb    mm1,        mm1

+    ; write out

+    mov         rdi,        arg(0) ;dst;

+    movsxd      rcx,        dword ptr arg(1) ;dst_stride

+    lea         rax,        [rcx*3]

+    movq [rdi      ],       mm1

+    movq [rdi+rcx  ],       mm1

+    movq [rdi+rcx*2],       mm1

+    movq [rdi+rax  ],       mm1

+    lea         rdi,        [rdi+rcx*4]

+    movq [rdi      ],       mm1

+    movq [rdi+rcx  ],       mm1

+    movq [rdi+rcx*2],       mm1

+    movq [rdi+rax  ],       mm1

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_intra_pred_uv_dctop_mmx2(

+;    unsigned char *dst,

+;    int dst_stride

+;    unsigned char *src,

+;    int src_stride,

+;    )

+global sym(vp9_intra_pred_uv_dctop_mmx2)

+sym(vp9_intra_pred_uv_dctop_mmx2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ; from top

+    mov         rsi,        arg(2) ;src;

+    movsxd      rax,        dword ptr arg(3) ;src_stride;

+    sub         rsi,        rax

+    pxor        mm0,        mm0

+    movq        mm1,        [rsi]

+    psadbw      mm1,        mm0

+    ; add up

+    paddw       mm1,        [GLOBAL(dc_4)]

+    psraw       mm1,        3

+    pshufw      mm1,        mm1, 0x0

+    packuswb    mm1,        mm1

+    ; write out

+    mov         rdi,        arg(0) ;dst;

+    movsxd      rcx,        dword ptr arg(1) ;dst_stride

+    lea         rax,        [rcx*3]

+    movq [rdi      ],       mm1

+    movq [rdi+rcx  ],       mm1

+    movq [rdi+rcx*2],       mm1

+    movq [rdi+rax  ],       mm1

+    lea         rdi,        [rdi+rcx*4]

+    movq [rdi      ],       mm1

+    movq [rdi+rcx  ],       mm1

+    movq [rdi+rcx*2],       mm1

+    movq [rdi+rax  ],       mm1

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_intra_pred_uv_dcleft_mmx2(

+;    unsigned char *dst,

+;    int dst_stride

+;    unsigned char *src,

+;    int src_stride,

+;    )

+global sym(vp9_intra_pred_uv_dcleft_mmx2)

+sym(vp9_intra_pred_uv_dcleft_mmx2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    push        rsi

+    push        rdi

+    ; end prolog

+    ; from left

+    mov         rsi,        arg(2) ;src;

+    movsxd      rax,        dword ptr arg(3) ;src_stride;

+    dec         rsi

+    lea         rdi,        [rax*3]

+    movzx       ecx,        byte [rsi]

+    movzx       edx,        byte [rsi+rax]

+    add         ecx,        edx

+    movzx       edx,        byte [rsi+rax*2]

+    add         ecx,        edx

+    movzx       edx,        byte [rsi+rdi]

+    add         ecx,        edx

+    lea         rsi,        [rsi+rax*4]

+    movzx       edx,        byte [rsi]

+    add         ecx,        edx

+    movzx       edx,        byte [rsi+rax]

+    add         ecx,        edx

+    movzx       edx,        byte [rsi+rax*2]

+    add         ecx,        edx

+    movzx       edx,        byte [rsi+rdi]

+    lea         edx,        [ecx+edx+4]

+    ; add up

+    shr         edx,        3

+    movd        mm1,        edx

+    pshufw      mm1,        mm1, 0x0

+    packuswb    mm1,        mm1

+    ; write out

+    mov         rdi,        arg(0) ;dst;

+    movsxd      rcx,        dword ptr arg(1) ;dst_stride

+    lea         rax,        [rcx*3]

+    movq [rdi      ],       mm1

+    movq [rdi+rcx  ],       mm1

+    movq [rdi+rcx*2],       mm1

+    movq [rdi+rax  ],       mm1

+    lea         rdi,        [rdi+rcx*4]

+    movq [rdi      ],       mm1

+    movq [rdi+rcx  ],       mm1

+    movq [rdi+rcx*2],       mm1

+    movq [rdi+rax  ],       mm1

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_intra_pred_uv_dc128_mmx(

+;    unsigned char *dst,

+;    int dst_stride

+;    unsigned char *src,

+;    int src_stride,

+;    )

+global sym(vp9_intra_pred_uv_dc128_mmx)

+sym(vp9_intra_pred_uv_dc128_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    GET_GOT     rbx

+    ; end prolog

+    ; write out

+    movq        mm1,        [GLOBAL(dc_128)]

+    mov         rax,        arg(0) ;dst;

+    movsxd      rdx,        dword ptr arg(1) ;dst_stride

+    lea         rcx,        [rdx*3]

+    movq [rax      ],       mm1

+    movq [rax+rdx  ],       mm1

+    movq [rax+rdx*2],       mm1

+    movq [rax+rcx  ],       mm1

+    lea         rax,        [rax+rdx*4]

+    movq [rax      ],       mm1

+    movq [rax+rdx  ],       mm1

+    movq [rax+rdx*2],       mm1

+    movq [rax+rcx  ],       mm1

+    ; begin epilog

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_intra_pred_uv_tm_sse2(

+;    unsigned char *dst,

+;    int dst_stride

+;    unsigned char *src,

+;    int src_stride,

+;    )

+%macro vp9_intra_pred_uv_tm 1

+global sym(vp9_intra_pred_uv_tm_%1)

+sym(vp9_intra_pred_uv_tm_%1):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ; read top row

+    mov         edx,        4

+    mov         rsi,        arg(2) ;src;

+    movsxd      rax,        dword ptr arg(3) ;src_stride;

+    sub         rsi,        rax

+    pxor        xmm0,       xmm0

+%ifidn %1, ssse3

+    movdqa      xmm2,       [GLOBAL(dc_1024)]

+%endif

+    movq        xmm1,       [rsi]

+    punpcklbw   xmm1,       xmm0

+    ; set up left ptrs ans subtract topleft

+    movd        xmm3,       [rsi-1]

+    lea         rsi,        [rsi+rax-1]

+%ifidn %1, sse2

+    punpcklbw   xmm3,       xmm0

+    pshuflw     xmm3,       xmm3, 0x0

+    punpcklqdq  xmm3,       xmm3

+%else

+    pshufb      xmm3,       xmm2

+%endif

+    psubw       xmm1,       xmm3

+    ; set up dest ptrs

+    mov         rdi,        arg(0) ;dst;

+    movsxd      rcx,        dword ptr arg(1) ;dst_stride

+.vp9_intra_pred_uv_tm_%1_loop:

+    movd        xmm3,       [rsi]

+    movd        xmm5,       [rsi+rax]

+%ifidn %1, sse2

+    punpcklbw   xmm3,       xmm0

+    punpcklbw   xmm5,       xmm0

+    pshuflw     xmm3,       xmm3, 0x0

+    pshuflw     xmm5,       xmm5, 0x0

+    punpcklqdq  xmm3,       xmm3

+    punpcklqdq  xmm5,       xmm5

+%else

+    pshufb      xmm3,       xmm2

+    pshufb      xmm5,       xmm2

+%endif

+    paddw       xmm3,       xmm1

+    paddw       xmm5,       xmm1

+    packuswb    xmm3,       xmm5

+    movq  [rdi    ],        xmm3

+    movhps[rdi+rcx],        xmm3

+    lea         rsi,        [rsi+rax*2]

+    lea         rdi,        [rdi+rcx*2]

+    dec         edx

+    jnz .vp9_intra_pred_uv_tm_%1_loop

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+%endmacro

+vp9_intra_pred_uv_tm sse2

+vp9_intra_pred_uv_tm ssse3

+;void vp9_intra_pred_uv_ve_mmx(

+;    unsigned char *dst,

+;    int dst_stride

+;    unsigned char *src,

+;    int src_stride,

+;    )

+global sym(vp9_intra_pred_uv_ve_mmx)

+sym(vp9_intra_pred_uv_ve_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    ; end prolog

+    ; read from top

+    mov         rax,        arg(2) ;src;

+    movsxd      rdx,        dword ptr arg(3) ;src_stride;

+    sub         rax,        rdx

+    movq        mm1,        [rax]

+    ; write out

+    mov         rax,        arg(0) ;dst;

+    movsxd      rdx,        dword ptr arg(1) ;dst_stride

+    lea         rcx,        [rdx*3]

+    movq [rax      ],       mm1

+    movq [rax+rdx  ],       mm1

+    movq [rax+rdx*2],       mm1

+    movq [rax+rcx  ],       mm1

+    lea         rax,        [rax+rdx*4]

+    movq [rax      ],       mm1

+    movq [rax+rdx  ],       mm1

+    movq [rax+rdx*2],       mm1

+    movq [rax+rcx  ],       mm1

+    ; begin epilog

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_intra_pred_uv_ho_mmx2(

+;    unsigned char *dst,

+;    int dst_stride

+;    unsigned char *src,

+;    int src_stride,

+;    )

+%macro vp9_intra_pred_uv_ho 1

+global sym(vp9_intra_pred_uv_ho_%1)

+sym(vp9_intra_pred_uv_ho_%1):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    push        rsi

+    push        rdi

+%ifidn %1, ssse3

+%ifndef GET_GOT_SAVE_ARG

+    push        rbx

+%endif

+    GET_GOT     rbx

+%endif

+    ; end prolog

+    ; read from left and write out

+%ifidn %1, mmx2

+    mov         edx,        4

+%endif

+    mov         rsi,        arg(2) ;src;

+    movsxd      rax,        dword ptr arg(3) ;src_stride;

+    mov         rdi,        arg(0) ;dst;

+    movsxd      rcx,        dword ptr arg(1) ;dst_stride

+%ifidn %1, ssse3

+    lea         rdx,        [rcx*3]

+    movdqa      xmm2,       [GLOBAL(dc_00001111)]

+    lea         rbx,        [rax*3]

+%endif

+    dec         rsi

+%ifidn %1, mmx2

+.vp9_intra_pred_uv_ho_%1_loop:

+    movd        mm0,        [rsi]

+    movd        mm1,        [rsi+rax]

+    punpcklbw   mm0,        mm0

+    punpcklbw   mm1,        mm1

+    pshufw      mm0,        mm0, 0x0

+    pshufw      mm1,        mm1, 0x0

+    movq  [rdi    ],        mm0

+    movq  [rdi+rcx],        mm1

+    lea         rsi,        [rsi+rax*2]

+    lea         rdi,        [rdi+rcx*2]

+    dec         edx

+    jnz .vp9_intra_pred_uv_ho_%1_loop

+%else

+    movd        xmm0,       [rsi]

+    movd        xmm3,       [rsi+rax]

+    movd        xmm1,       [rsi+rax*2]

+    movd        xmm4,       [rsi+rbx]

+    punpcklbw   xmm0,       xmm3

+    punpcklbw   xmm1,       xmm4

+    pshufb      xmm0,       xmm2

+    pshufb      xmm1,       xmm2

+    movq   [rdi    ],       xmm0

+    movhps [rdi+rcx],       xmm0

+    movq [rdi+rcx*2],       xmm1

+    movhps [rdi+rdx],       xmm1

+    lea         rsi,        [rsi+rax*4]

+    lea         rdi,        [rdi+rcx*4]

+    movd        xmm0,       [rsi]

+    movd        xmm3,       [rsi+rax]

+    movd        xmm1,       [rsi+rax*2]

+    movd        xmm4,       [rsi+rbx]

+    punpcklbw   xmm0,       xmm3

+    punpcklbw   xmm1,       xmm4

+    pshufb      xmm0,       xmm2

+    pshufb      xmm1,       xmm2

+    movq   [rdi    ],       xmm0

+    movhps [rdi+rcx],       xmm0

+    movq [rdi+rcx*2],       xmm1

+    movhps [rdi+rdx],       xmm1

+%endif

+    ; begin epilog

+%ifidn %1, ssse3

+    RESTORE_GOT

+%ifndef GET_GOT_SAVE_ARG

+    pop         rbx

+%endif

+%endif

+    pop         rdi

+    pop         rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+%endmacro

+vp9_intra_pred_uv_ho mmx2

+vp9_intra_pred_uv_ho ssse3

+SECTION_RODATA

+dc_128:

+    times 8 db 128

+dc_4:

+    times 4 dw 4

+align 16

+dc_1024:

+    times 8 dw 0x400

+align 16

+dc_00001111:

+    times 8 db 0

+    times 8 db 1

--- /dev/null

+++ b/vp9/common/x86/recon_wrapper_sse2.c

@@ -1,0 +1,101 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vpx_mem/vpx_mem.h"

+#include "vp9/common/blockd.h"

+#define build_intra_predictors_mbuv_prototype(sym) \

+  void sym(unsigned char *dst, int dst_stride, \

+           const unsigned char *src, int src_stride)

+typedef build_intra_predictors_mbuv_prototype((*build_intra_pred_mbuv_fn_t));

+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dc_mmx2);

+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dctop_mmx2);

+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dcleft_mmx2);

+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dc128_mmx);

+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ho_mmx2);

+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ho_ssse3);

+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ve_mmx);

+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_tm_sse2);

+extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_tm_ssse3);

+static void build_intra_predictors_mbuv_x86(MACROBLOCKD *xd,

+                                            unsigned char *dst_u,

+                                            unsigned char *dst_v,

+                                            int dst_stride,

+                                            build_intra_pred_mbuv_fn_t tm_fn,

+                                            build_intra_pred_mbuv_fn_t ho_fn) {

+  int mode = xd->mode_info_context->mbmi.uv_mode;

+  build_intra_pred_mbuv_fn_t fn;

+  int src_stride = xd->dst.uv_stride;

+  switch (mode) {

+    case  V_PRED:

+      fn = vp9_intra_pred_uv_ve_mmx;

+      break;

+    case  H_PRED:

+      fn = ho_fn;

+      break;

+    case TM_PRED:

+      fn = tm_fn;

+      break;

+    case DC_PRED:

+      if (xd->up_available) {

+        if (xd->left_available) {

+          fn = vp9_intra_pred_uv_dc_mmx2;

+          break;

+        } else {

+          fn = vp9_intra_pred_uv_dctop_mmx2;

+          break;

+        }

+      } else if (xd->left_available) {

+        fn = vp9_intra_pred_uv_dcleft_mmx2;

+        break;

+      } else {

+        fn = vp9_intra_pred_uv_dc128_mmx;

+        break;

+      }

+      break;

+    default:

+      return;

+  }

+  fn(dst_u, dst_stride, xd->dst.u_buffer, src_stride);

+  fn(dst_v, dst_stride, xd->dst.v_buffer, src_stride);

+}

+void vp9_build_intra_predictors_mbuv_sse2(MACROBLOCKD *xd) {

+  build_intra_predictors_mbuv_x86(xd, &xd->predictor[256],

+                                  &xd->predictor[320], 8,

+                                  vp9_intra_pred_uv_tm_sse2,

+                                  vp9_intra_pred_uv_ho_mmx2);

+}

+void vp9_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *xd) {

+  build_intra_predictors_mbuv_x86(xd, &xd->predictor[256],

+                                  &xd->predictor[320], 8,

+                                  vp9_intra_pred_uv_tm_ssse3,

+                                  vp9_intra_pred_uv_ho_ssse3);

+}

+void vp9_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *xd) {

+  build_intra_predictors_mbuv_x86(xd, xd->dst.u_buffer,

+                                  xd->dst.v_buffer, xd->dst.uv_stride,

+                                  vp9_intra_pred_uv_tm_sse2,

+                                  vp9_intra_pred_uv_ho_mmx2);

+}

+void vp9_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *xd) {

+  build_intra_predictors_mbuv_x86(xd, xd->dst.u_buffer,

+                                  xd->dst.v_buffer, xd->dst.uv_stride,

+                                  vp9_intra_pred_uv_tm_ssse3,

+                                  vp9_intra_pred_uv_ho_ssse3);

+}

--- /dev/null

+++ b/vp9/common/x86/sadmxn_x86.c

@@ -1,0 +1,92 @@

+/*

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <emmintrin.h>  // SSE2

+#include "./vpx_config.h"

+#include "./vpx_rtcd.h"

+#if CONFIG_NEWBESTREFMV

+#if HAVE_SSE2

+unsigned int vp9_sad16x3_sse2(

+  const unsigned char *src_ptr,

+  int  src_stride,

+  const unsigned char *ref_ptr,

+  int  ref_stride,

+  int max_sad) {

+  __m128i s0, s1, s2;

+  __m128i r0, r1, r2;

+  __m128i sad;

+  (void)max_sad;

+  s0 = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride));

+  s1 = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride));

+  s2 = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride));

+  r0 = _mm_loadu_si128((const __m128i *)(ref_ptr + 0 * src_stride));

+  r1 = _mm_loadu_si128((const __m128i *)(ref_ptr + 1 * src_stride));

+  r2 = _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * src_stride));

+  sad = _mm_sad_epu8(s0, r0);

+  sad = _mm_add_epi16(sad,  _mm_sad_epu8(s1, r1));

+  sad = _mm_add_epi16(sad,  _mm_sad_epu8(s2, r2));

+  sad = _mm_add_epi16(sad,  _mm_srli_si128(sad, 8));

+  return _mm_cvtsi128_si32(sad);

+}

+unsigned int vp9_sad3x16_sse2(

+  const unsigned char *src_ptr,

+  int  src_stride,

+  const unsigned char *ref_ptr,

+  int  ref_stride,

+  int max_sad) {

+  int r;

+  __m128i s0, s1, s2, s3;

+  __m128i r0, r1, r2, r3;

+  __m128i sad = _mm_set1_epi16(0);

+  for (r = 0; r < 16; r += 4) {

+    s0 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 0 * src_stride));

+    s1 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 1 * src_stride));

+    s2 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 2 * src_stride));

+    s3 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 3 * src_stride));

+    r0 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 0 * src_stride));

+    r1 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 1 * src_stride));

+    r2 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 2 * src_stride));

+    r3 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 3 * src_stride));

+    s0 = _mm_unpacklo_epi8(s0, s1);

+    r0 = _mm_unpacklo_epi8(r0, r1);

+    s2 = _mm_unpacklo_epi8(s2, s3);

+    r2 = _mm_unpacklo_epi8(r2, r3);

+    s0 = _mm_unpacklo_epi64(s0, s2);

+    r0 = _mm_unpacklo_epi64(r0, r2);

+    // throw out byte 3

+    s0 = _mm_slli_epi64(s0, 16);

+    r0 = _mm_slli_epi64(r0, 16);

+    sad = _mm_add_epi16(sad, _mm_sad_epu8(s0, r0));

+    src_ptr += src_stride*4;

+    ref_ptr += ref_stride*4;

+  }

+  sad = _mm_add_epi16(sad,  _mm_srli_si128(sad, 8));

+  return _mm_cvtsi128_si32(sad);

+}

+#endif

+#endif  // CONFIG_NEWBESTREFMV

--- /dev/null

+++ b/vp9/common/x86/subpixel_8t_ssse3.asm

@@ -1,0 +1,550 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+;/************************************************************************************

+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The

+; input pixel array has output_height rows. This routine assumes that output_height is an

+; even number. This function handles 8 pixels in horizontal direction, calculating ONE

+; rows each iteration to take advantage of the 128 bits operations.

+;

+; This is an implementation of some of the SSE optimizations first seen in ffvp8

+;

+;*************************************************************************************/

+;void vp9_filter_block1d8_v8_ssse3

+;(

+;    unsigned char *src_ptr,

+;    unsigned int   src_pitch,

+;    unsigned char *output_ptr,

+;    unsigned int   out_pitch,

+;    unsigned int   output_height,

+;    short *filter

+;)

+global sym(vp9_filter_block1d8_v8_ssse3)

+sym(vp9_filter_block1d8_v8_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    push        rsi

+    push        rdi

+    push        rbx

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 16*5

+    %define k0k1 [rsp + 16*0]

+    %define k2k3 [rsp + 16*1]

+    %define k4k5 [rsp + 16*2]

+    %define k6k7 [rsp + 16*3]

+    %define krd [rsp + 16*4]

+    mov         rdx, arg(5)                 ;filter ptr

+    mov         rsi, arg(0)                 ;src_ptr

+    mov         rdi, arg(2)                 ;output_ptr

+    mov         rcx, 0x0400040

+    movdqa      xmm4, [rdx]                 ;load filters

+    movd        xmm5, rcx

+    packsswb    xmm4, xmm4

+    pshuflw     xmm0, xmm4, 0b              ;k0_k1

+    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3

+    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5

+    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7

+    punpcklqdq  xmm0, xmm0

+    punpcklqdq  xmm1, xmm1

+    punpcklqdq  xmm2, xmm2

+    punpcklqdq  xmm3, xmm3

+    movdqa      k0k1, xmm0

+    movdqa      k2k3, xmm1

+    pshufd      xmm5, xmm5, 0

+    movdqa      k4k5, xmm2

+    movdqa      k6k7, xmm3

+    movdqa      krd, xmm5

+    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line

+%if ABI_IS_32BIT=0

+    movsxd      r8, DWORD PTR arg(3)        ;out_pitch

+%endif

+    mov         rax, rsi

+    movsxd      rcx, DWORD PTR arg(4)       ;output_height

+    add         rax, rdx

+    lea         rbx, [rdx + rdx*4]

+    add         rbx, rdx                    ;pitch * 6

+.vp9_filter_block1d8_v8_ssse3_loop:

+    movq        xmm0, [rsi]                 ;A

+    movq        xmm1, [rsi + rdx]           ;B

+    movq        xmm2, [rsi + rdx * 2]       ;C

+    movq        xmm3, [rax + rdx * 2]       ;D

+    movq        xmm4, [rsi + rdx * 4]       ;E

+    movq        xmm5, [rax + rdx * 4]       ;F

+    punpcklbw   xmm0, xmm1                  ;A B

+    punpcklbw   xmm2, xmm3                  ;C D

+    punpcklbw   xmm4, xmm5                  ;E F

+    movq        xmm6, [rsi + rbx]           ;G

+    movq        xmm7, [rax + rbx]           ;H

+    pmaddubsw   xmm0, k0k1

+    pmaddubsw   xmm2, k2k3

+    punpcklbw   xmm6, xmm7                  ;G H

+    pmaddubsw   xmm4, k4k5

+    pmaddubsw   xmm6, k6k7

+    paddsw      xmm0, xmm2

+    paddsw      xmm0, krd

+    paddsw      xmm4, xmm6

+    paddsw      xmm0, xmm4

+    psraw       xmm0, 7

+    packuswb    xmm0, xmm0

+    add         rsi,  rdx

+    add         rax,  rdx

+    movq        [rdi], xmm0

+%if ABI_IS_32BIT

+    add         rdi, DWORD PTR arg(3)       ;out_pitch

+%else

+    add         rdi, r8

+%endif

+    dec         rcx

+    jnz         .vp9_filter_block1d8_v8_ssse3_loop

+    add rsp, 16*5

+    pop rsp

+    pop rbx

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_filter_block1d16_v8_ssse3

+;(

+;    unsigned char *src_ptr,

+;    unsigned int   src_pitch,

+;    unsigned char *output_ptr,

+;    unsigned int   out_pitch,

+;    unsigned int   output_height,

+;    short *filter

+;)

+global sym(vp9_filter_block1d16_v8_ssse3)

+sym(vp9_filter_block1d16_v8_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    push        rsi

+    push        rdi

+    push        rbx

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 16*5

+    %define k0k1 [rsp + 16*0]

+    %define k2k3 [rsp + 16*1]

+    %define k4k5 [rsp + 16*2]

+    %define k6k7 [rsp + 16*3]

+    %define krd [rsp + 16*4]

+    mov         rdx, arg(5)                 ;filter ptr

+    mov         rsi, arg(0)                 ;src_ptr

+    mov         rdi, arg(2)                 ;output_ptr

+    mov         rcx, 0x0400040

+    movdqa      xmm4, [rdx]                 ;load filters

+    movd        xmm5, rcx

+    packsswb    xmm4, xmm4

+    pshuflw     xmm0, xmm4, 0b              ;k0_k1

+    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3

+    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5

+    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7

+    punpcklqdq  xmm0, xmm0

+    punpcklqdq  xmm1, xmm1

+    punpcklqdq  xmm2, xmm2

+    punpcklqdq  xmm3, xmm3

+    movdqa      k0k1, xmm0

+    movdqa      k2k3, xmm1

+    pshufd      xmm5, xmm5, 0

+    movdqa      k4k5, xmm2

+    movdqa      k6k7, xmm3

+    movdqa      krd, xmm5

+    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line

+%if ABI_IS_32BIT=0

+    movsxd      r8, DWORD PTR arg(3)        ;out_pitch

+%endif

+    mov         rax, rsi

+    movsxd      rcx, DWORD PTR arg(4)       ;output_height

+    add         rax, rdx

+    lea         rbx, [rdx + rdx*4]

+    add         rbx, rdx                    ;pitch * 6

+.vp9_filter_block1d16_v8_ssse3_loop:

+    movq        xmm0, [rsi]                 ;A

+    movq        xmm1, [rsi + rdx]           ;B

+    movq        xmm2, [rsi + rdx * 2]       ;C

+    movq        xmm3, [rax + rdx * 2]       ;D

+    movq        xmm4, [rsi + rdx * 4]       ;E

+    movq        xmm5, [rax + rdx * 4]       ;F

+    punpcklbw   xmm0, xmm1                  ;A B

+    punpcklbw   xmm2, xmm3                  ;C D

+    punpcklbw   xmm4, xmm5                  ;E F

+    movq        xmm6, [rsi + rbx]           ;G

+    movq        xmm7, [rax + rbx]           ;H

+    pmaddubsw   xmm0, k0k1

+    pmaddubsw   xmm2, k2k3

+    punpcklbw   xmm6, xmm7                  ;G H

+    pmaddubsw   xmm4, k4k5

+    pmaddubsw   xmm6, k6k7

+    paddsw      xmm0, xmm2

+    paddsw      xmm0, krd

+    paddsw      xmm4, xmm6

+    paddsw      xmm0, xmm4

+    psraw       xmm0, 7

+    packuswb    xmm0, xmm0

+    movq        [rdi], xmm0

+    movq        xmm0, [rsi + 8]             ;A

+    movq        xmm1, [rsi + rdx + 8]       ;B

+    movq        xmm2, [rsi + rdx * 2 + 8]   ;C

+    movq        xmm3, [rax + rdx * 2 + 8]   ;D

+    movq        xmm4, [rsi + rdx * 4 + 8]   ;E

+    movq        xmm5, [rax + rdx * 4 + 8]   ;F

+    punpcklbw   xmm0, xmm1                  ;A B

+    punpcklbw   xmm2, xmm3                  ;C D

+    punpcklbw   xmm4, xmm5                  ;E F

+    movq        xmm6, [rsi + rbx + 8]       ;G

+    movq        xmm7, [rax + rbx + 8]       ;H

+    punpcklbw   xmm6, xmm7                  ;G H

+    pmaddubsw   xmm0, k0k1

+    pmaddubsw   xmm2, k2k3

+    pmaddubsw   xmm4, k4k5

+    pmaddubsw   xmm6, k6k7

+    paddsw      xmm0, xmm2

+    paddsw      xmm4, xmm6

+    paddsw      xmm0, krd

+    paddsw      xmm0, xmm4

+    psraw       xmm0, 7

+    packuswb    xmm0, xmm0

+    add         rsi,  rdx

+    add         rax,  rdx

+    movq        [rdi+8], xmm0

+%if ABI_IS_32BIT

+    add         rdi, DWORD PTR arg(3)       ;out_pitch

+%else

+    add         rdi, r8

+%endif

+    dec         rcx

+    jnz         .vp9_filter_block1d16_v8_ssse3_loop

+    add rsp, 16*5

+    pop rsp

+    pop rbx

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_filter_block1d8_h8_ssse3

+;(

+;    unsigned char  *src_ptr,

+;    unsigned int    src_pixels_per_line,

+;    unsigned char  *output_ptr,

+;    unsigned int    output_pitch,

+;    unsigned int    output_height,

+;    short *filter

+;)

+global sym(vp9_filter_block1d8_h8_ssse3)

+sym(vp9_filter_block1d8_h8_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 16*5

+    %define k0k1 [rsp + 16*0]

+    %define k2k3 [rsp + 16*1]

+    %define k4k5 [rsp + 16*2]

+    %define k6k7 [rsp + 16*3]

+    %define krd [rsp + 16*4]

+    mov         rdx, arg(5)                 ;filter ptr

+    mov         rsi, arg(0)                 ;src_ptr

+    mov         rdi, arg(2)                 ;output_ptr

+    mov         rcx, 0x0400040

+    movdqa      xmm4, [rdx]                 ;load filters

+    movd        xmm5, rcx

+    packsswb    xmm4, xmm4

+    pshuflw     xmm0, xmm4, 0b              ;k0_k1

+    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3

+    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5

+    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7

+    punpcklqdq  xmm0, xmm0

+    punpcklqdq  xmm1, xmm1

+    punpcklqdq  xmm2, xmm2

+    punpcklqdq  xmm3, xmm3

+    movdqa      k0k1, xmm0

+    movdqa      k2k3, xmm1

+    pshufd      xmm5, xmm5, 0

+    movdqa      k4k5, xmm2

+    movdqa      k6k7, xmm3

+;    movdqa      krd, xmm5

+    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line

+    movsxd      rdx, dword ptr arg(3)       ;output_pitch

+    movsxd      rcx, dword ptr arg(4)       ;output_height

+.filter_block1d8_h8_rowloop_ssse3:

+    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4

+;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11

+    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12

+;note: if we create a k0_k7 filter, we can save a pshufb

+;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11

+    punpcklqdq  xmm0,   xmm3

+    movdqa      xmm1,   xmm0

+    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]

+    pmaddubsw   xmm0,   k0k1

+    movdqa      xmm2,   xmm1

+    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]

+    pmaddubsw   xmm1,   k2k3

+    movdqa      xmm4,   xmm2

+    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]

+    pmaddubsw   xmm2,   k4k5

+    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]

+    pmaddubsw   xmm4,   k6k7

+    paddsw      xmm0,   xmm1

+    paddsw      xmm0,   xmm2

+    paddsw      xmm0,   xmm5

+    paddsw      xmm0,   xmm4

+    psraw       xmm0,   7

+    packuswb    xmm0,   xmm0

+    lea         rsi,    [rsi + rax]

+    movq        [rdi],  xmm0

+    lea         rdi,    [rdi + rdx]

+    dec         rcx

+    jnz         .filter_block1d8_h8_rowloop_ssse3

+    add rsp, 16*5

+    pop rsp

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_filter_block1d16_h8_ssse3

+;(

+;    unsigned char  *src_ptr,

+;    unsigned int    src_pixels_per_line,

+;    unsigned char  *output_ptr,

+;    unsigned int    output_pitch,

+;    unsigned int    output_height,

+;    short *filter

+;)

+global sym(vp9_filter_block1d16_h8_ssse3)

+sym(vp9_filter_block1d16_h8_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 16*5

+    %define k0k1 [rsp + 16*0]

+    %define k2k3 [rsp + 16*1]

+    %define k4k5 [rsp + 16*2]

+    %define k6k7 [rsp + 16*3]

+    %define krd [rsp + 16*4]

+    mov         rdx, arg(5)                 ;filter ptr

+    mov         rsi, arg(0)                 ;src_ptr

+    mov         rdi, arg(2)                 ;output_ptr

+    mov         rcx, 0x0400040

+    movdqa      xmm4, [rdx]                 ;load filters

+    movd        xmm5, rcx

+    packsswb    xmm4, xmm4

+    pshuflw     xmm0, xmm4, 0b              ;k0_k1

+    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3

+    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5

+    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7

+    punpcklqdq  xmm0, xmm0

+    punpcklqdq  xmm1, xmm1

+    punpcklqdq  xmm2, xmm2

+    punpcklqdq  xmm3, xmm3

+    movdqa      k0k1, xmm0

+    movdqa      k2k3, xmm1

+    pshufd      xmm5, xmm5, 0

+    movdqa      k4k5, xmm2

+    movdqa      k6k7, xmm3

+    movdqa      krd, xmm5

+    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line

+    movsxd      rdx, dword ptr arg(3)       ;output_pitch

+    movsxd      rcx, dword ptr arg(4)       ;output_height

+.filter_block1d16_h8_rowloop_ssse3:

+    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4

+;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11

+    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12

+;note: if we create a k0_k7 filter, we can save a pshufb

+;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11

+    punpcklqdq  xmm0,   xmm3

+    movdqa      xmm1,   xmm0

+    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]

+    pmaddubsw   xmm0,   k0k1

+    movdqa      xmm2,   xmm1

+    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]

+    pmaddubsw   xmm1,   k2k3

+    movdqa      xmm4,   xmm2

+    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]

+    pmaddubsw   xmm2,   k4k5

+    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]

+    pmaddubsw   xmm4,   k6k7

+    paddsw      xmm0,   xmm1

+    paddsw      xmm0,   xmm4

+    paddsw      xmm0,   xmm2

+    paddsw      xmm0,   krd

+    psraw       xmm0,   7

+    packuswb    xmm0,   xmm0

+    movq        xmm3,   [rsi +  5]

+;    movq        xmm7,   [rsi + 12]

+    movq        xmm7,   [rsi + 13]

+;note: same as above

+;    punpcklbw   xmm3,   xmm7

+    punpcklqdq  xmm3,   xmm7

+    movdqa      xmm1,   xmm3

+    pshufb      xmm3,   [GLOBAL(shuf_t0t1)]

+    pmaddubsw   xmm3,   k0k1

+    movdqa      xmm2,   xmm1

+    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]

+    pmaddubsw   xmm1,   k2k3

+    movdqa      xmm4,   xmm2

+    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]

+    pmaddubsw   xmm2,   k4k5

+    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]

+    pmaddubsw   xmm4,   k6k7

+    paddsw      xmm3,   xmm1

+    paddsw      xmm3,   xmm2

+    paddsw      xmm3,   krd

+    paddsw      xmm3,   xmm4

+    psraw       xmm3,   7

+    packuswb    xmm3,   xmm3

+    punpcklqdq  xmm0,   xmm3

+    lea         rsi,    [rsi + rax]

+    movdqa      [rdi],  xmm0

+    lea         rdi,    [rdi + rdx]

+    dec         rcx

+    jnz         .filter_block1d16_h8_rowloop_ssse3

+    add rsp, 16*5

+    pop rsp

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+shuf_t0t1:

+    db  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8

+align 16

+shuf_t2t3:

+    db  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10

+align 16

+shuf_t4t5:

+    db  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12

+align 16

+shuf_t6t7:

+    db  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14

--- /dev/null

+++ b/vp9/common/x86/subpixel_mmx.asm

@@ -1,0 +1,727 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+%define BLOCK_HEIGHT_WIDTH 4

+%define vp9_filter_weight 128

+%define VP9_FILTER_SHIFT  7

+;void vp9_filter_block1d_h6_mmx

+;(

+;    unsigned char   *src_ptr,

+;    unsigned short  *output_ptr,

+;    unsigned int    src_pixels_per_line,

+;    unsigned int    pixel_step,

+;    unsigned int    output_height,

+;    unsigned int    output_width,

+;    short           * vp9_filter

+;)

+global sym(vp9_filter_block1d_h6_mmx)

+sym(vp9_filter_block1d_h6_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov         rdx,    arg(6) ;vp9_filter

+        movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!

+        movq        mm2,    [rdx + 32]         ;

+        movq        mm6,    [rdx + 48]        ;

+        movq        mm7,    [rdx + 64]        ;

+        mov         rdi,    arg(1) ;output_ptr

+        mov         rsi,    arg(0) ;src_ptr

+        movsxd      rcx,    dword ptr arg(4) ;output_height

+        movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?

+        pxor        mm0,    mm0              ; mm0 = 00000000

+.nextrow:

+        movq        mm3,    [rsi-2]          ; mm3 = p-2..p5

+        movq        mm4,    mm3              ; mm4 = p-2..p5

+        psrlq       mm3,    8                ; mm3 = p-1..p5

+        punpcklbw   mm3,    mm0              ; mm3 = p-1..p2

+        pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.

+        movq        mm5,    mm4              ; mm5 = p-2..p5

+        punpckhbw   mm4,    mm0              ; mm5 = p2..p5

+        pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers

+        paddsw      mm3,    mm4              ; mm3 += mm5

+        movq        mm4,    mm5              ; mm4 = p-2..p5;

+        psrlq       mm5,    16               ; mm5 = p0..p5;

+        punpcklbw   mm5,    mm0              ; mm5 = p0..p3

+        pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers

+        paddsw      mm3,    mm5              ; mm3 += mm5

+        movq        mm5,    mm4              ; mm5 = p-2..p5

+        psrlq       mm4,    24               ; mm4 = p1..p5

+        punpcklbw   mm4,    mm0              ; mm4 = p1..p4

+        pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers

+        paddsw      mm3,    mm4              ; mm3 += mm5

+        ; do outer positive taps

+        movd        mm4,    [rsi+3]

+        punpcklbw   mm4,    mm0              ; mm5 = p3..p6

+        pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers

+        paddsw      mm3,    mm4              ; mm3 += mm5

+        punpcklbw   mm5,    mm0              ; mm5 = p-2..p1

+        pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers

+        paddsw      mm3,    mm5              ; mm3 += mm5

+        paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value

+        psraw       mm3,    VP9_FILTER_SHIFT     ; mm3 /= 128

+        packuswb    mm3,    mm0              ; pack and unpack to saturate

+        punpcklbw   mm3,    mm0              ;

+        movq        [rdi],  mm3              ; store the results in the destination

+%if ABI_IS_32BIT

+        add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line

+        add         rdi,    rax;

+%else

+        movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line

+        add         rdi,    rax;

+        add         rsi,    r8               ; next line

+%endif

+        dec         rcx                      ; decrement count

+        jnz         .nextrow                 ; next row

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_filter_block1dc_v6_mmx

+;(

+;   short *src_ptr,

+;   unsigned char *output_ptr,

+;    int output_pitch,

+;   unsigned int pixels_per_line,

+;   unsigned int pixel_step,

+;   unsigned int output_height,

+;   unsigned int output_width,

+;   short * vp9_filter

+;)

+global sym(vp9_filter_block1dc_v6_mmx)

+sym(vp9_filter_block1dc_v6_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 8

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+        movq      mm5, [GLOBAL(rd)]

+        push        rbx

+        mov         rbx, arg(7) ;vp9_filter

+        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!

+        movq      mm2, [rbx + 32]         ;

+        movq      mm6, [rbx + 48]        ;

+        movq      mm7, [rbx + 64]        ;

+        movsxd      rdx, dword ptr arg(3) ;pixels_per_line

+        mov         rdi, arg(1) ;output_ptr

+        mov         rsi, arg(0) ;src_ptr

+        sub         rsi, rdx

+        sub         rsi, rdx

+        movsxd      rcx, DWORD PTR arg(5) ;output_height

+        movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?

+        pxor        mm0, mm0              ; mm0 = 00000000

+.nextrow_cv:

+        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1

+        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.

+        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2

+        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.

+        paddsw      mm3, mm4              ; mm3 += mm4

+        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0

+        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.

+        paddsw      mm3, mm4              ; mm3 += mm4

+        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2

+        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.

+        paddsw      mm3, mm4              ; mm3 += mm4

+        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch

+        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1

+        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.

+        paddsw      mm3, mm4              ; mm3 += mm4

+        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3

+        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.

+        paddsw      mm3, mm4              ; mm3 += mm4

+        paddsw      mm3, mm5               ; mm3 += round value

+        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128

+        packuswb    mm3, mm0              ; pack and saturate

+        movd        [rdi],mm3             ; store the results in the destination

+        ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the

+        ; recon block should be in cache this shouldn't cost much.  Its obviously

+        ; avoidable!!!.

+        lea         rdi,  [rdi+rax] ;

+        dec         rcx                   ; decrement count

+        jnz         .nextrow_cv           ; next row

+        pop         rbx

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void bilinear_predict8x8_mmx

+;(

+;    unsigned char  *src_ptr,

+;    int   src_pixels_per_line,

+;    int  xoffset,

+;    int  yoffset,

+;   unsigned char *dst_ptr,

+;    int dst_pitch

+;)

+global sym(vp9_bilinear_predict8x8_mmx)

+sym(vp9_bilinear_predict8x8_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ;const short *HFilter = bilinear_filters_mmx[xoffset];

+    ;const short *VFilter = bilinear_filters_mmx[yoffset];

+        movsxd      rax,        dword ptr arg(2) ;xoffset

+        mov         rdi,        arg(4) ;dst_ptr           ;

+        shl         rax,        5 ; offset * 32

+        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]

+        add         rax,        rcx ; HFilter

+        mov         rsi,        arg(0) ;src_ptr              ;

+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch

+        movq        mm1,        [rax]               ;

+        movq        mm2,        [rax+16]            ;

+        movsxd      rax,        dword ptr arg(3) ;yoffset

+        pxor        mm0,        mm0                 ;

+        shl         rax,        5 ; offset*32

+        add         rax,        rcx ; VFilter

+        lea         rcx,        [rdi+rdx*8]          ;

+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;

+        ; get the first horizontal line done       ;

+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

+        movq        mm4,        mm3                 ; make a copy of current line

+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06

+        punpckhbw   mm4,        mm0                 ;

+        pmullw      mm3,        mm1                 ;

+        pmullw      mm4,        mm1                 ;

+        movq        mm5,        [rsi+1]             ;

+        movq        mm6,        mm5                 ;

+        punpcklbw   mm5,        mm0                 ;

+        punpckhbw   mm6,        mm0                 ;

+        pmullw      mm5,        mm2                 ;

+        pmullw      mm6,        mm2                 ;

+        paddw       mm3,        mm5                 ;

+        paddw       mm4,        mm6                 ;

+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

+        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128

+        paddw       mm4,        [GLOBAL(rd)]                 ;

+        psraw       mm4,        VP9_FILTER_SHIFT        ;

+        movq        mm7,        mm3                 ;

+        packuswb    mm7,        mm4                 ;

+        add         rsi,        rdx                 ; next line

+.next_row_8x8:

+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

+        movq        mm4,        mm3                 ; make a copy of current line

+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06

+        punpckhbw   mm4,        mm0                 ;

+        pmullw      mm3,        mm1                 ;

+        pmullw      mm4,        mm1                 ;

+        movq        mm5,        [rsi+1]             ;

+        movq        mm6,        mm5                 ;

+        punpcklbw   mm5,        mm0                 ;

+        punpckhbw   mm6,        mm0                 ;

+        pmullw      mm5,        mm2                 ;

+        pmullw      mm6,        mm2                 ;

+        paddw       mm3,        mm5                 ;

+        paddw       mm4,        mm6                 ;

+        movq        mm5,        mm7                 ;

+        movq        mm6,        mm7                 ;

+        punpcklbw   mm5,        mm0                 ;

+        punpckhbw   mm6,        mm0

+        pmullw      mm5,        [rax]               ;

+        pmullw      mm6,        [rax]               ;

+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

+        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128

+        paddw       mm4,        [GLOBAL(rd)]                 ;

+        psraw       mm4,        VP9_FILTER_SHIFT        ;

+        movq        mm7,        mm3                 ;

+        packuswb    mm7,        mm4                 ;

+        pmullw      mm3,        [rax+16]            ;

+        pmullw      mm4,        [rax+16]            ;

+        paddw       mm3,        mm5                 ;

+        paddw       mm4,        mm6                 ;

+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

+        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128

+        paddw       mm4,        [GLOBAL(rd)]                 ;

+        psraw       mm4,        VP9_FILTER_SHIFT        ;

+        packuswb    mm3,        mm4

+        movq        [rdi],      mm3                 ; store the results in the destination

+%if ABI_IS_32BIT

+        add         rsi,        rdx                 ; next line

+        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;

+%else

+        movsxd      r8,         dword ptr arg(5) ;dst_pitch

+        add         rsi,        rdx                 ; next line

+        add         rdi,        r8                  ;dst_pitch

+%endif

+        cmp         rdi,        rcx                 ;

+        jne         .next_row_8x8

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void bilinear_predict8x4_mmx

+;(

+;    unsigned char  *src_ptr,

+;    int   src_pixels_per_line,

+;    int  xoffset,

+;    int  yoffset,

+;    unsigned char *dst_ptr,

+;    int dst_pitch

+;)

+global sym(vp9_bilinear_predict8x4_mmx)

+sym(vp9_bilinear_predict8x4_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ;const short *HFilter = bilinear_filters_mmx[xoffset];

+    ;const short *VFilter = bilinear_filters_mmx[yoffset];

+        movsxd      rax,        dword ptr arg(2) ;xoffset

+        mov         rdi,        arg(4) ;dst_ptr           ;

+        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]

+        shl         rax,        5

+        mov         rsi,        arg(0) ;src_ptr              ;

+        add         rax,        rcx

+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch

+        movq        mm1,        [rax]               ;

+        movq        mm2,        [rax+16]            ;

+        movsxd      rax,        dword ptr arg(3) ;yoffset

+        pxor        mm0,        mm0                 ;

+        shl         rax,        5

+        add         rax,        rcx

+        lea         rcx,        [rdi+rdx*4]          ;

+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;

+        ; get the first horizontal line done       ;

+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

+        movq        mm4,        mm3                 ; make a copy of current line

+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06

+        punpckhbw   mm4,        mm0                 ;

+        pmullw      mm3,        mm1                 ;

+        pmullw      mm4,        mm1                 ;

+        movq        mm5,        [rsi+1]             ;

+        movq        mm6,        mm5                 ;

+        punpcklbw   mm5,        mm0                 ;

+        punpckhbw   mm6,        mm0                 ;

+        pmullw      mm5,        mm2                 ;

+        pmullw      mm6,        mm2                 ;

+        paddw       mm3,        mm5                 ;

+        paddw       mm4,        mm6                 ;

+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

+        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128

+        paddw       mm4,        [GLOBAL(rd)]                 ;

+        psraw       mm4,        VP9_FILTER_SHIFT        ;

+        movq        mm7,        mm3                 ;

+        packuswb    mm7,        mm4                 ;

+        add         rsi,        rdx                 ; next line

+.next_row_8x4:

+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

+        movq        mm4,        mm3                 ; make a copy of current line

+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06

+        punpckhbw   mm4,        mm0                 ;

+        pmullw      mm3,        mm1                 ;

+        pmullw      mm4,        mm1                 ;

+        movq        mm5,        [rsi+1]             ;

+        movq        mm6,        mm5                 ;

+        punpcklbw   mm5,        mm0                 ;

+        punpckhbw   mm6,        mm0                 ;

+        pmullw      mm5,        mm2                 ;

+        pmullw      mm6,        mm2                 ;

+        paddw       mm3,        mm5                 ;

+        paddw       mm4,        mm6                 ;

+        movq        mm5,        mm7                 ;

+        movq        mm6,        mm7                 ;

+        punpcklbw   mm5,        mm0                 ;

+        punpckhbw   mm6,        mm0

+        pmullw      mm5,        [rax]               ;

+        pmullw      mm6,        [rax]               ;

+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

+        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128

+        paddw       mm4,        [GLOBAL(rd)]                 ;

+        psraw       mm4,        VP9_FILTER_SHIFT        ;

+        movq        mm7,        mm3                 ;

+        packuswb    mm7,        mm4                 ;

+        pmullw      mm3,        [rax+16]            ;

+        pmullw      mm4,        [rax+16]            ;

+        paddw       mm3,        mm5                 ;

+        paddw       mm4,        mm6                 ;

+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

+        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128

+        paddw       mm4,        [GLOBAL(rd)]                 ;

+        psraw       mm4,        VP9_FILTER_SHIFT        ;

+        packuswb    mm3,        mm4

+        movq        [rdi],      mm3                 ; store the results in the destination

+%if ABI_IS_32BIT

+        add         rsi,        rdx                 ; next line

+        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;

+%else

+        movsxd      r8,         dword ptr arg(5) ;dst_pitch

+        add         rsi,        rdx                 ; next line

+        add         rdi,        r8

+%endif

+        cmp         rdi,        rcx                 ;

+        jne         .next_row_8x4

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void bilinear_predict4x4_mmx

+;(

+;    unsigned char  *src_ptr,

+;    int   src_pixels_per_line,

+;    int  xoffset,

+;    int  yoffset,

+;    unsigned char *dst_ptr,

+;    int dst_pitch

+;)

+global sym(vp9_bilinear_predict4x4_mmx)

+sym(vp9_bilinear_predict4x4_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ;const short *HFilter = bilinear_filters_mmx[xoffset];

+    ;const short *VFilter = bilinear_filters_mmx[yoffset];

+        movsxd      rax,        dword ptr arg(2) ;xoffset

+        mov         rdi,        arg(4) ;dst_ptr           ;

+        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]

+        shl         rax,        5

+        add         rax,        rcx ; HFilter

+        mov         rsi,        arg(0) ;src_ptr              ;

+        movsxd      rdx,        dword ptr arg(5) ;ldst_pitch

+        movq        mm1,        [rax]               ;

+        movq        mm2,        [rax+16]            ;

+        movsxd      rax,        dword ptr arg(3) ;yoffset

+        pxor        mm0,        mm0                 ;

+        shl         rax,        5

+        add         rax,        rcx

+        lea         rcx,        [rdi+rdx*4]          ;

+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;

+        ; get the first horizontal line done       ;

+        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06

+        pmullw      mm3,        mm1                 ;

+        movd        mm5,        [rsi+1]             ;

+        punpcklbw   mm5,        mm0                 ;

+        pmullw      mm5,        mm2                 ;

+        paddw       mm3,        mm5                 ;

+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

+        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128

+        movq        mm7,        mm3                 ;

+        packuswb    mm7,        mm0                 ;

+        add         rsi,        rdx                 ; next line

+.next_row_4x4:

+        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06

+        pmullw      mm3,        mm1                 ;

+        movd        mm5,        [rsi+1]             ;

+        punpcklbw   mm5,        mm0                 ;

+        pmullw      mm5,        mm2                 ;

+        paddw       mm3,        mm5                 ;

+        movq        mm5,        mm7                 ;

+        punpcklbw   mm5,        mm0                 ;

+        pmullw      mm5,        [rax]               ;

+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

+        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128

+        movq        mm7,        mm3                 ;

+        packuswb    mm7,        mm0                 ;

+        pmullw      mm3,        [rax+16]            ;

+        paddw       mm3,        mm5                 ;

+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value

+        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128

+        packuswb    mm3,        mm0

+        movd        [rdi],      mm3                 ; store the results in the destination

+%if ABI_IS_32BIT

+        add         rsi,        rdx                 ; next line

+        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;

+%else

+        movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;

+        add         rsi,        rdx                 ; next line

+        add         rdi,        r8

+%endif

+        cmp         rdi,        rcx                 ;

+        jne         .next_row_4x4

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+rd:

+    times 4 dw 0x40

+align 16

+global HIDDEN_DATA(sym(vp9_six_tap_mmx))

+sym(vp9_six_tap_mmx):

+    times 8 dw 0

+    times 8 dw 0

+    times 8 dw 128

+    times 8 dw 0

+    times 8 dw 0

+    times 8 dw 0

+    times 8 dw 0

+    times 8 dw -6

+    times 8 dw 123

+    times 8 dw 12

+    times 8 dw -1

+    times 8 dw 0

+    times 8 dw 2

+    times 8 dw -11

+    times 8 dw 108

+    times 8 dw 36

+    times 8 dw -8

+    times 8 dw 1

+    times 8 dw 0

+    times 8 dw -9

+    times 8 dw 93

+    times 8 dw 50

+    times 8 dw -6

+    times 8 dw 0

+    times 8 dw 3

+    times 8 dw -16

+    times 8 dw 77

+    times 8 dw 77

+    times 8 dw -16

+    times 8 dw 3

+    times 8 dw 0

+    times 8 dw -6

+    times 8 dw 50

+    times 8 dw 93

+    times 8 dw -9

+    times 8 dw 0

+    times 8 dw 1

+    times 8 dw -8

+    times 8 dw 36

+    times 8 dw 108

+    times 8 dw -11

+    times 8 dw 2

+    times 8 dw 0

+    times 8 dw -1

+    times 8 dw 12

+    times 8 dw 123

+    times 8 dw -6

+    times 8 dw 0

+align 16

+global HIDDEN_DATA(sym(vp9_bilinear_filters_8x_mmx))

+sym(vp9_bilinear_filters_8x_mmx):

+    times 8 dw 128

+    times 8 dw 0

+    times 8 dw 112

+    times 8 dw 16

+    times 8 dw 96

+    times 8 dw 32

+    times 8 dw 80

+    times 8 dw 48

+    times 8 dw 64

+    times 8 dw 64

+    times 8 dw 48

+    times 8 dw 80

+    times 8 dw 32

+    times 8 dw 96

+    times 8 dw 16

+    times 8 dw 112

--- /dev/null

+++ b/vp9/common/x86/subpixel_sse2.asm

@@ -1,0 +1,1372 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+%define BLOCK_HEIGHT_WIDTH 4

+%define VP9_FILTER_WEIGHT 128

+%define VP9_FILTER_SHIFT  7

+;/************************************************************************************

+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The

+; input pixel array has output_height rows. This routine assumes that output_height is an

+; even number. This function handles 8 pixels in horizontal direction, calculating ONE

+; rows each iteration to take advantage of the 128 bits operations.

+;*************************************************************************************/

+;void vp9_filter_block1d8_h6_sse2

+;(

+;    unsigned char  *src_ptr,

+;    unsigned short *output_ptr,

+;    unsigned int    src_pixels_per_line,

+;    unsigned int    pixel_step,

+;    unsigned int    output_height,

+;    unsigned int    output_width,

+;    short           *vp9_filter

+;)

+global sym(vp9_filter_block1d8_h6_sse2)

+sym(vp9_filter_block1d8_h6_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 7

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov         rdx,        arg(6) ;vp9_filter

+        mov         rsi,        arg(0) ;src_ptr

+        mov         rdi,        arg(1) ;output_ptr

+        movsxd      rcx,        dword ptr arg(4) ;output_height

+        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source

+%if ABI_IS_32BIT=0

+        movsxd      r8,         dword ptr arg(5) ;output_width

+%endif

+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

+.filter_block1d8_h6_rowloop:

+        movq        xmm3,       MMWORD PTR [rsi - 2]

+        movq        xmm1,       MMWORD PTR [rsi + 6]

+        prefetcht2  [rsi+rax-2]

+        pslldq      xmm1,       8

+        por         xmm1,       xmm3

+        movdqa      xmm4,       xmm1

+        movdqa      xmm5,       xmm1

+        movdqa      xmm6,       xmm1

+        movdqa      xmm7,       xmm1

+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

+        paddsw      xmm4,       xmm7

+        paddsw      xmm4,       xmm5

+        paddsw      xmm4,       xmm3

+        paddsw      xmm4,       xmm6

+        paddsw      xmm4,       xmm1

+        paddsw      xmm4,       [GLOBAL(rd)]

+        psraw       xmm4,       7

+        packuswb    xmm4,       xmm0

+        punpcklbw   xmm4,       xmm0

+        movdqa      XMMWORD Ptr [rdi],         xmm4

+        lea         rsi,        [rsi + rax]

+%if ABI_IS_32BIT

+        add         rdi,        DWORD Ptr arg(5) ;[output_width]

+%else

+        add         rdi,        r8

+%endif

+        dec         rcx

+        jnz         .filter_block1d8_h6_rowloop                ; next row

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_filter_block1d16_h6_sse2

+;(

+;    unsigned char  *src_ptr,

+;    unsigned short *output_ptr,

+;    unsigned int    src_pixels_per_line,

+;    unsigned int    pixel_step,

+;    unsigned int    output_height,

+;    unsigned int    output_width,

+;    short           *vp9_filter

+;)

+;/************************************************************************************

+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The

+; input pixel array has output_height rows. This routine assumes that output_height is an

+; even number. This function handles 8 pixels in horizontal direction, calculating ONE

+; rows each iteration to take advantage of the 128 bits operations.

+;*************************************************************************************/

+global sym(vp9_filter_block1d16_h6_sse2)

+sym(vp9_filter_block1d16_h6_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 7

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov         rdx,        arg(6) ;vp9_filter

+        mov         rsi,        arg(0) ;src_ptr

+        mov         rdi,        arg(1) ;output_ptr

+        movsxd      rcx,        dword ptr arg(4) ;output_height

+        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source

+%if ABI_IS_32BIT=0

+        movsxd      r8,         dword ptr arg(5) ;output_width

+%endif

+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

+.filter_block1d16_h6_sse2_rowloop:

+        movq        xmm3,       MMWORD PTR [rsi - 2]

+        movq        xmm1,       MMWORD PTR [rsi + 6]

+        movq        xmm2,       MMWORD PTR [rsi +14]

+        pslldq      xmm2,       8

+        por         xmm2,       xmm1

+        prefetcht2  [rsi+rax-2]

+        pslldq      xmm1,       8

+        por         xmm1,       xmm3

+        movdqa      xmm4,       xmm1

+        movdqa      xmm5,       xmm1

+        movdqa      xmm6,       xmm1

+        movdqa      xmm7,       xmm1

+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

+        paddsw      xmm4,       xmm7

+        paddsw      xmm4,       xmm5

+        paddsw      xmm4,       xmm3

+        paddsw      xmm4,       xmm6

+        paddsw      xmm4,       xmm1

+        paddsw      xmm4,       [GLOBAL(rd)]

+        psraw       xmm4,       7

+        packuswb    xmm4,       xmm0

+        punpcklbw   xmm4,       xmm0

+        movdqa      XMMWORD Ptr [rdi],         xmm4

+        movdqa      xmm3,       xmm2

+        movdqa      xmm4,       xmm2

+        movdqa      xmm5,       xmm2

+        movdqa      xmm6,       xmm2

+        movdqa      xmm7,       xmm2

+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

+        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

+        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

+        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

+        paddsw      xmm4,       xmm7

+        paddsw      xmm4,       xmm5

+        paddsw      xmm4,       xmm3

+        paddsw      xmm4,       xmm6

+        paddsw      xmm4,       xmm2

+        paddsw      xmm4,       [GLOBAL(rd)]

+        psraw       xmm4,       7

+        packuswb    xmm4,       xmm0

+        punpcklbw   xmm4,       xmm0

+        movdqa      XMMWORD Ptr [rdi+16],      xmm4

+        lea         rsi,        [rsi + rax]

+%if ABI_IS_32BIT

+        add         rdi,        DWORD Ptr arg(5) ;[output_width]

+%else

+        add         rdi,        r8

+%endif

+        dec         rcx

+        jnz         .filter_block1d16_h6_sse2_rowloop                ; next row

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_filter_block1d8_v6_sse2

+;(

+;    short *src_ptr,

+;    unsigned char *output_ptr,

+;    int dst_ptich,

+;    unsigned int pixels_per_line,

+;    unsigned int pixel_step,

+;    unsigned int output_height,

+;    unsigned int output_width,

+;    short * vp9_filter

+;)

+;/************************************************************************************

+; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The

+; input pixel array has output_height rows.

+;*************************************************************************************/

+global sym(vp9_filter_block1d8_v6_sse2)

+sym(vp9_filter_block1d8_v6_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 8

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov         rax,        arg(7) ;vp9_filter

+        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line

+        mov         rdi,        arg(1) ;output_ptr

+        mov         rsi,        arg(0) ;src_ptr

+        sub         rsi,        rdx

+        sub         rsi,        rdx

+        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]

+        pxor        xmm0,       xmm0                        ; clear xmm0

+        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]

+%if ABI_IS_32BIT=0

+        movsxd      r8,         dword ptr arg(2) ; dst_ptich

+%endif

+.vp9_filter_block1d8_v6_sse2_loop:

+        movdqa      xmm1,       XMMWORD PTR [rsi]

+        pmullw      xmm1,       [rax]

+        movdqa      xmm2,       XMMWORD PTR [rsi + rdx]

+        pmullw      xmm2,       [rax + 16]

+        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]

+        pmullw      xmm3,       [rax + 32]

+        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]

+        pmullw      xmm5,       [rax + 64]

+        add         rsi,        rdx

+        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]

+        pmullw      xmm4,       [rax + 48]

+        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]

+        pmullw      xmm6,       [rax + 80]

+        paddsw      xmm2,       xmm5

+        paddsw      xmm2,       xmm3

+        paddsw      xmm2,       xmm1

+        paddsw      xmm2,       xmm4

+        paddsw      xmm2,       xmm6

+        paddsw      xmm2,       xmm7

+        psraw       xmm2,       7

+        packuswb    xmm2,       xmm0              ; pack and saturate

+        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination

+%if ABI_IS_32BIT

+        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]

+%else

+        add         rdi,        r8

+%endif

+        dec         rcx         ; decrement count

+        jnz         .vp9_filter_block1d8_v6_sse2_loop               ; next row

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_filter_block1d16_v6_sse2

+;(

+;    unsigned short *src_ptr,

+;    unsigned char *output_ptr,

+;    int dst_ptich,

+;    unsigned int pixels_per_line,

+;    unsigned int pixel_step,

+;    unsigned int output_height,

+;    unsigned int output_width,

+;    const short    *vp9_filter

+;)

+;/************************************************************************************

+; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The

+; input pixel array has output_height rows.

+;*************************************************************************************/

+global sym(vp9_filter_block1d16_v6_sse2)

+sym(vp9_filter_block1d16_v6_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 8

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov         rax,        arg(7) ;vp9_filter

+        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line

+        mov         rdi,        arg(1) ;output_ptr

+        mov         rsi,        arg(0) ;src_ptr

+        sub         rsi,        rdx

+        sub         rsi,        rdx

+        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]

+%if ABI_IS_32BIT=0

+        movsxd      r8,         dword ptr arg(2) ; dst_ptich

+%endif

+.vp9_filter_block1d16_v6_sse2_loop:

+; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.

+        movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2

+        movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]

+        pmullw      xmm1,       [rax + 16]

+        pmullw      xmm2,       [rax + 16]

+        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5

+        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]

+        pmullw      xmm3,       [rax + 64]

+        pmullw      xmm4,       [rax + 64]

+        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3

+        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]

+        pmullw      xmm5,       [rax + 32]

+        pmullw      xmm6,       [rax + 32]

+        movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1

+        movdqa      xmm0,       XMMWORD PTR [rsi + 16]

+        pmullw      xmm7,       [rax]

+        pmullw      xmm0,       [rax]

+        paddsw      xmm1,       xmm3

+        paddsw      xmm2,       xmm4

+        paddsw      xmm1,       xmm5

+        paddsw      xmm2,       xmm6

+        paddsw      xmm1,       xmm7

+        paddsw      xmm2,       xmm0

+        add         rsi,        rdx

+        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4

+        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]

+        pmullw      xmm3,       [rax + 48]

+        pmullw      xmm4,       [rax + 48]

+        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6

+        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]

+        pmullw      xmm5,       [rax + 80]

+        pmullw      xmm6,       [rax + 80]

+        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]

+        pxor        xmm0,       xmm0                        ; clear xmm0

+        paddsw      xmm1,       xmm3

+        paddsw      xmm2,       xmm4

+        paddsw      xmm1,       xmm5

+        paddsw      xmm2,       xmm6

+        paddsw      xmm1,       xmm7

+        paddsw      xmm2,       xmm7

+        psraw       xmm1,       7

+        psraw       xmm2,       7

+        packuswb    xmm1,       xmm2              ; pack and saturate

+        movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination

+%if ABI_IS_32BIT

+        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]

+%else

+        add         rdi,        r8

+%endif

+        dec         rcx         ; decrement count

+        jnz         .vp9_filter_block1d16_v6_sse2_loop              ; next row

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_filter_block1d8_h6_only_sse2

+;(

+;    unsigned char  *src_ptr,

+;    unsigned int    src_pixels_per_line,

+;    unsigned char  *output_ptr,

+;    int dst_ptich,

+;    unsigned int    output_height,

+;    const short    *vp9_filter

+;)

+; First-pass filter only when yoffset==0

+global sym(vp9_filter_block1d8_h6_only_sse2)

+sym(vp9_filter_block1d8_h6_only_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov         rdx,        arg(5) ;vp9_filter

+        mov         rsi,        arg(0) ;src_ptr

+        mov         rdi,        arg(2) ;output_ptr

+        movsxd      rcx,        dword ptr arg(4) ;output_height

+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source

+%if ABI_IS_32BIT=0

+        movsxd      r8,         dword ptr arg(3) ;dst_ptich

+%endif

+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

+.filter_block1d8_h6_only_rowloop:

+        movq        xmm3,       MMWORD PTR [rsi - 2]

+        movq        xmm1,       MMWORD PTR [rsi + 6]

+        prefetcht2  [rsi+rax-2]

+        pslldq      xmm1,       8

+        por         xmm1,       xmm3

+        movdqa      xmm4,       xmm1

+        movdqa      xmm5,       xmm1

+        movdqa      xmm6,       xmm1

+        movdqa      xmm7,       xmm1

+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

+        paddsw      xmm4,       xmm7

+        paddsw      xmm4,       xmm5

+        paddsw      xmm4,       xmm3

+        paddsw      xmm4,       xmm6

+        paddsw      xmm4,       xmm1

+        paddsw      xmm4,       [GLOBAL(rd)]

+        psraw       xmm4,       7

+        packuswb    xmm4,       xmm0

+        movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination

+        lea         rsi,        [rsi + rax]

+%if ABI_IS_32BIT

+        add         rdi,        DWORD Ptr arg(3) ;dst_ptich

+%else

+        add         rdi,        r8

+%endif

+        dec         rcx

+        jnz         .filter_block1d8_h6_only_rowloop               ; next row

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_filter_block1d16_h6_only_sse2

+;(

+;    unsigned char  *src_ptr,

+;    unsigned int    src_pixels_per_line,

+;    unsigned char  *output_ptr,

+;    int dst_ptich,

+;    unsigned int    output_height,

+;    const short    *vp9_filter

+;)

+; First-pass filter only when yoffset==0

+global sym(vp9_filter_block1d16_h6_only_sse2)

+sym(vp9_filter_block1d16_h6_only_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov         rdx,        arg(5) ;vp9_filter

+        mov         rsi,        arg(0) ;src_ptr

+        mov         rdi,        arg(2) ;output_ptr

+        movsxd      rcx,        dword ptr arg(4) ;output_height

+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source

+%if ABI_IS_32BIT=0

+        movsxd      r8,         dword ptr arg(3) ;dst_ptich

+%endif

+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

+.filter_block1d16_h6_only_sse2_rowloop:

+        movq        xmm3,       MMWORD PTR [rsi - 2]

+        movq        xmm1,       MMWORD PTR [rsi + 6]

+        movq        xmm2,       MMWORD PTR [rsi +14]

+        pslldq      xmm2,       8

+        por         xmm2,       xmm1

+        prefetcht2  [rsi+rax-2]

+        pslldq      xmm1,       8

+        por         xmm1,       xmm3

+        movdqa      xmm4,       xmm1

+        movdqa      xmm5,       xmm1

+        movdqa      xmm6,       xmm1

+        movdqa      xmm7,       xmm1

+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

+        paddsw      xmm4,       xmm7

+        paddsw      xmm4,       xmm5

+        paddsw      xmm4,       xmm3

+        paddsw      xmm4,       xmm6

+        paddsw      xmm4,       xmm1

+        paddsw      xmm4,       [GLOBAL(rd)]

+        psraw       xmm4,       7

+        packuswb    xmm4,       xmm0                        ; lower 8 bytes

+        movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination

+        movdqa      xmm3,       xmm2

+        movdqa      xmm4,       xmm2

+        movdqa      xmm5,       xmm2

+        movdqa      xmm6,       xmm2

+        movdqa      xmm7,       xmm2

+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

+        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

+        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

+        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

+        paddsw      xmm4,       xmm7

+        paddsw      xmm4,       xmm5

+        paddsw      xmm4,       xmm3

+        paddsw      xmm4,       xmm6

+        paddsw      xmm4,       xmm2

+        paddsw      xmm4,       [GLOBAL(rd)]

+        psraw       xmm4,       7

+        packuswb    xmm4,       xmm0                        ; higher 8 bytes

+        movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination

+        lea         rsi,        [rsi + rax]

+%if ABI_IS_32BIT

+        add         rdi,        DWORD Ptr arg(3) ;dst_ptich

+%else

+        add         rdi,        r8

+%endif

+        dec         rcx

+        jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_filter_block1d8_v6_only_sse2

+;(

+;    unsigned char *src_ptr,

+;    unsigned int    src_pixels_per_line,

+;    unsigned char *output_ptr,

+;    int dst_ptich,

+;    unsigned int output_height,

+;    const short    *vp9_filter

+;)

+; Second-pass filter only when xoffset==0

+global sym(vp9_filter_block1d8_v6_only_sse2)

+sym(vp9_filter_block1d8_v6_only_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov         rsi,        arg(0) ;src_ptr

+        mov         rdi,        arg(2) ;output_ptr

+        movsxd      rcx,        dword ptr arg(4) ;output_height

+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line

+        mov         rax,        arg(5) ;vp9_filter

+        pxor        xmm0,       xmm0                        ; clear xmm0

+        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]

+%if ABI_IS_32BIT=0

+        movsxd      r8,         dword ptr arg(3) ; dst_ptich

+%endif

+.vp9_filter_block1d8_v6_only_sse2_loop:

+        movq        xmm1,       MMWORD PTR [rsi]

+        movq        xmm2,       MMWORD PTR [rsi + rdx]

+        movq        xmm3,       MMWORD PTR [rsi + rdx * 2]

+        movq        xmm5,       MMWORD PTR [rsi + rdx * 4]

+        add         rsi,        rdx

+        movq        xmm4,       MMWORD PTR [rsi + rdx * 2]

+        movq        xmm6,       MMWORD PTR [rsi + rdx * 4]

+        punpcklbw   xmm1,       xmm0

+        pmullw      xmm1,       [rax]

+        punpcklbw   xmm2,       xmm0

+        pmullw      xmm2,       [rax + 16]

+        punpcklbw   xmm3,       xmm0

+        pmullw      xmm3,       [rax + 32]

+        punpcklbw   xmm5,       xmm0

+        pmullw      xmm5,       [rax + 64]

+        punpcklbw   xmm4,       xmm0

+        pmullw      xmm4,       [rax + 48]

+        punpcklbw   xmm6,       xmm0

+        pmullw      xmm6,       [rax + 80]

+        paddsw      xmm2,       xmm5

+        paddsw      xmm2,       xmm3

+        paddsw      xmm2,       xmm1

+        paddsw      xmm2,       xmm4

+        paddsw      xmm2,       xmm6

+        paddsw      xmm2,       xmm7

+        psraw       xmm2,       7

+        packuswb    xmm2,       xmm0              ; pack and saturate

+        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination

+%if ABI_IS_32BIT

+        add         rdi,        DWORD PTR arg(3) ;[dst_ptich]

+%else

+        add         rdi,        r8

+%endif

+        dec         rcx         ; decrement count

+        jnz         .vp9_filter_block1d8_v6_only_sse2_loop              ; next row

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_unpack_block1d16_h6_sse2

+;(

+;    unsigned char  *src_ptr,

+;    unsigned short *output_ptr,

+;    unsigned int    src_pixels_per_line,

+;    unsigned int    output_height,

+;    unsigned int    output_width

+;)

+global sym(vp9_unpack_block1d16_h6_sse2)

+sym(vp9_unpack_block1d16_h6_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov         rsi,        arg(0) ;src_ptr

+        mov         rdi,        arg(1) ;output_ptr

+        movsxd      rcx,        dword ptr arg(3) ;output_height

+        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source

+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

+%if ABI_IS_32BIT=0

+        movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source

+%endif

+.unpack_block1d16_h6_sse2_rowloop:

+        movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2

+        movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1

+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

+        punpcklbw   xmm1,       xmm0

+        movdqa      XMMWORD Ptr [rdi],         xmm1

+        movdqa      XMMWORD Ptr [rdi + 16],    xmm3

+        lea         rsi,        [rsi + rax]

+%if ABI_IS_32BIT

+        add         rdi,        DWORD Ptr arg(4) ;[output_width]

+%else

+        add         rdi,        r8

+%endif

+        dec         rcx

+        jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_bilinear_predict16x16_sse2

+;(

+;    unsigned char  *src_ptr,

+;    int   src_pixels_per_line,

+;    int  xoffset,

+;    int  yoffset,

+;    unsigned char *dst_ptr,

+;    int dst_pitch

+;)

+extern sym(vp9_bilinear_filters_mmx)

+global sym(vp9_bilinear_predict16x16_sse2)

+sym(vp9_bilinear_predict16x16_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ;const short *HFilter = bilinear_filters_mmx[xoffset]

+    ;const short *VFilter = bilinear_filters_mmx[yoffset]

+        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_mmx))]

+        movsxd      rax,        dword ptr arg(2) ;xoffset

+        cmp         rax,        0      ;skip first_pass filter if xoffset=0

+        je          .b16x16_sp_only

+        shl         rax,        5

+        add         rax,        rcx    ;HFilter

+        mov         rdi,        arg(4) ;dst_ptr

+        mov         rsi,        arg(0) ;src_ptr

+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch

+        movdqa      xmm1,       [rax]

+        movdqa      xmm2,       [rax+16]

+        movsxd      rax,        dword ptr arg(3) ;yoffset

+        cmp         rax,        0      ;skip second_pass filter if yoffset=0

+        je          .b16x16_fp_only

+        shl         rax,        5

+        add         rax,        rcx    ;VFilter

+        lea         rcx,        [rdi+rdx*8]

+        lea         rcx,        [rcx+rdx*8]

+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line

+        pxor        xmm0,       xmm0

+%if ABI_IS_32BIT=0

+        movsxd      r8,         dword ptr arg(5) ;dst_pitch

+%endif

+        ; get the first horizontal line done

+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

+        movdqa      xmm4,       xmm3                 ; make a copy of current line

+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06

+        punpckhbw   xmm4,       xmm0

+        pmullw      xmm3,       xmm1

+        pmullw      xmm4,       xmm1

+        movdqu      xmm5,       [rsi+1]

+        movdqa      xmm6,       xmm5

+        punpcklbw   xmm5,       xmm0

+        punpckhbw   xmm6,       xmm0

+        pmullw      xmm5,       xmm2

+        pmullw      xmm6,       xmm2

+        paddw       xmm3,       xmm5

+        paddw       xmm4,       xmm6

+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

+        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

+        paddw       xmm4,       [GLOBAL(rd)]

+        psraw       xmm4,       VP9_FILTER_SHIFT

+        movdqa      xmm7,       xmm3

+        packuswb    xmm7,       xmm4

+        add         rsi,        rdx                 ; next line

+.next_row:

+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

+        movdqa      xmm4,       xmm3                 ; make a copy of current line

+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06

+        punpckhbw   xmm4,       xmm0

+        pmullw      xmm3,       xmm1

+        pmullw      xmm4,       xmm1

+        movdqu      xmm5,       [rsi+1]

+        movdqa      xmm6,       xmm5

+        punpcklbw   xmm5,       xmm0

+        punpckhbw   xmm6,       xmm0

+        pmullw      xmm5,       xmm2

+        pmullw      xmm6,       xmm2

+        paddw       xmm3,       xmm5

+        paddw       xmm4,       xmm6

+        movdqa      xmm5,       xmm7

+        movdqa      xmm6,       xmm7

+        punpcklbw   xmm5,       xmm0

+        punpckhbw   xmm6,       xmm0

+        pmullw      xmm5,       [rax]

+        pmullw      xmm6,       [rax]

+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

+        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

+        paddw       xmm4,       [GLOBAL(rd)]

+        psraw       xmm4,       VP9_FILTER_SHIFT

+        movdqa      xmm7,       xmm3

+        packuswb    xmm7,       xmm4

+        pmullw      xmm3,       [rax+16]

+        pmullw      xmm4,       [rax+16]

+        paddw       xmm3,       xmm5

+        paddw       xmm4,       xmm6

+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

+        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

+        paddw       xmm4,       [GLOBAL(rd)]

+        psraw       xmm4,       VP9_FILTER_SHIFT

+        packuswb    xmm3,       xmm4

+        movdqa      [rdi],      xmm3                 ; store the results in the destination

+        add         rsi,        rdx                 ; next line

+%if ABI_IS_32BIT

+        add         rdi,        DWORD PTR arg(5) ;dst_pitch

+%else

+        add         rdi,        r8

+%endif

+        cmp         rdi,        rcx

+        jne         .next_row

+        jmp         .done

+.b16x16_sp_only:

+        movsxd      rax,        dword ptr arg(3) ;yoffset

+        shl         rax,        5

+        add         rax,        rcx    ;VFilter

+        mov         rdi,        arg(4) ;dst_ptr

+        mov         rsi,        arg(0) ;src_ptr

+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch

+        movdqa      xmm1,       [rax]

+        movdqa      xmm2,       [rax+16]

+        lea         rcx,        [rdi+rdx*8]

+        lea         rcx,        [rcx+rdx*8]

+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line

+        pxor        xmm0,       xmm0

+        ; get the first horizontal line done

+        movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

+        add         rsi,        rax                 ; next line

+.next_row_spo:

+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

+        movdqa      xmm5,       xmm7

+        movdqa      xmm6,       xmm7

+        movdqa      xmm4,       xmm3                 ; make a copy of current line

+        movdqa      xmm7,       xmm3

+        punpcklbw   xmm5,       xmm0

+        punpckhbw   xmm6,       xmm0

+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06

+        punpckhbw   xmm4,       xmm0

+        pmullw      xmm5,       xmm1

+        pmullw      xmm6,       xmm1

+        pmullw      xmm3,       xmm2

+        pmullw      xmm4,       xmm2

+        paddw       xmm3,       xmm5

+        paddw       xmm4,       xmm6

+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

+        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

+        paddw       xmm4,       [GLOBAL(rd)]

+        psraw       xmm4,       VP9_FILTER_SHIFT

+        packuswb    xmm3,       xmm4

+        movdqa      [rdi],      xmm3                 ; store the results in the destination

+        add         rsi,        rax                 ; next line

+        add         rdi,        rdx                 ;dst_pitch

+        cmp         rdi,        rcx

+        jne         .next_row_spo

+        jmp         .done

+.b16x16_fp_only:

+        lea         rcx,        [rdi+rdx*8]

+        lea         rcx,        [rcx+rdx*8]

+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line

+        pxor        xmm0,       xmm0

+.next_row_fpo:

+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

+        movdqa      xmm4,       xmm3                 ; make a copy of current line

+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06

+        punpckhbw   xmm4,       xmm0

+        pmullw      xmm3,       xmm1

+        pmullw      xmm4,       xmm1

+        movdqu      xmm5,       [rsi+1]

+        movdqa      xmm6,       xmm5

+        punpcklbw   xmm5,       xmm0

+        punpckhbw   xmm6,       xmm0

+        pmullw      xmm5,       xmm2

+        pmullw      xmm6,       xmm2

+        paddw       xmm3,       xmm5

+        paddw       xmm4,       xmm6

+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

+        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

+        paddw       xmm4,       [GLOBAL(rd)]

+        psraw       xmm4,       VP9_FILTER_SHIFT

+        packuswb    xmm3,       xmm4

+        movdqa      [rdi],      xmm3                 ; store the results in the destination

+        add         rsi,        rax                 ; next line

+        add         rdi,        rdx                 ; dst_pitch

+        cmp         rdi,        rcx

+        jne         .next_row_fpo

+.done:

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_bilinear_predict8x8_sse2

+;(

+;    unsigned char  *src_ptr,

+;    int   src_pixels_per_line,

+;    int  xoffset,

+;    int  yoffset,

+;    unsigned char *dst_ptr,

+;    int dst_pitch

+;)

+extern sym(vp9_bilinear_filters_mmx)

+global sym(vp9_bilinear_predict8x8_sse2)

+sym(vp9_bilinear_predict8x8_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 144                         ; reserve 144 bytes

+    ;const short *HFilter = bilinear_filters_mmx[xoffset]

+    ;const short *VFilter = bilinear_filters_mmx[yoffset]

+        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_mmx))]

+        mov         rsi,        arg(0) ;src_ptr

+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line

+    ;Read 9-line unaligned data in and put them on stack. This gives a big

+    ;performance boost.

+        movdqu      xmm0,       [rsi]

+        lea         rax,        [rdx + rdx*2]

+        movdqu      xmm1,       [rsi+rdx]

+        movdqu      xmm2,       [rsi+rdx*2]

+        add         rsi,        rax

+        movdqu      xmm3,       [rsi]

+        movdqu      xmm4,       [rsi+rdx]

+        movdqu      xmm5,       [rsi+rdx*2]

+        add         rsi,        rax

+        movdqu      xmm6,       [rsi]

+        movdqu      xmm7,       [rsi+rdx]

+        movdqa      XMMWORD PTR [rsp],            xmm0

+        movdqu      xmm0,       [rsi+rdx*2]

+        movdqa      XMMWORD PTR [rsp+16],         xmm1

+        movdqa      XMMWORD PTR [rsp+32],         xmm2

+        movdqa      XMMWORD PTR [rsp+48],         xmm3

+        movdqa      XMMWORD PTR [rsp+64],         xmm4

+        movdqa      XMMWORD PTR [rsp+80],         xmm5

+        movdqa      XMMWORD PTR [rsp+96],         xmm6

+        movdqa      XMMWORD PTR [rsp+112],        xmm7

+        movdqa      XMMWORD PTR [rsp+128],        xmm0

+        movsxd      rax,        dword ptr arg(2) ;xoffset

+        shl         rax,        5

+        add         rax,        rcx    ;HFilter

+        mov         rdi,        arg(4) ;dst_ptr

+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch

+        movdqa      xmm1,       [rax]

+        movdqa      xmm2,       [rax+16]

+        movsxd      rax,        dword ptr arg(3) ;yoffset

+        shl         rax,        5

+        add         rax,        rcx    ;VFilter

+        lea         rcx,        [rdi+rdx*8]

+        movdqa      xmm5,       [rax]

+        movdqa      xmm6,       [rax+16]

+        pxor        xmm0,       xmm0

+        ; get the first horizontal line done

+        movdqa      xmm3,       XMMWORD PTR [rsp]

+        movdqa      xmm4,       xmm3                 ; make a copy of current line

+        psrldq      xmm4,       1

+        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07

+        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08

+        pmullw      xmm3,       xmm1

+        pmullw      xmm4,       xmm2

+        paddw       xmm3,       xmm4

+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

+        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

+        movdqa      xmm7,       xmm3

+        add         rsp,        16                 ; next line

+.next_row8x8:

+        movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

+        movdqa      xmm4,       xmm3                 ; make a copy of current line

+        psrldq      xmm4,       1

+        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07

+        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08

+        pmullw      xmm3,       xmm1

+        pmullw      xmm4,       xmm2

+        paddw       xmm3,       xmm4

+        pmullw      xmm7,       xmm5

+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

+        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

+        movdqa      xmm4,       xmm3

+        pmullw      xmm3,       xmm6

+        paddw       xmm3,       xmm7

+        movdqa      xmm7,       xmm4

+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

+        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

+        packuswb    xmm3,       xmm0

+        movq        [rdi],      xmm3                 ; store the results in the destination

+        add         rsp,        16                 ; next line

+        add         rdi,        rdx

+        cmp         rdi,        rcx

+        jne         .next_row8x8

+    ;add rsp, 144

+    pop rsp

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+rd:

+    times 8 dw 0x40

--- /dev/null

+++ b/vp9/common/x86/subpixel_ssse3.asm

@@ -1,0 +1,1515 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+%define BLOCK_HEIGHT_WIDTH 4

+%define VP9_FILTER_WEIGHT 128

+%define VP9_FILTER_SHIFT  7

+;/************************************************************************************

+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The

+; input pixel array has output_height rows. This routine assumes that output_height is an

+; even number. This function handles 8 pixels in horizontal direction, calculating ONE

+; rows each iteration to take advantage of the 128 bits operations.

+;

+; This is an implementation of some of the SSE optimizations first seen in ffvp8

+;

+;*************************************************************************************/

+;void vp9_filter_block1d8_h6_ssse3

+;(

+;    unsigned char  *src_ptr,

+;    unsigned int    src_pixels_per_line,

+;    unsigned char *output_ptr,

+;    unsigned int    output_pitch,

+;    unsigned int    output_height,

+;    unsigned int    vp9_filter_index

+;)

+global sym(vp9_filter_block1d8_h6_ssse3)

+sym(vp9_filter_block1d8_h6_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    movsxd      rdx, DWORD PTR arg(5)   ;table index

+    xor         rsi, rsi

+    shl         rdx, 4

+    movdqa      xmm7, [GLOBAL(rd)]

+    lea         rax, [GLOBAL(k0_k5)]

+    add         rax, rdx

+    mov         rdi, arg(2)             ;output_ptr

+    cmp         esi, DWORD PTR [rax]

+    je          vp9_filter_block1d8_h4_ssse3

+    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5

+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

+    mov         rsi, arg(0)             ;src_ptr

+    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line

+    movsxd      rcx, dword ptr arg(4)   ;output_height

+    movsxd      rdx, dword ptr arg(3)   ;output_pitch

+    sub         rdi, rdx

+;xmm3 free

+.filter_block1d8_h6_rowloop_ssse3:

+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5

+    movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10

+    punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10

+    movdqa      xmm1,   xmm0

+    pmaddubsw   xmm0,   xmm4

+    movdqa      xmm2,   xmm1

+    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]

+    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]

+    pmaddubsw   xmm1,   xmm5

+    lea         rdi,    [rdi + rdx]

+    pmaddubsw   xmm2,   xmm6

+    lea         rsi,    [rsi + rax]

+    dec         rcx

+    paddsw      xmm0,   xmm1

+    paddsw      xmm2,   xmm7

+    paddsw      xmm0,   xmm2

+    psraw       xmm0,   7

+    packuswb    xmm0,   xmm0

+    movq        MMWORD Ptr [rdi], xmm0

+    jnz         .filter_block1d8_h6_rowloop_ssse3

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+vp9_filter_block1d8_h4_ssse3:

+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

+    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]

+    movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]

+    mov         rsi, arg(0)             ;src_ptr

+    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line

+    movsxd      rcx, dword ptr arg(4)   ;output_height

+    movsxd      rdx, dword ptr arg(3)   ;output_pitch

+    sub         rdi, rdx

+.filter_block1d8_h4_rowloop_ssse3:

+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5

+    movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10

+    punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10

+    movdqa      xmm2,   xmm0

+    pshufb      xmm0,   xmm3

+    pshufb      xmm2,   xmm4

+    pmaddubsw   xmm0,   xmm5

+    lea         rdi,    [rdi + rdx]

+    pmaddubsw   xmm2,   xmm6

+    lea         rsi,    [rsi + rax]

+    dec         rcx

+    paddsw      xmm0,   xmm7

+    paddsw      xmm0,   xmm2

+    psraw       xmm0,   7

+    packuswb    xmm0,   xmm0

+    movq        MMWORD Ptr [rdi], xmm0

+    jnz         .filter_block1d8_h4_rowloop_ssse3

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_filter_block1d16_h6_ssse3

+;(

+;    unsigned char  *src_ptr,

+;    unsigned int    src_pixels_per_line,

+;    unsigned char  *output_ptr,

+;    unsigned int    output_pitch,

+;    unsigned int    output_height,

+;    unsigned int    vp9_filter_index

+;)

+global sym(vp9_filter_block1d16_h6_ssse3)

+sym(vp9_filter_block1d16_h6_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    movsxd      rdx, DWORD PTR arg(5)           ;table index

+    xor         rsi, rsi

+    shl         rdx, 4      ;

+    lea         rax, [GLOBAL(k0_k5)]

+    add         rax, rdx

+    mov         rdi, arg(2)                     ;output_ptr

+    mov         rsi, arg(0)                     ;src_ptr

+    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5

+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

+    movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line

+    movsxd      rcx, dword ptr arg(4)           ;output_height

+    movsxd      rdx, dword ptr arg(3)           ;output_pitch

+.filter_block1d16_h6_rowloop_ssse3:

+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5

+    movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10

+    punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10

+    movdqa      xmm1,   xmm0

+    pmaddubsw   xmm0,   xmm4

+    movdqa      xmm2,   xmm1

+    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]

+    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]

+    movq        xmm3,   MMWORD PTR [rsi +  6]

+    pmaddubsw   xmm1,   xmm5

+    movq        xmm7,   MMWORD PTR [rsi + 11]

+    pmaddubsw   xmm2,   xmm6

+    punpcklbw   xmm3,   xmm7

+    paddsw      xmm0,   xmm1

+    movdqa      xmm1,   xmm3

+    pmaddubsw   xmm3,   xmm4

+    paddsw      xmm0,   xmm2

+    movdqa      xmm2,   xmm1

+    paddsw      xmm0,   [GLOBAL(rd)]

+    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]

+    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]

+    psraw       xmm0,   7

+    pmaddubsw   xmm1,   xmm5

+    pmaddubsw   xmm2,   xmm6

+    packuswb    xmm0,   xmm0

+    lea         rsi,    [rsi + rax]

+    paddsw      xmm3,   xmm1

+    paddsw      xmm3,   xmm2

+    paddsw      xmm3,   [GLOBAL(rd)]

+    psraw       xmm3,   7

+    packuswb    xmm3,   xmm3

+    punpcklqdq  xmm0,   xmm3

+    movdqa      XMMWORD Ptr [rdi], xmm0

+    lea         rdi,    [rdi + rdx]

+    dec         rcx

+    jnz         .filter_block1d16_h6_rowloop_ssse3

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_filter_block1d4_h6_ssse3

+;(

+;    unsigned char  *src_ptr,

+;    unsigned int    src_pixels_per_line,

+;    unsigned char  *output_ptr,

+;    unsigned int    output_pitch,

+;    unsigned int    output_height,

+;    unsigned int    vp9_filter_index

+;)

+global sym(vp9_filter_block1d4_h6_ssse3)

+sym(vp9_filter_block1d4_h6_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    movsxd      rdx, DWORD PTR arg(5)   ;table index

+    xor         rsi, rsi

+    shl         rdx, 4      ;

+    lea         rax, [GLOBAL(k0_k5)]

+    add         rax, rdx

+    movdqa      xmm7, [GLOBAL(rd)]

+    cmp         esi, DWORD PTR [rax]

+    je          .vp9_filter_block1d4_h4_ssse3

+    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5

+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

+    mov         rsi, arg(0)             ;src_ptr

+    mov         rdi, arg(2)             ;output_ptr

+    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line

+    movsxd      rcx, dword ptr arg(4)   ;output_height

+    movsxd      rdx, dword ptr arg(3)   ;output_pitch

+;xmm3 free

+.filter_block1d4_h6_rowloop_ssse3:

+    movdqu      xmm0,   XMMWORD PTR [rsi - 2]

+    movdqa      xmm1, xmm0

+    pshufb      xmm0, [GLOBAL(shuf1b)]

+    movdqa      xmm2, xmm1

+    pshufb      xmm1, [GLOBAL(shuf2b)]

+    pmaddubsw   xmm0, xmm4

+    pshufb      xmm2, [GLOBAL(shuf3b)]

+    pmaddubsw   xmm1, xmm5

+;--

+    pmaddubsw   xmm2, xmm6

+    lea         rsi,    [rsi + rax]

+;--

+    paddsw      xmm0, xmm1

+    paddsw      xmm0, xmm7

+    pxor        xmm1, xmm1

+    paddsw      xmm0, xmm2

+    psraw       xmm0, 7

+    packuswb    xmm0, xmm0

+    movd        DWORD PTR [rdi], xmm0

+    add         rdi, rdx

+    dec         rcx

+    jnz         .filter_block1d4_h6_rowloop_ssse3

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+.vp9_filter_block1d4_h4_ssse3:

+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

+    movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]

+    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]

+    mov         rsi, arg(0)             ;src_ptr

+    mov         rdi, arg(2)             ;output_ptr

+    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line

+    movsxd      rcx, dword ptr arg(4)   ;output_height

+    movsxd      rdx, dword ptr arg(3)   ;output_pitch

+.filter_block1d4_h4_rowloop_ssse3:

+    movdqu      xmm1,   XMMWORD PTR [rsi - 2]

+    movdqa      xmm2, xmm1

+    pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]

+    pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]

+    pmaddubsw   xmm1, xmm5

+;--

+    pmaddubsw   xmm2, xmm6

+    lea         rsi,    [rsi + rax]

+;--

+    paddsw      xmm1, xmm7

+    paddsw      xmm1, xmm2

+    psraw       xmm1, 7

+    packuswb    xmm1, xmm1

+    movd        DWORD PTR [rdi], xmm1

+    add         rdi, rdx

+    dec         rcx

+    jnz         .filter_block1d4_h4_rowloop_ssse3

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_filter_block1d16_v6_ssse3

+;(

+;    unsigned char *src_ptr,

+;    unsigned int   src_pitch,

+;    unsigned char *output_ptr,

+;    unsigned int   out_pitch,

+;    unsigned int   output_height,

+;    unsigned int   vp9_filter_index

+;)

+global sym(vp9_filter_block1d16_v6_ssse3)

+sym(vp9_filter_block1d16_v6_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    movsxd      rdx, DWORD PTR arg(5)   ;table index

+    xor         rsi, rsi

+    shl         rdx, 4      ;

+    lea         rax, [GLOBAL(k0_k5)]

+    add         rax, rdx

+    cmp         esi, DWORD PTR [rax]

+    je          .vp9_filter_block1d16_v4_ssse3

+    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5

+    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4

+    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3

+    mov         rsi, arg(0)             ;src_ptr

+    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line

+    mov         rdi, arg(2)             ;output_ptr

+%if ABI_IS_32BIT=0

+    movsxd      r8, DWORD PTR arg(3)    ;out_pitch

+%endif

+    mov         rax, rsi

+    movsxd      rcx, DWORD PTR arg(4)   ;output_height

+    add         rax, rdx

+.vp9_filter_block1d16_v6_ssse3_loop:

+    movq        xmm1, MMWORD PTR [rsi]                  ;A

+    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B

+    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C

+    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D

+    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E

+    punpcklbw   xmm2, xmm4                  ;B D

+    punpcklbw   xmm3, xmm0                  ;C E

+    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F

+    pmaddubsw   xmm3, xmm6

+    punpcklbw   xmm1, xmm0                  ;A F

+    pmaddubsw   xmm2, xmm7

+    pmaddubsw   xmm1, xmm5

+    paddsw      xmm2, xmm3

+    paddsw      xmm2, xmm1

+    paddsw      xmm2, [GLOBAL(rd)]

+    psraw       xmm2, 7

+    packuswb    xmm2, xmm2

+    movq        MMWORD PTR [rdi], xmm2          ;store the results

+    movq        xmm1, MMWORD PTR [rsi + 8]                  ;A

+    movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B

+    movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C

+    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D

+    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E

+    punpcklbw   xmm2, xmm4                  ;B D

+    punpcklbw   xmm3, xmm0                  ;C E

+    movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F

+    pmaddubsw   xmm3, xmm6

+    punpcklbw   xmm1, xmm0                  ;A F

+    pmaddubsw   xmm2, xmm7

+    pmaddubsw   xmm1, xmm5

+    add         rsi,  rdx

+    add         rax,  rdx

+;--

+;--

+    paddsw      xmm2, xmm3

+    paddsw      xmm2, xmm1

+    paddsw      xmm2, [GLOBAL(rd)]

+    psraw       xmm2, 7

+    packuswb    xmm2, xmm2

+    movq        MMWORD PTR [rdi+8], xmm2

+%if ABI_IS_32BIT

+    add         rdi,        DWORD PTR arg(3) ;out_pitch

+%else

+    add         rdi,        r8

+%endif

+    dec         rcx

+    jnz         .vp9_filter_block1d16_v6_ssse3_loop

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+.vp9_filter_block1d16_v4_ssse3:

+    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4

+    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3

+    mov         rsi, arg(0)             ;src_ptr

+    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line

+    mov         rdi, arg(2)             ;output_ptr

+%if ABI_IS_32BIT=0

+    movsxd      r8, DWORD PTR arg(3)    ;out_pitch

+%endif

+    mov         rax, rsi

+    movsxd      rcx, DWORD PTR arg(4)   ;output_height

+    add         rax, rdx

+.vp9_filter_block1d16_v4_ssse3_loop:

+    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B

+    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C

+    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D

+    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E

+    punpcklbw   xmm2, xmm4                  ;B D

+    punpcklbw   xmm3, xmm0                  ;C E

+    pmaddubsw   xmm3, xmm6

+    pmaddubsw   xmm2, xmm7

+    movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B

+    movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C

+    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D

+    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E

+    paddsw      xmm2, [GLOBAL(rd)]

+    paddsw      xmm2, xmm3

+    psraw       xmm2, 7

+    packuswb    xmm2, xmm2

+    punpcklbw   xmm5, xmm4                  ;B D

+    punpcklbw   xmm1, xmm0                  ;C E

+    pmaddubsw   xmm1, xmm6

+    pmaddubsw   xmm5, xmm7

+    movdqa      xmm4, [GLOBAL(rd)]

+    add         rsi,  rdx

+    add         rax,  rdx

+;--

+;--

+    paddsw      xmm5, xmm1

+    paddsw      xmm5, xmm4

+    psraw       xmm5, 7

+    packuswb    xmm5, xmm5

+    punpcklqdq  xmm2, xmm5

+    movdqa       XMMWORD PTR [rdi], xmm2

+%if ABI_IS_32BIT

+    add         rdi,        DWORD PTR arg(3) ;out_pitch

+%else

+    add         rdi,        r8

+%endif

+    dec         rcx

+    jnz         .vp9_filter_block1d16_v4_ssse3_loop

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_filter_block1d8_v6_ssse3

+;(

+;    unsigned char *src_ptr,

+;    unsigned int   src_pitch,

+;    unsigned char *output_ptr,

+;    unsigned int   out_pitch,

+;    unsigned int   output_height,

+;    unsigned int   vp9_filter_index

+;)

+global sym(vp9_filter_block1d8_v6_ssse3)

+sym(vp9_filter_block1d8_v6_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    movsxd      rdx, DWORD PTR arg(5)   ;table index

+    xor         rsi, rsi

+    shl         rdx, 4      ;

+    lea         rax, [GLOBAL(k0_k5)]

+    add         rax, rdx

+    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line

+    mov         rdi, arg(2)             ;output_ptr

+%if ABI_IS_32BIT=0

+    movsxd      r8, DWORD PTR arg(3)    ; out_pitch

+%endif

+    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]

+    cmp         esi, DWORD PTR [rax]

+    je          .vp9_filter_block1d8_v4_ssse3

+    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5

+    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4

+    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3

+    mov         rsi, arg(0)             ;src_ptr

+    mov         rax, rsi

+    add         rax, rdx

+.vp9_filter_block1d8_v6_ssse3_loop:

+    movq        xmm1, MMWORD PTR [rsi]                  ;A

+    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B

+    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C

+    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D

+    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E

+    punpcklbw   xmm2, xmm4                  ;B D

+    punpcklbw   xmm3, xmm0                  ;C E

+    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F

+    movdqa      xmm4, [GLOBAL(rd)]

+    pmaddubsw   xmm3, xmm6

+    punpcklbw   xmm1, xmm0                  ;A F

+    pmaddubsw   xmm2, xmm7

+    pmaddubsw   xmm1, xmm5

+    add         rsi,  rdx

+    add         rax,  rdx

+;--

+;--

+    paddsw      xmm2, xmm3

+    paddsw      xmm2, xmm1

+    paddsw      xmm2, xmm4

+    psraw       xmm2, 7

+    packuswb    xmm2, xmm2

+    movq        MMWORD PTR [rdi], xmm2

+%if ABI_IS_32BIT

+    add         rdi,        DWORD PTR arg(3) ;[out_pitch]

+%else

+    add         rdi,        r8

+%endif

+    dec         rcx

+    jnz         .vp9_filter_block1d8_v6_ssse3_loop

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+.vp9_filter_block1d8_v4_ssse3:

+    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4

+    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3

+    movdqa      xmm5, [GLOBAL(rd)]

+    mov         rsi, arg(0)             ;src_ptr

+    mov         rax, rsi

+    add         rax, rdx

+.vp9_filter_block1d8_v4_ssse3_loop:

+    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B

+    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C

+    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D

+    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E

+    punpcklbw   xmm2, xmm4                  ;B D

+    punpcklbw   xmm3, xmm0                  ;C E

+    pmaddubsw   xmm3, xmm6

+    pmaddubsw   xmm2, xmm7

+    add         rsi,  rdx

+    add         rax,  rdx

+;--

+;--

+    paddsw      xmm2, xmm3

+    paddsw      xmm2, xmm5

+    psraw       xmm2, 7

+    packuswb    xmm2, xmm2

+    movq        MMWORD PTR [rdi], xmm2

+%if ABI_IS_32BIT

+    add         rdi,        DWORD PTR arg(3) ;[out_pitch]

+%else

+    add         rdi,        r8

+%endif

+    dec         rcx

+    jnz         .vp9_filter_block1d8_v4_ssse3_loop

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_filter_block1d4_v6_ssse3

+;(

+;    unsigned char *src_ptr,

+;    unsigned int   src_pitch,

+;    unsigned char *output_ptr,

+;    unsigned int   out_pitch,

+;    unsigned int   output_height,

+;    unsigned int   vp9_filter_index

+;)

+global sym(vp9_filter_block1d4_v6_ssse3)

+sym(vp9_filter_block1d4_v6_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    movsxd      rdx, DWORD PTR arg(5)   ;table index

+    xor         rsi, rsi

+    shl         rdx, 4      ;

+    lea         rax, [GLOBAL(k0_k5)]

+    add         rax, rdx

+    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line

+    mov         rdi, arg(2)             ;output_ptr

+%if ABI_IS_32BIT=0

+    movsxd      r8, DWORD PTR arg(3)    ; out_pitch

+%endif

+    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]

+    cmp         esi, DWORD PTR [rax]

+    je          .vp9_filter_block1d4_v4_ssse3

+    movq        mm5, MMWORD PTR [rax]         ;k0_k5

+    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4

+    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3

+    mov         rsi, arg(0)             ;src_ptr

+    mov         rax, rsi

+    add         rax, rdx

+.vp9_filter_block1d4_v6_ssse3_loop:

+    movd        mm1, DWORD PTR [rsi]                  ;A

+    movd        mm2, DWORD PTR [rsi + rdx]            ;B

+    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C

+    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D

+    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E

+    punpcklbw   mm2, mm4                  ;B D

+    punpcklbw   mm3, mm0                  ;C E

+    movd        mm0, DWORD PTR [rax + rdx * 4]        ;F

+    movq        mm4, [GLOBAL(rd)]

+    pmaddubsw   mm3, mm6

+    punpcklbw   mm1, mm0                  ;A F

+    pmaddubsw   mm2, mm7

+    pmaddubsw   mm1, mm5

+    add         rsi,  rdx

+    add         rax,  rdx

+;--

+;--

+    paddsw      mm2, mm3

+    paddsw      mm2, mm1

+    paddsw      mm2, mm4

+    psraw       mm2, 7

+    packuswb    mm2, mm2

+    movd        DWORD PTR [rdi], mm2

+%if ABI_IS_32BIT

+    add         rdi,        DWORD PTR arg(3) ;[out_pitch]

+%else

+    add         rdi,        r8

+%endif

+    dec         rcx

+    jnz         .vp9_filter_block1d4_v6_ssse3_loop

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+.vp9_filter_block1d4_v4_ssse3:

+    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4

+    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3

+    movq        mm5, MMWORD PTR [GLOBAL(rd)]

+    mov         rsi, arg(0)             ;src_ptr

+    mov         rax, rsi

+    add         rax, rdx

+.vp9_filter_block1d4_v4_ssse3_loop:

+    movd        mm2, DWORD PTR [rsi + rdx]            ;B

+    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C

+    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D

+    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E

+    punpcklbw   mm2, mm4                  ;B D

+    punpcklbw   mm3, mm0                  ;C E

+    pmaddubsw   mm3, mm6

+    pmaddubsw   mm2, mm7

+    add         rsi,  rdx

+    add         rax,  rdx

+;--

+;--

+    paddsw      mm2, mm3

+    paddsw      mm2, mm5

+    psraw       mm2, 7

+    packuswb    mm2, mm2

+    movd        DWORD PTR [rdi], mm2

+%if ABI_IS_32BIT

+    add         rdi,        DWORD PTR arg(3) ;[out_pitch]

+%else

+    add         rdi,        r8

+%endif

+    dec         rcx

+    jnz         .vp9_filter_block1d4_v4_ssse3_loop

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_bilinear_predict16x16_ssse3

+;(

+;    unsigned char  *src_ptr,

+;    int   src_pixels_per_line,

+;    int  xoffset,

+;    int  yoffset,

+;    unsigned char *dst_ptr,

+;    int dst_pitch

+;)

+global sym(vp9_bilinear_predict16x16_ssse3)

+sym(vp9_bilinear_predict16x16_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+        lea         rcx,        [GLOBAL(bilinear_filters_ssse3)]

+        movsxd      rax,        dword ptr arg(2)    ; xoffset

+        cmp         rax,        0                   ; skip first_pass filter if xoffset=0

+        je          .b16x16_sp_only

+        shl         rax,        4

+        lea         rax,        [rax + rcx]         ; HFilter

+        mov         rdi,        arg(4)              ; dst_ptr

+        mov         rsi,        arg(0)              ; src_ptr

+        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch

+        movdqa      xmm1,       [rax]

+        movsxd      rax,        dword ptr arg(3)    ; yoffset

+        cmp         rax,        0                   ; skip second_pass filter if yoffset=0

+        je          .b16x16_fp_only

+        shl         rax,        4

+        lea         rax,        [rax + rcx]         ; VFilter

+        lea         rcx,        [rdi+rdx*8]

+        lea         rcx,        [rcx+rdx*8]

+        movsxd      rdx,        dword ptr arg(1)    ; src_pixels_per_line

+        movdqa      xmm2,       [rax]

+%if ABI_IS_32BIT=0

+        movsxd      r8,         dword ptr arg(5)    ; dst_pitch

+%endif

+        movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07

+        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08

+        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08

+        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15

+        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16

+        lea         rsi,        [rsi + rdx]         ; next line

+        pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14

+        punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16

+        pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15

+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

+        psraw       xmm3,       VP9_FILTER_SHIFT    ; xmm3 /= 128

+        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value

+        psraw       xmm4,       VP9_FILTER_SHIFT    ; xmm4 /= 128

+        movdqa      xmm7,       xmm3

+        packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

+.next_row:

+        movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07

+        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08

+        punpcklbw   xmm6,       xmm5

+        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15

+        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16

+        lea         rsi,        [rsi + rdx]         ; next line

+        pmaddubsw   xmm6,       xmm1

+        punpcklbw   xmm4,       xmm5

+        pmaddubsw   xmm4,       xmm1

+        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value

+        psraw       xmm6,       VP9_FILTER_SHIFT    ; xmm6 /= 128

+        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value

+        psraw       xmm4,       VP9_FILTER_SHIFT    ; xmm4 /= 128

+        packuswb    xmm6,       xmm4

+        movdqa      xmm5,       xmm7

+        punpcklbw   xmm5,       xmm6

+        pmaddubsw   xmm5,       xmm2

+        punpckhbw   xmm7,       xmm6

+        pmaddubsw   xmm7,       xmm2

+        paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value

+        psraw       xmm5,       VP9_FILTER_SHIFT    ; xmm5 /= 128

+        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value

+        psraw       xmm7,       VP9_FILTER_SHIFT    ; xmm7 /= 128

+        packuswb    xmm5,       xmm7

+        movdqa      xmm7,       xmm6

+        movdqa      [rdi],      xmm5                ; store the results in the destination

+%if ABI_IS_32BIT

+        add         rdi,        DWORD PTR arg(5)    ; dst_pitch

+%else

+        add         rdi,        r8

+%endif

+        cmp         rdi,        rcx

+        jne         .next_row

+        jmp         .done

+.b16x16_sp_only:

+        movsxd      rax,        dword ptr arg(3)    ; yoffset

+        shl         rax,        4

+        lea         rax,        [rax + rcx]         ; VFilter

+        mov         rdi,        arg(4)              ; dst_ptr

+        mov         rsi,        arg(0)              ; src_ptr

+        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch

+        movdqa      xmm1,       [rax]               ; VFilter

+        lea         rcx,        [rdi+rdx*8]

+        lea         rcx,        [rcx+rdx*8]

+        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line

+        ; get the first horizontal line done

+        movq        xmm4,       [rsi]               ; load row 0

+        movq        xmm2,       [rsi + 8]           ; load row 0

+        lea         rsi,        [rsi + rax]         ; next line

+.next_row_sp:

+        movq        xmm3,       [rsi]               ; load row + 1

+        movq        xmm5,       [rsi + 8]           ; load row + 1

+        punpcklbw   xmm4,       xmm3

+        punpcklbw   xmm2,       xmm5

+        pmaddubsw   xmm4,       xmm1

+        movq        xmm7,       [rsi + rax]         ; load row + 2

+        pmaddubsw   xmm2,       xmm1

+        movq        xmm6,       [rsi + rax + 8]     ; load row + 2

+        punpcklbw   xmm3,       xmm7

+        punpcklbw   xmm5,       xmm6

+        pmaddubsw   xmm3,       xmm1

+        paddw       xmm4,       [GLOBAL(rd)]

+        pmaddubsw   xmm5,       xmm1

+        paddw       xmm2,       [GLOBAL(rd)]

+        psraw       xmm4,       VP9_FILTER_SHIFT

+        psraw       xmm2,       VP9_FILTER_SHIFT

+        packuswb    xmm4,       xmm2

+        paddw       xmm3,       [GLOBAL(rd)]

+        movdqa      [rdi],      xmm4                ; store row 0

+        paddw       xmm5,       [GLOBAL(rd)]

+        psraw       xmm3,       VP9_FILTER_SHIFT

+        psraw       xmm5,       VP9_FILTER_SHIFT

+        packuswb    xmm3,       xmm5

+        movdqa      xmm4,       xmm7

+        movdqa      [rdi + rdx],xmm3                ; store row 1

+        lea         rsi,        [rsi + 2*rax]

+        movdqa      xmm2,       xmm6

+        lea         rdi,        [rdi + 2*rdx]

+        cmp         rdi,        rcx

+        jne         .next_row_sp

+        jmp         .done

+.b16x16_fp_only:

+        lea         rcx,        [rdi+rdx*8]

+        lea         rcx,        [rcx+rdx*8]

+        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line

+.next_row_fp:

+        movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07

+        movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08

+        punpcklbw   xmm2,       xmm4

+        movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15

+        pmaddubsw   xmm2,       xmm1

+        movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16

+        lea         rsi,        [rsi + rax]         ; next line

+        punpcklbw   xmm3,       xmm4

+        pmaddubsw   xmm3,       xmm1

+        movq        xmm5,       [rsi]

+        paddw       xmm2,       [GLOBAL(rd)]

+        movq        xmm7,       [rsi+1]

+        movq        xmm6,       [rsi+8]

+        psraw       xmm2,       VP9_FILTER_SHIFT

+        punpcklbw   xmm5,       xmm7

+        movq        xmm7,       [rsi+9]

+        paddw       xmm3,       [GLOBAL(rd)]

+        pmaddubsw   xmm5,       xmm1

+        psraw       xmm3,       VP9_FILTER_SHIFT

+        punpcklbw   xmm6,       xmm7

+        packuswb    xmm2,       xmm3

+        pmaddubsw   xmm6,       xmm1

+        movdqa      [rdi],      xmm2                ; store the results in the destination

+        paddw       xmm5,       [GLOBAL(rd)]

+        lea         rdi,        [rdi + rdx]         ; dst_pitch

+        psraw       xmm5,       VP9_FILTER_SHIFT

+        paddw       xmm6,       [GLOBAL(rd)]

+        psraw       xmm6,       VP9_FILTER_SHIFT

+        packuswb    xmm5,       xmm6

+        lea         rsi,        [rsi + rax]         ; next line

+        movdqa      [rdi],      xmm5                ; store the results in the destination

+        lea         rdi,        [rdi + rdx]         ; dst_pitch

+        cmp         rdi,        rcx

+        jne         .next_row_fp

+.done:

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_bilinear_predict8x8_ssse3

+;(

+;    unsigned char  *src_ptr,

+;    int   src_pixels_per_line,

+;    int  xoffset,

+;    int  yoffset,

+;    unsigned char *dst_ptr,

+;    int dst_pitch

+;)

+global sym(vp9_bilinear_predict8x8_ssse3)

+sym(vp9_bilinear_predict8x8_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 144                         ; reserve 144 bytes

+        lea         rcx,        [GLOBAL(bilinear_filters_ssse3)]

+        mov         rsi,        arg(0) ;src_ptr

+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line

+    ;Read 9-line unaligned data in and put them on stack. This gives a big

+    ;performance boost.

+        movdqu      xmm0,       [rsi]

+        lea         rax,        [rdx + rdx*2]

+        movdqu      xmm1,       [rsi+rdx]

+        movdqu      xmm2,       [rsi+rdx*2]

+        add         rsi,        rax

+        movdqu      xmm3,       [rsi]

+        movdqu      xmm4,       [rsi+rdx]

+        movdqu      xmm5,       [rsi+rdx*2]

+        add         rsi,        rax

+        movdqu      xmm6,       [rsi]

+        movdqu      xmm7,       [rsi+rdx]

+        movdqa      XMMWORD PTR [rsp],            xmm0

+        movdqu      xmm0,       [rsi+rdx*2]

+        movdqa      XMMWORD PTR [rsp+16],         xmm1

+        movdqa      XMMWORD PTR [rsp+32],         xmm2

+        movdqa      XMMWORD PTR [rsp+48],         xmm3

+        movdqa      XMMWORD PTR [rsp+64],         xmm4

+        movdqa      XMMWORD PTR [rsp+80],         xmm5

+        movdqa      XMMWORD PTR [rsp+96],         xmm6

+        movdqa      XMMWORD PTR [rsp+112],        xmm7

+        movdqa      XMMWORD PTR [rsp+128],        xmm0

+        movsxd      rax,        dword ptr arg(2)    ; xoffset

+        cmp         rax,        0                   ; skip first_pass filter if xoffset=0

+        je          .b8x8_sp_only

+        shl         rax,        4

+        add         rax,        rcx                 ; HFilter

+        mov         rdi,        arg(4)              ; dst_ptr

+        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch

+        movdqa      xmm0,       [rax]

+        movsxd      rax,        dword ptr arg(3)    ; yoffset

+        cmp         rax,        0                   ; skip second_pass filter if yoffset=0

+        je          .b8x8_fp_only

+        shl         rax,        4

+        lea         rax,        [rax + rcx]         ; VFilter

+        lea         rcx,        [rdi+rdx*8]

+        movdqa      xmm1,       [rax]

+        ; get the first horizontal line done

+        movdqa      xmm3,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

+        movdqa      xmm5,       xmm3                ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx

+        psrldq      xmm5,       1

+        lea         rsp,        [rsp + 16]          ; next line

+        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08

+        pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14

+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

+        psraw       xmm3,       VP9_FILTER_SHIFT    ; xmm3 /= 128

+        movdqa      xmm7,       xmm3

+        packuswb    xmm7,       xmm7                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

+.next_row:

+        movdqa      xmm6,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

+        lea         rsp,        [rsp + 16]          ; next line

+        movdqa      xmm5,       xmm6

+        psrldq      xmm5,       1

+        punpcklbw   xmm6,       xmm5

+        pmaddubsw   xmm6,       xmm0

+        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value

+        psraw       xmm6,       VP9_FILTER_SHIFT    ; xmm6 /= 128

+        packuswb    xmm6,       xmm6

+        punpcklbw   xmm7,       xmm6

+        pmaddubsw   xmm7,       xmm1

+        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value

+        psraw       xmm7,       VP9_FILTER_SHIFT    ; xmm7 /= 128

+        packuswb    xmm7,       xmm7

+        movq        [rdi],      xmm7                ; store the results in the destination

+        lea         rdi,        [rdi + rdx]

+        movdqa      xmm7,       xmm6

+        cmp         rdi,        rcx

+        jne         .next_row

+        jmp         .done8x8

+.b8x8_sp_only:

+        movsxd      rax,        dword ptr arg(3)    ; yoffset

+        shl         rax,        4

+        lea         rax,        [rax + rcx]         ; VFilter

+        mov         rdi,        arg(4) ;dst_ptr

+        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch

+        movdqa      xmm0,       [rax]               ; VFilter

+        movq        xmm1,       XMMWORD PTR [rsp]

+        movq        xmm2,       XMMWORD PTR [rsp+16]

+        movq        xmm3,       XMMWORD PTR [rsp+32]

+        punpcklbw   xmm1,       xmm2

+        movq        xmm4,       XMMWORD PTR [rsp+48]

+        punpcklbw   xmm2,       xmm3

+        movq        xmm5,       XMMWORD PTR [rsp+64]

+        punpcklbw   xmm3,       xmm4

+        movq        xmm6,       XMMWORD PTR [rsp+80]

+        punpcklbw   xmm4,       xmm5

+        movq        xmm7,       XMMWORD PTR [rsp+96]

+        punpcklbw   xmm5,       xmm6

+        pmaddubsw   xmm1,       xmm0

+        pmaddubsw   xmm2,       xmm0

+        pmaddubsw   xmm3,       xmm0

+        pmaddubsw   xmm4,       xmm0

+        pmaddubsw   xmm5,       xmm0

+        punpcklbw   xmm6,       xmm7

+        pmaddubsw   xmm6,       xmm0

+        paddw       xmm1,       [GLOBAL(rd)]

+        paddw       xmm2,       [GLOBAL(rd)]

+        psraw       xmm1,       VP9_FILTER_SHIFT

+        paddw       xmm3,       [GLOBAL(rd)]

+        psraw       xmm2,       VP9_FILTER_SHIFT

+        paddw       xmm4,       [GLOBAL(rd)]

+        psraw       xmm3,       VP9_FILTER_SHIFT

+        paddw       xmm5,       [GLOBAL(rd)]

+        psraw       xmm4,       VP9_FILTER_SHIFT

+        paddw       xmm6,       [GLOBAL(rd)]

+        psraw       xmm5,       VP9_FILTER_SHIFT

+        psraw       xmm6,       VP9_FILTER_SHIFT

+        packuswb    xmm1,       xmm1

+        packuswb    xmm2,       xmm2

+        movq        [rdi],      xmm1

+        packuswb    xmm3,       xmm3

+        movq        [rdi+rdx],  xmm2

+        packuswb    xmm4,       xmm4

+        movq        xmm1,       XMMWORD PTR [rsp+112]

+        lea         rdi,        [rdi + 2*rdx]

+        movq        xmm2,       XMMWORD PTR [rsp+128]

+        packuswb    xmm5,       xmm5

+        movq        [rdi],      xmm3

+        packuswb    xmm6,       xmm6

+        movq        [rdi+rdx],  xmm4

+        lea         rdi,        [rdi + 2*rdx]

+        punpcklbw   xmm7,       xmm1

+        movq        [rdi],      xmm5

+        pmaddubsw   xmm7,       xmm0

+        movq        [rdi+rdx],  xmm6

+        punpcklbw   xmm1,       xmm2

+        pmaddubsw   xmm1,       xmm0

+        paddw       xmm7,       [GLOBAL(rd)]

+        psraw       xmm7,       VP9_FILTER_SHIFT

+        paddw       xmm1,       [GLOBAL(rd)]

+        psraw       xmm1,       VP9_FILTER_SHIFT

+        packuswb    xmm7,       xmm7

+        packuswb    xmm1,       xmm1

+        lea         rdi,        [rdi + 2*rdx]

+        movq        [rdi],      xmm7

+        movq        [rdi+rdx],  xmm1

+        lea         rsp,        [rsp + 144]

+        jmp         .done8x8

+.b8x8_fp_only:

+        lea         rcx,        [rdi+rdx*8]

+.next_row_fp:

+        movdqa      xmm1,       XMMWORD PTR [rsp]

+        movdqa      xmm3,       XMMWORD PTR [rsp+16]

+        movdqa      xmm2,       xmm1

+        movdqa      xmm5,       XMMWORD PTR [rsp+32]

+        psrldq      xmm2,       1

+        movdqa      xmm7,       XMMWORD PTR [rsp+48]

+        movdqa      xmm4,       xmm3

+        psrldq      xmm4,       1

+        movdqa      xmm6,       xmm5

+        psrldq      xmm6,       1

+        punpcklbw   xmm1,       xmm2

+        pmaddubsw   xmm1,       xmm0

+        punpcklbw   xmm3,       xmm4

+        pmaddubsw   xmm3,       xmm0

+        punpcklbw   xmm5,       xmm6

+        pmaddubsw   xmm5,       xmm0

+        movdqa      xmm2,       xmm7

+        psrldq      xmm2,       1

+        punpcklbw   xmm7,       xmm2

+        pmaddubsw   xmm7,       xmm0

+        paddw       xmm1,       [GLOBAL(rd)]

+        psraw       xmm1,       VP9_FILTER_SHIFT

+        paddw       xmm3,       [GLOBAL(rd)]

+        psraw       xmm3,       VP9_FILTER_SHIFT

+        paddw       xmm5,       [GLOBAL(rd)]

+        psraw       xmm5,       VP9_FILTER_SHIFT

+        paddw       xmm7,       [GLOBAL(rd)]

+        psraw       xmm7,       VP9_FILTER_SHIFT

+        packuswb    xmm1,       xmm1

+        packuswb    xmm3,       xmm3

+        packuswb    xmm5,       xmm5

+        movq        [rdi],      xmm1

+        packuswb    xmm7,       xmm7

+        movq        [rdi+rdx],  xmm3

+        lea         rdi,        [rdi + 2*rdx]

+        movq        [rdi],      xmm5

+        lea         rsp,        [rsp + 4*16]

+        movq        [rdi+rdx],  xmm7

+        lea         rdi,        [rdi + 2*rdx]

+        cmp         rdi,        rcx

+        jne         .next_row_fp

+        lea         rsp,        [rsp + 16]

+.done8x8:

+    ;add rsp, 144

+    pop         rsp

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+shuf1b:

+    db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12

+shuf2b:

+    db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11

+shuf3b:

+    db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10

+align 16

+shuf2bfrom1:

+    db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13

+align 16

+shuf3bfrom1:

+    db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11

+align 16

+rd:

+    times 8 dw 0x40

+align 16

+k0_k5:

+    times 8 db 0, 0             ;placeholder

+    times 8 db 0, 0

+    times 8 db 2, 1

+    times 8 db 0, 0

+    times 8 db 3, 3

+    times 8 db 0, 0

+    times 8 db 1, 2

+    times 8 db 0, 0

+k1_k3:

+    times 8 db  0,    0         ;placeholder

+    times 8 db  -6,  12

+    times 8 db -11,  36

+    times 8 db  -9,  50

+    times 8 db -16,  77

+    times 8 db  -6,  93

+    times 8 db  -8, 108

+    times 8 db  -1, 123

+k2_k4:

+    times 8 db 128,    0        ;placeholder

+    times 8 db 123,   -1

+    times 8 db 108,   -8

+    times 8 db  93,   -6

+    times 8 db  77,  -16

+    times 8 db  50,   -9

+    times 8 db  36,  -11

+    times 8 db  12,   -6

+align 16

+bilinear_filters_ssse3:

+    times 8 db 128, 0

+    times 8 db 120, 8

+    times 8 db 112, 16

+    times 8 db 104, 24

+    times 8 db 96,  32

+    times 8 db 88,  40

+    times 8 db 80,  48

+    times 8 db 72,  56

+    times 8 db 64,  64

+    times 8 db 56,  72

+    times 8 db 48,  80

+    times 8 db 40,  88

+    times 8 db 32,  96

+    times 8 db 24,  104

+    times 8 db 16,  112

+    times 8 db 8,   120

--- /dev/null

+++ b/vp9/common/x86/subpixel_x86.h

@@ -1,0 +1,122 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef SUBPIXEL_X86_H

+#define SUBPIXEL_X86_H

+/* Note:

+ *

+ * This platform is commonly built for runtime CPU detection. If you modify

+ * any of the function mappings present in this file, be sure to also update

+ * them in the function pointer initialization code

+ */

+#if HAVE_MMX

+extern prototype_subpixel_predict(vp9_sixtap_predict16x16_mmx);

+extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx);

+extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx);

+extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx);

+extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx);

+extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx);

+extern prototype_subpixel_predict(vp9_bilinear_predict8x4_mmx);

+extern prototype_subpixel_predict(vp9_bilinear_predict4x4_mmx);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp9_subpix_sixtap16x16

+#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx

+#undef  vp9_subpix_sixtap8x8

+#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_mmx

+#undef  vp9_subpix_sixtap8x4

+#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_mmx

+#undef  vp9_subpix_sixtap4x4

+#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_mmx

+#undef  vp9_subpix_bilinear16x16

+#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx

+#undef  vp9_subpix_bilinear8x8

+#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_mmx

+#undef  vp9_subpix_bilinear8x4

+#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_mmx

+#undef  vp9_subpix_bilinear4x4

+#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_mmx

+#endif

+#endif

+#if HAVE_SSE2

+extern prototype_subpixel_predict(vp9_sixtap_predict16x16_sse2);

+extern prototype_subpixel_predict(vp9_sixtap_predict8x8_sse2);

+extern prototype_subpixel_predict(vp9_sixtap_predict8x4_sse2);

+extern prototype_subpixel_predict(vp9_bilinear_predict16x16_sse2);

+extern prototype_subpixel_predict(vp9_bilinear_predict8x8_sse2);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp9_subpix_sixtap16x16

+#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_sse2

+#undef  vp9_subpix_sixtap8x8

+#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_sse2

+#undef  vp9_subpix_sixtap8x4

+#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_sse2

+#undef  vp9_subpix_bilinear16x16

+#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_sse2

+#undef  vp9_subpix_bilinear8x8

+#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_sse2

+#endif

+#endif

+#if HAVE_SSSE3

+extern prototype_subpixel_predict(vp9_sixtap_predict16x16_ssse3);

+extern prototype_subpixel_predict(vp9_sixtap_predict8x8_ssse3);

+extern prototype_subpixel_predict(vp9_sixtap_predict8x4_ssse3);

+extern prototype_subpixel_predict(vp9_sixtap_predict4x4_ssse3);

+extern prototype_subpixel_predict(vp9_bilinear_predict16x16_ssse3);

+extern prototype_subpixel_predict(vp9_bilinear_predict8x8_ssse3);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp9_subpix_sixtap16x16

+#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_ssse3

+#undef  vp9_subpix_sixtap8x8

+#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_ssse3

+#undef  vp9_subpix_sixtap8x4

+#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_ssse3

+#undef  vp9_subpix_sixtap4x4

+#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_ssse3

+#undef  vp9_subpix_bilinear16x16

+#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_ssse3

+#undef  vp9_subpix_bilinear8x8

+#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_ssse3

+#endif

+#endif

+#endif

--- /dev/null

+++ b/vp9/common/x86/vp8_asm_stubs.c

@@ -1,0 +1,602 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vpx_ports/mem.h"

+#include "vp9/common/subpixel.h"

+extern const short vp9_six_tap_mmx[16][6 * 8];

+extern const short vp9_bilinear_filters_8x_mmx[16][2 * 8];

+extern void vp9_filter_block1d_h6_mmx(unsigned char   *src_ptr,

+                                      unsigned short  *output_ptr,

+                                      unsigned int     src_pixels_per_line,

+                                      unsigned int     pixel_step,

+                                      unsigned int     output_height,

+                                      unsigned int     output_width,

+                                      const short     *vp9_filter);

+extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr,

+                                       unsigned char  *output_ptr,

+                                       int             output_pitch,

+                                       unsigned int    pixels_per_line,

+                                       unsigned int    pixel_step,

+                                       unsigned int    output_height,

+                                       unsigned int    output_width,

+                                       const short    *vp9_filter);

+extern void vp9_filter_block1d8_h6_sse2(unsigned char  *src_ptr,

+                                        unsigned short *output_ptr,

+                                        unsigned int    src_pixels_per_line,

+                                        unsigned int    pixel_step,

+                                        unsigned int    output_height,

+                                        unsigned int    output_width,

+                                        const short    *vp9_filter);

+extern void vp9_filter_block1d16_h6_sse2(unsigned char  *src_ptr,

+                                         unsigned short *output_ptr,

+                                         unsigned int    src_pixels_per_line,

+                                         unsigned int    pixel_step,

+                                         unsigned int    output_height,

+                                         unsigned int    output_width,

+                                         const short    *vp9_filter);

+extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr,

+                                        unsigned char *output_ptr,

+                                        int dst_ptich,

+                                        unsigned int pixels_per_line,

+                                        unsigned int pixel_step,

+                                        unsigned int output_height,

+                                        unsigned int output_width,

+                                        const short    *vp9_filter);

+extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr,

+                                         unsigned char *output_ptr,

+                                         int dst_ptich,

+                                         unsigned int pixels_per_line,

+                                         unsigned int pixel_step,

+                                         unsigned int output_height,

+                                         unsigned int output_width,

+                                         const short    *vp9_filter);

+extern void vp9_unpack_block1d16_h6_sse2(unsigned char  *src_ptr,

+                                         unsigned short *output_ptr,

+                                         unsigned int    src_pixels_per_line,

+                                         unsigned int    output_height,

+                                         unsigned int    output_width);

+extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,

+                                             unsigned int   src_pixels_per_line,

+                                             unsigned char *output_ptr,

+                                             int            dst_pitch,

+                                             unsigned int   output_height,

+                                             const short   *vp9_filter);

+extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,

+                                              unsigned int   src_pixels_per_lin,

+                                              unsigned char *output_ptr,

+                                              int            dst_pitch,

+                                              unsigned int   output_height,

+                                              const short   *vp9_filter);

+extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,

+                                             unsigned int   src_pixels_per_line,

+                                             unsigned char *output_ptr,

+                                             int            dst_pitch,

+                                             unsigned int   output_height,

+                                             const short   *vp9_filter);

+extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx);

+#if HAVE_MMX

+void vp9_sixtap_predict4x4_mmx(unsigned char  *src_ptr,

+                               int  src_pixels_per_line,

+                               int  xoffset,

+                               int  yoffset,

+                               unsigned char *dst_ptr,

+                               int  dst_pitch) {

+#ifdef ANNOUNCE_FUNCTION

+  printf("vp9_sixtap_predict4x4_mmx\n");

+#endif

+  /* Temp data bufffer used in filtering */

+  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16);

+  const short *hfilter, *vfilter;

+  hfilter = vp9_six_tap_mmx[xoffset];

+  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2,

+                            src_pixels_per_line, 1, 9, 8, hfilter);

+  vfilter = vp9_six_tap_mmx[yoffset];

+  vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch,

+                             8, 4, 4, 4, vfilter);

+}

+void vp9_sixtap_predict16x16_mmx(unsigned char  *src_ptr,

+                                 int  src_pixels_per_line,

+                                 int  xoffset,

+                                 int  yoffset,

+                                 unsigned char *dst_ptr,

+                                 int dst_pitch) {

+#ifdef ANNOUNCE_FUNCTION

+  printf("vp9_sixtap_predict16x16_mmx\n");

+#endif

+  /* Temp data bufffer used in filtering */

+  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);

+  const short *hfilter, *vfilter;

+  hfilter = vp9_six_tap_mmx[xoffset];

+  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),

+                            fdata2,   src_pixels_per_line, 1, 21, 32,

+                            hfilter);

+  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,

+                            fdata2 + 4, src_pixels_per_line, 1, 21, 32,

+                            hfilter);

+  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,

+                            fdata2 + 8, src_pixels_per_line, 1, 21, 32,

+                            hfilter);

+  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12,

+                            fdata2 + 12, src_pixels_per_line, 1, 21, 32,

+                            hfilter);

+  vfilter = vp9_six_tap_mmx[yoffset];

+  vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr,      dst_pitch,

+                             32, 16, 16, 16, vfilter);

+  vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4,  dst_pitch,

+                             32, 16, 16, 16, vfilter);

+  vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8,  dst_pitch,

+                             32, 16, 16, 16, vfilter);

+  vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch,

+                             32, 16, 16, 16, vfilter);

+}

+void vp9_sixtap_predict8x8_mmx(unsigned char  *src_ptr,

+                               int  src_pixels_per_line,

+                               int  xoffset,

+                               int  yoffset,

+                               unsigned char *dst_ptr,

+                               int  dst_pitch) {

+#ifdef ANNOUNCE_FUNCTION

+  printf("vp9_sixtap_predict8x8_mmx\n");

+#endif

+  /* Temp data bufffer used in filtering */

+  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);

+  const short *hfilter, *vfilter;

+  hfilter = vp9_six_tap_mmx[xoffset];

+  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),

+                            fdata2,   src_pixels_per_line, 1, 13, 16,

+                            hfilter);

+  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,

+                            fdata2 + 4, src_pixels_per_line, 1, 13, 16,

+                            hfilter);

+  vfilter = vp9_six_tap_mmx[yoffset];

+  vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr,     dst_pitch,

+                             16, 8, 8, 8, vfilter);

+  vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,

+                             16, 8, 8, 8, vfilter);

+}

+void vp9_sixtap_predict8x4_mmx(unsigned char  *src_ptr,

+                               int  src_pixels_per_line,

+                               int  xoffset,

+                               int  yoffset,

+                               unsigned char *dst_ptr,

+                               int  dst_pitch) {

+#ifdef ANNOUNCE_FUNCTION

+  printf("vp9_sixtap_predict8x4_mmx\n");

+#endif

+  /* Temp data bufffer used in filtering */

+  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);

+  const short *hfilter, *vfilter;

+  hfilter = vp9_six_tap_mmx[xoffset];

+  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),

+                            fdata2,   src_pixels_per_line, 1, 9, 16, hfilter);

+  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,

+                            fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter);

+  vfilter = vp9_six_tap_mmx[yoffset];

+  vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr,     dst_pitch,

+                             16, 8, 4, 8, vfilter);

+  vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,

+                             16, 8, 4, 8, vfilter);

+}

+void vp9_bilinear_predict16x16_mmx(unsigned char  *src_ptr,

+                                   int  src_pixels_per_line,

+                                   int  xoffset,

+                                   int  yoffset,

+                                   unsigned char *dst_ptr,

+                                   int  dst_pitch) {

+  vp9_bilinear_predict8x8_mmx(src_ptr,

+                              src_pixels_per_line, xoffset, yoffset,

+                              dst_ptr, dst_pitch);

+  vp9_bilinear_predict8x8_mmx(src_ptr + 8,

+                              src_pixels_per_line, xoffset, yoffset,

+                              dst_ptr + 8, dst_pitch);

+  vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line,

+                              src_pixels_per_line, xoffset, yoffset,

+                              dst_ptr + dst_pitch * 8, dst_pitch);

+  vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8,

+                              src_pixels_per_line, xoffset, yoffset,

+                              dst_ptr + dst_pitch * 8 + 8, dst_pitch);

+}

+#endif

+#if HAVE_SSE2

+void vp9_sixtap_predict16x16_sse2(unsigned char  *src_ptr,

+                                  int  src_pixels_per_line,

+                                  int  xoffset,

+                                  int  yoffset,

+                                  unsigned char *dst_ptr,

+                                  int  dst_pitch) {

+  /* Temp data bufffer used in filtering */

+  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);

+  const short *hfilter, *vfilter;

+#ifdef ANNOUNCE_FUNCTION

+  printf("vp9_sixtap_predict16x16_sse2\n");

+#endif

+  if (xoffset) {

+    if (yoffset) {

+      hfilter = vp9_six_tap_mmx[xoffset];

+      vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,

+                                   src_pixels_per_line, 1, 21, 32, hfilter);

+      vfilter = vp9_six_tap_mmx[yoffset];

+      vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,

+                                   32, 16, 16, dst_pitch, vfilter);

+    } else {

+      /* First-pass only */

+      hfilter = vp9_six_tap_mmx[xoffset];

+      vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line,

+                                        dst_ptr, dst_pitch, 16, hfilter);

+    }

+  } else {

+    /* Second-pass only */

+    vfilter = vp9_six_tap_mmx[yoffset];

+    vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,

+                                 src_pixels_per_line, 21, 32);

+    vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,

+                                 32, 16, 16, dst_pitch, vfilter);

+  }

+}

+void vp9_sixtap_predict8x8_sse2(unsigned char  *src_ptr,

+                                int  src_pixels_per_line,

+                                int  xoffset,

+                                int  yoffset,

+                                unsigned char *dst_ptr,

+                                int  dst_pitch) {

+  /* Temp data bufffer used in filtering */

+  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);

+  const short *hfilter, *vfilter;

+#ifdef ANNOUNCE_FUNCTION

+  printf("vp9_sixtap_predict8x8_sse2\n");

+#endif

+  if (xoffset) {

+    if (yoffset) {

+      hfilter = vp9_six_tap_mmx[xoffset];

+      vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,

+                                  src_pixels_per_line, 1, 13, 16, hfilter);

+      vfilter = vp9_six_tap_mmx[yoffset];

+      vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,

+                                  16, 8, 8, dst_pitch, vfilter);

+    } else {

+      /* First-pass only */

+      hfilter = vp9_six_tap_mmx[xoffset];

+      vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,

+                                       dst_ptr, dst_pitch, 8, hfilter);

+    }

+  } else {

+    /* Second-pass only */

+    vfilter = vp9_six_tap_mmx[yoffset];

+    vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),

+                                     src_pixels_per_line,

+                                     dst_ptr, dst_pitch, 8, vfilter);

+  }

+}

+void vp9_sixtap_predict8x4_sse2(unsigned char  *src_ptr,

+                                int  src_pixels_per_line,

+                                int  xoffset,

+                                int  yoffset,

+                                unsigned char *dst_ptr,

+                                int  dst_pitch) {

+  /* Temp data bufffer used in filtering */

+  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);

+  const short *hfilter, *vfilter;

+#ifdef ANNOUNCE_FUNCTION

+  printf("vp9_sixtap_predict8x4_sse2\n");

+#endif

+  if (xoffset) {

+    if (yoffset) {

+      hfilter = vp9_six_tap_mmx[xoffset];

+      vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,

+                                  src_pixels_per_line, 1, 9, 16, hfilter);

+      vfilter = vp9_six_tap_mmx[yoffset];

+      vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,

+                                  16, 8, 4, dst_pitch, vfilter);

+    } else {

+      /* First-pass only */

+      hfilter = vp9_six_tap_mmx[xoffset];

+      vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,

+                                       dst_ptr, dst_pitch, 4, hfilter);

+    }

+  } else {

+    /* Second-pass only */

+    vfilter = vp9_six_tap_mmx[yoffset];

+    vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),

+                                     src_pixels_per_line,

+                                     dst_ptr, dst_pitch, 4, vfilter);

+  }

+}

+#endif

+#if HAVE_SSSE3

+extern void vp9_filter_block1d8_h6_ssse3(unsigned char  *src_ptr,

+                                         unsigned int    src_pixels_per_line,

+                                         unsigned char  *output_ptr,

+                                         unsigned int    output_pitch,

+                                         unsigned int    output_height,

+                                         unsigned int    vp9_filter_index);

+extern void vp9_filter_block1d16_h6_ssse3(unsigned char  *src_ptr,

+                                          unsigned int    src_pixels_per_line,

+                                          unsigned char  *output_ptr,

+                                          unsigned int    output_pitch,

+                                          unsigned int    output_height,

+                                          unsigned int    vp9_filter_index);

+extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr,

+                                          unsigned int   src_pitch,

+                                          unsigned char *output_ptr,

+                                          unsigned int   out_pitch,

+                                          unsigned int   output_height,

+                                          unsigned int   vp9_filter_index);

+extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr,

+                                         unsigned int   src_pitch,

+                                         unsigned char *output_ptr,

+                                         unsigned int   out_pitch,

+                                         unsigned int   output_height,

+                                         unsigned int   vp9_filter_index);

+extern void vp9_filter_block1d4_h6_ssse3(unsigned char  *src_ptr,

+                                         unsigned int    src_pixels_per_line,

+                                         unsigned char  *output_ptr,

+                                         unsigned int    output_pitch,

+                                         unsigned int    output_height,

+                                         unsigned int    vp9_filter_index);

+extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr,

+                                         unsigned int   src_pitch,

+                                         unsigned char *output_ptr,

+                                         unsigned int   out_pitch,

+                                         unsigned int   output_height,

+                                         unsigned int   vp9_filter_index);

+void vp9_sixtap_predict16x16_ssse3(unsigned char  *src_ptr,

+                                   int  src_pixels_per_line,

+                                   int  xoffset,

+                                   int  yoffset,

+                                   unsigned char *dst_ptr,

+                                   int  dst_pitch) {

+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24);

+#ifdef ANNOUNCE_FUNCTION

+  printf("vp9_sixtap_predict16x16_ssse3\n");

+#endif

+  if (xoffset) {

+    if (yoffset) {

+      vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),

+                                    src_pixels_per_line,

+                                    fdata2, 16, 21, xoffset);

+      vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch,

+                                    16, yoffset);

+    } else {

+      /* First-pass only */

+      vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,

+                                    dst_ptr, dst_pitch, 16, xoffset);

+    }

+  } else {

+    /* Second-pass only */

+    vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),

+                                  src_pixels_per_line,

+                                  dst_ptr, dst_pitch, 16, yoffset);

+  }

+}

+void vp9_sixtap_predict8x8_ssse3(unsigned char  *src_ptr,

+                                 int  src_pixels_per_line,

+                                 int  xoffset,

+                                 int  yoffset,

+                                 unsigned char *dst_ptr,

+                                 int  dst_pitch) {

+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);

+#ifdef ANNOUNCE_FUNCTION

+  printf("vp9_sixtap_predict8x8_ssse3\n");

+#endif

+  if (xoffset) {

+    if (yoffset) {

+      vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),

+                                   src_pixels_per_line, fdata2, 8, 13, xoffset);

+      vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset);

+    } else {

+      vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,

+                                   dst_ptr, dst_pitch, 8, xoffset);

+    }

+  } else {

+    /* Second-pass only */

+    vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),

+                                 src_pixels_per_line,

+                                 dst_ptr, dst_pitch, 8, yoffset);

+  }

+}

+void vp9_sixtap_predict8x4_ssse3(unsigned char  *src_ptr,

+                                 int  src_pixels_per_line,

+                                 int  xoffset,

+                                 int  yoffset,

+                                 unsigned char *dst_ptr,

+                                 int  dst_pitch) {

+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);

+#ifdef ANNOUNCE_FUNCTION

+  printf("vp9_sixtap_predict8x4_ssse3\n");

+#endif

+  if (xoffset) {

+    if (yoffset) {

+      vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),

+                                   src_pixels_per_line, fdata2, 8, 9, xoffset);

+      vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset);

+    } else {

+      /* First-pass only */

+      vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,

+                                   dst_ptr, dst_pitch, 4, xoffset);

+    }

+  } else {

+    /* Second-pass only */

+    vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),

+                                 src_pixels_per_line,

+                                 dst_ptr, dst_pitch, 4, yoffset);

+  }

+}

+void vp9_sixtap_predict4x4_ssse3(unsigned char  *src_ptr,

+                                 int   src_pixels_per_line,

+                                 int  xoffset,

+                                 int  yoffset,

+                                 unsigned char *dst_ptr,

+                                 int dst_pitch) {

+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9);

+#ifdef ANNOUNCE_FUNCTION

+  printf("vp9_sixtap_predict4x4_ssse3\n");

+#endif

+  if (xoffset) {

+    if (yoffset) {

+      vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),

+                                   src_pixels_per_line, fdata2, 4, 9, xoffset);

+      vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset);

+    } else {

+      vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,

+                                   dst_ptr, dst_pitch, 4, xoffset);

+    }

+  } else {

+    vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),

+                                 src_pixels_per_line,

+                                 dst_ptr, dst_pitch, 4, yoffset);

+  }

+}

+void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,

+                                   const unsigned int src_pitch,

+                                   unsigned char *output_ptr,

+                                   unsigned int out_pitch,

+                                   unsigned int output_height,

+                                   const short *filter);

+void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,

+                                   const unsigned int src_pitch,

+                                   unsigned char *output_ptr,

+                                   unsigned int out_pitch,

+                                   unsigned int output_height,

+                                   const short *filter);

+void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr,

+                                      const unsigned int src_stride,

+                                      const short *hfilter_aligned16,

+                                      const short *vfilter_aligned16,

+                                      unsigned char *dst_ptr,

+                                      unsigned int dst_stride) {

+  if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {

+    DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);

+    vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride,

+                                  fdata2, 16, 23, hfilter_aligned16);

+    vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16,

+                                  vfilter_aligned16);

+  } else {

+    if (hfilter_aligned16[3] != 128) {

+      vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride,

+                                    16, hfilter_aligned16);

+    } else {

+      vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride,

+                                    dst_ptr, dst_stride, 16, vfilter_aligned16);

+    }

+  }

+}

+void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,

+                                   const unsigned int src_pitch,

+                                   unsigned char *output_ptr,

+                                   unsigned int out_pitch,

+                                   unsigned int output_height,

+                                   const short *filter);

+void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,

+                                   const unsigned int src_pitch,

+                                   unsigned char *output_ptr,

+                                   unsigned int out_pitch,

+                                   unsigned int output_height,

+                                   const short *filter);

+void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr,

+                                    const unsigned int src_stride,

+                                    const short *hfilter_aligned16,

+                                    const short *vfilter_aligned16,

+                                    unsigned char *dst_ptr,

+                                    unsigned int dst_stride) {

+  if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {

+    DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);

+    vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,

+                                 fdata2, 16, 15, hfilter_aligned16);

+    vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8,

+                                 vfilter_aligned16);

+  } else {

+    if (hfilter_aligned16[3] != 128) {

+      vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8,

+                                   hfilter_aligned16);

+    } else {

+      vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,

+                                   dst_ptr, dst_stride, 8, vfilter_aligned16);

+    }

+  }

+}

+void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr,

+                                    const unsigned int src_stride,

+                                    const short *hfilter_aligned16,

+                                    const short *vfilter_aligned16,

+                                    unsigned char *dst_ptr,

+                                    unsigned int dst_stride) {

+  if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) {

+      DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);

+      vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,

+                                   fdata2, 16, 11, hfilter_aligned16);

+      vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4,

+                                   vfilter_aligned16);

+  } else {

+    if (hfilter_aligned16[3] != 128) {

+      vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4,

+                                   hfilter_aligned16);

+    } else {

+      vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,

+                                   dst_ptr, dst_stride, 4, vfilter_aligned16);

+    }

+  }

+}

+#endif

--- /dev/null

+++ b/vp9/common/x86/x86_systemdependent.c

@@ -1,0 +1,108 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_config.h"

+#include "vpx_ports/x86.h"

+#include "vp9/common/subpixel.h"

+#include "vp9/common/loopfilter.h"

+#include "vp9/common/idct.h"

+#include "vp9/common/pragmas.h"

+#include "vp9/common/onyxc_int.h"

+void vp9_arch_x86_common_init(VP9_COMMON *ctx) {

+#if CONFIG_RUNTIME_CPU_DETECT

+  VP9_COMMON_RTCD *rtcd = &ctx->rtcd;

+  int flags = x86_simd_caps();

+  /* Note:

+   *

+   * This platform can be built without runtime CPU detection as well. If

+   * you modify any of the function mappings present in this file, be sure

+   * to also update them in static mapings (<arch>/filename_<arch>.h)

+   */

+  /* Override default functions with fastest ones for this CPU. */

+#if HAVE_MMX

+// The commented functions need to be re-written for vpx.

+  if (flags & HAS_MMX) {

+    rtcd->idct.idct1        = vp9_short_idct4x4llm_1_mmx;

+    rtcd->idct.idct16       = vp9_short_idct4x4llm_mmx;

+    rtcd->idct.idct1_scalar_add = vp9_dc_only_idct_add_mmx;

+    // rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_mmx;

+    // rtcd->idct.iwalsh1     = vp9_short_inv_walsh4x4_1_mmx;

+    /* Disabled due to unsupported enhanced interpolation/high_prec mv

+    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_mmx;

+    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_mmx;

+    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_mmx;

+    rtcd->subpix.sixtap4x4     = vp9_sixtap_predict4x4_mmx;

+    */

+    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_mmx;

+    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_mmx;

+    rtcd->subpix.bilinear8x4   = vp9_bilinear_predict8x4_mmx;

+    rtcd->subpix.bilinear4x4   = vp9_bilinear_predict4x4_mmx;

+#if CONFIG_POSTPROC

+    rtcd->postproc.down        = vp9_mbpost_proc_down_mmx;

+    /*rtcd->postproc.across      = vp9_mbpost_proc_across_ip_c;*/

+    rtcd->postproc.downacross  = vp9_post_proc_down_and_across_mmx;

+    rtcd->postproc.addnoise    = vp9_plane_add_noise_mmx;

+#endif

+  }

+#endif

+#if HAVE_SSE2

+  if (flags & HAS_SSE2) {

+    // rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_sse2;

+    /* Disabled due to unsupported enhanced interpolation/high_prec mv

+    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_sse2;

+    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_sse2;

+    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_sse2;

+    */

+    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_sse2;

+    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_sse2;

+#if CONFIG_POSTPROC

+    rtcd->postproc.down        = vp9_mbpost_proc_down_xmm;

+    rtcd->postproc.across      = vp9_mbpost_proc_across_ip_xmm;

+    rtcd->postproc.downacross  = vp9_post_proc_down_and_across_xmm;

+    rtcd->postproc.addnoise    = vp9_plane_add_noise_wmt;

+#endif

+  }

+#endif

+#if HAVE_SSSE3

+  if (flags & HAS_SSSE3) {

+    /* Disabled due to unsupported enhanced interpolation/high_prec mv

+    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_ssse3;

+    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_ssse3;

+    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_ssse3;

+    rtcd->subpix.sixtap4x4     = vp9_sixtap_predict4x4_ssse3;

+    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_ssse3;

+    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_ssse3;

+    */

+    /* these are disable because of unsupported diagonal pred modes

+    rtcd->recon.build_intra_predictors_mbuv =

+      vp9_build_intra_predictors_mbuv_ssse3;

+    rtcd->recon.build_intra_predictors_mbuv_s =

+      vp9_build_intra_predictors_mbuv_s_ssse3;

+      */

+  }

+#endif

+#endif

+}

--- /dev/null

+++ b/vp9/decoder/arm/armv6/dequant_dc_idct_v6.asm

@@ -1,0 +1,218 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license and patent

+;  grant that can be found in the LICENSE file in the root of the source

+;  tree. All contributing project authors may be found in the AUTHORS

+;  file in the root of the source tree.

+;

+    EXPORT |vp8_dequant_dc_idct_add_v6|

+    AREA |.text|, CODE, READONLY

+;void vp8_dequant_dc_idct_v6(short *input, short *dq, unsigned char *pred,

+; unsigned char *dest, int pitch, int stride, int Dc)

+; r0 = input

+; r1 = dq

+; r2 = pred

+; r3 = dest

+; sp + 36 = pitch  ; +4 = 40

+; sp + 40 = stride  ; +4 = 44

+; sp + 44 = Dc  ; +4 = 48

+|vp8_dequant_dc_idct_add_v6| PROC

+    stmdb   sp!, {r4-r11, lr}

+    ldr     r6, [sp, #44]

+    ldr     r4, [r0]                ;input

+    ldr     r5, [r1], #4            ;dq

+    sub     sp, sp, #4

+    str     r3, [sp]

+    smultt  r7, r4, r5

+    ldr     r4, [r0, #4]            ;input

+    ldr     r5, [r1], #4            ;dq

+    strh    r6, [r0], #2

+    strh    r7, [r0], #2

+    smulbb  r6, r4, r5

+    smultt  r7, r4, r5

+    ldr     r4, [r0, #4]            ;input

+    ldr     r5, [r1], #4            ;dq

+    strh    r6, [r0], #2

+    strh    r7, [r0], #2

+    mov     r12, #3

+vp8_dequant_dc_add_loop

+    smulbb  r6, r4, r5

+    smultt  r7, r4, r5

+    ldr     r4, [r0, #4]            ;input

+    ldr     r5, [r1], #4            ;dq

+    strh    r6, [r0], #2

+    strh    r7, [r0], #2

+    smulbb  r6, r4, r5

+    smultt  r7, r4, r5

+    subs    r12, r12, #1

+    ldrne   r4, [r0, #4]

+    ldrne   r5, [r1], #4

+    strh    r6, [r0], #2

+    strh    r7, [r0], #2

+    bne     vp8_dequant_dc_add_loop

+    sub     r0, r0, #32

+    mov     r1, r0

+; short_idct4x4llm_v6_dual

+    ldr     r3, cospi8sqrt2minus1

+    ldr     r4, sinpi8sqrt2

+    ldr     r6, [r0, #8]

+    mov     r5, #2

+vp8_dequant_dc_idct_loop1_v6

+    ldr     r12, [r0, #24]

+    ldr     r14, [r0, #16]

+    smulwt  r9, r3, r6

+    smulwb  r7, r3, r6

+    smulwt  r10, r4, r6

+    smulwb  r8, r4, r6

+    pkhbt   r7, r7, r9, lsl #16

+    smulwt  r11, r3, r12

+    pkhbt   r8, r8, r10, lsl #16

+    uadd16  r6, r6, r7

+    smulwt  r7, r4, r12

+    smulwb  r9, r3, r12

+    smulwb  r10, r4, r12

+    subs    r5, r5, #1

+    pkhbt   r9, r9, r11, lsl #16

+    ldr     r11, [r0], #4

+    pkhbt   r10, r10, r7, lsl #16

+    uadd16  r7, r12, r9

+    usub16  r7, r8, r7

+    uadd16  r6, r6, r10

+    uadd16  r10, r11, r14

+    usub16  r8, r11, r14

+    uadd16  r9, r10, r6

+    usub16  r10, r10, r6

+    uadd16  r6, r8, r7

+    usub16  r7, r8, r7

+    str     r6, [r1, #8]

+    ldrne   r6, [r0, #8]

+    str     r7, [r1, #16]

+    str     r10, [r1, #24]

+    str     r9, [r1], #4

+    bne     vp8_dequant_dc_idct_loop1_v6

+    mov     r5, #2

+    sub     r0, r1, #8

+vp8_dequant_dc_idct_loop2_v6

+    ldr     r6, [r0], #4

+    ldr     r7, [r0], #4

+    ldr     r8, [r0], #4

+    ldr     r9, [r0], #4

+    smulwt  r1, r3, r6

+    smulwt  r12, r4, r6

+    smulwt  lr, r3, r8

+    smulwt  r10, r4, r8

+    pkhbt   r11, r8, r6, lsl #16

+    pkhbt   r1, lr, r1, lsl #16

+    pkhbt   r12, r10, r12, lsl #16

+    pkhtb   r6, r6, r8, asr #16

+    uadd16  r6, r1, r6

+    pkhbt   lr, r9, r7, lsl #16

+    uadd16  r10, r11, lr

+    usub16  lr, r11, lr

+    pkhtb   r8, r7, r9, asr #16

+    subs    r5, r5, #1

+    smulwt  r1, r3, r8

+    smulwb  r7, r3, r8

+    smulwt  r11, r4, r8

+    smulwb  r9, r4, r8

+    pkhbt   r1, r7, r1, lsl #16

+    uadd16  r8, r1, r8

+    pkhbt   r11, r9, r11, lsl #16

+    usub16  r1, r12, r8

+    uadd16  r8, r11, r6

+    ldr     r9, c0x00040004

+    ldr     r12, [sp, #40]

+    uadd16  r6, r10, r8

+    usub16  r7, r10, r8

+    uadd16  r7, r7, r9

+    uadd16  r6, r6, r9

+    uadd16  r10, r14, r1

+    usub16  r1, r14, r1

+    uadd16  r10, r10, r9

+    uadd16  r1, r1, r9

+    ldr     r11, [r2], r12

+    mov     r8, r7, asr #3

+    pkhtb   r9, r8, r10, asr #19

+    mov     r8, r1, asr #3

+    pkhtb   r8, r8, r6, asr #19

+    uxtb16  lr, r11, ror #8

+    qadd16  r9, r9, lr

+    uxtb16  lr, r11

+    qadd16  r8, r8, lr

+    usat16  r9, #8, r9

+    usat16  r8, #8, r8

+    orr     r9, r8, r9, lsl #8

+    ldr     r11, [r2], r12

+    ldr     lr, [sp]

+    ldr     r12, [sp, #44]

+    mov     r7, r7, lsl #16

+    mov     r1, r1, lsl #16

+    mov     r10, r10, lsl #16

+    mov     r6, r6, lsl #16

+    mov     r7, r7, asr #3

+    pkhtb   r7, r7, r10, asr #19

+    mov     r1, r1, asr #3

+    pkhtb   r1, r1, r6, asr #19

+    uxtb16  r8, r11, ror #8

+    qadd16  r7, r7, r8

+    uxtb16  r8, r11

+    qadd16  r1, r1, r8

+    usat16  r7, #8, r7

+    usat16  r1, #8, r1

+    orr     r1, r1, r7, lsl #8

+    str     r9, [lr], r12

+    str     r1, [lr], r12

+    str     lr, [sp]

+    bne     vp8_dequant_dc_idct_loop2_v6

+; vpx_memset

+    sub     r0, r0, #32

+    add     sp, sp, #4

+    mov     r12, #0

+    str     r12, [r0]

+    str     r12, [r0, #4]

+    str     r12, [r0, #8]

+    str     r12, [r0, #12]

+    str     r12, [r0, #16]

+    str     r12, [r0, #20]

+    str     r12, [r0, #24]

+    str     r12, [r0, #28]

+    ldmia   sp!, {r4 - r11, pc}

+    ENDP    ; |vp8_dequant_dc_idct_add_v6|

+; Constant Pool

+cospi8sqrt2minus1 DCD 0x00004E7B

+sinpi8sqrt2       DCD 0x00008A8C

+c0x00040004       DCD 0x00040004

+    END

--- /dev/null

+++ b/vp9/decoder/arm/armv6/dequant_idct_v6.asm

@@ -1,0 +1,196 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license and patent

+;  grant that can be found in the LICENSE file in the root of the source

+;  tree. All contributing project authors may be found in the AUTHORS

+;  file in the root of the source tree.

+;

+    EXPORT |vp8_dequant_idct_add_v6|

+    AREA |.text|, CODE, READONLY

+;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred,

+; unsigned char *dest, int pitch, int stride)

+; r0 = input

+; r1 = dq

+; r2 = pred

+; r3 = dest

+; sp + 36 = pitch  ; +4 = 40

+; sp + 40 = stride  ; +4 = 44

+|vp8_dequant_idct_add_v6| PROC

+    stmdb   sp!, {r4-r11, lr}

+    ldr     r4, [r0]                ;input

+    ldr     r5, [r1], #4            ;dq

+    sub     sp, sp, #4

+    str     r3, [sp]

+    mov     r12, #4

+vp8_dequant_add_loop

+    smulbb  r6, r4, r5

+    smultt  r7, r4, r5

+    ldr     r4, [r0, #4]            ;input

+    ldr     r5, [r1], #4            ;dq

+    strh    r6, [r0], #2

+    strh    r7, [r0], #2

+    smulbb  r6, r4, r5

+    smultt  r7, r4, r5

+    subs    r12, r12, #1

+    ldrne   r4, [r0, #4]

+    ldrne   r5, [r1], #4

+    strh    r6, [r0], #2

+    strh    r7, [r0], #2

+    bne     vp8_dequant_add_loop

+    sub     r0, r0, #32

+    mov     r1, r0

+; short_idct4x4llm_v6_dual

+    ldr     r3, cospi8sqrt2minus1

+    ldr     r4, sinpi8sqrt2

+    ldr     r6, [r0, #8]

+    mov     r5, #2

+vp8_dequant_idct_loop1_v6

+    ldr     r12, [r0, #24]

+    ldr     r14, [r0, #16]

+    smulwt  r9, r3, r6

+    smulwb  r7, r3, r6

+    smulwt  r10, r4, r6

+    smulwb  r8, r4, r6

+    pkhbt   r7, r7, r9, lsl #16

+    smulwt  r11, r3, r12

+    pkhbt   r8, r8, r10, lsl #16

+    uadd16  r6, r6, r7

+    smulwt  r7, r4, r12

+    smulwb  r9, r3, r12

+    smulwb  r10, r4, r12

+    subs    r5, r5, #1

+    pkhbt   r9, r9, r11, lsl #16

+    ldr     r11, [r0], #4

+    pkhbt   r10, r10, r7, lsl #16

+    uadd16  r7, r12, r9

+    usub16  r7, r8, r7

+    uadd16  r6, r6, r10

+    uadd16  r10, r11, r14

+    usub16  r8, r11, r14

+    uadd16  r9, r10, r6

+    usub16  r10, r10, r6

+    uadd16  r6, r8, r7

+    usub16  r7, r8, r7

+    str     r6, [r1, #8]

+    ldrne   r6, [r0, #8]

+    str     r7, [r1, #16]

+    str     r10, [r1, #24]

+    str     r9, [r1], #4

+    bne     vp8_dequant_idct_loop1_v6

+    mov     r5, #2

+    sub     r0, r1, #8

+vp8_dequant_idct_loop2_v6

+    ldr     r6, [r0], #4

+    ldr     r7, [r0], #4

+    ldr     r8, [r0], #4

+    ldr     r9, [r0], #4

+    smulwt  r1, r3, r6

+    smulwt  r12, r4, r6

+    smulwt  lr, r3, r8

+    smulwt  r10, r4, r8

+    pkhbt   r11, r8, r6, lsl #16

+    pkhbt   r1, lr, r1, lsl #16

+    pkhbt   r12, r10, r12, lsl #16

+    pkhtb   r6, r6, r8, asr #16

+    uadd16  r6, r1, r6

+    pkhbt   lr, r9, r7, lsl #16

+    uadd16  r10, r11, lr

+    usub16  lr, r11, lr

+    pkhtb   r8, r7, r9, asr #16

+    subs    r5, r5, #1

+    smulwt  r1, r3, r8

+    smulwb  r7, r3, r8

+    smulwt  r11, r4, r8

+    smulwb  r9, r4, r8

+    pkhbt   r1, r7, r1, lsl #16

+    uadd16  r8, r1, r8

+    pkhbt   r11, r9, r11, lsl #16

+    usub16  r1, r12, r8

+    uadd16  r8, r11, r6

+    ldr     r9, c0x00040004

+    ldr     r12, [sp, #40]

+    uadd16  r6, r10, r8

+    usub16  r7, r10, r8

+    uadd16  r7, r7, r9

+    uadd16  r6, r6, r9

+    uadd16  r10, r14, r1

+    usub16  r1, r14, r1

+    uadd16  r10, r10, r9

+    uadd16  r1, r1, r9

+    ldr     r11, [r2], r12

+    mov     r8, r7, asr #3

+    pkhtb   r9, r8, r10, asr #19

+    mov     r8, r1, asr #3

+    pkhtb   r8, r8, r6, asr #19

+    uxtb16  lr, r11, ror #8

+    qadd16  r9, r9, lr

+    uxtb16  lr, r11

+    qadd16  r8, r8, lr

+    usat16  r9, #8, r9

+    usat16  r8, #8, r8

+    orr     r9, r8, r9, lsl #8

+    ldr     r11, [r2], r12

+    ldr     lr, [sp]

+    ldr     r12, [sp, #44]

+    mov     r7, r7, lsl #16

+    mov     r1, r1, lsl #16

+    mov     r10, r10, lsl #16

+    mov     r6, r6, lsl #16

+    mov     r7, r7, asr #3

+    pkhtb   r7, r7, r10, asr #19

+    mov     r1, r1, asr #3

+    pkhtb   r1, r1, r6, asr #19

+    uxtb16  r8, r11, ror #8

+    qadd16  r7, r7, r8

+    uxtb16  r8, r11

+    qadd16  r1, r1, r8

+    usat16  r7, #8, r7

+    usat16  r1, #8, r1

+    orr     r1, r1, r7, lsl #8

+    str     r9, [lr], r12

+    str     r1, [lr], r12

+    str     lr, [sp]

+    bne     vp8_dequant_idct_loop2_v6

+; vpx_memset

+    sub     r0, r0, #32

+    add     sp, sp, #4

+    mov     r12, #0

+    str     r12, [r0]

+    str     r12, [r0, #4]

+    str     r12, [r0, #8]

+    str     r12, [r0, #12]

+    str     r12, [r0, #16]

+    str     r12, [r0, #20]

+    str     r12, [r0, #24]

+    str     r12, [r0, #28]

+    ldmia   sp!, {r4 - r11, pc}

+    ENDP    ; |vp8_dequant_idct_add_v6|

+; Constant Pool

+cospi8sqrt2minus1 DCD 0x00004E7B

+sinpi8sqrt2       DCD 0x00008A8C

+c0x00040004       DCD 0x00040004

+    END

--- /dev/null

+++ b/vp9/decoder/arm/armv6/dequantize_v6.asm

@@ -1,0 +1,69 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_dequantize_b_loop_v6|

+    AREA    |.text|, CODE, READONLY  ; name this block of code

+;-------------------------------

+;void   vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);

+; r0    short *Q,

+; r1    short *DQC

+; r2    short *DQ

+|vp8_dequantize_b_loop_v6| PROC

+    stmdb   sp!, {r4-r9, lr}

+    ldr     r3, [r0]                ;load Q

+    ldr     r4, [r1]                ;load DQC

+    ldr     r5, [r0, #4]

+    ldr     r6, [r1, #4]

+    mov     r12, #2                 ;loop counter

+dequant_loop

+    smulbb  r7, r3, r4              ;multiply

+    smultt  r8, r3, r4

+    smulbb  r9, r5, r6

+    smultt  lr, r5, r6

+    ldr     r3, [r0, #8]

+    ldr     r4, [r1, #8]

+    ldr     r5, [r0, #12]

+    ldr     r6, [r1, #12]

+    strh    r7, [r2], #2            ;store result

+    smulbb  r7, r3, r4              ;multiply

+    strh    r8, [r2], #2

+    smultt  r8, r3, r4

+    strh    r9, [r2], #2

+    smulbb  r9, r5, r6

+    strh    lr, [r2], #2

+    smultt  lr, r5, r6

+    subs    r12, r12, #1

+    add     r0, r0, #16

+    add     r1, r1, #16

+    ldrne       r3, [r0]

+    strh    r7, [r2], #2            ;store result

+    ldrne       r4, [r1]

+    strh    r8, [r2], #2

+    ldrne       r5, [r0, #4]

+    strh    r9, [r2], #2

+    ldrne       r6, [r1, #4]

+    strh    lr, [r2], #2

+    bne     dequant_loop

+    ldmia   sp!, {r4-r9, pc}

+    ENDP    ;|vp8_dequantize_b_loop_v6|

+    END

--- /dev/null

+++ b/vp9/decoder/arm/armv6/idct_blk_v6.c

@@ -1,0 +1,136 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vp9/common/idct.h"

+#include "vp9/decoder/dequantize.h"

+void vp8_dequant_dc_idct_add_y_block_v6

+(short *q, short *dq, unsigned char *pre,

+ unsigned char *dst, int stride, char *eobs, short *dc) {

+  int i;

+  for (i = 0; i < 4; i++) {

+    if (eobs[0] > 1)

+      vp8_dequant_dc_idct_add_v6(q, dq, pre, dst, 16, stride, dc[0]);

+    else

+      vp8_dc_only_idct_add_v6(dc[0], pre, dst, 16, stride);

+    if (eobs[1] > 1)

+      vp8_dequant_dc_idct_add_v6(q + 16, dq, pre + 4, dst + 4, 16, stride, dc[1]);

+    else

+      vp8_dc_only_idct_add_v6(dc[1], pre + 4, dst + 4, 16, stride);

+    if (eobs[2] > 1)

+      vp8_dequant_dc_idct_add_v6(q + 32, dq, pre + 8, dst + 8, 16, stride, dc[2]);

+    else

+      vp8_dc_only_idct_add_v6(dc[2], pre + 8, dst + 8, 16, stride);

+    if (eobs[3] > 1)

+      vp8_dequant_dc_idct_add_v6(q + 48, dq, pre + 12, dst + 12, 16, stride, dc[3]);

+    else

+      vp8_dc_only_idct_add_v6(dc[3], pre + 12, dst + 12, 16, stride);

+    q    += 64;

+    dc   += 4;

+    pre  += 64;

+    dst  += 4 * stride;

+    eobs += 4;

+  }

+}

+void vp8_dequant_idct_add_y_block_v6

+(short *q, short *dq, unsigned char *pre,

+ unsigned char *dst, int stride, char *eobs) {

+  int i;

+  for (i = 0; i < 4; i++) {

+    if (eobs[0] > 1)

+      vp8_dequant_idct_add_v6(q, dq, pre, dst, 16, stride);

+    else {

+      vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dst, 16, stride);

+      ((int *)q)[0] = 0;

+    }

+    if (eobs[1] > 1)

+      vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dst + 4, 16, stride);

+    else {

+      vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dst + 4, 16, stride);

+      ((int *)(q + 16))[0] = 0;

+    }

+    if (eobs[2] > 1)

+      vp8_dequant_idct_add_v6(q + 32, dq, pre + 8, dst + 8, 16, stride);

+    else {

+      vp8_dc_only_idct_add_v6(q[32]*dq[0], pre + 8, dst + 8, 16, stride);

+      ((int *)(q + 32))[0] = 0;

+    }

+    if (eobs[3] > 1)

+      vp8_dequant_idct_add_v6(q + 48, dq, pre + 12, dst + 12, 16, stride);

+    else {

+      vp8_dc_only_idct_add_v6(q[48]*dq[0], pre + 12, dst + 12, 16, stride);

+      ((int *)(q + 48))[0] = 0;

+    }

+    q    += 64;

+    pre  += 64;

+    dst  += 4 * stride;

+    eobs += 4;

+  }

+}

+void vp8_dequant_idct_add_uv_block_v6

+(short *q, short *dq, unsigned char *pre,

+ unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) {

+  int i;

+  for (i = 0; i < 2; i++) {

+    if (eobs[0] > 1)

+      vp8_dequant_idct_add_v6(q, dq, pre, dstu, 8, stride);

+    else {

+      vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dstu, 8, stride);

+      ((int *)q)[0] = 0;

+    }

+    if (eobs[1] > 1)

+      vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dstu + 4, 8, stride);

+    else {

+      vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dstu + 4, 8, stride);

+      ((int *)(q + 16))[0] = 0;

+    }

+    q    += 32;

+    pre  += 32;

+    dstu += 4 * stride;

+    eobs += 2;

+  }

+  for (i = 0; i < 2; i++) {

+    if (eobs[0] > 1)

+      vp8_dequant_idct_add_v6(q, dq, pre, dstv, 8, stride);

+    else {

+      vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dstv, 8, stride);

+      ((int *)q)[0] = 0;

+    }

+    if (eobs[1] > 1)

+      vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dstv + 4, 8, stride);

+    else {

+      vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dstv + 4, 8, stride);

+      ((int *)(q + 16))[0] = 0;

+    }

+    q    += 32;

+    pre  += 32;

+    dstv += 4 * stride;

+    eobs += 2;

+  }

+}

--- /dev/null

+++ b/vp9/decoder/arm/dequantize_arm.c

@@ -1,0 +1,44 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vp9/decoder/dequantize.h"

+#include "vp9/common/idct.h"

+#include "vpx_mem/vpx_mem.h"

+#if HAVE_ARMV7

+extern void vp9_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);

+#endif

+#if HAVE_ARMV6

+extern void vp9_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);

+#endif

+#if HAVE_ARMV7

+void vp9_dequantize_b_neon(BLOCKD *d) {

+  short *DQ  = d->dqcoeff;

+  short *Q   = d->qcoeff;

+  short *DQC = d->dequant;

+  vp9_dequantize_b_loop_neon(Q, DQC, DQ);

+}

+#endif

+#if HAVE_ARMV6

+void vp9_dequantize_b_v6(BLOCKD *d) {

+  short *DQ  = d->dqcoeff;

+  short *Q   = d->qcoeff;

+  short *DQC = d->dequant;

+  vp9_dequantize_b_loop_v6(Q, DQC, DQ);

+}

+#endif

--- /dev/null

+++ b/vp9/decoder/arm/neon/dequant_idct_neon.asm

@@ -1,0 +1,129 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_dequant_idct_add_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;void vp8_dequant_idct_neon(short *input, short *dq, unsigned char *pred,

+;                           unsigned char *dest, int pitch, int stride)

+; r0    short *input,

+; r1    short *dq,

+; r2    unsigned char *pred

+; r3    unsigned char *dest

+; sp    int pitch

+; sp+4  int stride

+|vp8_dequant_idct_add_neon| PROC

+    vld1.16         {q3, q4}, [r0]

+    vld1.16         {q5, q6}, [r1]

+    ldr             r1, [sp]                ; pitch

+    vld1.32         {d14[0]}, [r2], r1

+    vld1.32         {d14[1]}, [r2], r1

+    vld1.32         {d15[0]}, [r2], r1

+    vld1.32         {d15[1]}, [r2]

+    ldr             r1, [sp, #4]            ; stride

+    adr             r12, cospi8sqrt2minus1  ; pointer to the first constant

+    vmul.i16        q1, q3, q5              ;input for short_idct4x4llm_neon

+    vmul.i16        q2, q4, q6

+;|short_idct4x4llm_neon| PROC

+    vld1.16         {d0}, [r12]

+    vswp            d3, d4                  ;q2(vp[4] vp[12])

+    vqdmulh.s16     q3, q2, d0[2]

+    vqdmulh.s16     q4, q2, d0[0]

+    vqadd.s16       d12, d2, d3             ;a1

+    vqsub.s16       d13, d2, d3             ;b1

+    vshr.s16        q3, q3, #1

+    vshr.s16        q4, q4, #1

+    vqadd.s16       q3, q3, q2

+    vqadd.s16       q4, q4, q2

+    vqsub.s16       d10, d6, d9             ;c1

+    vqadd.s16       d11, d7, d8             ;d1

+    vqadd.s16       d2, d12, d11

+    vqadd.s16       d3, d13, d10

+    vqsub.s16       d4, d13, d10

+    vqsub.s16       d5, d12, d11

+    vtrn.32         d2, d4

+    vtrn.32         d3, d5

+    vtrn.16         d2, d3

+    vtrn.16         d4, d5

+; memset(input, 0, 32) -- 32bytes

+    vmov.i16        q14, #0

+    vswp            d3, d4

+    vqdmulh.s16     q3, q2, d0[2]

+    vqdmulh.s16     q4, q2, d0[0]

+    vqadd.s16       d12, d2, d3             ;a1

+    vqsub.s16       d13, d2, d3             ;b1

+    vmov            q15, q14

+    vshr.s16        q3, q3, #1

+    vshr.s16        q4, q4, #1

+    vqadd.s16       q3, q3, q2

+    vqadd.s16       q4, q4, q2

+    vqsub.s16       d10, d6, d9             ;c1

+    vqadd.s16       d11, d7, d8             ;d1

+    vqadd.s16       d2, d12, d11

+    vqadd.s16       d3, d13, d10

+    vqsub.s16       d4, d13, d10

+    vqsub.s16       d5, d12, d11

+    vst1.16         {q14, q15}, [r0]

+    vrshr.s16       d2, d2, #3

+    vrshr.s16       d3, d3, #3

+    vrshr.s16       d4, d4, #3

+    vrshr.s16       d5, d5, #3

+    vtrn.32         d2, d4

+    vtrn.32         d3, d5

+    vtrn.16         d2, d3

+    vtrn.16         d4, d5

+    vaddw.u8        q1, q1, d14

+    vaddw.u8        q2, q2, d15

+    vqmovun.s16     d0, q1

+    vqmovun.s16     d1, q2

+    vst1.32         {d0[0]}, [r3], r1

+    vst1.32         {d0[1]}, [r3], r1

+    vst1.32         {d1[0]}, [r3], r1

+    vst1.32         {d1[1]}, [r3]

+    bx             lr

+    ENDP           ; |vp8_dequant_idct_add_neon|

+; Constant Pool

+cospi8sqrt2minus1 DCD 0x4e7b4e7b

+sinpi8sqrt2       DCD 0x8a8c8a8c

+    END

--- /dev/null

+++ b/vp9/decoder/arm/neon/dequantizeb_neon.asm

@@ -1,0 +1,34 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_dequantize_b_loop_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    short *Q,

+; r1    short *DQC

+; r2    short *DQ

+|vp8_dequantize_b_loop_neon| PROC

+    vld1.16         {q0, q1}, [r0]

+    vld1.16         {q2, q3}, [r1]

+    vmul.i16        q4, q0, q2

+    vmul.i16        q5, q1, q3

+    vst1.16         {q4, q5}, [r2]

+    bx             lr

+    ENDP

+    END

--- /dev/null

+++ b/vp9/decoder/arm/neon/idct_blk_neon.c

@@ -1,0 +1,110 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vp9/common/idct.h"

+#include "vp9/decoder/dequantize.h"

+/* place these declarations here because we don't want to maintain them

+ * outside of this scope

+ */

+void idct_dequant_dc_full_2x_neon

+(short *input, short *dq, unsigned char *pre, unsigned char *dst,

+ int stride, short *dc);

+void idct_dequant_dc_0_2x_neon

+(short *dc, unsigned char *pre, unsigned char *dst, int stride);

+void idct_dequant_full_2x_neon

+(short *q, short *dq, unsigned char *pre, unsigned char *dst,

+ int pitch, int stride);

+void idct_dequant_0_2x_neon

+(short *q, short dq, unsigned char *pre, int pitch,

+ unsigned char *dst, int stride);

+void vp8_dequant_dc_idct_add_y_block_neon

+(short *q, short *dq, unsigned char *pre,

+ unsigned char *dst, int stride, char *eobs, short *dc) {

+  int i;

+  for (i = 0; i < 4; i++) {

+    if (((short *)eobs)[0] & 0xfefe)

+      idct_dequant_dc_full_2x_neon(q, dq, pre, dst, stride, dc);

+    else

+      idct_dequant_dc_0_2x_neon(dc, pre, dst, stride);

+    if (((short *)eobs)[1] & 0xfefe)

+      idct_dequant_dc_full_2x_neon(q + 32, dq, pre + 8, dst + 8, stride, dc + 2);

+    else

+      idct_dequant_dc_0_2x_neon(dc + 2, pre + 8, dst + 8, stride);

+    q    += 64;

+    dc   += 4;

+    pre  += 64;

+    dst  += 4 * stride;

+    eobs += 4;

+  }

+}

+void vp8_dequant_idct_add_y_block_neon

+(short *q, short *dq, unsigned char *pre,

+ unsigned char *dst, int stride, char *eobs) {

+  int i;

+  for (i = 0; i < 4; i++) {

+    if (((short *)eobs)[0] & 0xfefe)

+      idct_dequant_full_2x_neon(q, dq, pre, dst, 16, stride);

+    else

+      idct_dequant_0_2x_neon(q, dq[0], pre, 16, dst, stride);

+    if (((short *)eobs)[1] & 0xfefe)

+      idct_dequant_full_2x_neon(q + 32, dq, pre + 8, dst + 8, 16, stride);

+    else

+      idct_dequant_0_2x_neon(q + 32, dq[0], pre + 8, 16, dst + 8, stride);

+    q    += 64;

+    pre  += 64;

+    dst  += 4 * stride;

+    eobs += 4;

+  }

+}

+void vp8_dequant_idct_add_uv_block_neon

+(short *q, short *dq, unsigned char *pre,

+ unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) {

+  if (((short *)eobs)[0] & 0xfefe)

+    idct_dequant_full_2x_neon(q, dq, pre, dstu, 8, stride);

+  else

+    idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstu, stride);

+  q    += 32;

+  pre  += 32;

+  dstu += 4 * stride;

+  if (((short *)eobs)[1] & 0xfefe)

+    idct_dequant_full_2x_neon(q, dq, pre, dstu, 8, stride);

+  else

+    idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstu, stride);

+  q += 32;

+  pre += 32;

+  if (((short *)eobs)[2] & 0xfefe)

+    idct_dequant_full_2x_neon(q, dq, pre, dstv, 8, stride);

+  else

+    idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstv, stride);

+  q    += 32;

+  pre  += 32;

+  dstv += 4 * stride;

+  if (((short *)eobs)[3] & 0xfefe)

+    idct_dequant_full_2x_neon(q, dq, pre, dstv, 8, stride);

+  else

+    idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstv, stride);

+}

--- /dev/null

+++ b/vp9/decoder/arm/neon/idct_dequant_0_2x_neon.asm

@@ -1,0 +1,79 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license and patent

+;  grant that can be found in the LICENSE file in the root of the source

+;  tree. All contributing project authors may be found in the AUTHORS

+;  file in the root of the source tree.

+;

+    EXPORT  |idct_dequant_0_2x_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *pre,

+;                            int pitch, unsigned char *dst, int stride);

+; r0   *q

+; r1   dq

+; r2   *pre

+; r3   pitch

+; sp   *dst

+; sp+4 stride

+|idct_dequant_0_2x_neon| PROC

+    add             r12, r2, #4

+    vld1.32         {d2[0]}, [r2], r3

+    vld1.32         {d2[1]}, [r2], r3

+    vld1.32         {d4[0]}, [r2], r3

+    vld1.32         {d4[1]}, [r2]

+    vld1.32         {d8[0]}, [r12], r3

+    vld1.32         {d8[1]}, [r12], r3

+    vld1.32         {d10[0]}, [r12], r3

+    vld1.32         {d10[1]}, [r12]

+    ldrh            r12, [r0]               ; lo q

+    ldrh            r2, [r0, #32]           ; hi q

+    mov             r3, #0

+    strh            r3, [r0]

+    strh            r3, [r0, #32]

+    sxth            r12, r12                ; lo

+    mul             r0, r12, r1

+    add             r0, r0, #4

+    asr             r0, r0, #3

+    vdup.16         q0, r0

+    sxth            r2, r2                  ; hi

+    mul             r0, r2, r1

+    add             r0, r0, #4

+    asr             r0, r0, #3

+    vdup.16         q3, r0

+    vaddw.u8        q1, q0, d2              ; lo

+    vaddw.u8        q2, q0, d4

+    vaddw.u8        q4, q3, d8              ; hi

+    vaddw.u8        q5, q3, d10

+    ldr             r2, [sp]                ; dst

+    ldr             r3, [sp, #4]            ; stride

+    vqmovun.s16     d2, q1                  ; lo

+    vqmovun.s16     d4, q2

+    vqmovun.s16     d8, q4                  ; hi

+    vqmovun.s16     d10, q5

+    add             r0, r2, #4

+    vst1.32         {d2[0]}, [r2], r3       ; lo

+    vst1.32         {d2[1]}, [r2], r3

+    vst1.32         {d4[0]}, [r2], r3

+    vst1.32         {d4[1]}, [r2]

+    vst1.32         {d8[0]}, [r0], r3       ; hi

+    vst1.32         {d8[1]}, [r0], r3

+    vst1.32         {d10[0]}, [r0], r3

+    vst1.32         {d10[1]}, [r0]

+    bx             lr

+    ENDP           ; |idct_dequant_0_2x_neon|

+    END

--- /dev/null

+++ b/vp9/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm

@@ -1,0 +1,69 @@

+;

+;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license and patent

+;  grant that can be found in the LICENSE file in the root of the source

+;  tree. All contributing project authors may be found in the AUTHORS

+;  file in the root of the source tree.

+;

+    EXPORT  |idct_dequant_dc_0_2x_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;void idct_dequant_dc_0_2x_neon(short *dc, unsigned char *pre,

+;                               unsigned char *dst, int stride);

+; r0  *dc

+; r1  *pre

+; r2  *dst

+; r3  stride

+|idct_dequant_dc_0_2x_neon| PROC

+    ldr             r0, [r0]                ; *dc

+    mov             r12, #16

+    vld1.32         {d2[0]}, [r1], r12      ; lo

+    vld1.32         {d2[1]}, [r1], r12

+    vld1.32         {d4[0]}, [r1], r12

+    vld1.32         {d4[1]}, [r1]

+    sub             r1, r1, #44

+    vld1.32         {d8[0]}, [r1], r12      ; hi

+    vld1.32         {d8[1]}, [r1], r12

+    vld1.32         {d10[0]}, [r1], r12

+    vld1.32         {d10[1]}, [r1]

+    sxth            r1, r0                  ; lo *dc

+    add             r1, r1, #4

+    asr             r1, r1, #3

+    vdup.16         q0, r1

+    sxth            r0, r0, ror #16         ; hi *dc

+    add             r0, r0, #4

+    asr             r0, r0, #3

+    vdup.16         q3, r0

+    vaddw.u8        q1, q0, d2              ; lo

+    vaddw.u8        q2, q0, d4

+    vaddw.u8        q4, q3, d8              ; hi

+    vaddw.u8        q5, q3, d10

+    vqmovun.s16     d2, q1                  ; lo

+    vqmovun.s16     d4, q2

+    vqmovun.s16     d8, q4                  ; hi

+    vqmovun.s16     d10, q5

+    add             r0, r2, #4

+    vst1.32         {d2[0]}, [r2], r3       ; lo

+    vst1.32         {d2[1]}, [r2], r3

+    vst1.32         {d4[0]}, [r2], r3

+    vst1.32         {d4[1]}, [r2]

+    vst1.32         {d8[0]}, [r0], r3       ; hi

+    vst1.32         {d8[1]}, [r0], r3

+    vst1.32         {d10[0]}, [r0], r3

+    vst1.32         {d10[1]}, [r0]

+    bx             lr

+    ENDP           ;|idct_dequant_dc_0_2x_neon|

+    END

--- /dev/null

+++ b/vp9/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm

@@ -1,0 +1,205 @@

+;

+;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |idct_dequant_dc_full_2x_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;void idct_dequant_dc_full_2x_neon(short *q, short *dq, unsigned char *pre,

+;                                  unsigned char *dst, int stride, short *dc);

+; r0    *q,

+; r1    *dq,

+; r2    *pre

+; r3    *dst

+; sp    stride

+; sp+4  *dc

+|idct_dequant_dc_full_2x_neon| PROC

+    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)

+    vld1.16         {q2, q3}, [r0]          ; l q

+    mov             r1, #16                 ; pitch

+    add             r0, r0, #32

+    vld1.16         {q4, q5}, [r0]          ; r q

+    add             r12, r2, #4

+    ; interleave the predictors

+    vld1.32         {d28[0]}, [r2], r1      ; l pre

+    vld1.32         {d28[1]}, [r12], r1     ; r pre

+    vld1.32         {d29[0]}, [r2], r1

+    vld1.32         {d29[1]}, [r12], r1

+    vld1.32         {d30[0]}, [r2], r1

+    vld1.32         {d30[1]}, [r12], r1

+    vld1.32         {d31[0]}, [r2]

+    ldr             r1, [sp, #4]

+    vld1.32         {d31[1]}, [r12]

+    adr             r2, cospi8sqrt2minus1   ; pointer to the first constant

+    ldrh            r12, [r1], #2           ; lo *dc

+    ldrh            r1, [r1]                ; hi *dc

+    ; dequant: q[i] = q[i] * dq[i]

+    vmul.i16        q2, q2, q0

+    vmul.i16        q3, q3, q1

+    vmul.i16        q4, q4, q0

+    vmul.i16        q5, q5, q1

+    ; move dc up to neon and overwrite first element

+    vmov.16         d4[0], r12

+    vmov.16         d8[0], r1

+    vld1.16         {d0}, [r2]

+    ; q2: l0r0  q3: l8r8

+    ; q4: l4r4  q5: l12r12

+    vswp            d5, d8

+    vswp            d7, d10

+    ; _CONSTANTS_ * 4,12 >> 16

+    ; q6:  4 * sinpi : c1/temp1

+    ; q7: 12 * sinpi : d1/temp2

+    ; q8:  4 * cospi

+    ; q9: 12 * cospi

+    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2

+    vqdmulh.s16     q7, q5, d0[2]

+    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1

+    vqdmulh.s16     q9, q5, d0[0]

+    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8

+    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8

+    ; vqdmulh only accepts signed values. this was a problem because

+    ; our constant had the high bit set, and was treated as a negative value.

+    ; vqdmulh also doubles the value before it shifts by 16. we need to

+    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,

+    ; so we can shift the constant without losing precision. this avoids

+    ; shift again afterward, but also avoids the sign issue. win win!

+    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we

+    ; pre-shift it

+    vshr.s16        q8, q8, #1

+    vshr.s16        q9, q9, #1

+    ; q4:  4 +  4 * cospi : d1/temp1

+    ; q5: 12 + 12 * cospi : c1/temp2

+    vqadd.s16       q4, q4, q8

+    vqadd.s16       q5, q5, q9

+    ; c1 = temp1 - temp2

+    ; d1 = temp1 + temp2

+    vqsub.s16       q2, q6, q5

+    vqadd.s16       q3, q4, q7

+    ; [0]: a1+d1

+    ; [1]: b1+c1

+    ; [2]: b1-c1

+    ; [3]: a1-d1

+    vqadd.s16       q4, q10, q3

+    vqadd.s16       q5, q11, q2

+    vqsub.s16       q6, q11, q2

+    vqsub.s16       q7, q10, q3

+    ; rotate

+    vtrn.32         q4, q6

+    vtrn.32         q5, q7

+    vtrn.16         q4, q5

+    vtrn.16         q6, q7

+    ; idct loop 2

+    ; q4: l 0, 4, 8,12 r 0, 4, 8,12

+    ; q5: l 1, 5, 9,13 r 1, 5, 9,13

+    ; q6: l 2, 6,10,14 r 2, 6,10,14

+    ; q7: l 3, 7,11,15 r 3, 7,11,15

+    ; q8:  1 * sinpi : c1/temp1

+    ; q9:  3 * sinpi : d1/temp2

+    ; q10: 1 * cospi

+    ; q11: 3 * cospi

+    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2

+    vqdmulh.s16     q9, q7, d0[2]

+    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1

+    vqdmulh.s16     q11, q7, d0[0]

+    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2

+    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2

+    ; see note on shifting above

+    vshr.s16        q10, q10, #1

+    vshr.s16        q11, q11, #1

+    ; q10: 1 + 1 * cospi : d1/temp1

+    ; q11: 3 + 3 * cospi : c1/temp2

+    vqadd.s16       q10, q5, q10

+    vqadd.s16       q11, q7, q11

+    ; q8: c1 = temp1 - temp2

+    ; q9: d1 = temp1 + temp2

+    vqsub.s16       q8, q8, q11

+    vqadd.s16       q9, q10, q9

+    ; a1+d1

+    ; b1+c1

+    ; b1-c1

+    ; a1-d1

+    vqadd.s16       q4, q2, q9

+    vqadd.s16       q5, q3, q8

+    vqsub.s16       q6, q3, q8

+    vqsub.s16       q7, q2, q9

+    ; +4 >> 3 (rounding)

+    vrshr.s16       q4, q4, #3              ; lo

+    vrshr.s16       q5, q5, #3

+    vrshr.s16       q6, q6, #3              ; hi

+    vrshr.s16       q7, q7, #3

+    vtrn.32         q4, q6

+    vtrn.32         q5, q7

+    vtrn.16         q4, q5

+    vtrn.16         q6, q7

+    ; adding pre

+    ; input is still packed. pre was read interleaved

+    vaddw.u8        q4, q4, d28

+    vaddw.u8        q5, q5, d29

+    vaddw.u8        q6, q6, d30

+    vaddw.u8        q7, q7, d31

+    vmov.i16        q14, #0

+    vmov            q15, q14

+    vst1.16         {q14, q15}, [r0]        ; write over high input

+    sub             r0, r0, #32

+    vst1.16         {q14, q15}, [r0]        ; write over low input

+    ;saturate and narrow

+    vqmovun.s16     d0, q4                  ; lo

+    vqmovun.s16     d1, q5

+    vqmovun.s16     d2, q6                  ; hi

+    vqmovun.s16     d3, q7

+    ldr             r1, [sp]                ; stride

+    add             r2, r3, #4              ; hi

+    vst1.32         {d0[0]}, [r3], r1       ; lo

+    vst1.32         {d0[1]}, [r2], r1       ; hi

+    vst1.32         {d1[0]}, [r3], r1

+    vst1.32         {d1[1]}, [r2], r1

+    vst1.32         {d2[0]}, [r3], r1

+    vst1.32         {d2[1]}, [r2], r1

+    vst1.32         {d3[0]}, [r3]

+    vst1.32         {d3[1]}, [r2]

+    bx             lr

+    ENDP           ; |idct_dequant_dc_full_2x_neon|

+; Constant Pool

+cospi8sqrt2minus1 DCD 0x4e7b

+; because the lowest bit in 0x8a8c is 0, we can pre-shift this

+sinpi8sqrt2       DCD 0x4546

+    END

--- /dev/null

+++ b/vp9/decoder/arm/neon/idct_dequant_full_2x_neon.asm

@@ -1,0 +1,197 @@

+;

+;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |idct_dequant_full_2x_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *pre,

+;                               unsigned char *dst, int pitch, int stride);

+; r0    *q,

+; r1    *dq,

+; r2    *pre

+; r3    *dst

+; sp    pitch

+; sp+4  stride

+|idct_dequant_full_2x_neon| PROC

+    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)

+    vld1.16         {q2, q3}, [r0]          ; l q

+    ldr             r1, [sp]                ; pitch

+    add             r0, r0, #32

+    vld1.16         {q4, q5}, [r0]          ; r q

+    add             r12, r2, #4

+    ; interleave the predictors

+    vld1.32         {d28[0]}, [r2], r1      ; l pre

+    vld1.32         {d28[1]}, [r12], r1     ; r pre

+    vld1.32         {d29[0]}, [r2], r1

+    vld1.32         {d29[1]}, [r12], r1

+    vld1.32         {d30[0]}, [r2], r1

+    vld1.32         {d30[1]}, [r12], r1

+    vld1.32         {d31[0]}, [r2]

+    vld1.32         {d31[1]}, [r12]

+    adr             r2, cospi8sqrt2minus1   ; pointer to the first constant

+    ; dequant: q[i] = q[i] * dq[i]

+    vmul.i16        q2, q2, q0

+    vmul.i16        q3, q3, q1

+    vmul.i16        q4, q4, q0

+    vmul.i16        q5, q5, q1

+    vld1.16         {d0}, [r2]

+    ; q2: l0r0  q3: l8r8

+    ; q4: l4r4  q5: l12r12

+    vswp            d5, d8

+    vswp            d7, d10

+    ; _CONSTANTS_ * 4,12 >> 16

+    ; q6:  4 * sinpi : c1/temp1

+    ; q7: 12 * sinpi : d1/temp2

+    ; q8:  4 * cospi

+    ; q9: 12 * cospi

+    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2

+    vqdmulh.s16     q7, q5, d0[2]

+    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1

+    vqdmulh.s16     q9, q5, d0[0]

+    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8

+    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8

+    ; vqdmulh only accepts signed values. this was a problem because

+    ; our constant had the high bit set, and was treated as a negative value.

+    ; vqdmulh also doubles the value before it shifts by 16. we need to

+    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,

+    ; so we can shift the constant without losing precision. this avoids

+    ; shift again afterward, but also avoids the sign issue. win win!

+    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we

+    ; pre-shift it

+    vshr.s16        q8, q8, #1

+    vshr.s16        q9, q9, #1

+    ; q4:  4 +  4 * cospi : d1/temp1

+    ; q5: 12 + 12 * cospi : c1/temp2

+    vqadd.s16       q4, q4, q8

+    vqadd.s16       q5, q5, q9

+    ; c1 = temp1 - temp2

+    ; d1 = temp1 + temp2

+    vqsub.s16       q2, q6, q5

+    vqadd.s16       q3, q4, q7

+    ; [0]: a1+d1

+    ; [1]: b1+c1

+    ; [2]: b1-c1

+    ; [3]: a1-d1

+    vqadd.s16       q4, q10, q3

+    vqadd.s16       q5, q11, q2

+    vqsub.s16       q6, q11, q2

+    vqsub.s16       q7, q10, q3

+    ; rotate

+    vtrn.32         q4, q6

+    vtrn.32         q5, q7

+    vtrn.16         q4, q5

+    vtrn.16         q6, q7

+    ; idct loop 2

+    ; q4: l 0, 4, 8,12 r 0, 4, 8,12

+    ; q5: l 1, 5, 9,13 r 1, 5, 9,13

+    ; q6: l 2, 6,10,14 r 2, 6,10,14

+    ; q7: l 3, 7,11,15 r 3, 7,11,15

+    ; q8:  1 * sinpi : c1/temp1

+    ; q9:  3 * sinpi : d1/temp2

+    ; q10: 1 * cospi

+    ; q11: 3 * cospi

+    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2

+    vqdmulh.s16     q9, q7, d0[2]

+    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1

+    vqdmulh.s16     q11, q7, d0[0]

+    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2

+    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2

+    ; see note on shifting above

+    vshr.s16        q10, q10, #1

+    vshr.s16        q11, q11, #1

+    ; q10: 1 + 1 * cospi : d1/temp1

+    ; q11: 3 + 3 * cospi : c1/temp2

+    vqadd.s16       q10, q5, q10

+    vqadd.s16       q11, q7, q11

+    ; q8: c1 = temp1 - temp2

+    ; q9: d1 = temp1 + temp2

+    vqsub.s16       q8, q8, q11

+    vqadd.s16       q9, q10, q9

+    ; a1+d1

+    ; b1+c1

+    ; b1-c1

+    ; a1-d1

+    vqadd.s16       q4, q2, q9

+    vqadd.s16       q5, q3, q8

+    vqsub.s16       q6, q3, q8

+    vqsub.s16       q7, q2, q9

+    ; +4 >> 3 (rounding)

+    vrshr.s16       q4, q4, #3              ; lo

+    vrshr.s16       q5, q5, #3

+    vrshr.s16       q6, q6, #3              ; hi

+    vrshr.s16       q7, q7, #3

+    vtrn.32         q4, q6

+    vtrn.32         q5, q7

+    vtrn.16         q4, q5

+    vtrn.16         q6, q7

+    ; adding pre

+    ; input is still packed. pre was read interleaved

+    vaddw.u8        q4, q4, d28

+    vaddw.u8        q5, q5, d29

+    vaddw.u8        q6, q6, d30

+    vaddw.u8        q7, q7, d31

+    vmov.i16        q14, #0

+    vmov            q15, q14

+    vst1.16         {q14, q15}, [r0]        ; write over high input

+    sub             r0, r0, #32

+    vst1.16         {q14, q15}, [r0]        ; write over low input

+    ;saturate and narrow

+    vqmovun.s16     d0, q4                  ; lo

+    vqmovun.s16     d1, q5

+    vqmovun.s16     d2, q6                  ; hi

+    vqmovun.s16     d3, q7

+    ldr             r1, [sp, #4]            ; stride

+    add             r2, r3, #4              ; hi

+    vst1.32         {d0[0]}, [r3], r1       ; lo

+    vst1.32         {d0[1]}, [r2], r1       ; hi

+    vst1.32         {d1[0]}, [r3], r1

+    vst1.32         {d1[1]}, [r2], r1

+    vst1.32         {d2[0]}, [r3], r1

+    vst1.32         {d2[1]}, [r2], r1

+    vst1.32         {d3[0]}, [r3]

+    vst1.32         {d3[1]}, [r2]

+    bx             lr

+    ENDP           ; |idct_dequant_full_2x_neon|

+; Constant Pool

+cospi8sqrt2minus1 DCD 0x4e7b

+; because the lowest bit in 0x8a8c is 0, we can pre-shift this

+sinpi8sqrt2       DCD 0x4546

+    END

--- /dev/null

+++ b/vp9/decoder/asm_dec_offsets.c

@@ -1,0 +1,39 @@

+/*

+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/asm_offsets.h"

+#include "onyxd_int.h"

+BEGIN

+DEFINE(detok_scan,                              offsetof(DETOK, scan));

+DEFINE(detok_ptr_block2leftabove,               offsetof(DETOK, ptr_block2leftabove));

+DEFINE(detok_coef_tree_ptr,                     offsetof(DETOK, vp9_coef_tree_ptr));

+DEFINE(detok_norm_ptr,                          offsetof(DETOK, norm_ptr));

+DEFINE(detok_ptr_coef_bands_x,                  offsetof(DETOK, ptr_coef_bands_x));

+DEFINE(detok_A,                                 offsetof(DETOK, A));

+DEFINE(detok_L,                                 offsetof(DETOK, L));

+DEFINE(detok_qcoeff_start_ptr,                  offsetof(DETOK, qcoeff_start_ptr));

+DEFINE(detok_coef_probs,                        offsetof(DETOK, coef_probs));

+DEFINE(detok_eob,                               offsetof(DETOK, eob));

+DEFINE(bool_decoder_user_buffer_end,            offsetof(BOOL_DECODER, user_buffer_end));

+DEFINE(bool_decoder_user_buffer,                offsetof(BOOL_DECODER, user_buffer));

+DEFINE(bool_decoder_value,                      offsetof(BOOL_DECODER, value));

+DEFINE(bool_decoder_count,                      offsetof(BOOL_DECODER, count));

+DEFINE(bool_decoder_range,                      offsetof(BOOL_DECODER, range));

+END

+/* add asserts for any offset that is not supported by assembly code */

+/* add asserts for any size that is not supported by assembly code */

--- /dev/null

+++ b/vp9/decoder/dboolhuff.c

@@ -1,0 +1,100 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "dboolhuff.h"

+#include "vpx_ports/mem.h"

+#include "vpx_mem/vpx_mem.h"

+int vp9_start_decode(BOOL_DECODER *br,

+                     const unsigned char *source,

+                     unsigned int source_sz) {

+  br->user_buffer_end = source + source_sz;

+  br->user_buffer     = source;

+  br->value    = 0;

+  br->count    = -8;

+  br->range    = 255;

+  if (source_sz && !source)

+    return 1;

+  /* Populate the buffer */

+  vp9_bool_decoder_fill(br);

+  return 0;

+}

+void vp9_bool_decoder_fill(BOOL_DECODER *br) {

+  const unsigned char *bufptr;

+  const unsigned char *bufend;

+  VP9_BD_VALUE         value;

+  int                  count;

+  bufend = br->user_buffer_end;

+  bufptr = br->user_buffer;

+  value = br->value;

+  count = br->count;

+  VP9DX_BOOL_DECODER_FILL(count, value, bufptr, bufend);

+  br->user_buffer = bufptr;

+  br->value = value;

+  br->count = count;

+}

+static int get_unsigned_bits(unsigned num_values) {

+  int cat = 0;

+  if ((num_values--) <= 1) return 0;

+  while (num_values > 0) {

+    cat++;

+    num_values >>= 1;

+  }

+  return cat;

+}

+int vp9_inv_recenter_nonneg(int v, int m) {

+  if (v > (m << 1)) return v;

+  else if ((v & 1) == 0) return (v >> 1) + m;

+  else return m - ((v + 1) >> 1);

+}

+int vp9_decode_uniform(BOOL_DECODER *br, int n) {

+  int v;

+  int l = get_unsigned_bits(n);

+  int m = (1 << l) - n;

+  if (!l) return 0;

+  v = decode_value(br, l - 1);

+  if (v < m)

+    return v;

+  else

+    return (v << 1) - m + decode_value(br, 1);

+}

+int vp9_decode_term_subexp(BOOL_DECODER *br, int k, int num_syms) {

+  int i = 0, mk = 0, word;

+  while (1) {

+    int b = (i ? k + i - 1 : k);

+    int a = (1 << b);

+    if (num_syms <= mk + 3 * a) {

+      word = vp9_decode_uniform(br, num_syms - mk) + mk;

+      break;

+    } else {

+      if (decode_value(br, 1)) {

+        i++;

+        mk += a;

+      } else {

+        word = decode_value(br, b) + mk;

+        break;

+      }

+    }

+  }

+  return word;

+}

--- /dev/null

+++ b/vp9/decoder/dboolhuff.h

@@ -1,0 +1,153 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef DBOOLHUFF_H

+#define DBOOLHUFF_H

+#include <stddef.h>

+#include <limits.h>

+#include "vpx_ports/config.h"

+#include "vpx_ports/mem.h"

+#include "vpx/vpx_integer.h"

+typedef size_t VP9_BD_VALUE;

+# define VP9_BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT)

+/*This is meant to be a large, positive constant that can still be efficiently

+   loaded as an immediate (on platforms like ARM, for example).

+  Even relatively modest values like 100 would work fine.*/

+# define VP9_LOTS_OF_BITS (0x40000000)

+typedef struct {

+  const unsigned char *user_buffer_end;

+  const unsigned char *user_buffer;

+  VP9_BD_VALUE         value;

+  int                  count;

+  unsigned int         range;

+} BOOL_DECODER;

+DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);

+int vp9_start_decode(BOOL_DECODER *br,

+                     const unsigned char *source,

+                     unsigned int source_sz);

+void vp9_bool_decoder_fill(BOOL_DECODER *br);

+int vp9_decode_uniform(BOOL_DECODER *br, int n);

+int vp9_decode_term_subexp(BOOL_DECODER *br, int k, int num_syms);

+int vp9_inv_recenter_nonneg(int v, int m);

+/*The refill loop is used in several places, so define it in a macro to make

+   sure they're all consistent.

+  An inline function would be cleaner, but has a significant penalty, because

+   multiple BOOL_DECODER fields must be modified, and the compiler is not smart

+   enough to eliminate the stores to those fields and the subsequent reloads

+   from them when inlining the function.*/

+#define VP9DX_BOOL_DECODER_FILL(_count,_value,_bufptr,_bufend) \

+  do \

+  { \

+    int shift = VP9_BD_VALUE_SIZE - 8 - ((_count) + 8); \

+    int loop_end, x; \

+    size_t bits_left = ((_bufend)-(_bufptr))*CHAR_BIT; \

+    \

+    x = shift + CHAR_BIT - bits_left; \

+    loop_end = 0; \

+    if(x >= 0) \

+    { \

+      (_count) += VP9_LOTS_OF_BITS; \

+      loop_end = x; \

+      if(!bits_left) break; \

+    } \

+    while(shift >= loop_end) \

+    { \

+      (_count) += CHAR_BIT; \

+      (_value) |= (VP9_BD_VALUE)*(_bufptr)++ << shift; \

+      shift -= CHAR_BIT; \

+    } \

+  } \

+  while(0) \

+static int decode_bool(BOOL_DECODER *br, int probability) {

+  unsigned int bit = 0;

+  VP9_BD_VALUE value;

+  unsigned int split;

+  VP9_BD_VALUE bigsplit;

+  int count;

+  unsigned int range;

+  split = 1 + (((br->range - 1) * probability) >> 8);

+  if (br->count < 0)

+    vp9_bool_decoder_fill(br);

+  value = br->value;

+  count = br->count;

+  bigsplit = (VP9_BD_VALUE)split << (VP9_BD_VALUE_SIZE - 8);

+  range = split;

+  if (value >= bigsplit) {

+    range = br->range - split;

+    value = value - bigsplit;

+    bit = 1;

+  }

+  {

+    register unsigned int shift = vp9_norm[range];

+    range <<= shift;

+    value <<= shift;

+    count -= shift;

+  }

+  br->value = value;

+  br->count = count;

+  br->range = range;

+  return bit;

+}

+static int decode_value(BOOL_DECODER *br, int bits) {

+  int z = 0;

+  int bit;

+  for (bit = bits - 1; bit >= 0; bit--) {

+    z |= (decode_bool(br, 0x80) << bit);

+  }

+  return z;

+}

+static int bool_error(BOOL_DECODER *br) {

+  /* Check if we have reached the end of the buffer.

+   *

+   * Variable 'count' stores the number of bits in the 'value' buffer, minus

+   * 8. The top byte is part of the algorithm, and the remainder is buffered

+   * to be shifted into it. So if count == 8, the top 16 bits of 'value' are

+   * occupied, 8 for the algorithm and 8 in the buffer.

+   *

+   * When reading a byte from the user's buffer, count is filled with 8 and

+   * one byte is filled into the value buffer. When we reach the end of the

+   * data, count is additionally filled with VP9_LOTS_OF_BITS. So when

+   * count == VP9_LOTS_OF_BITS - 1, the user's data has been exhausted.

+   */

+  if ((br->count > VP9_BD_VALUE_SIZE) && (br->count < VP9_LOTS_OF_BITS)) {

+    /* We have tried to decode bits after the end of

+     * stream was encountered.

+     */

+    return 1;

+  }

+  /* No error. */

+  return 0;

+}

+#endif

--- /dev/null

+++ b/vp9/decoder/decodemv.c

@@ -1,0 +1,1199 @@

+/*

+  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "treereader.h"

+#include "vp9/common/entropymv.h"

+#include "vp9/common/entropymode.h"

+#include "onyxd_int.h"

+#include "vp9/common/findnearmv.h"

+#include "vp9/common/seg_common.h"

+#include "vp9/common/pred_common.h"

+#include "vp9/common/entropy.h"

+#include "vp9/decoder/decodemv.h"

+#if CONFIG_DEBUG

+#include <assert.h>

+#endif

+// #define DEBUG_DEC_MV

+#ifdef DEBUG_DEC_MV

+int dec_mvcount = 0;

+#endif

+static int read_bmode(vp9_reader *bc, const vp9_prob *p) {

+  return treed_read(bc, vp9_bmode_tree, p);

+}

+static int read_ymode(vp9_reader *bc, const vp9_prob *p) {

+  return treed_read(bc, vp9_ymode_tree, p);

+}

+#if CONFIG_SUPERBLOCKS

+static int read_kf_sb_ymode(vp9_reader *bc, const vp9_prob *p) {

+  return treed_read(bc, vp9_uv_mode_tree, p);

+}

+#endif

+static int read_kf_mb_ymode(vp9_reader *bc, const vp9_prob *p) {

+  return treed_read(bc, vp9_kf_ymode_tree, p);

+}

+static int read_i8x8_mode(vp9_reader *bc, const vp9_prob *p) {

+  return treed_read(bc, vp9_i8x8_mode_tree, p);

+}

+static int read_uv_mode(vp9_reader *bc, const vp9_prob *p) {

+  return treed_read(bc, vp9_uv_mode_tree, p);

+}

+// This function reads the current macro block's segnent id from the bitstream

+// It should only be called if a segment map update is indicated.

+static void read_mb_segid(vp9_reader *r, MB_MODE_INFO *mi,

+                          MACROBLOCKD *xd) {

+  /* Is segmentation enabled */

+  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {

+    /* If so then read the segment id. */

+    if (vp9_read(r, xd->mb_segment_tree_probs[0]))

+      mi->segment_id =

+        (unsigned char)(2 + vp9_read(r, xd->mb_segment_tree_probs[2]));

+    else

+      mi->segment_id =

+        (unsigned char)(vp9_read(r, xd->mb_segment_tree_probs[1]));

+  }

+}

+#if CONFIG_NEW_MVREF

+int vp9_read_mv_ref_id(vp9_reader *r,

+                       vp9_prob * ref_id_probs) {

+  int ref_index = 0;

+  if (vp9_read(r, ref_id_probs[0])) {

+    ref_index++;

+    if (vp9_read(r, ref_id_probs[1])) {

+      ref_index++;

+      if (vp9_read(r, ref_id_probs[2]))

+        ref_index++;

+    }

+  }

+  return ref_index;

+}

+#endif

+extern const int vp9_i8x8_block[4];

+static void kfread_modes(VP9D_COMP *pbi,

+                         MODE_INFO *m,

+                         int mb_row,

+                         int mb_col,

+                         BOOL_DECODER* const bc) {

+  VP9_COMMON *const cm = &pbi->common;

+  const int mis = pbi->common.mode_info_stride;

+  int map_index = mb_row * pbi->common.mb_cols + mb_col;

+  MB_PREDICTION_MODE y_mode;

+  // Read the Macroblock segmentation map if it is being updated explicitly

+  // this frame (reset to 0 by default).

+  m->mbmi.segment_id = 0;

+  if (pbi->mb.update_mb_segmentation_map) {

+    read_mb_segid(bc, &m->mbmi, &pbi->mb);

+    pbi->common.last_frame_seg_map[map_index] = m->mbmi.segment_id;

+  }

+  m->mbmi.mb_skip_coeff = 0;

+  if (pbi->common.mb_no_coeff_skip &&

+      (!vp9_segfeature_active(&pbi->mb,

+                              m->mbmi.segment_id, SEG_LVL_EOB) ||

+       (vp9_get_segdata(&pbi->mb,

+                        m->mbmi.segment_id, SEG_LVL_EOB) != 0))) {

+    MACROBLOCKD *const xd  = &pbi->mb;

+    m->mbmi.mb_skip_coeff =

+      vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));

+  } else {

+    if (vp9_segfeature_active(&pbi->mb,

+                              m->mbmi.segment_id, SEG_LVL_EOB) &&

+        (vp9_get_segdata(&pbi->mb,

+                         m->mbmi.segment_id, SEG_LVL_EOB) == 0)) {

+      m->mbmi.mb_skip_coeff = 1;

+    } else

+      m->mbmi.mb_skip_coeff = 0;

+  }

+#if CONFIG_SUPERBLOCKS

+  if (m->mbmi.encoded_as_sb) {

+    y_mode = (MB_PREDICTION_MODE) read_kf_sb_ymode(bc,

+      pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]);

+  } else

+#endif

+  y_mode = (MB_PREDICTION_MODE) read_kf_mb_ymode(bc,

+    pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);

+#if CONFIG_COMP_INTRA_PRED

+  m->mbmi.second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

+#endif

+  m->mbmi.ref_frame = INTRA_FRAME;

+  if ((m->mbmi.mode = y_mode) == B_PRED) {

+    int i = 0;

+#if CONFIG_COMP_INTRA_PRED

+    int use_comp_pred = vp9_read(bc, 128);

+#endif

+    do {

+      const B_PREDICTION_MODE A = above_block_mode(m, i, mis);

+      const B_PREDICTION_MODE L = left_block_mode(m, i);

+      m->bmi[i].as_mode.first =

+        (B_PREDICTION_MODE) read_bmode(

+          bc, pbi->common.kf_bmode_prob [A] [L]);

+#if CONFIG_COMP_INTRA_PRED

+      if (use_comp_pred) {

+        m->bmi[i].as_mode.second =

+          (B_PREDICTION_MODE) read_bmode(

+            bc, pbi->common.kf_bmode_prob [A] [L]);

+      } else {

+        m->bmi[i].as_mode.second = (B_PREDICTION_MODE)(B_DC_PRED - 1);

+      }

+#endif

+    } while (++i < 16);

+  }

+  if ((m->mbmi.mode = y_mode) == I8X8_PRED) {

+    int i;

+    int mode8x8;

+    for (i = 0; i < 4; i++) {

+      int ib = vp9_i8x8_block[i];

+      mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);

+      m->bmi[ib + 0].as_mode.first = mode8x8;

+      m->bmi[ib + 1].as_mode.first = mode8x8;

+      m->bmi[ib + 4].as_mode.first = mode8x8;

+      m->bmi[ib + 5].as_mode.first = mode8x8;

+#if CONFIG_COMP_INTRA_PRED

+      m->bmi[ib + 0].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);

+      m->bmi[ib + 1].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);

+      m->bmi[ib + 4].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);

+      m->bmi[ib + 5].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);

+#endif

+    }

+  } else

+    m->mbmi.uv_mode = (MB_PREDICTION_MODE)read_uv_mode(bc,

+                                                       pbi->common.kf_uv_mode_prob[m->mbmi.mode]);

+#if CONFIG_COMP_INTRA_PRED

+  m->mbmi.second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

+#endif

+#if CONFIG_SUPERBLOCKS

+  if (m->mbmi.encoded_as_sb)

+    m->mbmi.txfm_size = TX_8X8;

+  else

+#endif

+  if (cm->txfm_mode == TX_MODE_SELECT && m->mbmi.mb_skip_coeff == 0 &&

+      m->mbmi.mode <= I8X8_PRED) {

+    // FIXME(rbultje) code ternary symbol once all experiments are merged

+    m->mbmi.txfm_size = vp9_read(bc, cm->prob_tx[0]);

+    if (m->mbmi.txfm_size != TX_4X4 && m->mbmi.mode != I8X8_PRED)

+      m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[1]);

+  } else if (cm->txfm_mode >= ALLOW_16X16 && m->mbmi.mode <= TM_PRED) {

+    m->mbmi.txfm_size = TX_16X16;

+  } else if (cm->txfm_mode >= ALLOW_8X8 && m->mbmi.mode != B_PRED) {

+    m->mbmi.txfm_size = TX_8X8;

+  } else {

+    m->mbmi.txfm_size = TX_4X4;

+  }

+}

+static int read_nmv_component(vp9_reader *r,

+                              int rv,

+                              const nmv_component *mvcomp) {

+  int v, s, z, c, o, d;

+  s = vp9_read(r, mvcomp->sign);

+  c = treed_read(r, vp9_mv_class_tree, mvcomp->classes);

+  if (c == MV_CLASS_0) {

+    d = treed_read(r, vp9_mv_class0_tree, mvcomp->class0);

+  } else {

+    int i, b;

+    d = 0;

+    b = c + CLASS0_BITS - 1;  /* number of bits */

+    for (i = 0; i < b; ++i)

+      d |= (vp9_read(r, mvcomp->bits[i]) << i);

+  }

+  o = d << 3;

+  z = vp9_get_mv_mag(c, o);

+  v = (s ? -(z + 8) : (z + 8));

+  return v;

+}

+static int read_nmv_component_fp(vp9_reader *r,

+                                 int v,

+                                 int rv,

+                                 const nmv_component *mvcomp,

+                                 int usehp) {

+  int s, z, c, o, d, e, f;

+  s = v < 0;

+  z = (s ? -v : v) - 1;       /* magnitude - 1 */

+  z &= ~7;

+  c = vp9_get_mv_class(z, &o);

+  d = o >> 3;

+  if (c == MV_CLASS_0) {

+    f = treed_read(r, vp9_mv_fp_tree, mvcomp->class0_fp[d]);

+  } else {

+    f = treed_read(r, vp9_mv_fp_tree, mvcomp->fp);

+  }

+  o += (f << 1);

+  if (usehp) {

+    if (c == MV_CLASS_0) {

+      e = vp9_read(r, mvcomp->class0_hp);

+    } else {

+      e = vp9_read(r, mvcomp->hp);

+    }

+    o += e;

+  } else {

+    ++o;  /* Note if hp is not used, the default value of the hp bit is 1 */

+  }

+  z = vp9_get_mv_mag(c, o);

+  v = (s ? -(z + 1) : (z + 1));

+  return v;

+}

+static void read_nmv(vp9_reader *r, MV *mv, const MV *ref,

+                     const nmv_context *mvctx) {

+  MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, mvctx->joints);

+  mv->row = mv-> col = 0;

+  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {

+    mv->row = read_nmv_component(r, ref->row, &mvctx->comps[0]);

+  }

+  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {

+    mv->col = read_nmv_component(r, ref->col, &mvctx->comps[1]);

+  }

+}

+static void read_nmv_fp(vp9_reader *r, MV *mv, const MV *ref,

+                        const nmv_context *mvctx, int usehp) {

+  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);

+  usehp = usehp && vp9_use_nmv_hp(ref);

+  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {

+    mv->row = read_nmv_component_fp(r, mv->row, ref->row, &mvctx->comps[0],

+                                    usehp);

+  }

+  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {

+    mv->col = read_nmv_component_fp(r, mv->col, ref->col, &mvctx->comps[1],

+                                    usehp);

+  }

+  //printf("  %d: %d %d ref: %d %d\n", usehp, mv->row, mv-> col, ref->row, ref->col);

+}

+static void update_nmv(vp9_reader *bc, vp9_prob *const p,

+                       const vp9_prob upd_p) {

+  if (vp9_read(bc, upd_p)) {

+#ifdef LOW_PRECISION_MV_UPDATE

+    *p = (vp9_read_literal(bc, 7) << 1) | 1;

+#else

+    *p = (vp9_read_literal(bc, 8));

+#endif

+  }

+}

+static void read_nmvprobs(vp9_reader *bc, nmv_context *mvctx,

+                          int usehp) {

+  int i, j, k;

+#ifdef MV_GROUP_UPDATE

+  if (!vp9_read_bit(bc)) return;

+#endif

+  for (j = 0; j < MV_JOINTS - 1; ++j) {

+    update_nmv(bc, &mvctx->joints[j],

+               VP9_NMV_UPDATE_PROB);

+  }

+  for (i = 0; i < 2; ++i) {

+    update_nmv(bc, &mvctx->comps[i].sign,

+               VP9_NMV_UPDATE_PROB);

+    for (j = 0; j < MV_CLASSES - 1; ++j) {

+      update_nmv(bc, &mvctx->comps[i].classes[j],

+                 VP9_NMV_UPDATE_PROB);

+    }

+    for (j = 0; j < CLASS0_SIZE - 1; ++j) {

+      update_nmv(bc, &mvctx->comps[i].class0[j],

+                 VP9_NMV_UPDATE_PROB);

+    }

+    for (j = 0; j < MV_OFFSET_BITS; ++j) {

+      update_nmv(bc, &mvctx->comps[i].bits[j],

+                 VP9_NMV_UPDATE_PROB);

+    }

+  }

+  for (i = 0; i < 2; ++i) {

+    for (j = 0; j < CLASS0_SIZE; ++j) {

+      for (k = 0; k < 3; ++k)

+        update_nmv(bc, &mvctx->comps[i].class0_fp[j][k],

+                   VP9_NMV_UPDATE_PROB);

+    }

+    for (j = 0; j < 3; ++j) {

+      update_nmv(bc, &mvctx->comps[i].fp[j],

+                 VP9_NMV_UPDATE_PROB);

+    }

+  }

+  if (usehp) {

+    for (i = 0; i < 2; ++i) {

+      update_nmv(bc, &mvctx->comps[i].class0_hp,

+                 VP9_NMV_UPDATE_PROB);

+      update_nmv(bc, &mvctx->comps[i].hp,

+                 VP9_NMV_UPDATE_PROB);

+    }

+  }

+}

+// Read the referncence frame

+static MV_REFERENCE_FRAME read_ref_frame(VP9D_COMP *pbi,

+                                         vp9_reader *const bc,

+                                         unsigned char segment_id) {

+  MV_REFERENCE_FRAME ref_frame;

+  int seg_ref_active;

+  int seg_ref_count = 0;

+  VP9_COMMON *const cm = &pbi->common;

+  MACROBLOCKD *const xd = &pbi->mb;

+  seg_ref_active = vp9_segfeature_active(xd,

+                                         segment_id,

+                                         SEG_LVL_REF_FRAME);

+  // If segment coding enabled does the segment allow for more than one

+  // possible reference frame

+  if (seg_ref_active) {

+    seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME) +

+                    vp9_check_segref(xd, segment_id, LAST_FRAME) +

+                    vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +

+                    vp9_check_segref(xd, segment_id, ALTREF_FRAME);

+  }

+  // Segment reference frame features not available or allows for

+  // multiple reference frame options

+  if (!seg_ref_active || (seg_ref_count > 1)) {

+    // Values used in prediction model coding

+    unsigned char prediction_flag;

+    vp9_prob pred_prob;

+    MV_REFERENCE_FRAME pred_ref;

+    // Get the context probability the prediction flag

+    pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);

+    // Read the prediction status flag

+    prediction_flag = (unsigned char)vp9_read(bc, pred_prob);

+    // Store the prediction flag.

+    vp9_set_pred_flag(xd, PRED_REF, prediction_flag);

+    // Get the predicted reference frame.

+    pred_ref = vp9_get_pred_ref(cm, xd);

+    // If correctly predicted then use the predicted value

+    if (prediction_flag) {

+      ref_frame = pred_ref;

+    }

+    // else decode the explicitly coded value

+    else {

+      vp9_prob mod_refprobs[PREDICTION_PROBS];

+      vpx_memcpy(mod_refprobs,

+                 cm->mod_refprobs[pred_ref], sizeof(mod_refprobs));

+      // If segment coding enabled blank out options that cant occur by

+      // setting the branch probability to 0.

+      if (seg_ref_active) {

+        mod_refprobs[INTRA_FRAME] *=

+          vp9_check_segref(xd, segment_id, INTRA_FRAME);

+        mod_refprobs[LAST_FRAME] *=

+          vp9_check_segref(xd, segment_id, LAST_FRAME);

+        mod_refprobs[GOLDEN_FRAME] *=

+          (vp9_check_segref(xd, segment_id, GOLDEN_FRAME) *

+           vp9_check_segref(xd, segment_id, ALTREF_FRAME));

+      }

+      // Default to INTRA_FRAME (value 0)

+      ref_frame = INTRA_FRAME;

+      // Do we need to decode the Intra/Inter branch

+      if (mod_refprobs[0])

+        ref_frame = (MV_REFERENCE_FRAME) vp9_read(bc, mod_refprobs[0]);

+      else

+        ref_frame++;

+      if (ref_frame) {

+        // Do we need to decode the Last/Gf_Arf branch

+        if (mod_refprobs[1])

+          ref_frame += vp9_read(bc, mod_refprobs[1]);

+        else

+          ref_frame++;

+        if (ref_frame > 1) {

+          // Do we need to decode the GF/Arf branch

+          if (mod_refprobs[2])

+            ref_frame += vp9_read(bc, mod_refprobs[2]);

+          else {

+            if (seg_ref_active) {

+              if ((pred_ref == GOLDEN_FRAME) ||

+                  !vp9_check_segref(xd, segment_id, GOLDEN_FRAME)) {

+                ref_frame = ALTREF_FRAME;

+              } else

+                ref_frame = GOLDEN_FRAME;

+            } else

+              ref_frame = (pred_ref == GOLDEN_FRAME)

+                          ? ALTREF_FRAME : GOLDEN_FRAME;

+          }

+        }

+      }

+    }

+  }

+  // Segment reference frame features are enabled

+  else {

+    // The reference frame for the mb is considered as correclty predicted

+    // if it is signaled at the segment level for the purposes of the

+    // common prediction model

+    vp9_set_pred_flag(xd, PRED_REF, 1);

+    ref_frame = vp9_get_pred_ref(cm, xd);

+  }

+  return (MV_REFERENCE_FRAME)ref_frame;

+}

+#if CONFIG_SUPERBLOCKS

+static MB_PREDICTION_MODE read_sb_mv_ref(vp9_reader *bc, const vp9_prob *p) {

+  return (MB_PREDICTION_MODE) treed_read(bc, vp9_sb_mv_ref_tree, p);

+}

+#endif

+static MB_PREDICTION_MODE read_mv_ref(vp9_reader *bc, const vp9_prob *p) {

+  return (MB_PREDICTION_MODE) treed_read(bc, vp9_mv_ref_tree, p);

+}

+static B_PREDICTION_MODE sub_mv_ref(vp9_reader *bc, const vp9_prob *p) {

+  return (B_PREDICTION_MODE) treed_read(bc, vp9_sub_mv_ref_tree, p);

+}

+#ifdef VPX_MODE_COUNT

+unsigned int vp9_mv_cont_count[5][4] = {

+  { 0, 0, 0, 0 },

+  { 0, 0, 0, 0 },

+  { 0, 0, 0, 0 },

+  { 0, 0, 0, 0 },

+  { 0, 0, 0, 0 }

+};

+#endif

+static const unsigned char mbsplit_fill_count[4] = {8, 8, 4, 1};

+static const unsigned char mbsplit_fill_offset[4][16] = {

+  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},

+  { 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,   6,  7, 10, 11, 14, 15},

+  { 0,  1,  4,  5,  2,  3,  6,  7,  8,  9,  12, 13, 10, 11, 14, 15},

+  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}

+};

+static void read_switchable_interp_probs(VP9D_COMP* const pbi,

+                                         BOOL_DECODER* const bc) {

+  VP9_COMMON *const cm = &pbi->common;

+  int i, j;

+  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {

+    for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {

+      cm->fc.switchable_interp_prob[j][i] = vp9_read_literal(bc, 8);

+    }

+  }

+  //printf("DECODER: %d %d\n", cm->fc.switchable_interp_prob[0],

+  //cm->fc.switchable_interp_prob[1]);

+}

+static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *bc) {

+  VP9_COMMON *const cm = &pbi->common;

+  nmv_context *const nmvc = &pbi->common.fc.nmvc;

+  MACROBLOCKD *const xd  = &pbi->mb;

+  if (cm->frame_type == KEY_FRAME) {

+    if (!cm->kf_ymode_probs_update)

+      cm->kf_ymode_probs_index = vp9_read_literal(bc, 3);

+  } else {

+#if CONFIG_PRED_FILTER

+    cm->pred_filter_mode = (vp9_prob)vp9_read_literal(bc, 2);

+    if (cm->pred_filter_mode == 2)

+      cm->prob_pred_filter_off = (vp9_prob)vp9_read_literal(bc, 8);

+#endif

+    if (cm->mcomp_filter_type == SWITCHABLE)

+      read_switchable_interp_probs(pbi, bc);

+    // Decode the baseline probabilities for decoding reference frame

+    cm->prob_intra_coded = (vp9_prob)vp9_read_literal(bc, 8);

+    cm->prob_last_coded  = (vp9_prob)vp9_read_literal(bc, 8);

+    cm->prob_gf_coded    = (vp9_prob)vp9_read_literal(bc, 8);

+    // Computes a modified set of probabilities for use when reference

+    // frame prediction fails.

+    vp9_compute_mod_refprobs(cm);

+    pbi->common.comp_pred_mode = vp9_read(bc, 128);

+    if (cm->comp_pred_mode)

+      cm->comp_pred_mode += vp9_read(bc, 128);

+    if (cm->comp_pred_mode == HYBRID_PREDICTION) {

+      int i;

+      for (i = 0; i < COMP_PRED_CONTEXTS; i++)

+        cm->prob_comppred[i] = (vp9_prob)vp9_read_literal(bc, 8);

+    }

+    if (vp9_read_bit(bc)) {

+      int i = 0;

+      do {

+        cm->fc.ymode_prob[i] = (vp9_prob) vp9_read_literal(bc, 8);

+      } while (++i < VP9_YMODES - 1);

+    }

+#if CONFIG_NEW_MVREF

+  // Temp defaults probabilities for ecnoding the MV ref id signal

+  vpx_memset(xd->mb_mv_ref_id_probs, 192, sizeof(xd->mb_mv_ref_id_probs));

+#endif

+    read_nmvprobs(bc, nmvc, xd->allow_high_precision_mv);

+  }

+}

+// This function either reads the segment id for the current macroblock from

+// the bitstream or if the value is temporally predicted asserts the predicted

+// value

+static void read_mb_segment_id(VP9D_COMP *pbi,

+                               int mb_row, int mb_col,

+                               BOOL_DECODER* const bc) {

+  VP9_COMMON *const cm = &pbi->common;

+  MACROBLOCKD *const xd  = &pbi->mb;

+  MODE_INFO *mi = xd->mode_info_context;

+  MB_MODE_INFO *mbmi = &mi->mbmi;

+  int index = mb_row * pbi->common.mb_cols + mb_col;

+  if (xd->segmentation_enabled) {

+    if (xd->update_mb_segmentation_map) {

+      // Is temporal coding of the segment id for this mb enabled.

+      if (cm->temporal_update) {

+        // Get the context based probability for reading the

+        // prediction status flag

+        vp9_prob pred_prob =

+          vp9_get_pred_prob(cm, xd, PRED_SEG_ID);

+        // Read the prediction status flag

+        unsigned char seg_pred_flag =

+          (unsigned char)vp9_read(bc, pred_prob);

+        // Store the prediction flag.

+        vp9_set_pred_flag(xd, PRED_SEG_ID, seg_pred_flag);

+        // If the value is flagged as correctly predicted

+        // then use the predicted value

+        if (seg_pred_flag) {

+          mbmi->segment_id = vp9_get_pred_mb_segid(cm, xd, index);

+        }

+        // Else .... decode it explicitly

+        else {

+          read_mb_segid(bc, mbmi, xd);

+        }

+      }

+      // Normal unpredicted coding mode

+      else {

+        read_mb_segid(bc, mbmi, xd);

+      }

+#if CONFIG_SUPERBLOCKS

+      if (mbmi->encoded_as_sb) {

+        cm->last_frame_seg_map[index] = mbmi->segment_id;

+        if (mb_col + 1 < cm->mb_cols)

+          cm->last_frame_seg_map[index + 1] = mbmi->segment_id;

+        if (mb_row + 1 < cm->mb_rows) {

+          cm->last_frame_seg_map[index + cm->mb_cols] = mbmi->segment_id;

+          if (mb_col + 1 < cm->mb_cols)

+            cm->last_frame_seg_map[index + cm->mb_cols + 1] = mbmi->segment_id;

+        }

+      } else

+#endif

+      {

+        cm->last_frame_seg_map[index] = mbmi->segment_id;

+      }

+    } else {

+#if CONFIG_SUPERBLOCKS

+      if (mbmi->encoded_as_sb) {

+        mbmi->segment_id = cm->last_frame_seg_map[index];

+        if (mb_col < cm->mb_cols - 1)

+          mbmi->segment_id = mbmi->segment_id &&

+                             cm->last_frame_seg_map[index + 1];

+        if (mb_row < cm->mb_rows - 1) {

+          mbmi->segment_id = mbmi->segment_id &&

+                             cm->last_frame_seg_map[index + cm->mb_cols];

+          if (mb_col < cm->mb_cols - 1)

+            mbmi->segment_id = mbmi->segment_id &&

+                               cm->last_frame_seg_map[index + cm->mb_cols + 1];

+        }

+      } else

+#endif

+      {

+        mbmi->segment_id = cm->last_frame_seg_map[index];

+      }

+    }

+  } else {

+    // The encoder explicitly sets the segment_id to 0

+    // when segmentation is disabled

+    mbmi->segment_id = 0;

+  }

+}

+static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,

+                             MODE_INFO *prev_mi,

+                             int mb_row, int mb_col,

+                             BOOL_DECODER* const bc) {

+  VP9_COMMON *const cm = &pbi->common;

+  nmv_context *const nmvc = &pbi->common.fc.nmvc;

+  const int mis = pbi->common.mode_info_stride;

+  MACROBLOCKD *const xd  = &pbi->mb;

+  int_mv *const mv = &mbmi->mv;

+  int mb_to_left_edge;

+  int mb_to_right_edge;

+  int mb_to_top_edge;

+  int mb_to_bottom_edge;

+  mb_to_top_edge = xd->mb_to_top_edge;

+  mb_to_bottom_edge = xd->mb_to_bottom_edge;

+  mb_to_top_edge -= LEFT_TOP_MARGIN;

+  mb_to_bottom_edge += RIGHT_BOTTOM_MARGIN;

+  mbmi->need_to_clamp_mvs = 0;

+  mbmi->need_to_clamp_secondmv = 0;

+  mbmi->second_ref_frame = 0;

+  /* Distance of Mb to the various image edges.

+   * These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units

+   */

+  xd->mb_to_left_edge =

+    mb_to_left_edge = -((mb_col * 16) << 3);

+  mb_to_left_edge -= LEFT_TOP_MARGIN;

+  xd->mb_to_right_edge =

+    mb_to_right_edge = ((pbi->common.mb_cols - 1 - mb_col) * 16) << 3;

+  mb_to_right_edge += RIGHT_BOTTOM_MARGIN;

+  // Make sure the MACROBLOCKD mode info pointer is pointed at the

+  // correct entry for the current macroblock.

+  xd->mode_info_context = mi;

+  xd->prev_mode_info_context = prev_mi;

+  // Read the macroblock segment id.

+  read_mb_segment_id(pbi, mb_row, mb_col, bc);

+  if (pbi->common.mb_no_coeff_skip &&

+      (!vp9_segfeature_active(xd,

+                              mbmi->segment_id, SEG_LVL_EOB) ||

+       (vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_EOB) != 0))) {

+    // Read the macroblock coeff skip flag if this feature is in use,

+    // else default to 0

+    mbmi->mb_skip_coeff = vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));

+  } else {

+    if (vp9_segfeature_active(xd,

+                              mbmi->segment_id, SEG_LVL_EOB) &&

+        (vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_EOB) == 0)) {

+      mbmi->mb_skip_coeff = 1;

+    } else

+      mbmi->mb_skip_coeff = 0;

+  }

+  // Read the reference frame

+  mbmi->ref_frame = read_ref_frame(pbi, bc, mbmi->segment_id);

+  // If reference frame is an Inter frame

+  if (mbmi->ref_frame) {

+    int rct[4];

+    int_mv nearest, nearby, best_mv;

+    int_mv nearest_second, nearby_second, best_mv_second;

+    vp9_prob mv_ref_p [VP9_MVREFS - 1];

+#if CONFIG_NEWBESTREFMV

+    int recon_y_stride, recon_yoffset;

+    int recon_uv_stride, recon_uvoffset;

+#endif

+    vp9_find_near_mvs(xd, mi,

+                      prev_mi,

+                      &nearest, &nearby, &best_mv, rct,

+                      mbmi->ref_frame, cm->ref_frame_sign_bias);

+#if CONFIG_NEWBESTREFMV

+    {

+      int ref_fb_idx;

+      MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;

+      /* Select the appropriate reference frame for this MB */

+      if (ref_frame == LAST_FRAME)

+        ref_fb_idx = cm->lst_fb_idx;

+      else if (ref_frame == GOLDEN_FRAME)

+        ref_fb_idx = cm->gld_fb_idx;

+      else

+        ref_fb_idx = cm->alt_fb_idx;

+      recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride  ;

+      recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;

+      recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);

+      recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);

+      xd->pre.y_buffer = cm->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;

+      xd->pre.u_buffer = cm->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;

+      xd->pre.v_buffer = cm->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;

+      vp9_find_mv_refs(xd, mi, prev_mi,

+                       ref_frame, mbmi->ref_mvs[ref_frame],

+                       cm->ref_frame_sign_bias);

+      vp9_find_best_ref_mvs(xd,

+                            xd->pre.y_buffer,

+                            recon_y_stride,

+                            mbmi->ref_mvs[ref_frame],

+                            &best_mv, &nearest, &nearby);

+    }

+#endif

+    vp9_mv_ref_probs(&pbi->common, mv_ref_p, rct);

+    // Is the segment level mode feature enabled for this segment

+    if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE)) {

+      mbmi->mode =

+        vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);

+    } else {

+#if CONFIG_SUPERBLOCKS

+      if (mbmi->encoded_as_sb) {

+        mbmi->mode = read_sb_mv_ref(bc, mv_ref_p);

+      } else

+#endif

+      mbmi->mode = read_mv_ref(bc, mv_ref_p);

+      vp9_accum_mv_refs(&pbi->common, mbmi->mode, rct);

+    }

+#if CONFIG_PRED_FILTER

+    if (mbmi->mode >= NEARESTMV && mbmi->mode < SPLITMV) {

+      // Is the prediction filter enabled

+      if (cm->pred_filter_mode == 2)

+        mbmi->pred_filter_enabled =

+          vp9_read(bc, cm->prob_pred_filter_off);

+      else

+        mbmi->pred_filter_enabled = cm->pred_filter_mode;

+    }

+#endif

+    if (mbmi->mode >= NEARESTMV && mbmi->mode <= SPLITMV)

+    {

+      if (cm->mcomp_filter_type == SWITCHABLE) {

+        mbmi->interp_filter = vp9_switchable_interp[

+            treed_read(bc, vp9_switchable_interp_tree,

+                       vp9_get_pred_probs(cm, xd, PRED_SWITCHABLE_INTERP))];

+      } else {

+        mbmi->interp_filter = cm->mcomp_filter_type;

+      }

+    }

+    if (cm->comp_pred_mode == COMP_PREDICTION_ONLY ||

+        (cm->comp_pred_mode == HYBRID_PREDICTION &&

+         vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_COMP)))) {

+      /* Since we have 3 reference frames, we can only have 3 unique

+       * combinations of combinations of 2 different reference frames

+       * (A-G, G-L or A-L). In the bitstream, we use this to simply

+       * derive the second reference frame from the first reference

+       * frame, by saying it's the next one in the enumerator, and

+       * if that's > n_refs, then the second reference frame is the

+       * first one in the enumerator. */

+      mbmi->second_ref_frame = mbmi->ref_frame + 1;

+      if (mbmi->second_ref_frame == 4)

+        mbmi->second_ref_frame = 1;

+#if CONFIG_NEWBESTREFMV

+      if (mbmi->second_ref_frame) {

+        int second_ref_fb_idx;

+        /* Select the appropriate reference frame for this MB */

+        if (mbmi->second_ref_frame == LAST_FRAME)

+          second_ref_fb_idx = cm->lst_fb_idx;

+        else if (mbmi->second_ref_frame ==

+          GOLDEN_FRAME)

+          second_ref_fb_idx = cm->gld_fb_idx;

+        else

+          second_ref_fb_idx = cm->alt_fb_idx;

+        xd->second_pre.y_buffer =

+          cm->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;

+        xd->second_pre.u_buffer =

+          cm->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;

+        xd->second_pre.v_buffer =

+          cm->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;

+        vp9_find_near_mvs(xd, mi, prev_mi,

+                          &nearest_second, &nearby_second, &best_mv_second,

+                          rct,

+                          mbmi->second_ref_frame,

+                          cm->ref_frame_sign_bias);

+        vp9_find_mv_refs(xd, mi, prev_mi,

+                         mbmi->second_ref_frame,

+                         mbmi->ref_mvs[mbmi->second_ref_frame],

+                         cm->ref_frame_sign_bias);

+        vp9_find_best_ref_mvs(xd,

+                              xd->second_pre.y_buffer,

+                              recon_y_stride,

+                              mbmi->ref_mvs[mbmi->second_ref_frame],

+                              &best_mv_second,

+                              &nearest_second,

+                              &nearby_second);

+      }

+#else

+      vp9_find_near_mvs(xd, mi, prev_mi,

+                        &nearest_second, &nearby_second, &best_mv_second,

+                        rct,

+                        mbmi->second_ref_frame,

+                        pbi->common.ref_frame_sign_bias);

+#endif

+    } else {

+      mbmi->second_ref_frame = 0;

+    }

+    mbmi->uv_mode = DC_PRED;

+    switch (mbmi->mode) {

+      case SPLITMV: {

+        const int s = mbmi->partitioning =

+                        treed_read(bc, vp9_mbsplit_tree, cm->fc.mbsplit_prob);

+        const int num_p = vp9_mbsplit_count [s];

+        int j = 0;

+        cm->fc.mbsplit_counts[s]++;

+        mbmi->need_to_clamp_mvs = 0;

+        do { /* for each subset j */

+          int_mv leftmv, abovemv, second_leftmv, second_abovemv;

+          int_mv blockmv, secondmv;

+          int k;  /* first block in subset j */

+          int mv_contz;

+          int blockmode;

+          k = vp9_mbsplit_offset[s][j];

+          leftmv.as_int = left_block_mv(mi, k);

+          abovemv.as_int = above_block_mv(mi, k, mis);

+          if (mbmi->second_ref_frame) {

+            second_leftmv.as_int = left_block_second_mv(mi, k);

+            second_abovemv.as_int = above_block_second_mv(mi, k, mis);

+          }

+          mv_contz = vp9_mv_cont(&leftmv, &abovemv);

+          blockmode = sub_mv_ref(bc, cm->fc.sub_mv_ref_prob [mv_contz]);

+          cm->fc.sub_mv_ref_counts[mv_contz][blockmode - LEFT4X4]++;

+          switch (blockmode) {

+            case NEW4X4:

+              read_nmv(bc, &blockmv.as_mv, &best_mv.as_mv, nmvc);

+              read_nmv_fp(bc, &blockmv.as_mv, &best_mv.as_mv, nmvc,

+                          xd->allow_high_precision_mv);

+              vp9_increment_nmv(&blockmv.as_mv, &best_mv.as_mv,

+                                &cm->fc.NMVcount, xd->allow_high_precision_mv);

+              blockmv.as_mv.row += best_mv.as_mv.row;

+              blockmv.as_mv.col += best_mv.as_mv.col;

+              if (mbmi->second_ref_frame) {

+                read_nmv(bc, &secondmv.as_mv, &best_mv_second.as_mv, nmvc);

+                read_nmv_fp(bc, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,

+                            xd->allow_high_precision_mv);

+                vp9_increment_nmv(&secondmv.as_mv, &best_mv_second.as_mv,

+                                  &cm->fc.NMVcount, xd->allow_high_precision_mv);

+                secondmv.as_mv.row += best_mv_second.as_mv.row;

+                secondmv.as_mv.col += best_mv_second.as_mv.col;

+              }

+#ifdef VPX_MODE_COUNT

+              vp9_mv_cont_count[mv_contz][3]++;

+#endif

+              break;

+            case LEFT4X4:

+              blockmv.as_int = leftmv.as_int;

+              if (mbmi->second_ref_frame)

+                secondmv.as_int = second_leftmv.as_int;

+#ifdef VPX_MODE_COUNT

+              vp9_mv_cont_count[mv_contz][0]++;

+#endif

+              break;

+            case ABOVE4X4:

+              blockmv.as_int = abovemv.as_int;

+              if (mbmi->second_ref_frame)

+                secondmv.as_int = second_abovemv.as_int;

+#ifdef VPX_MODE_COUNT

+              vp9_mv_cont_count[mv_contz][1]++;

+#endif

+              break;

+            case ZERO4X4:

+              blockmv.as_int = 0;

+              if (mbmi->second_ref_frame)

+                secondmv.as_int = 0;

+#ifdef VPX_MODE_COUNT

+              vp9_mv_cont_count[mv_contz][2]++;

+#endif

+              break;

+            default:

+              break;

+          }

+          mbmi->need_to_clamp_mvs |= check_mv_bounds(&blockmv,

+                                                     mb_to_left_edge,

+                                                     mb_to_right_edge,

+                                                     mb_to_top_edge,

+                                                     mb_to_bottom_edge);

+          if (mbmi->second_ref_frame) {

+            mbmi->need_to_clamp_mvs |= check_mv_bounds(&secondmv,

+                                                       mb_to_left_edge,

+                                                       mb_to_right_edge,

+                                                       mb_to_top_edge,

+                                                       mb_to_bottom_edge);

+          }

+          {

+            /* Fill (uniform) modes, mvs of jth subset.

+             Must do it here because ensuing subsets can

+             refer back to us via "left" or "above". */

+            const unsigned char *fill_offset;

+            unsigned int fill_count = mbsplit_fill_count[s];

+            fill_offset = &mbsplit_fill_offset[s][(unsigned char)j * mbsplit_fill_count[s]];

+            do {

+              mi->bmi[ *fill_offset].as_mv.first.as_int = blockmv.as_int;

+              if (mbmi->second_ref_frame)

+                mi->bmi[ *fill_offset].as_mv.second.as_int = secondmv.as_int;

+              fill_offset++;

+            } while (--fill_count);

+          }

+        } while (++j < num_p);

+      }

+      mv->as_int = mi->bmi[15].as_mv.first.as_int;

+      mbmi->mv[1].as_int = mi->bmi[15].as_mv.second.as_int;

+      break;  /* done with SPLITMV */

+      case NEARMV:

+        mv->as_int = nearby.as_int;

+        /* Clip "next_nearest" so that it does not extend to far out of image */

+        clamp_mv(mv, mb_to_left_edge, mb_to_right_edge,

+                 mb_to_top_edge, mb_to_bottom_edge);

+        if (mbmi->second_ref_frame) {

+          mbmi->mv[1].as_int = nearby_second.as_int;

+          clamp_mv(&mbmi->mv[1], mb_to_left_edge, mb_to_right_edge,

+                   mb_to_top_edge, mb_to_bottom_edge);

+        }

+        break;

+      case NEARESTMV:

+        mv->as_int = nearest.as_int;

+        /* Clip "next_nearest" so that it does not extend to far out of image */

+        clamp_mv(mv, mb_to_left_edge, mb_to_right_edge,

+                 mb_to_top_edge, mb_to_bottom_edge);

+        if (mbmi->second_ref_frame) {

+          mbmi->mv[1].as_int = nearest_second.as_int;

+          clamp_mv(&mbmi->mv[1], mb_to_left_edge, mb_to_right_edge,

+                   mb_to_top_edge, mb_to_bottom_edge);

+        }

+        break;

+      case ZEROMV:

+        mv->as_int = 0;

+        if (mbmi->second_ref_frame)

+          mbmi->mv[1].as_int = 0;

+        break;

+      case NEWMV:

+#if CONFIG_NEW_MVREF

+        {

+          int best_index;

+          MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;

+          // Encode the index of the choice.

+          best_index =

+            vp9_read_mv_ref_id(bc, xd->mb_mv_ref_id_probs[ref_frame]);

+          best_mv.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;

+        }

+#endif

+        read_nmv(bc, &mv->as_mv, &best_mv.as_mv, nmvc);

+        read_nmv_fp(bc, &mv->as_mv, &best_mv.as_mv, nmvc,

+                    xd->allow_high_precision_mv);

+        vp9_increment_nmv(&mv->as_mv, &best_mv.as_mv, &cm->fc.NMVcount,

+                          xd->allow_high_precision_mv);

+        mv->as_mv.row += best_mv.as_mv.row;

+        mv->as_mv.col += best_mv.as_mv.col;

+        /* Don't need to check this on NEARMV and NEARESTMV modes

+         * since those modes clamp the MV. The NEWMV mode does not,

+         * so signal to the prediction stage whether special

+         * handling may be required.

+         */

+        mbmi->need_to_clamp_mvs = check_mv_bounds(mv,

+                                                  mb_to_left_edge,

+                                                  mb_to_right_edge,

+                                                  mb_to_top_edge,

+                                                  mb_to_bottom_edge);

+        if (mbmi->second_ref_frame) {

+#if CONFIG_NEW_MVREF

+        {

+          int best_index;

+          MV_REFERENCE_FRAME ref_frame = mbmi->second_ref_frame;

+          // Encode the index of the choice.

+          best_index =

+            vp9_read_mv_ref_id(bc, xd->mb_mv_ref_id_probs[ref_frame]);

+          best_mv_second.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;

+        }

+#endif

+          read_nmv(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc);

+          read_nmv_fp(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc,

+                      xd->allow_high_precision_mv);

+          vp9_increment_nmv(&mbmi->mv[1].as_mv, &best_mv_second.as_mv,

+                            &cm->fc.NMVcount, xd->allow_high_precision_mv);

+          mbmi->mv[1].as_mv.row += best_mv_second.as_mv.row;

+          mbmi->mv[1].as_mv.col += best_mv_second.as_mv.col;

+          mbmi->need_to_clamp_secondmv |=

+            check_mv_bounds(&mbmi->mv[1],

+                            mb_to_left_edge, mb_to_right_edge,

+                            mb_to_top_edge, mb_to_bottom_edge);

+        }

+        break;

+      default:

+;

+#if CONFIG_DEBUG

+        assert(0);

+#endif

+    }

+  } else {

+    /* required for left and above block mv */

+    mbmi->mv[0].as_int = 0;

+    if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE))

+      mbmi->mode = (MB_PREDICTION_MODE)

+                   vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);

+    else {

+      // FIXME write using SB mode tree

+      mbmi->mode = (MB_PREDICTION_MODE)

+                   read_ymode(bc, pbi->common.fc.ymode_prob);

+      pbi->common.fc.ymode_counts[mbmi->mode]++;

+    }

+#if CONFIG_COMP_INTRA_PRED

+    mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

+#endif

+    // If MB mode is BPRED read the block modes

+    if (mbmi->mode == B_PRED) {

+      int j = 0;

+#if CONFIG_COMP_INTRA_PRED

+      int use_comp_pred = vp9_read(bc, 128);

+#endif

+      do {

+        mi->bmi[j].as_mode.first = (B_PREDICTION_MODE)read_bmode(bc, pbi->common.fc.bmode_prob);

+        /*

+        {

+          int p;

+          for (p = 0; p < VP9_BINTRAMODES - 1; ++p)

+            printf(" %d", pbi->common.fc.bmode_prob[p]);

+          printf("\nbmode[%d][%d]: %d\n", pbi->common.current_video_frame, j, mi->bmi[j].as_mode.first);

+        }

+        */

+        pbi->common.fc.bmode_counts[mi->bmi[j].as_mode.first]++;

+#if CONFIG_COMP_INTRA_PRED

+        if (use_comp_pred) {

+          mi->bmi[j].as_mode.second = (B_PREDICTION_MODE)read_bmode(bc, pbi->common.fc.bmode_prob);

+        } else {

+          mi->bmi[j].as_mode.second = (B_PREDICTION_MODE)(B_DC_PRED - 1);

+        }

+#endif

+      } while (++j < 16);

+    }

+    if (mbmi->mode == I8X8_PRED) {

+      int i;

+      int mode8x8;

+      for (i = 0; i < 4; i++) {

+        int ib = vp9_i8x8_block[i];

+        mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);

+        mi->bmi[ib + 0].as_mode.first = mode8x8;

+        mi->bmi[ib + 1].as_mode.first = mode8x8;

+        mi->bmi[ib + 4].as_mode.first = mode8x8;

+        mi->bmi[ib + 5].as_mode.first = mode8x8;

+        pbi->common.fc.i8x8_mode_counts[mode8x8]++;

+#if CONFIG_COMP_INTRA_PRED

+        mi->bmi[ib + 0].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);

+        mi->bmi[ib + 1].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);

+        mi->bmi[ib + 4].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);

+        mi->bmi[ib + 5].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);

+#endif

+      }

+    } else {

+      mbmi->uv_mode = (MB_PREDICTION_MODE)read_uv_mode(

+        bc, pbi->common.fc.uv_mode_prob[mbmi->mode]);

+      pbi->common.fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;

+    }

+#if CONFIG_COMP_INTRA_PRED

+    mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

+#endif

+  }

+#if CONFIG_SUPERBLOCKS

+  if (mbmi->encoded_as_sb)

+    mbmi->txfm_size = TX_8X8;

+  else

+#endif

+  if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&

+      ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= I8X8_PRED) ||

+       (mbmi->ref_frame != INTRA_FRAME && !(mbmi->mode == SPLITMV &&

+                           mbmi->partitioning == PARTITIONING_4X4)))) {

+    // FIXME(rbultje) code ternary symbol once all experiments are merged

+    mbmi->txfm_size = vp9_read(bc, cm->prob_tx[0]);

+    if (mbmi->txfm_size != TX_4X4 && mbmi->mode != I8X8_PRED &&

+        mbmi->mode != SPLITMV)

+      mbmi->txfm_size += vp9_read(bc, cm->prob_tx[1]);

+  } else if (cm->txfm_mode >= ALLOW_16X16 &&

+      ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||

+       (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {

+    mbmi->txfm_size = TX_16X16;

+  } else if (cm->txfm_mode >= ALLOW_8X8 &&

+      (!(mbmi->ref_frame == INTRA_FRAME && mbmi->mode == B_PRED) &&

+       !(mbmi->ref_frame != INTRA_FRAME && mbmi->mode == SPLITMV &&

+         mbmi->partitioning == PARTITIONING_4X4))) {

+    mbmi->txfm_size = TX_8X8;

+  } else {

+    mbmi->txfm_size = TX_4X4;

+  }

+}

+void vp9_decode_mode_mvs_init(VP9D_COMP *pbi, BOOL_DECODER* const bc) {

+  VP9_COMMON *cm = &pbi->common;

+  vpx_memset(cm->mbskip_pred_probs, 0, sizeof(cm->mbskip_pred_probs));

+  if (pbi->common.mb_no_coeff_skip) {

+    int k;

+    for (k = 0; k < MBSKIP_CONTEXTS; ++k)

+      cm->mbskip_pred_probs[k] = (vp9_prob)vp9_read_literal(bc, 8);

+  }

+  mb_mode_mv_init(pbi, bc);

+}

+void vp9_decode_mb_mode_mv(VP9D_COMP *pbi,

+                           MACROBLOCKD *xd,

+                           int mb_row,

+                           int mb_col,

+                           BOOL_DECODER* const bc) {

+  MODE_INFO *mi = xd->mode_info_context;

+  MODE_INFO *prev_mi = xd->prev_mode_info_context;

+  if (pbi->common.frame_type == KEY_FRAME)

+    kfread_modes(pbi, mi, mb_row, mb_col, bc);

+  else

+    read_mb_modes_mv(pbi, mi, &mi->mbmi, prev_mi, mb_row, mb_col, bc);

+}

--- /dev/null

+++ b/vp9/decoder/decodemv.h

@@ -1,0 +1,19 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "onyxd_int.h"

+void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,

+                           MACROBLOCKD* const xd,

+                           int mb_row,

+                           int mb_col,

+                           BOOL_DECODER* const bc);

+void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, BOOL_DECODER* const bc);

--- /dev/null

+++ b/vp9/decoder/decodframe.c

@@ -1,0 +1,1337 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "onyxd_int.h"

+#include "vp9/common/header.h"

+#include "vp9/common/reconintra.h"

+#include "vp9/common/reconintra4x4.h"

+#include "vp9/common/reconinter.h"

+#include "detokenize.h"

+#include "vp9/common/invtrans.h"

+#include "vp9/common/alloccommon.h"

+#include "vp9/common/entropymode.h"

+#include "vp9/common/quant_common.h"

+#include "vpx_scale/vpxscale.h"

+#include "vpx_scale/yv12extend.h"

+#include "vp9/common/setupintrarecon.h"

+#include "decodemv.h"

+#include "vp9/common/extend.h"

+#include "vp9/common/modecont.h"

+#include "vpx_mem/vpx_mem.h"

+#include "vp9/common/idct.h"

+#include "dboolhuff.h"

+#include "vp9/common/seg_common.h"

+#include "vp9/common/entropy.h"

+#include "vpx_rtcd.h"

+#include <assert.h>

+#include <stdio.h>

+#define COEFCOUNT_TESTING

+static int merge_index(int v, int n, int modulus) {

+  int max1 = (n - 1 - modulus / 2) / modulus + 1;

+  if (v < max1) v = v * modulus + modulus / 2;

+  else {

+    int w;

+    v -= max1;

+    w = v;

+    v += (v + modulus - modulus / 2) / modulus;

+    while (v % modulus == modulus / 2 ||

+           w != v - (v + modulus - modulus / 2) / modulus) v++;

+  }

+  return v;

+}

+static int inv_remap_prob(int v, int m) {

+  const int n = 256;

+  const int modulus = MODULUS_PARAM;

+  int i;

+  v = merge_index(v, n - 1, modulus);

+  if ((m << 1) <= n) {

+    i = vp9_inv_recenter_nonneg(v + 1, m);

+  } else {

+    i = n - 1 - vp9_inv_recenter_nonneg(v + 1, n - 1 - m);

+  }

+  return i;

+}

+static vp9_prob read_prob_diff_update(vp9_reader *const bc, int oldp) {

+  int delp = vp9_decode_term_subexp(bc, SUBEXP_PARAM, 255);

+  return (vp9_prob)inv_remap_prob(delp, oldp);

+}

+void vp9_init_de_quantizer(VP9D_COMP *pbi) {

+  int i;

+  int Q;

+  VP9_COMMON *const pc = &pbi->common;

+  for (Q = 0; Q < QINDEX_RANGE; Q++) {

+    pc->Y1dequant[Q][0] = (short)vp9_dc_quant(Q, pc->y1dc_delta_q);

+    pc->Y2dequant[Q][0] = (short)vp9_dc2quant(Q, pc->y2dc_delta_q);

+    pc->UVdequant[Q][0] = (short)vp9_dc_uv_quant(Q, pc->uvdc_delta_q);

+    /* all the ac values =; */

+    for (i = 1; i < 16; i++) {

+      int rc = vp9_default_zig_zag1d[i];

+      pc->Y1dequant[Q][rc] = (short)vp9_ac_yquant(Q);

+      pc->Y2dequant[Q][rc] = (short)vp9_ac2quant(Q, pc->y2ac_delta_q);

+      pc->UVdequant[Q][rc] = (short)vp9_ac_uv_quant(Q, pc->uvac_delta_q);

+    }

+  }

+}

+static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *xd) {

+  int i;

+  int QIndex;

+  VP9_COMMON *const pc = &pbi->common;

+  int segment_id = xd->mode_info_context->mbmi.segment_id;

+  // Set the Q baseline allowing for any segment level adjustment

+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {

+    /* Abs Value */

+    if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA)

+      QIndex = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);

+    /* Delta Value */

+    else {

+      QIndex = pc->base_qindex +

+               vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);

+      QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;    /* Clamp to valid range */

+    }

+  } else

+    QIndex = pc->base_qindex;

+  xd->q_index = QIndex;

+  /* Set up the block level dequant pointers */

+  for (i = 0; i < 16; i++) {

+    xd->block[i].dequant = pc->Y1dequant[QIndex];

+  }

+#if CONFIG_LOSSLESS

+  if (!QIndex) {

+    pbi->common.rtcd.idct.idct1        = vp9_short_inv_walsh4x4_1_x8_c;

+    pbi->common.rtcd.idct.idct16       = vp9_short_inv_walsh4x4_x8_c;

+    pbi->common.rtcd.idct.idct1_scalar_add  = vp9_dc_only_inv_walsh_add_c;

+    pbi->common.rtcd.idct.iwalsh1      = vp9_short_inv_walsh4x4_1_lossless_c;

+    pbi->common.rtcd.idct.iwalsh16     = vp9_short_inv_walsh4x4_lossless_c;

+    pbi->idct_add            = vp9_dequant_idct_add_lossless_c;

+    pbi->dc_idct_add         = vp9_dequant_dc_idct_add_lossless_c;

+    pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block_lossless_c;

+    pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block_lossless_c;

+    pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block_lossless_c;

+  } else {

+    pbi->common.rtcd.idct.idct1        = vp9_short_idct4x4llm_1_c;

+    pbi->common.rtcd.idct.idct16       = vp9_short_idct4x4llm_c;

+    pbi->common.rtcd.idct.idct1_scalar_add  = vp9_dc_only_idct_add_c;

+    pbi->common.rtcd.idct.iwalsh1      = vp9_short_inv_walsh4x4_1_c;

+    pbi->common.rtcd.idct.iwalsh16     = vp9_short_inv_walsh4x4_c;

+    pbi->idct_add            = vp9_dequant_idct_add;

+    pbi->dc_idct_add         = vp9_dequant_dc_idct_add;

+    pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block;

+    pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block;

+    pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block;

+  }

+#else

+  pbi->idct_add            = vp9_dequant_idct_add;

+  pbi->dc_idct_add         = vp9_dequant_dc_idct_add;

+  pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block;

+  pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block;

+  pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block;

+#endif

+  for (i = 16; i < 24; i++) {

+    xd->block[i].dequant = pc->UVdequant[QIndex];

+  }

+  xd->block[24].dequant = pc->Y2dequant[QIndex];

+}

+#if CONFIG_RUNTIME_CPU_DETECT

+#define RTCD_VTABLE(x) (&(pbi)->common.rtcd.x)

+#else

+#define RTCD_VTABLE(x) NULL

+#endif

+/* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it

+ *  to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.

+ */

+static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd) {

+  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {

+#if CONFIG_SUPERBLOCKS

+    if (xd->mode_info_context->mbmi.encoded_as_sb) {

+      vp9_build_intra_predictors_sbuv_s(xd);

+      vp9_build_intra_predictors_sby_s(xd);

+    } else {

+#endif

+    vp9_build_intra_predictors_mbuv_s(xd);

+    vp9_build_intra_predictors_mby_s(xd);

+#if CONFIG_SUPERBLOCKS

+    }

+#endif

+  } else {

+#if CONFIG_SUPERBLOCKS

+    if (xd->mode_info_context->mbmi.encoded_as_sb) {

+      vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,

+                                         xd->dst.u_buffer, xd->dst.v_buffer,

+                                         xd->dst.y_stride, xd->dst.uv_stride);

+    } else {

+#endif

+    vp9_build_1st_inter16x16_predictors_mb(xd, xd->dst.y_buffer,

+                                           xd->dst.u_buffer, xd->dst.v_buffer,

+                                           xd->dst.y_stride, xd->dst.uv_stride);

+    if (xd->mode_info_context->mbmi.second_ref_frame) {

+      vp9_build_2nd_inter16x16_predictors_mb(xd, xd->dst.y_buffer,

+                                             xd->dst.u_buffer, xd->dst.v_buffer,

+                                             xd->dst.y_stride, xd->dst.uv_stride);

+    }

+#if CONFIG_SUPERBLOCKS

+    }

+#endif

+  }

+}

+static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,

+                              int mb_row, unsigned int mb_col,

+                              BOOL_DECODER* const bc) {

+  int eobtotal = 0;

+  MB_PREDICTION_MODE mode;

+  int i;

+  int tx_size;

+  TX_TYPE tx_type;

+  VP9_COMMON *pc = &pbi->common;

+#if CONFIG_SUPERBLOCKS

+  int orig_skip_flag = xd->mode_info_context->mbmi.mb_skip_coeff;

+#endif

+  // re-initialize macroblock dequantizer before detokenization

+  if (xd->segmentation_enabled)

+    mb_init_dequantizer(pbi, xd);

+  tx_size = xd->mode_info_context->mbmi.txfm_size;

+  mode = xd->mode_info_context->mbmi.mode;

+  if (xd->mode_info_context->mbmi.mb_skip_coeff) {

+    vp9_reset_mb_tokens_context(xd);

+#if CONFIG_SUPERBLOCKS

+    if (xd->mode_info_context->mbmi.encoded_as_sb &&

+        (mb_col < pc->mb_cols - 1 || mb_row < pc->mb_rows - 1)) {

+      if (mb_col < pc->mb_cols - 1)

+        xd->above_context++;

+      if (mb_row < pc->mb_rows - 1)

+        xd->left_context++;

+      vp9_reset_mb_tokens_context(xd);

+      if (mb_col < pc->mb_cols - 1)

+        xd->above_context--;

+      if (mb_row < pc->mb_rows - 1)

+        xd->left_context--;

+    }

+#endif

+  } else if (!bool_error(bc)) {

+    for (i = 0; i < 25; i++) {

+      xd->block[i].eob = 0;

+      xd->eobs[i] = 0;

+    }

+    if (tx_size == TX_16X16) {

+      eobtotal = vp9_decode_mb_tokens_16x16(pbi, xd, bc);

+    } else if (tx_size == TX_8X8) {

+      eobtotal = vp9_decode_mb_tokens_8x8(pbi, xd, bc);

+    } else {

+      eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);

+    }

+  }

+  //mode = xd->mode_info_context->mbmi.mode;

+  if (pbi->common.frame_type != KEY_FRAME)

+    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter,

+                             &pbi->common);

+  if (eobtotal == 0 && mode != B_PRED && mode != SPLITMV

+      && mode != I8X8_PRED

+      && !bool_error(bc)) {

+    /* Special case:  Force the loopfilter to skip when eobtotal and

+     * mb_skip_coeff are zero.

+     * */

+    xd->mode_info_context->mbmi.mb_skip_coeff = 1;

+#if CONFIG_SUPERBLOCKS

+    if (!xd->mode_info_context->mbmi.encoded_as_sb || orig_skip_flag)

+#endif

+    {

+      skip_recon_mb(pbi, xd);

+      return;

+    }

+  }

+  // moved to be performed before detokenization

+//  if (xd->segmentation_enabled)

+//    mb_init_dequantizer(pbi, xd);

+  /* do prediction */

+  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {

+#if CONFIG_SUPERBLOCKS

+    if (xd->mode_info_context->mbmi.encoded_as_sb) {

+      vp9_build_intra_predictors_sby_s(xd);

+      vp9_build_intra_predictors_sbuv_s(xd);

+    } else

+#endif

+    if (mode != I8X8_PRED) {

+      vp9_build_intra_predictors_mbuv(xd);

+      if (mode != B_PRED) {

+        vp9_build_intra_predictors_mby(xd);

+      }

+    }

+  } else {

+#if CONFIG_SUPERBLOCKS

+    if (xd->mode_info_context->mbmi.encoded_as_sb) {

+      vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,

+                                         xd->dst.u_buffer, xd->dst.v_buffer,

+                                         xd->dst.y_stride, xd->dst.uv_stride);

+    } else

+#endif

+    vp9_build_inter_predictors_mb(xd);

+  }

+  /* dequantization and idct */

+  if (mode == I8X8_PRED) {

+    for (i = 0; i < 4; i++) {

+      int ib = vp9_i8x8_block[i];

+      const int iblock[4] = {0, 1, 4, 5};

+      int j;

+      int i8x8mode;

+      BLOCKD *b;

+      int idx = (ib & 0x02) ? (ib + 2) : ib;

+      short *q  = xd->block[idx].qcoeff;

+      short *dq = xd->block[0].dequant;

+      unsigned char *pre = xd->block[ib].predictor;

+      unsigned char *dst = *(xd->block[ib].base_dst) + xd->block[ib].dst;

+      int stride = xd->dst.y_stride;

+      b = &xd->block[ib];

+      i8x8mode = b->bmi.as_mode.first;

+      vp9_intra8x8_predict(b, i8x8mode, b->predictor);

+      if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {

+        tx_type = get_tx_type(xd, &xd->block[idx]);

+        if (tx_type != DCT_DCT) {

+          vp9_ht_dequant_idct_add_8x8_c(tx_type,

+                                        q, dq, pre, dst, 16, stride);

+        } else {

+          vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);

+        }

+        q += 64;

+      } else {

+        for (j = 0; j < 4; j++) {

+          b = &xd->block[ib + iblock[j]];

+          vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,

+                                 *(b->base_dst) + b->dst, 16, b->dst_stride);

+        }

+      }

+      b = &xd->block[16 + i];

+      vp9_intra_uv4x4_predict(b, i8x8mode, b->predictor);

+      pbi->idct_add(b->qcoeff, b->dequant, b->predictor,

+                    *(b->base_dst) + b->dst, 8, b->dst_stride);

+      b = &xd->block[20 + i];

+      vp9_intra_uv4x4_predict(b, i8x8mode, b->predictor);

+      pbi->idct_add(b->qcoeff, b->dequant, b->predictor,

+                    *(b->base_dst) + b->dst, 8, b->dst_stride);

+    }

+  } else if (mode == B_PRED) {

+    for (i = 0; i < 16; i++) {

+      BLOCKD *b = &xd->block[i];

+      int b_mode = xd->mode_info_context->bmi[i].as_mode.first;

+#if CONFIG_COMP_INTRA_PRED

+      int b_mode2 = xd->mode_info_context->bmi[i].as_mode.second;

+      if (b_mode2 == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {

+#endif

+        vp9_intra4x4_predict(b, b_mode, b->predictor);

+#if CONFIG_COMP_INTRA_PRED

+      } else {

+        vp9_comp_intra4x4_predict(b, b_mode, b_mode2, b->predictor);

+      }

+#endif

+      tx_type = get_tx_type(xd, b);

+      if (tx_type != DCT_DCT) {

+        vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,

+                                  b->dequant, b->predictor,

+                                  *(b->base_dst) + b->dst, 16, b->dst_stride);

+      } else {

+        vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,

+                               *(b->base_dst) + b->dst, 16, b->dst_stride);

+      }

+    }

+  } else if (mode == SPLITMV) {

+    if (tx_size == TX_8X8) {

+      vp9_dequant_idct_add_y_block_8x8(xd->qcoeff, xd->block[0].dequant,

+                                         xd->predictor, xd->dst.y_buffer,

+                                         xd->dst.y_stride, xd->eobs, xd);

+    } else {

+      pbi->idct_add_y_block(xd->qcoeff, xd->block[0].dequant,

+                                       xd->predictor, xd->dst.y_buffer,

+                                       xd->dst.y_stride, xd->eobs);

+    }

+  } else {

+    BLOCKD *b = &xd->block[24];

+    if (tx_size == TX_16X16) {

+      BLOCKD *bd = &xd->block[0];

+      tx_type = get_tx_type(xd, bd);

+      if (tx_type != DCT_DCT) {

+        vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff,

+                                        xd->block[0].dequant, xd->predictor,

+                                        xd->dst.y_buffer, 16, xd->dst.y_stride);

+      } else {

+        vp9_dequant_idct_add_16x16(xd->qcoeff, xd->block[0].dequant,

+                                     xd->predictor, xd->dst.y_buffer,

+                                     16, xd->dst.y_stride);

+      }

+    } else if (tx_size == TX_8X8) {

+#if CONFIG_SUPERBLOCKS

+      void *orig = xd->mode_info_context;

+      int n, num = xd->mode_info_context->mbmi.encoded_as_sb ? 4 : 1;

+      for (n = 0; n < num; n++) {

+        int x_idx = n & 1, y_idx = n >> 1;

+        if (num == 4 && (mb_col + x_idx >= pc->mb_cols ||

+                         mb_row + y_idx >= pc->mb_rows))

+          continue;

+        if (n != 0) {

+          for (i = 0; i < 25; i++) {

+            xd->block[i].eob = 0;

+            xd->eobs[i] = 0;

+          }

+          xd->above_context = pc->above_context + mb_col + (n & 1);

+          xd->left_context = pc->left_context + (n >> 1);

+          xd->mode_info_context = orig;

+          xd->mode_info_context += (n & 1);

+          xd->mode_info_context += (n >> 1) * pc->mode_info_stride;

+          if (!orig_skip_flag) {

+            eobtotal = vp9_decode_mb_tokens_8x8(pbi, xd, bc);

+            if (eobtotal == 0) // skip loopfilter

+              xd->mode_info_context->mbmi.mb_skip_coeff = 1;

+          } else {

+            vp9_reset_mb_tokens_context(xd);

+          }

+        }

+        if (xd->mode_info_context->mbmi.mb_skip_coeff)

+          continue; // only happens for SBs, which are already in dest buffer

+#endif

+      vp9_dequantize_b_2x2(b);

+      IDCT_INVOKE(RTCD_VTABLE(idct), ihaar2)(&b->dqcoeff[0], b->diff, 8);

+      ((int *)b->qcoeff)[0] = 0;// 2nd order block are set to 0 after inverse transform

+      ((int *)b->qcoeff)[1] = 0;

+      ((int *)b->qcoeff)[2] = 0;

+      ((int *)b->qcoeff)[3] = 0;

+      ((int *)b->qcoeff)[4] = 0;

+      ((int *)b->qcoeff)[5] = 0;

+      ((int *)b->qcoeff)[6] = 0;

+      ((int *)b->qcoeff)[7] = 0;

+#if CONFIG_SUPERBLOCKS

+      if (xd->mode_info_context->mbmi.encoded_as_sb) {

+        vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(xd->qcoeff,

+          xd->block[0].dequant,

+          xd->dst.y_buffer + (n >> 1) * 16 * xd->dst.y_stride + (n & 1) * 16,

+          xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);

+        // do UV inline also

+        vp9_dequant_idct_add_uv_block_8x8_inplace_c(xd->qcoeff + 16 * 16,

+          xd->block[16].dequant,

+          xd->dst.u_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,

+          xd->dst.v_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,

+          xd->dst.uv_stride, xd->eobs + 16, xd);

+      } else

+#endif

+        vp9_dequant_dc_idct_add_y_block_8x8(xd->qcoeff,

+          xd->block[0].dequant, xd->predictor, xd->dst.y_buffer,

+          xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);

+#if CONFIG_SUPERBLOCKS

+      }

+      xd->mode_info_context = orig;

+#endif

+    } else {

+      vp9_dequantize_b(b);

+      if (xd->eobs[24] > 1) {

+        IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);

+        ((int *)b->qcoeff)[0] = 0;

+        ((int *)b->qcoeff)[1] = 0;

+        ((int *)b->qcoeff)[2] = 0;

+        ((int *)b->qcoeff)[3] = 0;

+        ((int *)b->qcoeff)[4] = 0;

+        ((int *)b->qcoeff)[5] = 0;

+        ((int *)b->qcoeff)[6] = 0;

+        ((int *)b->qcoeff)[7] = 0;

+      } else {

+        IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);

+        ((int *)b->qcoeff)[0] = 0;

+      }

+      pbi->dc_idct_add_y_block(xd->qcoeff, xd->block[0].dequant, xd->predictor,

+                               xd->dst.y_buffer, xd->dst.y_stride, xd->eobs,

+                               xd->block[24].diff);

+    }

+  }

+#if CONFIG_SUPERBLOCKS

+  if (!xd->mode_info_context->mbmi.encoded_as_sb) {

+#endif

+    if ((tx_size == TX_8X8 &&

+         xd->mode_info_context->mbmi.mode != I8X8_PRED &&

+         xd->mode_info_context->mbmi.mode != SPLITMV)

+        || tx_size == TX_16X16

+       )

+      vp9_dequant_idct_add_uv_block_8x8

+          (xd->qcoeff + 16 * 16, xd->block[16].dequant,

+           xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,

+           xd->dst.uv_stride, xd->eobs + 16, xd); //

+    else if (xd->mode_info_context->mbmi.mode != I8X8_PRED)

+      pbi->idct_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant,

+           xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,

+           xd->dst.uv_stride, xd->eobs + 16);

+#if CONFIG_SUPERBLOCKS

+  }

+#endif

+}

+static int get_delta_q(vp9_reader *bc, int prev, int *q_update) {

+  int ret_val = 0;

+  if (vp9_read_bit(bc)) {

+    ret_val = vp9_read_literal(bc, 4);

+    if (vp9_read_bit(bc))

+      ret_val = -ret_val;

+  }

+  /* Trigger a quantizer update if the delta-q value has changed */

+  if (ret_val != prev)

+    *q_update = 1;

+  return ret_val;

+}

+#ifdef PACKET_TESTING

+#include <stdio.h>

+FILE *vpxlog = 0;

+#endif

+/* Decode a row of Superblocks (2x2 region of MBs) */

+static void

+decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc, int mbrow, MACROBLOCKD *xd,

+              BOOL_DECODER* const bc) {

+  int i;

+  int sb_col;

+  int mb_row, mb_col;

+  int recon_yoffset, recon_uvoffset;

+  int ref_fb_idx = pc->lst_fb_idx;

+  int dst_fb_idx = pc->new_fb_idx;

+  int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;

+  int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;

+  int row_delta[4] = { 0, +1,  0, -1};

+  int col_delta[4] = { +1, -1, +1, +1};

+  int sb_cols = (pc->mb_cols + 1) >> 1;

+  // For a SB there are 2 left contexts, each pertaining to a MB row within

+  vpx_memset(pc->left_context, 0, sizeof(pc->left_context));

+  mb_row = mbrow;

+  mb_col = 0;

+  for (sb_col = 0; sb_col < sb_cols; sb_col++) {

+    MODE_INFO *mi = xd->mode_info_context;

+#if CONFIG_SUPERBLOCKS

+    mi->mbmi.encoded_as_sb = vp9_read(bc, pc->sb_coded);

+#endif

+    // Process the 4 MBs within the SB in the order:

+    // top-left, top-right, bottom-left, bottom-right

+    for (i = 0; i < 4; i++) {

+      int dy = row_delta[i];

+      int dx = col_delta[i];

+      int offset_extended = dy * xd->mode_info_stride + dx;

+      xd->mb_index = i;

+      mi = xd->mode_info_context;

+      if ((mb_row >= pc->mb_rows) || (mb_col >= pc->mb_cols)) {

+        // MB lies outside frame, skip on to next

+        mb_row += dy;

+        mb_col += dx;

+        xd->mode_info_context += offset_extended;

+        xd->prev_mode_info_context += offset_extended;

+        continue;

+      }

+      // Set above context pointer

+      xd->above_context = pc->above_context + mb_col;

+      xd->left_context = pc->left_context + (i >> 1);

+      /* Distance of Mb to the various image edges.

+       * These are specified to 8th pel as they are always compared to

+       * values that are in 1/8th pel units

+       */

+      xd->mb_to_top_edge = -((mb_row * 16)) << 3;

+      xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;

+      xd->mb_to_left_edge = -((mb_col * 16) << 3);

+      xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;

+      xd->up_available = (mb_row != 0);

+      xd->left_available = (mb_col != 0);

+      recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);

+      recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);

+      xd->dst.y_buffer = pc->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;

+      xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;

+      xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;

+#if CONFIG_SUPERBLOCKS

+      if (i)

+        mi->mbmi.encoded_as_sb = 0;

+#endif

+      vp9_decode_mb_mode_mv(pbi, xd, mb_row, mb_col, bc);

+      update_blockd_bmi(xd);

+      /* Select the appropriate reference frame for this MB */

+      if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)

+        ref_fb_idx = pc->lst_fb_idx;

+      else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)

+        ref_fb_idx = pc->gld_fb_idx;

+      else

+        ref_fb_idx = pc->alt_fb_idx;

+      xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;

+      xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;

+      xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;

+      if (xd->mode_info_context->mbmi.second_ref_frame) {

+        int second_ref_fb_idx;

+        /* Select the appropriate reference frame for this MB */

+        if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)

+          second_ref_fb_idx = pc->lst_fb_idx;

+        else if (xd->mode_info_context->mbmi.second_ref_frame ==

+                 GOLDEN_FRAME)

+          second_ref_fb_idx = pc->gld_fb_idx;

+        else

+          second_ref_fb_idx = pc->alt_fb_idx;

+        xd->second_pre.y_buffer =

+          pc->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;

+        xd->second_pre.u_buffer =

+          pc->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;

+        xd->second_pre.v_buffer =

+          pc->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;

+      }

+      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {

+        /* propagate errors from reference frames */

+        xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;

+      }

+#if CONFIG_SUPERBLOCKS

+      if (xd->mode_info_context->mbmi.encoded_as_sb) {

+        if (mb_col < pc->mb_cols - 1)

+          mi[1] = mi[0];

+        if (mb_row < pc->mb_rows - 1) {

+          mi[pc->mode_info_stride] = mi[0];

+          if (mb_col < pc->mb_cols - 1)

+            mi[pc->mode_info_stride + 1] = mi[0];

+        }

+      }

+#endif

+      vp9_intra_prediction_down_copy(xd);

+      decode_macroblock(pbi, xd, mb_row, mb_col, bc);

+      /* check if the boolean decoder has suffered an error */

+      xd->corrupted |= bool_error(bc);

+#if CONFIG_SUPERBLOCKS

+      if (mi->mbmi.encoded_as_sb) {

+        assert(!i);

+        mb_col += 2;

+        xd->mode_info_context += 2;

+        xd->prev_mode_info_context += 2;

+        break;

+      }

+#endif

+      // skip to next MB

+      xd->mode_info_context += offset_extended;

+      xd->prev_mode_info_context += offset_extended;

+      mb_row += dy;

+      mb_col += dx;

+    }

+  }

+  /* skip prediction column */

+  xd->mode_info_context += 1 - (pc->mb_cols & 0x1) + xd->mode_info_stride;

+  xd->prev_mode_info_context += 1 - (pc->mb_cols & 0x1) + xd->mode_info_stride;

+}

+static unsigned int read_partition_size(const unsigned char *cx_size) {

+  const unsigned int size =

+    cx_size[0] + (cx_size[1] << 8) + (cx_size[2] << 16);

+  return size;

+}

+static int read_is_valid(const unsigned char *start,

+                         size_t               len,

+                         const unsigned char *end) {

+  return (start + len > start && start + len <= end);

+}

+static void setup_token_decoder(VP9D_COMP *pbi,

+                                const unsigned char *cx_data,

+                                BOOL_DECODER* const bool_decoder) {

+  VP9_COMMON          *pc = &pbi->common;

+  const unsigned char *user_data_end = pbi->Source + pbi->source_sz;

+  const unsigned char *partition;

+  ptrdiff_t            partition_size;

+  ptrdiff_t            bytes_left;

+  // Set up pointers to token partition

+  partition = cx_data;

+  bytes_left = user_data_end - partition;

+  partition_size = bytes_left;

+  /* Validate the calculated partition length. If the buffer

+   * described by the partition can't be fully read, then restrict

+   * it to the portion that can be (for EC mode) or throw an error.

+   */

+  if (!read_is_valid(partition, partition_size, user_data_end)) {

+    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

+                       "Truncated packet or corrupt partition "

+                       "%d length", 1);

+  }

+  if (vp9_start_decode(bool_decoder, partition, partition_size))

+    vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,

+                       "Failed to allocate bool decoder %d", 1);

+}

+static void init_frame(VP9D_COMP *pbi) {

+  VP9_COMMON *const pc = &pbi->common;

+  MACROBLOCKD *const xd  = &pbi->mb;

+  if (pc->frame_type == KEY_FRAME) {

+    /* Various keyframe initializations */

+    vp9_init_mv_probs(pc);

+    vp9_init_mbmode_probs(pc);

+    vp9_default_bmode_probs(pc->fc.bmode_prob);

+    vp9_default_coef_probs(pc);

+    vp9_kf_default_bmode_probs(pc->kf_bmode_prob);

+    // Reset the segment feature data to the default stats:

+    // Features disabled, 0, with delta coding (Default state).

+    vp9_clearall_segfeatures(xd);

+    xd->mb_segment_abs_delta = SEGMENT_DELTADATA;

+    /* reset the mode ref deltasa for loop filter */

+    vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));

+    vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));

+    /* All buffers are implicitly updated on key frames. */

+    pc->refresh_golden_frame = 1;

+    pc->refresh_alt_ref_frame = 1;

+    pc->copy_buffer_to_gf = 0;

+    pc->copy_buffer_to_arf = 0;

+    /* Note that Golden and Altref modes cannot be used on a key frame so

+     * ref_frame_sign_bias[] is undefined and meaningless

+     */

+    pc->ref_frame_sign_bias[GOLDEN_FRAME] = 0;

+    pc->ref_frame_sign_bias[ALTREF_FRAME] = 0;

+    vp9_init_mode_contexts(&pbi->common);

+    vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));

+    vpx_memcpy(&pc->lfc_a, &pc->fc, sizeof(pc->fc));

+    vpx_memcpy(pbi->common.fc.vp8_mode_contexts,

+               pbi->common.fc.mode_context,

+               sizeof(pbi->common.fc.mode_context));

+    vpx_memset(pc->prev_mip, 0,

+               (pc->mb_cols + 1) * (pc->mb_rows + 1)* sizeof(MODE_INFO));

+    vpx_memset(pc->mip, 0,

+               (pc->mb_cols + 1) * (pc->mb_rows + 1)* sizeof(MODE_INFO));

+    vp9_update_mode_info_border(pc, pc->mip);

+    vp9_update_mode_info_in_image(pc, pc->mi);

+  } else {

+    if (!pc->use_bilinear_mc_filter)

+      pc->mcomp_filter_type = EIGHTTAP;

+    else

+      pc->mcomp_filter_type = BILINEAR;

+    /* To enable choice of different interpolation filters */

+    vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);

+  }

+  xd->mode_info_context = pc->mi;

+  xd->prev_mode_info_context = pc->prev_mi;

+  xd->frame_type = pc->frame_type;

+  xd->mode_info_context->mbmi.mode = DC_PRED;

+  xd->mode_info_stride = pc->mode_info_stride;

+  xd->corrupted = 0; /* init without corruption */

+  xd->fullpixel_mask = 0xffffffff;

+  if (pc->full_pixel)

+    xd->fullpixel_mask = 0xfffffff8;

+}

+#if 0

+static void read_coef_probs2(VP9D_COMP *pbi) {

+  const vp9_prob grpupd = 192;

+  int i, j, k, l;

+  vp9_reader *const bc = &pbi->bc;

+  VP9_COMMON *const pc = &pbi->common;

+  for (l = 0; l < ENTROPY_NODES; l++) {

+    if (vp9_read(bc, grpupd)) {

+      // printf("Decoding %d\n", l);

+      for (i = 0; i < BLOCK_TYPES; i++)

+        for (j = !i; j < COEF_BANDS; j++)

+          for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

+            if (k >= 3 && ((i == 0 && j == 1) ||

+                           (i > 0 && j == 0)))

+              continue;

+            {

+              vp9_prob *const p = pc->fc.coef_probs [i][j][k] + l;

+              int u = vp9_read(bc, COEF_UPDATE_PROB);

+              if (u) *p = read_prob_diff_update(bc, *p);

+            }

+          }

+    }

+  }

+  if (pbi->common.txfm_mode == ALLOW_8X8) {

+    for (l = 0; l < ENTROPY_NODES; l++) {

+      if (vp9_read(bc, grpupd)) {

+        for (i = 0; i < BLOCK_TYPES_8X8; i++)

+          for (j = !i; j < COEF_BANDS; j++)

+            for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

+              if (k >= 3 && ((i == 0 && j == 1) ||

+                             (i > 0 && j == 0)))

+                continue;

+              {

+                vp9_prob *const p = pc->fc.coef_probs_8x8 [i][j][k] + l;

+                int u = vp9_read(bc, COEF_UPDATE_PROB_8X8);

+                if (u) *p = read_prob_diff_update(bc, *p);

+              }

+            }

+      }

+    }

+  }

+}

+#endif

+static void read_coef_probs_common(

+    BOOL_DECODER* const bc,

+    vp9_prob coef_probs[BLOCK_TYPES][COEF_BANDS]

+                       [PREV_COEF_CONTEXTS][ENTROPY_NODES]) {

+  int i, j, k, l;

+  if (vp9_read_bit(bc)) {

+    for (i = 0; i < BLOCK_TYPES; i++) {

+      for (j = !i; j < COEF_BANDS; j++) {

+        /* NB: This j loop starts from 1 on block type i == 0 */

+        for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

+          if (k >= 3 && ((i == 0 && j == 1) ||

+                         (i > 0 && j == 0)))

+            continue;

+          for (l = 0; l < ENTROPY_NODES; l++) {

+            vp9_prob *const p = coef_probs[i][j][k] + l;

+            if (vp9_read(bc, COEF_UPDATE_PROB)) {

+              *p = read_prob_diff_update(bc, *p);

+            }

+          }

+        }

+      }

+    }

+  }

+}

+static void read_coef_probs(VP9D_COMP *pbi, BOOL_DECODER* const bc) {

+  VP9_COMMON *const pc = &pbi->common;

+  read_coef_probs_common(bc, pc->fc.coef_probs);

+  read_coef_probs_common(bc, pc->fc.hybrid_coef_probs);

+  if (pbi->common.txfm_mode != ONLY_4X4) {

+    read_coef_probs_common(bc, pc->fc.coef_probs_8x8);

+    read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_8x8);

+  }

+  if (pbi->common.txfm_mode > ALLOW_8X8) {

+    read_coef_probs_common(bc, pc->fc.coef_probs_16x16);

+    read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_16x16);

+  }

+}

+int vp9_decode_frame(VP9D_COMP *pbi) {

+  BOOL_DECODER header_bc, residual_bc;

+  VP9_COMMON *const pc = &pbi->common;

+  MACROBLOCKD *const xd  = &pbi->mb;

+  const unsigned char *data = (const unsigned char *)pbi->Source;

+  const unsigned char *data_end = data + pbi->source_sz;

+  ptrdiff_t first_partition_length_in_bytes = 0;

+  int mb_row;

+  int i, j;

+  int corrupt_tokens = 0;

+  /* start with no corruption of current frame */

+  xd->corrupted = 0;

+  pc->yv12_fb[pc->new_fb_idx].corrupted = 0;

+  if (data_end - data < 3) {

+    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

+                       "Truncated packet");

+  } else {

+    pc->last_frame_type = pc->frame_type;

+    pc->frame_type = (FRAME_TYPE)(data[0] & 1);

+    pc->version = (data[0] >> 1) & 7;

+    pc->show_frame = (data[0] >> 4) & 1;

+    first_partition_length_in_bytes =

+      (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5;

+    if ((data + first_partition_length_in_bytes > data_end

+         || data + first_partition_length_in_bytes < data))

+      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

+                         "Truncated packet or corrupt partition 0 length");

+    data += 3;

+    vp9_setup_version(pc);

+    if (pc->frame_type == KEY_FRAME) {

+      const int Width = pc->Width;

+      const int Height = pc->Height;

+      /* vet via sync code */

+      /* When error concealment is enabled we should only check the sync

+       * code if we have enough bits available

+       */

+      if (data + 3 < data_end) {

+        if (data[0] != 0x9d || data[1] != 0x01 || data[2] != 0x2a)

+          vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,

+                             "Invalid frame sync code");

+      }

+      /* If error concealment is enabled we should only parse the new size

+       * if we have enough data. Otherwise we will end up with the wrong

+       * size.

+       */

+      if (data + 6 < data_end) {

+        pc->Width = (data[3] | (data[4] << 8)) & 0x3fff;

+        pc->horiz_scale = data[4] >> 6;

+        pc->Height = (data[5] | (data[6] << 8)) & 0x3fff;

+        pc->vert_scale = data[6] >> 6;

+      }

+      data += 7;

+      if (Width != pc->Width  ||  Height != pc->Height) {

+        if (pc->Width <= 0) {

+          pc->Width = Width;

+          vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

+                             "Invalid frame width");

+        }

+        if (pc->Height <= 0) {

+          pc->Height = Height;

+          vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

+                             "Invalid frame height");

+        }

+        if (vp9_alloc_frame_buffers(pc, pc->Width, pc->Height))

+          vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,

+                             "Failed to allocate frame buffers");

+      }

+    }

+  }

+  if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME) ||

+      pc->Width == 0 || pc->Height == 0) {

+    return -1;

+  }

+  init_frame(pbi);

+  if (vp9_start_decode(&header_bc, data, first_partition_length_in_bytes))

+    vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,

+                       "Failed to allocate bool decoder 0");

+  if (pc->frame_type == KEY_FRAME) {

+    pc->clr_type    = (YUV_TYPE)vp9_read_bit(&header_bc);

+    pc->clamp_type  = (CLAMP_TYPE)vp9_read_bit(&header_bc);

+  }

+  /* Is segmentation enabled */

+  xd->segmentation_enabled = (unsigned char)vp9_read_bit(&header_bc);

+  if (xd->segmentation_enabled) {

+    // Read whether or not the segmentation map is being explicitly

+    // updated this frame.

+    xd->update_mb_segmentation_map = (unsigned char)vp9_read_bit(&header_bc);

+    // If so what method will be used.

+    if (xd->update_mb_segmentation_map) {

+      // Which macro block level features are enabled

+      // Read the probs used to decode the segment id for each macro

+      // block.

+      for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {

+          xd->mb_segment_tree_probs[i] = vp9_read_bit(&header_bc) ?

+              (vp9_prob)vp9_read_literal(&header_bc, 8) : 255;

+      }

+      // Read the prediction probs needed to decode the segment id

+      pc->temporal_update = (unsigned char)vp9_read_bit(&header_bc);

+      for (i = 0; i < PREDICTION_PROBS; i++) {

+        if (pc->temporal_update) {

+          pc->segment_pred_probs[i] = vp9_read_bit(&header_bc) ?

+              (vp9_prob)vp9_read_literal(&header_bc, 8) : 255;

+        } else {

+          pc->segment_pred_probs[i] = 255;

+        }

+      }

+    }

+    // Is the segment data being updated

+    xd->update_mb_segmentation_data = (unsigned char)vp9_read_bit(&header_bc);

+    if (xd->update_mb_segmentation_data) {

+      int data;

+      xd->mb_segment_abs_delta = (unsigned char)vp9_read_bit(&header_bc);

+      vp9_clearall_segfeatures(xd);

+      // For each segmentation...

+      for (i = 0; i < MAX_MB_SEGMENTS; i++) {

+        // For each of the segments features...

+        for (j = 0; j < SEG_LVL_MAX; j++) {

+          // Is the feature enabled

+          if (vp9_read_bit(&header_bc)) {

+            // Update the feature data and mask

+            vp9_enable_segfeature(xd, i, j);

+            data = (signed char)vp9_read_literal(

+                     &header_bc, vp9_seg_feature_data_bits(j));

+            // Is the segment data signed..

+            if (vp9_is_segfeature_signed(j)) {

+              if (vp9_read_bit(&header_bc))

+                data = - data;

+            }

+          } else

+            data = 0;

+          vp9_set_segdata(xd, i, j, data);

+        }

+      }

+    }

+  }

+  // Read common prediction model status flag probability updates for the

+  // reference frame

+  if (pc->frame_type == KEY_FRAME) {

+    // Set the prediction probabilities to defaults

+    pc->ref_pred_probs[0] = 120;

+    pc->ref_pred_probs[1] = 80;

+    pc->ref_pred_probs[2] = 40;

+  } else {

+    for (i = 0; i < PREDICTION_PROBS; i++) {

+      if (vp9_read_bit(&header_bc))

+        pc->ref_pred_probs[i] = (vp9_prob)vp9_read_literal(&header_bc, 8);

+    }

+  }

+#if CONFIG_SUPERBLOCKS

+  pc->sb_coded = vp9_read_literal(&header_bc, 8);

+#endif

+  /* Read the loop filter level and type */

+  pc->txfm_mode = vp9_read_literal(&header_bc, 2);

+  if (pc->txfm_mode == TX_MODE_SELECT) {

+    pc->prob_tx[0] = vp9_read_literal(&header_bc, 8);

+    pc->prob_tx[1] = vp9_read_literal(&header_bc, 8);

+  }

+  pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(&header_bc);

+  pc->filter_level = vp9_read_literal(&header_bc, 6);

+  pc->sharpness_level = vp9_read_literal(&header_bc, 3);

+  /* Read in loop filter deltas applied at the MB level based on mode or ref frame. */

+  xd->mode_ref_lf_delta_update = 0;

+  xd->mode_ref_lf_delta_enabled = (unsigned char)vp9_read_bit(&header_bc);

+  if (xd->mode_ref_lf_delta_enabled) {

+    /* Do the deltas need to be updated */

+    xd->mode_ref_lf_delta_update = (unsigned char)vp9_read_bit(&header_bc);

+    if (xd->mode_ref_lf_delta_update) {

+      /* Send update */

+      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {

+        if (vp9_read_bit(&header_bc)) {

+          /*sign = vp9_read_bit( &header_bc );*/

+          xd->ref_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6);

+          if (vp9_read_bit(&header_bc))        /* Apply sign */

+            xd->ref_lf_deltas[i] = xd->ref_lf_deltas[i] * -1;

+        }

+      }

+      /* Send update */

+      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {

+        if (vp9_read_bit(&header_bc)) {

+          /*sign = vp9_read_bit( &header_bc );*/

+          xd->mode_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6);

+          if (vp9_read_bit(&header_bc))        /* Apply sign */

+            xd->mode_lf_deltas[i] = xd->mode_lf_deltas[i] * -1;

+        }

+      }

+    }

+  }

+  // Dummy read for now

+  vp9_read_literal(&header_bc, 2);

+  setup_token_decoder(pbi, data + first_partition_length_in_bytes,

+                      &residual_bc);

+  /* Read the default quantizers. */

+  {

+    int Q, q_update;

+    Q = vp9_read_literal(&header_bc, QINDEX_BITS);

+    pc->base_qindex = Q;

+    q_update = 0;

+    /* AC 1st order Q = default */

+    pc->y1dc_delta_q = get_delta_q(&header_bc, pc->y1dc_delta_q, &q_update);

+    pc->y2dc_delta_q = get_delta_q(&header_bc, pc->y2dc_delta_q, &q_update);

+    pc->y2ac_delta_q = get_delta_q(&header_bc, pc->y2ac_delta_q, &q_update);

+    pc->uvdc_delta_q = get_delta_q(&header_bc, pc->uvdc_delta_q, &q_update);

+    pc->uvac_delta_q = get_delta_q(&header_bc, pc->uvac_delta_q, &q_update);

+    if (q_update)

+      vp9_init_de_quantizer(pbi);

+    /* MB level dequantizer setup */

+    mb_init_dequantizer(pbi, &pbi->mb);

+  }

+  /* Determine if the golden frame or ARF buffer should be updated and how.

+   * For all non key frames the GF and ARF refresh flags and sign bias

+   * flags must be set explicitly.

+   */

+  if (pc->frame_type != KEY_FRAME) {

+    /* Should the GF or ARF be updated from the current frame */

+    pc->refresh_golden_frame = vp9_read_bit(&header_bc);

+    pc->refresh_alt_ref_frame = vp9_read_bit(&header_bc);

+    if (pc->refresh_alt_ref_frame) {

+      vpx_memcpy(&pc->fc, &pc->lfc_a, sizeof(pc->fc));

+      vpx_memcpy(pc->fc.vp8_mode_contexts,

+                 pc->fc.mode_context_a,

+                 sizeof(pc->fc.vp8_mode_contexts));

+    } else {

+      vpx_memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc));

+      vpx_memcpy(pc->fc.vp8_mode_contexts,

+                 pc->fc.mode_context,

+                 sizeof(pc->fc.vp8_mode_contexts));

+    }

+    /* Buffer to buffer copy flags. */

+    pc->copy_buffer_to_gf = 0;

+    if (!pc->refresh_golden_frame)

+      pc->copy_buffer_to_gf = vp9_read_literal(&header_bc, 2);

+    pc->copy_buffer_to_arf = 0;

+    if (!pc->refresh_alt_ref_frame)

+      pc->copy_buffer_to_arf = vp9_read_literal(&header_bc, 2);

+    pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp9_read_bit(&header_bc);

+    pc->ref_frame_sign_bias[ALTREF_FRAME] = vp9_read_bit(&header_bc);

+    /* Is high precision mv allowed */

+    xd->allow_high_precision_mv = (unsigned char)vp9_read_bit(&header_bc);

+    // Read the type of subpel filter to use

+    if (vp9_read_bit(&header_bc)) {

+      pc->mcomp_filter_type = SWITCHABLE;

+    } else {

+      pc->mcomp_filter_type = vp9_read_literal(&header_bc, 2);

+    }

+    /* To enable choice of different interploation filters */

+    vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);

+  }

+  pc->refresh_entropy_probs = vp9_read_bit(&header_bc);

+  if (pc->refresh_entropy_probs == 0) {

+    vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));

+  }

+  pc->refresh_last_frame = (pc->frame_type == KEY_FRAME)

+                           || vp9_read_bit(&header_bc);

+  if (0) {

+    FILE *z = fopen("decodestats.stt", "a");

+    fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n",

+            pc->current_video_frame,

+            pc->frame_type,

+            pc->refresh_golden_frame,

+            pc->refresh_alt_ref_frame,

+            pc->refresh_last_frame,

+            pc->base_qindex);

+    fclose(z);

+  }

+  vp9_copy(pbi->common.fc.pre_coef_probs,

+           pbi->common.fc.coef_probs);

+  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs,

+           pbi->common.fc.hybrid_coef_probs);

+  vp9_copy(pbi->common.fc.pre_coef_probs_8x8,

+           pbi->common.fc.coef_probs_8x8);

+  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_8x8,

+           pbi->common.fc.hybrid_coef_probs_8x8);

+  vp9_copy(pbi->common.fc.pre_coef_probs_16x16,

+           pbi->common.fc.coef_probs_16x16);

+  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_16x16,

+           pbi->common.fc.hybrid_coef_probs_16x16);

+  vp9_copy(pbi->common.fc.pre_ymode_prob, pbi->common.fc.ymode_prob);

+  vp9_copy(pbi->common.fc.pre_uv_mode_prob, pbi->common.fc.uv_mode_prob);

+  vp9_copy(pbi->common.fc.pre_bmode_prob, pbi->common.fc.bmode_prob);

+  vp9_copy(pbi->common.fc.pre_i8x8_mode_prob, pbi->common.fc.i8x8_mode_prob);

+  vp9_copy(pbi->common.fc.pre_sub_mv_ref_prob, pbi->common.fc.sub_mv_ref_prob);

+  vp9_copy(pbi->common.fc.pre_mbsplit_prob, pbi->common.fc.mbsplit_prob);

+  pbi->common.fc.pre_nmvc = pbi->common.fc.nmvc;

+  vp9_zero(pbi->common.fc.coef_counts);

+  vp9_zero(pbi->common.fc.hybrid_coef_counts);

+  vp9_zero(pbi->common.fc.coef_counts_8x8);

+  vp9_zero(pbi->common.fc.hybrid_coef_counts_8x8);

+  vp9_zero(pbi->common.fc.coef_counts_16x16);

+  vp9_zero(pbi->common.fc.hybrid_coef_counts_16x16);

+  vp9_zero(pbi->common.fc.ymode_counts);

+  vp9_zero(pbi->common.fc.uv_mode_counts);

+  vp9_zero(pbi->common.fc.bmode_counts);

+  vp9_zero(pbi->common.fc.i8x8_mode_counts);

+  vp9_zero(pbi->common.fc.sub_mv_ref_counts);

+  vp9_zero(pbi->common.fc.mbsplit_counts);

+  vp9_zero(pbi->common.fc.NMVcount);

+  vp9_zero(pbi->common.fc.mv_ref_ct);

+  vp9_zero(pbi->common.fc.mv_ref_ct_a);

+  read_coef_probs(pbi, &header_bc);

+  vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->lst_fb_idx], sizeof(YV12_BUFFER_CONFIG));

+  vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG));

+  // Create the segmentation map structure and set to 0

+  if (!pc->last_frame_seg_map)

+    CHECK_MEM_ERROR(pc->last_frame_seg_map,

+                    vpx_calloc((pc->mb_rows * pc->mb_cols), 1));

+  /* set up frame new frame for intra coded blocks */

+  vp9_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]);

+  vp9_setup_block_dptrs(xd);

+  vp9_build_block_doffsets(xd);

+  /* clear out the coeff buffer */

+  vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));

+  /* Read the mb_no_coeff_skip flag */

+  pc->mb_no_coeff_skip = (int)vp9_read_bit(&header_bc);

+  vp9_decode_mode_mvs_init(pbi, &header_bc);

+  vpx_memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);

+  // Resset the macroblock mode info context to the start of the list

+  xd->mode_info_context = pc->mi;

+  xd->prev_mode_info_context = pc->prev_mi;

+  /* Decode a row of superblocks */

+  for (mb_row = 0; mb_row < pc->mb_rows; mb_row += 2) {

+    decode_sb_row(pbi, pc, mb_row, xd, &residual_bc);

+  }

+  corrupt_tokens |= xd->corrupted;

+  /* Collect information about decoder corruption. */

+  /* 1. Check first boolean decoder for errors. */

+  pc->yv12_fb[pc->new_fb_idx].corrupted = bool_error(&header_bc);

+  /* 2. Check the macroblock information */

+  pc->yv12_fb[pc->new_fb_idx].corrupted |= corrupt_tokens;

+  if (!pbi->decoded_key_frame) {

+    if (pc->frame_type == KEY_FRAME &&

+        !pc->yv12_fb[pc->new_fb_idx].corrupted)

+      pbi->decoded_key_frame = 1;

+    else

+      vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME,

+                         "A stream must start with a complete key frame");

+  }

+  vp9_adapt_coef_probs(pc);

+  if (pc->frame_type != KEY_FRAME) {

+    vp9_adapt_mode_probs(pc);

+    vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);

+    vp9_update_mode_context(&pbi->common);

+  }

+  /* If this was a kf or Gf note the Q used */

+  if ((pc->frame_type == KEY_FRAME) ||

+      pc->refresh_golden_frame || pc->refresh_alt_ref_frame) {

+    pc->last_kf_gf_q = pc->base_qindex;

+  }

+  if (pc->refresh_entropy_probs) {

+    if (pc->refresh_alt_ref_frame)

+      vpx_memcpy(&pc->lfc_a, &pc->fc, sizeof(pc->fc));

+    else

+      vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));

+  }

+#ifdef PACKET_TESTING

+  {

+    FILE *f = fopen("decompressor.VP8", "ab");

+    unsigned int size = residual_bc.pos + header_bc.pos + 8;

+    fwrite((void *) &size, 4, 1, f);

+    fwrite((void *) pbi->Source, size, 1, f);

+    fclose(f);

+  }

+#endif

+  // printf("Frame %d Done\n", frame_count++);

+  return 0;

+}

--- /dev/null

+++ b/vp9/decoder/dequantize.c

@@ -1,0 +1,543 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "dequantize.h"

+#include "vp9/common/idct.h"

+#include "vpx_mem/vpx_mem.h"

+#include "onyxd_int.h"

+extern void vp9_short_idct4x4llm_c(short *input, short *output, int pitch);

+extern void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch);

+extern void vp9_short_idct8x8_c(short *input, short *output, int pitch);

+extern void vp9_short_idct8x8_1_c(short *input, short *output, int pitch);

+#if CONFIG_LOSSLESS

+extern void vp9_short_inv_walsh4x4_x8_c(short *input, short *output,

+                                        int pitch);

+extern void vp9_short_inv_walsh4x4_1_x8_c(short *input, short *output,

+                                          int pitch);

+#endif

+#ifdef DEC_DEBUG

+extern int dec_debug;

+#endif

+void vp9_dequantize_b_c(BLOCKD *d) {

+  int i;

+  short *DQ  = d->dqcoeff;

+  short *Q   = d->qcoeff;

+  short *DQC = d->dequant;

+  for (i = 0; i < 16; i++) {

+    DQ[i] = Q[i] * DQC[i];

+  }

+}

+void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,

+                               unsigned char *pred, unsigned char *dest,

+                               int pitch, int stride) {

+  short output[16];

+  short *diff_ptr = output;

+  int r, c;

+  int i;

+  for (i = 0; i < 16; i++) {

+    input[i] = dq[i] * input[i];

+  }

+  vp9_ihtllm_c(input, output, 4 << 1, tx_type, 4);

+  vpx_memset(input, 0, 32);

+  for (r = 0; r < 4; r++) {

+      for (c = 0; c < 4; c++) {

+        int a = diff_ptr[c] + pred[c];

+        if (a < 0)

+            a = 0;

+        if (a > 255)

+            a = 255;

+        dest[c] = (unsigned char) a;

+    }

+      dest += stride;

+      diff_ptr += 4;

+      pred += pitch;

+  }

+}

+void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,

+                                   unsigned char *pred, unsigned char *dest,

+                                   int pitch, int stride) {

+  short output[64];

+  short *diff_ptr = output;

+  int b, r, c;

+  int i;

+  unsigned char *origdest = dest;

+  unsigned char *origpred = pred;

+  input[0] = dq[0] * input[0];

+  for (i = 1; i < 64; i++) {

+    input[i] = dq[1] * input[i];

+  }

+  vp9_ihtllm_c(input, output, 16, tx_type, 8);

+  vpx_memset(input, 0, 128);

+  for (b = 0; b < 4; b++) {

+    for (r = 0; r < 4; r++) {

+      for (c = 0; c < 4; c++) {

+        int a = diff_ptr[c] + pred[c];

+        if (a < 0)

+          a = 0;

+        if (a > 255)

+          a = 255;

+        dest[c] = (unsigned char) a;

+      }

+      dest += stride;

+      diff_ptr += 8;

+      pred += pitch;

+    }

+    // shift buffer pointers to next 4x4 block in the submacroblock

+    diff_ptr = output + (b + 1) / 2 * 4 * 8 + ((b + 1) % 2) * 4;

+    dest = origdest + (b + 1) / 2 * 4 * stride + ((b + 1) % 2) * 4;

+    pred = origpred + (b + 1) / 2 * 4 * pitch + ((b + 1) % 2) * 4;

+  }

+}

+void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,

+                            unsigned char *dest, int pitch, int stride) {

+  short output[16];

+  short *diff_ptr = output;

+  int r, c;

+  int i;

+  for (i = 0; i < 16; i++) {

+    input[i] = dq[i] * input[i];

+  }

+  /* the idct halves ( >> 1) the pitch */

+  vp9_short_idct4x4llm_c(input, output, 4 << 1);

+  vpx_memset(input, 0, 32);

+  for (r = 0; r < 4; r++) {

+    for (c = 0; c < 4; c++) {

+      int a = diff_ptr[c] + pred[c];

+      if (a < 0)

+        a = 0;

+      if (a > 255)

+        a = 255;

+      dest[c] = (unsigned char) a;

+    }

+    dest += stride;

+    diff_ptr += 4;

+    pred += pitch;

+  }

+}

+void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,

+                               unsigned char *dest, int pitch, int stride,

+                               int Dc) {

+  int i;

+  short output[16];

+  short *diff_ptr = output;

+  int r, c;

+  input[0] = (short)Dc;

+  for (i = 1; i < 16; i++) {

+    input[i] = dq[i] * input[i];

+  }

+  /* the idct halves ( >> 1) the pitch */

+  vp9_short_idct4x4llm_c(input, output, 4 << 1);

+  vpx_memset(input, 0, 32);

+  for (r = 0; r < 4; r++) {

+    for (c = 0; c < 4; c++) {

+      int a = diff_ptr[c] + pred[c];

+      if (a < 0)

+        a = 0;

+      if (a > 255)

+        a = 255;

+      dest[c] = (unsigned char) a;

+    }

+    dest += stride;

+    diff_ptr += 4;

+    pred += pitch;

+  }

+}

+#if CONFIG_LOSSLESS

+void vp9_dequant_idct_add_lossless_c(short *input, short *dq,

+                                     unsigned char *pred, unsigned char *dest,

+                                     int pitch, int stride) {

+  short output[16];

+  short *diff_ptr = output;

+  int r, c;

+  int i;

+  for (i = 0; i < 16; i++) {

+    input[i] = dq[i] * input[i];

+  }

+  vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);

+  vpx_memset(input, 0, 32);

+  for (r = 0; r < 4; r++) {

+    for (c = 0; c < 4; c++) {

+      int a = diff_ptr[c] + pred[c];

+      if (a < 0)

+        a = 0;

+      if (a > 255)

+        a = 255;

+      dest[c] = (unsigned char) a;

+    }

+    dest += stride;

+    diff_ptr += 4;

+    pred += pitch;

+  }

+}

+void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,

+                                        unsigned char *pred,

+                                        unsigned char *dest,

+                                        int pitch, int stride, int dc) {

+  int i;

+  short output[16];

+  short *diff_ptr = output;

+  int r, c;

+  input[0] = (short)dc;

+  for (i = 1; i < 16; i++) {

+    input[i] = dq[i] * input[i];

+  }

+  vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);

+  vpx_memset(input, 0, 32);

+  for (r = 0; r < 4; r++) {

+    for (c = 0; c < 4; c++) {

+      int a = diff_ptr[c] + pred[c];

+      if (a < 0)

+        a = 0;

+      if (a > 255)

+        a = 255;

+      dest[c] = (unsigned char) a;

+    }

+    dest += stride;

+    diff_ptr += 4;

+    pred += pitch;

+  }

+}

+#endif

+void vp9_dequantize_b_2x2_c(BLOCKD *d) {

+  int i;

+  short *DQ  = d->dqcoeff;

+  short *Q   = d->qcoeff;

+  short *DQC = d->dequant;

+  for (i = 0; i < 16; i++) {

+    DQ[i] = (short)((Q[i] * DQC[i]));

+  }

+#ifdef DEC_DEBUG

+  if (dec_debug) {

+    int j;

+    printf("Dequantize 2x2\n");

+    for (j = 0; j < 16; j++) printf("%d ", Q[j]);

+    printf("\n");

+    for (j = 0; j < 16; j++) printf("%d ", DQ[j]);

+    printf("\n");

+  }

+#endif

+}

+void vp9_dequant_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,

+                                unsigned char *dest, int pitch, int stride) {

+  short output[64];

+  short *diff_ptr = output;

+  int r, c, b;

+  int i;

+  unsigned char *origdest = dest;

+  unsigned char *origpred = pred;

+#ifdef DEC_DEBUG

+  if (dec_debug) {

+    int j;

+    printf("Input 8x8\n");

+    for (j = 0; j < 64; j++) {

+      printf("%d ", input[j]);

+      if (j % 8 == 7) printf("\n");

+    }

+  }

+#endif

+  input[0] = input[0] * dq[0];

+  // recover quantizer for 4 4x4 blocks

+  for (i = 1; i < 64; i++) {

+    input[i] = input[i] * dq[1];

+  }

+#ifdef DEC_DEBUG

+  if (dec_debug) {

+    int j;

+    printf("Input DQ 8x8\n");

+    for (j = 0; j < 64; j++) {

+      printf("%d ", input[j]);

+      if (j % 8 == 7) printf("\n");

+    }

+  }

+#endif

+  // the idct halves ( >> 1) the pitch

+  vp9_short_idct8x8_c(input, output, 16);

+#ifdef DEC_DEBUG

+  if (dec_debug) {

+    int j;

+    printf("Output 8x8\n");

+    for (j = 0; j < 64; j++) {

+      printf("%d ", output[j]);

+      if (j % 8 == 7) printf("\n");

+    }

+  }

+#endif

+  vpx_memset(input, 0, 128);// test what should i put here

+  for (b = 0; b < 4; b++) {

+    for (r = 0; r < 4; r++) {

+      for (c = 0; c < 4; c++) {

+        int a = diff_ptr[c] + pred[c];

+        if (a < 0)

+          a = 0;

+        if (a > 255)

+          a = 255;

+        dest[c] = (unsigned char) a;

+      }

+      dest += stride;

+      diff_ptr += 8;

+      pred += pitch;

+    }

+    diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;

+    dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;

+    pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;

+  }

+#ifdef DEC_DEBUG

+  if (dec_debug) {

+    int k, j;

+    printf("Final 8x8\n");

+    for (j = 0; j < 8; j++) {

+      for (k = 0; k < 8; k++) {

+        printf("%d ", origdest[k]);

+      }

+      printf("\n");

+      origdest += stride;

+    }

+  }

+#endif

+}

+void vp9_dequant_dc_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,

+                                   unsigned char *dest, int pitch, int stride,

+                                   int Dc) { // Dc for 1st order T in some rear case

+  short output[64];

+  short *diff_ptr = output;

+  int r, c, b;

+  int i;

+  unsigned char *origdest = dest;

+  unsigned char *origpred = pred;

+  input[0] = (short)Dc;// Dc is the reconstructed value, do not need dequantization

+  // dc value is recovered after dequantization, since dc need not quantization

+#ifdef DEC_DEBUG

+  if (dec_debug) {

+    int j;

+    printf("Input 8x8\n");

+    for (j = 0; j < 64; j++) {

+      printf("%d ", input[j]);

+      if (j % 8 == 7) printf("\n");

+    }

+  }

+#endif

+  for (i = 1; i < 64; i++) {

+    input[i] = input[i] * dq[1];

+  }

+#ifdef DEC_DEBUG

+  if (dec_debug) {

+    int j;

+    printf("Input DQ 8x8\n");

+    for (j = 0; j < 64; j++) {

+      printf("%d ", input[j]);

+      if (j % 8 == 7) printf("\n");

+    }

+  }

+#endif

+  // the idct halves ( >> 1) the pitch

+  vp9_short_idct8x8_c(input, output, 16);

+#ifdef DEC_DEBUG

+  if (dec_debug) {

+    int j;

+    printf("Output 8x8\n");

+    for (j = 0; j < 64; j++) {

+      printf("%d ", output[j]);

+      if (j % 8 == 7) printf("\n");

+    }

+  }

+#endif

+  vpx_memset(input, 0, 128);

+  for (b = 0; b < 4; b++) {

+    for (r = 0; r < 4; r++) {

+      for (c = 0; c < 4; c++) {

+        int a = diff_ptr[c] + pred[c];

+        if (a < 0)

+          a = 0;

+        if (a > 255)

+          a = 255;

+        dest[c] = (unsigned char) a;

+      }

+      dest += stride;

+      diff_ptr += 8;

+      pred += pitch;

+    }

+    diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;

+    dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;

+    pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;

+  }

+#ifdef DEC_DEBUG

+  if (dec_debug) {

+    int k, j;

+    printf("Final 8x8\n");

+    for (j = 0; j < 8; j++) {

+      for (k = 0; k < 8; k++) {

+        printf("%d ", origdest[k]);

+      }

+      printf("\n");

+      origdest += stride;

+    }

+  }

+#endif

+}

+void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, short *dq,

+                                     unsigned char *pred, unsigned char *dest,

+                                     int pitch, int stride) {

+  short output[256];

+  short *diff_ptr = output;

+  int r, c, i;

+  input[0]= input[0] * dq[0];

+  // recover quantizer for 4 4x4 blocks

+  for (i = 1; i < 256; i++)

+    input[i] = input[i] * dq[1];

+  // inverse hybrid transform

+  vp9_ihtllm_c(input, output, 32, tx_type, 16);

+  // the idct halves ( >> 1) the pitch

+  // vp9_short_idct16x16_c(input, output, 32);

+  vpx_memset(input, 0, 512);

+  for (r = 0; r < 16; r++) {

+    for (c = 0; c < 16; c++) {

+      int a = diff_ptr[c] + pred[c];

+      if (a < 0)

+        a = 0;

+      else if (a > 255)

+        a = 255;

+      dest[c] = (unsigned char) a;

+    }

+    dest += stride;

+    diff_ptr += 16;

+    pred += pitch;

+  }

+}

+void vp9_dequant_idct_add_16x16_c(short *input, short *dq, unsigned char *pred,

+                                  unsigned char *dest, int pitch, int stride) {

+  short output[256];

+  short *diff_ptr = output;

+  int r, c, i;

+  input[0]= input[0] * dq[0];

+  // recover quantizer for 4 4x4 blocks

+  for (i = 1; i < 256; i++)

+    input[i] = input[i] * dq[1];

+  // the idct halves ( >> 1) the pitch

+  vp9_short_idct16x16_c(input, output, 32);

+  vpx_memset(input, 0, 512);

+  for (r = 0; r < 16; r++) {

+    for (c = 0; c < 16; c++) {

+      int a = diff_ptr[c] + pred[c];

+      if (a < 0)

+        a = 0;

+      else if (a > 255)

+        a = 255;

+      dest[c] = (unsigned char) a;

+    }

+    dest += stride;

+    diff_ptr += 16;

+    pred += pitch;

+  }

+}

--- /dev/null

+++ b/vp9/decoder/dequantize.h

@@ -1,0 +1,78 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef DEQUANTIZE_H

+#define DEQUANTIZE_H

+#include "vp9/common/blockd.h"

+#if CONFIG_LOSSLESS

+extern void vp9_dequant_idct_add_lossless_c(short *input, short *dq,

+                                            unsigned char *pred,

+                                            unsigned char *output,

+                                            int pitch, int stride);

+extern void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,

+                                               unsigned char *pred,

+                                               unsigned char *output,

+                                               int pitch, int stride, int dc);

+extern void vp9_dequant_dc_idct_add_y_block_lossless_c(short *q, short *dq,

+                                                       unsigned char *pre,

+                                                       unsigned char *dst,

+                                                       int stride, char *eobs,

+                                                       short *dc);

+extern void vp9_dequant_idct_add_y_block_lossless_c(short *q, short *dq,

+                                                    unsigned char *pre,

+                                                    unsigned char *dst,

+                                                    int stride, char *eobs);

+extern void vp9_dequant_idct_add_uv_block_lossless_c(short *q, short *dq,

+                                                     unsigned char *pre,

+                                                     unsigned char *dst_u,

+                                                     unsigned char *dst_v,

+                                                     int stride, char *eobs);

+#endif

+typedef void (*vp9_dequant_idct_add_fn_t)(short *input, short *dq,

+    unsigned char *pred, unsigned char *output, int pitch, int stride);

+typedef void(*vp9_dequant_dc_idct_add_fn_t)(short *input, short *dq,

+    unsigned char *pred, unsigned char *output, int pitch, int stride, int dc);

+typedef void(*vp9_dequant_dc_idct_add_y_block_fn_t)(short *q, short *dq,

+    unsigned char *pre, unsigned char *dst, int stride, char *eobs, short *dc);

+typedef void(*vp9_dequant_idct_add_y_block_fn_t)(short *q, short *dq,

+    unsigned char *pre, unsigned char *dst, int stride, char *eobs);

+typedef void(*vp9_dequant_idct_add_uv_block_fn_t)(short *q, short *dq,

+    unsigned char *pre, unsigned char *dst_u, unsigned char *dst_v, int stride,

+    char *eobs);

+void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,

+                                    unsigned char *pred, unsigned char *dest,

+                                    int pitch, int stride);

+void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,

+                                   unsigned char *pred, unsigned char *dest,

+                                   int pitch, int stride);

+void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, short *dq,

+                                     unsigned char *pred, unsigned char *dest,

+                                     int pitch, int stride);

+#if CONFIG_SUPERBLOCKS

+void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, short *dq,

+                                                   unsigned char *dst,

+                                                   int stride, char *eobs,

+                                                   short *dc, MACROBLOCKD *xd);

+void vp9_dequant_idct_add_uv_block_8x8_inplace_c(short *q, short *dq,

+                                                 unsigned char *dstu,

+                                                 unsigned char *dstv,

+                                                 int stride, char *eobs,

+                                                 MACROBLOCKD *xd);

+#endif

+#endif

--- /dev/null

+++ b/vp9/decoder/detokenize.c

@@ -1,0 +1,640 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vp9/common/type_aliases.h"

+#include "vp9/common/blockd.h"

+#include "onyxd_int.h"

+#include "vpx_mem/vpx_mem.h"

+#include "vpx_ports/mem.h"

+#include "detokenize.h"

+#include "vp9/common/seg_common.h"

+#define BOOL_DATA UINT8

+#define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES

+DECLARE_ALIGNED(16, static const int, coef_bands_x[16]) = {

+  0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X,

+  6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X,

+  6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,

+  6 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X

+};

+DECLARE_ALIGNED(16, static const int, coef_bands_x_8x8[64]) = {

+  0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 4 * OCB_X, 5 * OCB_X,

+  5 * OCB_X, 3 * OCB_X, 6 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 6 * OCB_X, 6 * OCB_X,

+  6 * OCB_X, 5 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,

+  6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,

+  6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+};

+DECLARE_ALIGNED(16, static const int, coef_bands_x_16x16[256]) = {

+  0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 4 * OCB_X, 5 * OCB_X, 5 * OCB_X, 3 * OCB_X, 6 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 6 * OCB_X, 6 * OCB_X,

+  6 * OCB_X, 5 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,

+  6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X

+};

+#define EOB_CONTEXT_NODE            0

+#define ZERO_CONTEXT_NODE           1

+#define ONE_CONTEXT_NODE            2

+#define LOW_VAL_CONTEXT_NODE        3

+#define TWO_CONTEXT_NODE            4

+#define THREE_CONTEXT_NODE          5

+#define HIGH_LOW_CONTEXT_NODE       6

+#define CAT_ONE_CONTEXT_NODE        7

+#define CAT_THREEFOUR_CONTEXT_NODE  8

+#define CAT_THREE_CONTEXT_NODE      9

+#define CAT_FIVE_CONTEXT_NODE       10

+#define CAT1_MIN_VAL    5

+#define CAT2_MIN_VAL    7

+#define CAT3_MIN_VAL   11

+#define CAT4_MIN_VAL   19

+#define CAT5_MIN_VAL   35

+#define CAT6_MIN_VAL   67

+#define CAT1_PROB0    159

+#define CAT2_PROB0    145

+#define CAT2_PROB1    165

+#define CAT3_PROB0 140

+#define CAT3_PROB1 148

+#define CAT3_PROB2 173

+#define CAT4_PROB0 135

+#define CAT4_PROB1 140

+#define CAT4_PROB2 155

+#define CAT4_PROB3 176

+#define CAT5_PROB0 130

+#define CAT5_PROB1 134

+#define CAT5_PROB2 141

+#define CAT5_PROB3 157

+#define CAT5_PROB4 180

+static const unsigned char cat6_prob[14] =

+{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };

+void vp9_reset_mb_tokens_context(MACROBLOCKD *xd) {

+  /* Clear entropy contexts for Y2 blocks */

+  if ((xd->mode_info_context->mbmi.mode != B_PRED &&

+      xd->mode_info_context->mbmi.mode != I8X8_PRED &&

+      xd->mode_info_context->mbmi.mode != SPLITMV)

+      || xd->mode_info_context->mbmi.txfm_size == TX_16X16

+      ) {

+    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));

+    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));

+  } else {

+    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);

+    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);

+  }

+}

+DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);

+// #define PREV_CONTEXT_INC(val) (2+((val)>2))

+// #define PREV_CONTEXT_INC(val) (vp9_prev_token_class[(val)])

+#define PREV_CONTEXT_INC(val) (vp9_prev_token_class[(val)>10?10:(val)])

+static int get_token(int v) {

+  if (v < 0) v = -v;

+  if (v == 0) return ZERO_TOKEN;

+  else if (v == 1) return ONE_TOKEN;

+  else if (v == 2) return TWO_TOKEN;

+  else if (v == 3) return THREE_TOKEN;

+  else if (v == 4) return FOUR_TOKEN;

+  else if (v <= 6) return DCT_VAL_CATEGORY1;

+  else if (v <= 10) return DCT_VAL_CATEGORY2;

+  else if (v <= 18) return DCT_VAL_CATEGORY3;

+  else if (v <= 34) return DCT_VAL_CATEGORY4;

+  else if (v <= 66) return DCT_VAL_CATEGORY5;

+  else return DCT_VAL_CATEGORY6;

+}

+void static count_tokens_adaptive_scan(const MACROBLOCKD *xd, INT16 *qcoeff_ptr,

+                                       int block, PLANE_TYPE type,

+                                       TX_TYPE tx_type,

+                                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

+                                       int eob, int seg_eob,

+                                       FRAME_CONTEXT *fc) {

+  int c, pt, token, band;

+  const int *scan;

+  switch(tx_type) {

+    case ADST_DCT :

+      scan = vp9_row_scan;

+      break;

+    case DCT_ADST :

+      scan = vp9_col_scan;

+      break;

+    default :

+      scan = vp9_default_zig_zag1d;

+      break;

+  }

+  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

+  for (c = !type; c < eob; ++c) {

+    int rc = scan[c];

+    int v = qcoeff_ptr[rc];

+    band = vp9_coef_bands[c];

+    token = get_token(v);

+    if (tx_type != DCT_DCT)

+      fc->hybrid_coef_counts[type][band][pt][token]++;

+    else

+      fc->coef_counts[type][band][pt][token]++;

+    pt = vp9_prev_token_class[token];

+  }

+  if (eob < seg_eob) {

+    band = vp9_coef_bands[c];

+    if (tx_type != DCT_DCT)

+      fc->hybrid_coef_counts[type][band][pt][DCT_EOB_TOKEN]++;

+    else

+      fc->coef_counts[type][band][pt][DCT_EOB_TOKEN]++;

+  }

+}

+void static count_tokens(INT16 *qcoeff_ptr, int block, PLANE_TYPE type,

+                         ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

+                         int eob, int seg_eob, FRAME_CONTEXT *const fc) {

+  int c, pt, token, band;

+  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

+  for (c = !type; c < eob; ++c) {

+    int rc = vp9_default_zig_zag1d[c];

+    int v = qcoeff_ptr[rc];

+    band = vp9_coef_bands[c];

+    token = get_token(v);

+    fc->coef_counts[type][band][pt][token]++;

+    pt = vp9_prev_token_class[token];

+  }

+  if (eob < seg_eob) {

+    band = vp9_coef_bands[c];

+    fc->coef_counts[type][band][pt][DCT_EOB_TOKEN]++;

+  }

+}

+void static count_tokens_8x8(INT16 *qcoeff_ptr, int block, PLANE_TYPE type,

+                             TX_TYPE tx_type,

+                             ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

+                             int eob, int seg_eob, FRAME_CONTEXT *fc) {

+  int c, pt, token, band;

+  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

+  for (c = !type; c < eob; ++c) {

+    int rc = (type == 1 ? vp9_default_zig_zag1d[c] : vp9_default_zig_zag1d_8x8[c]);

+    int v = qcoeff_ptr[rc];

+    band = (type == 1 ? vp9_coef_bands[c] : vp9_coef_bands_8x8[c]);

+    token = get_token(v);

+    if (tx_type != DCT_DCT)

+      fc->hybrid_coef_counts_8x8[type][band][pt][token]++;

+    else

+      fc->coef_counts_8x8[type][band][pt][token]++;

+    pt = vp9_prev_token_class[token];

+  }

+  if (eob < seg_eob) {

+    band = (type == 1 ? vp9_coef_bands[c] : vp9_coef_bands_8x8[c]);

+    if (tx_type != DCT_DCT)

+      fc->hybrid_coef_counts_8x8[type][band][pt][DCT_EOB_TOKEN]++;

+    else

+      fc->coef_counts_8x8[type][band][pt][DCT_EOB_TOKEN]++;

+  }

+}

+void static count_tokens_16x16(INT16 *qcoeff_ptr, int block, PLANE_TYPE type,

+                               TX_TYPE tx_type,

+                               ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

+                               int eob, int seg_eob, FRAME_CONTEXT *fc) {

+  int c, pt, token;

+  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

+  for (c = !type; c < eob; ++c) {

+    int rc = vp9_default_zig_zag1d_16x16[c];

+    int v = qcoeff_ptr[rc];

+    int band = vp9_coef_bands_16x16[c];

+    token = get_token(v);

+    if (tx_type != DCT_DCT)

+      fc->hybrid_coef_counts_16x16[type][band][pt][token]++;

+    else

+      fc->coef_counts_16x16[type][band][pt][token]++;

+    pt = vp9_prev_token_class[token];

+  }

+  if (eob < seg_eob) {

+    int band = vp9_coef_bands_16x16[c];

+    if (tx_type != DCT_DCT)

+      fc->hybrid_coef_counts_16x16[type][band][pt][DCT_EOB_TOKEN]++;

+    else

+      fc->coef_counts_16x16[type][band][pt][DCT_EOB_TOKEN]++;

+  }

+}

+static int get_signed(BOOL_DECODER *br, int value_to_sign) {

+  const int split = (br->range + 1) >> 1;

+  const VP9_BD_VALUE bigsplit = (VP9_BD_VALUE)split << (VP9_BD_VALUE_SIZE - 8);

+  int v;

+  if (br->count < 0)

+    vp9_bool_decoder_fill(br);

+  if (br->value < bigsplit) {

+    br->range = split;

+    v = value_to_sign;

+  } else {

+    br->range = br->range - split;

+    br->value = br->value - bigsplit;

+    v = -value_to_sign;

+  }

+  br->range += br->range;

+  br->value += br->value;

+  --br->count;

+  return v;

+}

+#define WRITE_COEF_CONTINUE(val)                              \

+  {                                                           \

+    prob = coef_probs + (ENTROPY_NODES*PREV_CONTEXT_INC(val));\

+    qcoeff_ptr[scan[c]] = (INT16) get_signed(br, val);        \

+    c++;                                                      \

+    continue;                                                 \

+  }

+#define ADJUST_COEF(prob, bits_count)  \

+  do {                                 \

+    if (vp9_read(br, prob))            \

+      val += (UINT16)(1 << bits_count);\

+  } while (0);

+static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,

+                        BOOL_DECODER* const br,

+                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

+                        PLANE_TYPE type,

+                        TX_TYPE tx_type,

+                        int seg_eob, INT16 *qcoeff_ptr, int i,

+                        const int *const scan, int block_type,

+                        const int *coef_bands) {

+  FRAME_CONTEXT *const fc = &dx->common.fc;

+  int tmp, c = (type == PLANE_TYPE_Y_NO_DC);

+  const vp9_prob *prob, *coef_probs;

+  switch (block_type) {

+    default:

+    case TX_4X4:

+      coef_probs =

+        tx_type != DCT_DCT ? fc->hybrid_coef_probs[type][0][0] :

+        fc->coef_probs[type][0][0];

+      break;

+    case TX_8X8:

+      coef_probs =

+        tx_type != DCT_DCT ? fc->hybrid_coef_probs_8x8[type][0][0] :

+        fc->coef_probs_8x8[type][0][0];

+      break;

+    case TX_16X16:

+      coef_probs =

+        tx_type != DCT_DCT ? fc->hybrid_coef_probs_16x16[type][0][0] :

+        fc->coef_probs_16x16[type][0][0];

+      break;

+  }

+  VP9_COMBINEENTROPYCONTEXTS(tmp, *a, *l);

+  prob = coef_probs + tmp * ENTROPY_NODES;

+  while (1) {

+    int val;

+    const uint8_t *cat6 = cat6_prob;

+    if (c == seg_eob) break;

+    prob += coef_bands[c];

+    if (!vp9_read(br, prob[EOB_CONTEXT_NODE]))

+      break;

+SKIP_START:

+    if (c == seg_eob) break;

+    if (!vp9_read(br, prob[ZERO_CONTEXT_NODE])) {

+      ++c;

+      prob = coef_probs + coef_bands[c];

+      goto SKIP_START;

+    }

+    // ONE_CONTEXT_NODE_0_

+    if (!vp9_read(br, prob[ONE_CONTEXT_NODE])) {

+      prob = coef_probs + ENTROPY_NODES;

+      qcoeff_ptr[scan[c]] = (INT16) get_signed(br, 1);

+      ++c;

+      continue;

+    }

+    // LOW_VAL_CONTEXT_NODE_0_

+    if (!vp9_read(br, prob[LOW_VAL_CONTEXT_NODE])) {

+      if (!vp9_read(br, prob[TWO_CONTEXT_NODE])) {

+        WRITE_COEF_CONTINUE(2);

+      }

+      if (!vp9_read(br, prob[THREE_CONTEXT_NODE])) {

+        WRITE_COEF_CONTINUE(3);

+      }

+      WRITE_COEF_CONTINUE(4);

+    }

+    // HIGH_LOW_CONTEXT_NODE_0_

+    if (!vp9_read(br, prob[HIGH_LOW_CONTEXT_NODE])) {

+      if (!vp9_read(br, prob[CAT_ONE_CONTEXT_NODE])) {

+        val = CAT1_MIN_VAL;

+        ADJUST_COEF(CAT1_PROB0, 0);

+        WRITE_COEF_CONTINUE(val);

+      }

+      val = CAT2_MIN_VAL;

+      ADJUST_COEF(CAT2_PROB1, 1);

+      ADJUST_COEF(CAT2_PROB0, 0);

+      WRITE_COEF_CONTINUE(val);

+    }

+    // CAT_THREEFOUR_CONTEXT_NODE_0_

+    if (!vp9_read(br, prob[CAT_THREEFOUR_CONTEXT_NODE])) {

+      if (!vp9_read(br, prob[CAT_THREE_CONTEXT_NODE])) {

+        val = CAT3_MIN_VAL;

+        ADJUST_COEF(CAT3_PROB2, 2);

+        ADJUST_COEF(CAT3_PROB1, 1);

+        ADJUST_COEF(CAT3_PROB0, 0);

+        WRITE_COEF_CONTINUE(val);

+      }

+      val = CAT4_MIN_VAL;

+      ADJUST_COEF(CAT4_PROB3, 3);

+      ADJUST_COEF(CAT4_PROB2, 2);

+      ADJUST_COEF(CAT4_PROB1, 1);

+      ADJUST_COEF(CAT4_PROB0, 0);

+      WRITE_COEF_CONTINUE(val);

+    }

+    // CAT_FIVE_CONTEXT_NODE_0_:

+    if (!vp9_read(br, prob[CAT_FIVE_CONTEXT_NODE])) {

+      val = CAT5_MIN_VAL;

+      ADJUST_COEF(CAT5_PROB4, 4);

+      ADJUST_COEF(CAT5_PROB3, 3);

+      ADJUST_COEF(CAT5_PROB2, 2);

+      ADJUST_COEF(CAT5_PROB1, 1);

+      ADJUST_COEF(CAT5_PROB0, 0);

+      WRITE_COEF_CONTINUE(val);

+    }

+    val = 0;

+    while (*cat6) {

+      val = (val << 1) | vp9_read(br, *cat6++);

+    }

+    val += CAT6_MIN_VAL;

+    WRITE_COEF_CONTINUE(val);

+  }

+  if (block_type == TX_4X4) {

+    count_tokens_adaptive_scan(xd, qcoeff_ptr, i, type,

+                               tx_type,

+                               a, l, c, seg_eob, fc);

+  }

+  else if (block_type == TX_8X8)

+    count_tokens_8x8(qcoeff_ptr, i, type,

+                     tx_type,

+                     a, l, c, seg_eob, fc);

+  else

+    count_tokens_16x16(qcoeff_ptr, i, type,

+                       tx_type,

+                       a, l, c, seg_eob, fc);

+  return c;

+}

+int vp9_decode_mb_tokens_16x16(VP9D_COMP *pbi, MACROBLOCKD *xd,

+                               BOOL_DECODER* const bc) {

+  ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context;

+  ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context;

+  char* const eobs = xd->eobs;

+  PLANE_TYPE type;

+  int c, i, eobtotal = 0, seg_eob;

+  const int segment_id = xd->mode_info_context->mbmi.segment_id;

+  const int seg_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB);

+  INT16 *qcoeff_ptr = &xd->qcoeff[0];

+  TX_TYPE tx_type = get_tx_type(xd, &xd->block[0]);

+  type = PLANE_TYPE_Y_WITH_DC;

+  if (seg_active)

+      seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

+  else

+      seg_eob = 256;

+  // Luma block

+  {

+    const int* const scan = vp9_default_zig_zag1d_16x16;

+    c = decode_coefs(pbi, xd, bc, A, L, type,

+                     tx_type,

+                     seg_eob, qcoeff_ptr,

+                     0, scan, TX_16X16, coef_bands_x_16x16);

+    eobs[0] = c;

+    A[0] = L[0] = (c != !type);

+    A[1] = A[2] = A[3] = A[0];

+    L[1] = L[2] = L[3] = L[0];

+    eobtotal += c;

+  }

+  // 8x8 chroma blocks

+  qcoeff_ptr += 256;

+  type = PLANE_TYPE_UV;

+  tx_type = DCT_DCT;

+  if (seg_active)

+    seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

+  else

+    seg_eob = 64;

+  for (i = 16; i < 24; i += 4) {

+    ENTROPY_CONTEXT* const a = A + vp9_block2above_8x8[i];

+    ENTROPY_CONTEXT* const l = L + vp9_block2left_8x8[i];

+    const int* const scan = vp9_default_zig_zag1d_8x8;

+    c = decode_coefs(pbi, xd, bc, a, l, type,

+                     tx_type,

+                     seg_eob, qcoeff_ptr,

+                     i, scan, TX_8X8, coef_bands_x_8x8);

+    a[0] = l[0] = ((eobs[i] = c) != !type);

+    a[1] = a[0];

+    l[1] = l[0];

+    eobtotal += c;

+    qcoeff_ptr += 64;

+  }

+  vpx_memset(&A[8], 0, sizeof(A[8]));

+  vpx_memset(&L[8], 0, sizeof(L[8]));

+  return eobtotal;

+}

+int vp9_decode_mb_tokens_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,

+                             BOOL_DECODER* const bc) {

+  ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;

+  ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;

+  char *const eobs = xd->eobs;

+  PLANE_TYPE type;

+  int c, i, eobtotal = 0, seg_eob;

+  const int segment_id = xd->mode_info_context->mbmi.segment_id;

+  const int seg_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB);

+  INT16 *qcoeff_ptr = &xd->qcoeff[0];

+  TX_TYPE tx_type = DCT_DCT;

+  int bufthred = (xd->mode_info_context->mbmi.mode == I8X8_PRED ||

+                  xd->mode_info_context->mbmi.mode == SPLITMV) ? 16 : 24;

+  if (xd->mode_info_context->mbmi.mode != B_PRED &&

+      xd->mode_info_context->mbmi.mode != SPLITMV &&

+      xd->mode_info_context->mbmi.mode != I8X8_PRED) {

+    ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[24];

+    ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[24];

+    const int *const scan = vp9_default_zig_zag1d;

+    type = PLANE_TYPE_Y2;

+    if (seg_active)

+      seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

+    else

+      seg_eob = 4;

+    c = decode_coefs(pbi, xd, bc, a, l, type,

+                     tx_type,

+                     seg_eob, qcoeff_ptr + 24 * 16,

+                     24, scan, TX_8X8, coef_bands_x);

+    a[0] = l[0] = ((eobs[24] = c) != !type);

+    eobtotal += c - 4;

+    type = PLANE_TYPE_Y_NO_DC;

+  } else

+    type = PLANE_TYPE_Y_WITH_DC;

+  if (seg_active)

+    seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

+  else

+    seg_eob = 64;

+  for (i = 0; i < bufthred ; i += 4) {

+    ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[i];

+    ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[i];

+    const int *const scan = vp9_default_zig_zag1d_8x8;

+    tx_type = DCT_DCT;

+    if (i == 16)

+      type = PLANE_TYPE_UV;

+    if (type == PLANE_TYPE_Y_WITH_DC) {

+      tx_type = get_tx_type(xd, xd->block + i);

+    }

+    c = decode_coefs(pbi, xd, bc, a, l, type,

+                     tx_type,

+                     seg_eob, qcoeff_ptr,

+                     i, scan, TX_8X8, coef_bands_x_8x8);

+    a[0] = l[0] = ((eobs[i] = c) != !type);

+    a[1] = a[0];

+    l[1] = l[0];

+    eobtotal += c;

+    qcoeff_ptr += 64;

+  }

+  if (bufthred == 16) {

+    type = PLANE_TYPE_UV;

+    tx_type = DCT_DCT;

+    seg_eob = 16;

+    // use 4x4 transform for U, V components in I8X8 prediction mode

+    for (i = 16; i < 24; i++) {

+      ENTROPY_CONTEXT *const a = A + vp9_block2above[i];

+      ENTROPY_CONTEXT *const l = L + vp9_block2left[i];

+      const int *scan = vp9_default_zig_zag1d;

+      c = decode_coefs(pbi, xd, bc, a, l, type,

+                       tx_type,

+                       seg_eob, qcoeff_ptr,

+                       i, scan, TX_4X4, coef_bands_x);

+      a[0] = l[0] = ((eobs[i] = c) != !type);

+      eobtotal += c;

+      qcoeff_ptr += 16;

+    }

+  }

+  return eobtotal;

+}

+int vp9_decode_mb_tokens(VP9D_COMP *dx, MACROBLOCKD *xd,

+                         BOOL_DECODER* const bc) {

+  ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;

+  ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;

+  char *const eobs = xd->eobs;

+  const int *scan = vp9_default_zig_zag1d;

+  PLANE_TYPE type;

+  int c, i, eobtotal = 0, seg_eob = 16;

+  INT16 *qcoeff_ptr = &xd->qcoeff[0];

+  int segment_id = xd->mode_info_context->mbmi.segment_id;

+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB))

+    seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

+  if (xd->mode_info_context->mbmi.mode != B_PRED &&

+      xd->mode_info_context->mbmi.mode != I8X8_PRED &&

+      xd->mode_info_context->mbmi.mode != SPLITMV) {

+    ENTROPY_CONTEXT *const a = A + vp9_block2above[24];

+    ENTROPY_CONTEXT *const l = L + vp9_block2left[24];

+    type = PLANE_TYPE_Y2;

+    c = decode_coefs(dx, xd, bc, a, l, type,

+                     DCT_DCT,

+                     seg_eob, qcoeff_ptr + 24 * 16, 24,

+                     scan, TX_4X4, coef_bands_x);

+    a[0] = l[0] = ((eobs[24] = c) != !type);

+    eobtotal += c - 16;

+    type = PLANE_TYPE_Y_NO_DC;

+  } else {

+    type = PLANE_TYPE_Y_WITH_DC;

+  }

+  for (i = 0; i < 24; ++i) {

+    ENTROPY_CONTEXT *const a = A + vp9_block2above[i];

+    ENTROPY_CONTEXT *const l = L + vp9_block2left[i];

+    TX_TYPE tx_type = DCT_DCT;

+    if (i == 16)

+      type = PLANE_TYPE_UV;

+    tx_type = get_tx_type(xd, &xd->block[i]);

+    switch(tx_type) {

+      case ADST_DCT :

+        scan = vp9_row_scan;

+        break;

+      case DCT_ADST :

+        scan = vp9_col_scan;

+        break;

+      default :

+        scan = vp9_default_zig_zag1d;

+        break;

+    }

+    c = decode_coefs(dx, xd, bc, a, l, type, tx_type,

+                     seg_eob, qcoeff_ptr,

+                     i, scan, TX_4X4, coef_bands_x);

+    a[0] = l[0] = ((eobs[i] = c) != !type);

+    eobtotal += c;

+    qcoeff_ptr += 16;

+  }

+  return eobtotal;

+}

--- /dev/null

+++ b/vp9/decoder/detokenize.h

@@ -1,0 +1,25 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef DETOKENIZE_H

+#define DETOKENIZE_H

+#include "onyxd_int.h"

+void vp9_reset_mb_tokens_context(MACROBLOCKD* const);

+int vp9_decode_mb_tokens(VP9D_COMP* const, MACROBLOCKD* const,

+                         BOOL_DECODER* const);

+int vp9_decode_mb_tokens_8x8(VP9D_COMP* const, MACROBLOCKD* const,

+                             BOOL_DECODER* const);

+int vp9_decode_mb_tokens_16x16(VP9D_COMP* const, MACROBLOCKD* const,

+                               BOOL_DECODER* const);

+#endif /* DETOKENIZE_H */

--- /dev/null

+++ b/vp9/decoder/idct_blk.c

@@ -1,0 +1,292 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vp9/common/idct.h"

+#include "dequantize.h"

+void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,

+                               unsigned char *dest, int pitch, int stride,

+                               int Dc);

+void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,

+                            unsigned char *dest, int pitch, int stride);

+void vp9_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,

+                            unsigned char *dst_ptr, int pitch, int stride);

+#if CONFIG_LOSSLESS

+void vp9_dequant_idct_add_lossless_c(short *input, short *dq,

+                                     unsigned char *pred, unsigned char *dest,

+                                     int pitch, int stride);

+void vp9_dc_only_idct_add_lossless_c(short input_dc, unsigned char *pred_ptr,

+                                     unsigned char *dst_ptr,

+                                     int pitch, int stride);

+#endif

+void vp9_dequant_dc_idct_add_y_block_c(short *q, short *dq,

+                                       unsigned char *pre,

+                                       unsigned char *dst,

+                                       int stride, char *eobs,

+                                       short *dc) {

+  int i, j;

+  for (i = 0; i < 4; i++) {

+    for (j = 0; j < 4; j++) {

+      if (*eobs++ > 1)

+        vp9_dequant_dc_idct_add_c(q, dq, pre, dst, 16, stride, dc[0]);

+      else

+        vp9_dc_only_idct_add_c(dc[0], pre, dst, 16, stride);

+      q   += 16;

+      pre += 4;

+      dst += 4;

+      dc++;

+    }

+    pre += 64 - 16;

+    dst += 4 * stride - 16;

+  }

+}

+void vp9_dequant_idct_add_y_block_c(short *q, short *dq,

+                                    unsigned char *pre,

+                                    unsigned char *dst,

+                                    int stride, char *eobs) {

+  int i, j;

+  for (i = 0; i < 4; i++) {

+    for (j = 0; j < 4; j++) {

+      if (*eobs++ > 1)

+        vp9_dequant_idct_add_c(q, dq, pre, dst, 16, stride);

+      else {

+        vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dst, 16, stride);

+        ((int *)q)[0] = 0;

+      }

+      q   += 16;

+      pre += 4;

+      dst += 4;

+    }

+    pre += 64 - 16;

+    dst += 4 * stride - 16;

+  }

+}

+void vp9_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *pre,

+                                     unsigned char *dstu, unsigned char *dstv,

+                                     int stride, char *eobs) {

+  int i, j;

+  for (i = 0; i < 2; i++) {

+    for (j = 0; j < 2; j++) {

+      if (*eobs++ > 1)

+        vp9_dequant_idct_add_c(q, dq, pre, dstu, 8, stride);

+      else {

+        vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstu, 8, stride);

+        ((int *)q)[0] = 0;

+      }

+      q    += 16;

+      pre  += 4;

+      dstu += 4;

+    }

+    pre  += 32 - 8;

+    dstu += 4 * stride - 8;

+  }

+  for (i = 0; i < 2; i++) {

+    for (j = 0; j < 2; j++) {

+      if (*eobs++ > 1)

+        vp9_dequant_idct_add_c(q, dq, pre, dstv, 8, stride);

+      else {

+        vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstv, 8, stride);

+        ((int *)q)[0] = 0;

+      }

+      q    += 16;

+      pre  += 4;

+      dstv += 4;

+    }

+    pre  += 32 - 8;

+    dstv += 4 * stride - 8;

+  }

+}

+void vp9_dequant_dc_idct_add_y_block_8x8_c(short *q, short *dq,

+                                           unsigned char *pre,

+                                           unsigned char *dst,

+                                           int stride, char *eobs, short *dc,

+                                           MACROBLOCKD *xd) {

+  vp9_dequant_dc_idct_add_8x8_c(q, dq, pre, dst, 16, stride, dc[0]);

+  vp9_dequant_dc_idct_add_8x8_c(&q[64], dq, pre + 8, dst + 8, 16, stride, dc[1]);

+  vp9_dequant_dc_idct_add_8x8_c(&q[128], dq, pre + 8 * 16,

+                                dst + 8 * stride, 16, stride, dc[4]);

+  vp9_dequant_dc_idct_add_8x8_c(&q[192], dq, pre + 8 * 16 + 8,

+                                dst + 8 * stride + 8, 16, stride, dc[8]);

+}

+#if CONFIG_SUPERBLOCKS

+void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, short *dq,

+                                                   unsigned char *dst,

+                                                   int stride, char *eobs,

+                                                   short *dc, MACROBLOCKD *xd) {

+  vp9_dequant_dc_idct_add_8x8_c(q, dq, dst, dst, stride, stride, dc[0]);

+  vp9_dequant_dc_idct_add_8x8_c(&q[64], dq, dst + 8,

+                                dst + 8, stride, stride, dc[1]);

+  vp9_dequant_dc_idct_add_8x8_c(&q[128], dq, dst + 8 * stride,

+                                dst + 8 * stride, stride, stride, dc[4]);

+  vp9_dequant_dc_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8,

+                                dst + 8 * stride + 8, stride, stride, dc[8]);

+}

+#endif

+void vp9_dequant_idct_add_y_block_8x8_c(short *q, short *dq,

+                                        unsigned char *pre,

+                                        unsigned char *dst,

+                                        int stride, char *eobs,

+                                        MACROBLOCKD *xd) {

+  unsigned char *origdest = dst;

+  unsigned char *origpred = pre;

+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);

+  vp9_dequant_idct_add_8x8_c(&q[64], dq, origpred + 8,

+                             origdest + 8, 16, stride);

+  vp9_dequant_idct_add_8x8_c(&q[128], dq, origpred + 8 * 16,

+                             origdest + 8 * stride, 16, stride);

+  vp9_dequant_idct_add_8x8_c(&q[192], dq, origpred + 8 * 16 + 8,

+                             origdest + 8 * stride + 8, 16, stride);

+}

+void vp9_dequant_idct_add_uv_block_8x8_c(short *q, short *dq,

+                                         unsigned char *pre,

+                                         unsigned char *dstu,

+                                         unsigned char *dstv,

+                                         int stride, char *eobs,

+                                         MACROBLOCKD *xd) {

+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride);

+  q    += 64;

+  pre  += 64;

+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride);

+}

+#if CONFIG_SUPERBLOCKS

+void vp9_dequant_idct_add_uv_block_8x8_inplace_c(short *q, short *dq,

+                                                 unsigned char *dstu,

+                                                 unsigned char *dstv,

+                                                 int stride, char *eobs,

+                                                 MACROBLOCKD *xd) {

+  vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride);

+  q    += 64;

+  vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride);

+}

+#endif

+#if CONFIG_LOSSLESS

+void vp9_dequant_dc_idct_add_y_block_lossless_c(short *q, short *dq,

+                                                unsigned char *pre,

+                                                unsigned char *dst,

+                                                int stride, char *eobs,

+                                                short *dc) {

+  int i, j;

+  for (i = 0; i < 4; i++) {

+    for (j = 0; j < 4; j++) {

+      if (*eobs++ > 1)

+        vp9_dequant_dc_idct_add_lossless_c(q, dq, pre, dst, 16, stride, dc[0]);

+      else

+        vp9_dc_only_inv_walsh_add_c(dc[0], pre, dst, 16, stride);

+      q   += 16;

+      pre += 4;

+      dst += 4;

+      dc++;

+    }

+    pre += 64 - 16;

+    dst += 4 * stride - 16;

+  }

+}

+void vp9_dequant_idct_add_y_block_lossless_c(short *q, short *dq,

+                                             unsigned char *pre,

+                                             unsigned char *dst,

+                                             int stride, char *eobs) {

+  int i, j;

+  for (i = 0; i < 4; i++) {

+    for (j = 0; j < 4; j++) {

+      if (*eobs++ > 1)

+        vp9_dequant_idct_add_lossless_c(q, dq, pre, dst, 16, stride);

+      else {

+        vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dst, 16, stride);

+        ((int *)q)[0] = 0;

+      }

+      q   += 16;

+      pre += 4;

+      dst += 4;

+    }

+    pre += 64 - 16;

+    dst += 4 * stride - 16;

+  }

+}

+void vp9_dequant_idct_add_uv_block_lossless_c(short *q, short *dq,

+                                              unsigned char *pre,

+                                              unsigned char *dstu,

+                                              unsigned char *dstv,

+                                              int stride, char *eobs) {

+  int i, j;

+  for (i = 0; i < 2; i++) {

+    for (j = 0; j < 2; j++) {

+      if (*eobs++ > 1)

+        vp9_dequant_idct_add_lossless_c(q, dq, pre, dstu, 8, stride);

+      else {

+        vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dstu, 8, stride);

+        ((int *)q)[0] = 0;

+      }

+      q    += 16;

+      pre  += 4;

+      dstu += 4;

+    }

+    pre  += 32 - 8;

+    dstu += 4 * stride - 8;

+  }

+  for (i = 0; i < 2; i++) {

+    for (j = 0; j < 2; j++) {

+      if (*eobs++ > 1)

+        vp9_dequant_idct_add_lossless_c(q, dq, pre, dstv, 8, stride);

+      else {

+        vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dstv, 8, stride);

+        ((int *)q)[0] = 0;

+      }

+      q    += 16;

+      pre  += 4;

+      dstv += 4;

+    }

+    pre  += 32 - 8;

+    dstv += 4 * stride - 8;

+  }

+}

+#endif

--- /dev/null

+++ b/vp9/decoder/onyxd_if.c

@@ -1,0 +1,506 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vp9/common/onyxc_int.h"

+#if CONFIG_POSTPROC

+#include "vp9/common/postproc.h"

+#endif

+#include "vp9/common/onyxd.h"

+#include "onyxd_int.h"

+#include "vpx_mem/vpx_mem.h"

+#include "vp9/common/alloccommon.h"

+#include "vpx_scale/yv12extend.h"

+#include "vp9/common/loopfilter.h"

+#include "vp9/common/swapyv12buffer.h"

+#include <stdio.h>

+#include <assert.h>

+#include "vp9/common/quant_common.h"

+#include "vpx_scale/vpxscale.h"

+#include "vp9/common/systemdependent.h"

+#include "vpx_ports/vpx_timer.h"

+#include "detokenize.h"

+#if ARCH_ARM

+#include "vpx_ports/arm.h"

+#endif

+extern void vp9_init_de_quantizer(VP9D_COMP *pbi);

+static int get_free_fb(VP9_COMMON *cm);

+static void ref_cnt_fb(int *buf, int *idx, int new_idx);

+#if CONFIG_DEBUG

+static void recon_write_yuv_frame(char *name, YV12_BUFFER_CONFIG *s) {

+  FILE *yuv_file = fopen((char *)name, "ab");

+  unsigned char *src = s->y_buffer;

+  int h = s->y_height;

+  do {

+    fwrite(src, s->y_width, 1,  yuv_file);

+    src += s->y_stride;

+  } while (--h);

+  src = s->u_buffer;

+  h = s->uv_height;

+  do {

+    fwrite(src, s->uv_width, 1,  yuv_file);

+    src += s->uv_stride;

+  } while (--h);

+  src = s->v_buffer;

+  h = s->uv_height;

+  do {

+    fwrite(src, s->uv_width, 1, yuv_file);

+    src += s->uv_stride;

+  } while (--h);

+  fclose(yuv_file);

+}

+#endif

+#define WRITE_RECON_BUFFER 0

+#if WRITE_RECON_BUFFER

+void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {

+  // write the frame

+  FILE *yframe;

+  int i;

+  char filename[255];

+  sprintf(filename, "dx\\y%04d.raw", this_frame);

+  yframe = fopen(filename, "wb");

+  for (i = 0; i < frame->y_height; i++)

+    fwrite(frame->y_buffer + i * frame->y_stride,

+           frame->y_width, 1, yframe);

+  fclose(yframe);

+  sprintf(filename, "dx\\u%04d.raw", this_frame);

+  yframe = fopen(filename, "wb");

+  for (i = 0; i < frame->uv_height; i++)

+    fwrite(frame->u_buffer + i * frame->uv_stride,

+           frame->uv_width, 1, yframe);

+  fclose(yframe);

+  sprintf(filename, "dx\\v%04d.raw", this_frame);

+  yframe = fopen(filename, "wb");

+  for (i = 0; i < frame->uv_height; i++)

+    fwrite(frame->v_buffer + i * frame->uv_stride,

+           frame->uv_width, 1, yframe);

+  fclose(yframe);

+}

+#endif

+void vp9_initialize_dec(void) {

+  static int init_done = 0;

+  if (!init_done) {

+    vp9_initialize_common();

+    vp9_init_quant_tables();

+    vp8_scale_machine_specific_config();

+    init_done = 1;

+  }

+}

+VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {

+  VP9D_COMP *pbi = vpx_memalign(32, sizeof(VP9D_COMP));

+  if (!pbi)

+    return NULL;

+  vpx_memset(pbi, 0, sizeof(VP9D_COMP));

+  if (setjmp(pbi->common.error.jmp)) {

+    pbi->common.error.setjmp = 0;

+    vp9_remove_decompressor(pbi);

+    return 0;

+  }

+  pbi->common.error.setjmp = 1;

+  vp9_initialize_dec();

+  vp9_create_common(&pbi->common);

+  pbi->common.current_video_frame = 0;

+  pbi->ready_for_new_data = 1;

+  /* vp9_init_de_quantizer() is first called here. Add check in

+   * frame_init_dequantizer() to avoid unnecessary calling of

+   * vp9_init_de_quantizer() for every frame.

+   */

+  vp9_init_de_quantizer(pbi);

+  vp9_loop_filter_init(&pbi->common);

+  pbi->common.error.setjmp = 0;

+  pbi->decoded_key_frame = 0;

+  return (VP9D_PTR) pbi;

+}

+void vp9_remove_decompressor(VP9D_PTR ptr) {

+  VP9D_COMP *pbi = (VP9D_COMP *) ptr;

+  if (!pbi)

+    return;

+  // Delete sementation map

+  if (pbi->common.last_frame_seg_map != 0)

+    vpx_free(pbi->common.last_frame_seg_map);

+  vp9_remove_common(&pbi->common);

+  vpx_free(pbi->mbc);

+  vpx_free(pbi);

+}

+vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,

+                                      YV12_BUFFER_CONFIG *sd) {

+  VP9D_COMP *pbi = (VP9D_COMP *) ptr;

+  VP9_COMMON *cm = &pbi->common;

+  int ref_fb_idx;

+  if (ref_frame_flag == VP9_LAST_FLAG)

+    ref_fb_idx = cm->lst_fb_idx;

+  else if (ref_frame_flag == VP9_GOLD_FLAG)

+    ref_fb_idx = cm->gld_fb_idx;

+  else if (ref_frame_flag == VP9_ALT_FLAG)

+    ref_fb_idx = cm->alt_fb_idx;

+  else {

+    vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,

+                       "Invalid reference frame");

+    return pbi->common.error.error_code;

+  }

+  if (cm->yv12_fb[ref_fb_idx].y_height != sd->y_height ||

+      cm->yv12_fb[ref_fb_idx].y_width != sd->y_width ||

+      cm->yv12_fb[ref_fb_idx].uv_height != sd->uv_height ||

+      cm->yv12_fb[ref_fb_idx].uv_width != sd->uv_width) {

+    vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,

+                       "Incorrect buffer dimensions");

+  } else

+    vp8_yv12_copy_frame_ptr(&cm->yv12_fb[ref_fb_idx], sd);

+  return pbi->common.error.error_code;

+}

+vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,

+                                      YV12_BUFFER_CONFIG *sd) {

+  VP9D_COMP *pbi = (VP9D_COMP *) ptr;

+  VP9_COMMON *cm = &pbi->common;

+  int *ref_fb_ptr = NULL;

+  int free_fb;

+  if (ref_frame_flag == VP9_LAST_FLAG)

+    ref_fb_ptr = &cm->lst_fb_idx;

+  else if (ref_frame_flag == VP9_GOLD_FLAG)

+    ref_fb_ptr = &cm->gld_fb_idx;

+  else if (ref_frame_flag == VP9_ALT_FLAG)

+    ref_fb_ptr = &cm->alt_fb_idx;

+  else {

+    vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,

+                       "Invalid reference frame");

+    return pbi->common.error.error_code;

+  }

+  if (cm->yv12_fb[*ref_fb_ptr].y_height != sd->y_height ||

+      cm->yv12_fb[*ref_fb_ptr].y_width != sd->y_width ||

+      cm->yv12_fb[*ref_fb_ptr].uv_height != sd->uv_height ||

+      cm->yv12_fb[*ref_fb_ptr].uv_width != sd->uv_width) {

+    vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,

+                       "Incorrect buffer dimensions");

+  } else {

+    /* Find an empty frame buffer. */

+    free_fb = get_free_fb(cm);

+    /* Decrease fb_idx_ref_cnt since it will be increased again in

+     * ref_cnt_fb() below. */

+    cm->fb_idx_ref_cnt[free_fb]--;

+    /* Manage the reference counters and copy image. */

+    ref_cnt_fb(cm->fb_idx_ref_cnt, ref_fb_ptr, free_fb);

+    vp8_yv12_copy_frame_ptr(sd, &cm->yv12_fb[*ref_fb_ptr]);

+  }

+  return pbi->common.error.error_code;

+}

+/*For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.*/

+#if HAVE_ARMV7

+extern void vp9_push_neon(int64_t *store);

+extern void vp9_pop_neon(int64_t *store);

+#endif

+static int get_free_fb(VP9_COMMON *cm) {

+  int i;

+  for (i = 0; i < NUM_YV12_BUFFERS; i++)

+    if (cm->fb_idx_ref_cnt[i] == 0)

+      break;

+  assert(i < NUM_YV12_BUFFERS);

+  cm->fb_idx_ref_cnt[i] = 1;

+  return i;

+}

+static void ref_cnt_fb(int *buf, int *idx, int new_idx) {

+  if (buf[*idx] > 0)

+    buf[*idx]--;

+  *idx = new_idx;

+  buf[new_idx]++;

+}

+/* If any buffer copy / swapping is signalled it should be done here. */

+static int swap_frame_buffers(VP9_COMMON *cm) {

+  int err = 0;

+  /* The alternate reference frame or golden frame can be updated

+   *  using the new, last, or golden/alt ref frame.  If it

+   *  is updated using the newly decoded frame it is a refresh.

+   *  An update using the last or golden/alt ref frame is a copy.

+   */

+  if (cm->copy_buffer_to_arf) {

+    int new_fb = 0;

+    if (cm->copy_buffer_to_arf == 1)

+      new_fb = cm->lst_fb_idx;

+    else if (cm->copy_buffer_to_arf == 2)

+      new_fb = cm->gld_fb_idx;

+    else

+      err = -1;

+    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->alt_fb_idx, new_fb);

+  }

+  if (cm->copy_buffer_to_gf) {

+    int new_fb = 0;

+    if (cm->copy_buffer_to_gf == 1)

+      new_fb = cm->lst_fb_idx;

+    else if (cm->copy_buffer_to_gf == 2)

+      new_fb = cm->alt_fb_idx;

+    else

+      err = -1;

+    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->gld_fb_idx, new_fb);

+  }

+  if (cm->refresh_golden_frame)

+    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->gld_fb_idx, cm->new_fb_idx);

+  if (cm->refresh_alt_ref_frame)

+    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->alt_fb_idx, cm->new_fb_idx);

+  if (cm->refresh_last_frame) {

+    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->lst_fb_idx, cm->new_fb_idx);

+    cm->frame_to_show = &cm->yv12_fb[cm->lst_fb_idx];

+  } else

+    cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];

+  cm->fb_idx_ref_cnt[cm->new_fb_idx]--;

+  return err;

+}

+int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,

+                                const unsigned char *source,

+                                int64_t time_stamp) {

+#if HAVE_ARMV7

+  int64_t dx_store_reg[8];

+#endif

+  VP9D_COMP *pbi = (VP9D_COMP *) ptr;

+  VP9_COMMON *cm = &pbi->common;

+  int retcode = 0;

+  /*if(pbi->ready_for_new_data == 0)

+      return -1;*/

+  if (ptr == 0) {

+    return -1;

+  }

+  pbi->common.error.error_code = VPX_CODEC_OK;

+  pbi->Source = source;

+  pbi->source_sz = size;

+  if (pbi->source_sz == 0) {

+    /* This is used to signal that we are missing frames.

+     * We do not know if the missing frame(s) was supposed to update

+     * any of the reference buffers, but we act conservative and

+     * mark only the last buffer as corrupted.

+     */

+    cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;

+  }

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+  if (cm->rtcd.flags & HAS_NEON)

+#endif

+  {

+    vp9_push_neon(dx_store_reg);

+  }

+#endif

+  cm->new_fb_idx = get_free_fb(cm);

+  if (setjmp(pbi->common.error.jmp)) {

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+    if (cm->rtcd.flags & HAS_NEON)

+#endif

+    {

+      vp9_pop_neon(dx_store_reg);

+    }

+#endif

+    pbi->common.error.setjmp = 0;

+    /* We do not know if the missing frame(s) was supposed to update

+     * any of the reference buffers, but we act conservative and

+     * mark only the last buffer as corrupted.

+     */

+    cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;

+    if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)

+      cm->fb_idx_ref_cnt[cm->new_fb_idx]--;

+    return -1;

+  }

+  pbi->common.error.setjmp = 1;

+  retcode = vp9_decode_frame(pbi);

+  if (retcode < 0) {

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+    if (cm->rtcd.flags & HAS_NEON)

+#endif

+    {

+      vp9_pop_neon(dx_store_reg);

+    }

+#endif

+    pbi->common.error.error_code = VPX_CODEC_ERROR;

+    pbi->common.error.setjmp = 0;

+    if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)

+      cm->fb_idx_ref_cnt[cm->new_fb_idx]--;

+    return retcode;

+  }

+  {

+    if (swap_frame_buffers(cm)) {

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+      if (cm->rtcd.flags & HAS_NEON)

+#endif

+      {

+        vp9_pop_neon(dx_store_reg);

+      }

+#endif

+      pbi->common.error.error_code = VPX_CODEC_ERROR;

+      pbi->common.error.setjmp = 0;

+      return -1;

+    }

+#if WRITE_RECON_BUFFER

+    if (cm->show_frame)

+      write_dx_frame_to_file(cm->frame_to_show,

+                             cm->current_video_frame);

+    else

+      write_dx_frame_to_file(cm->frame_to_show,

+                             cm->current_video_frame + 1000);

+#endif

+    if (cm->filter_level) {

+      /* Apply the loop filter if appropriate. */

+      vp9_loop_filter_frame(cm, &pbi->mb);

+    }

+    vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);

+  }

+#if CONFIG_DEBUG

+  if (cm->show_frame)

+    recon_write_yuv_frame("recon.yuv", cm->frame_to_show);

+#endif

+  vp9_clear_system_state();

+  if (cm->show_frame) {

+    vpx_memcpy(cm->prev_mip, cm->mip,

+               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

+  } else {

+    vpx_memset(cm->prev_mip, 0,

+               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

+  }

+  /*vp9_print_modes_and_motion_vectors(cm->mi, cm->mb_rows,cm->mb_cols,

+                                       cm->current_video_frame);*/

+  if (cm->show_frame)

+    cm->current_video_frame++;

+  pbi->ready_for_new_data = 0;

+  pbi->last_time_stamp = time_stamp;

+  pbi->source_sz = 0;

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+  if (cm->rtcd.flags & HAS_NEON)

+#endif

+  {

+    vp9_pop_neon(dx_store_reg);

+  }

+#endif

+  pbi->common.error.setjmp = 0;

+  return retcode;

+}

+int vp9_get_raw_frame(VP9D_PTR ptr, YV12_BUFFER_CONFIG *sd,

+                      int64_t *time_stamp, int64_t *time_end_stamp,

+                      vp9_ppflags_t *flags) {

+  int ret = -1;

+  VP9D_COMP *pbi = (VP9D_COMP *) ptr;

+  if (pbi->ready_for_new_data == 1)

+    return ret;

+  /* ie no raw frame to show!!! */

+  if (pbi->common.show_frame == 0)

+    return ret;

+  pbi->ready_for_new_data = 1;

+  *time_stamp = pbi->last_time_stamp;

+  *time_end_stamp = 0;

+  sd->clrtype = pbi->common.clr_type;

+#if CONFIG_POSTPROC

+  ret = vp9_post_proc_frame(&pbi->common, sd, flags);

+#else

+  if (pbi->common.frame_to_show) {

+    *sd = *pbi->common.frame_to_show;

+    sd->y_width = pbi->common.Width;

+    sd->y_height = pbi->common.Height;

+    sd->uv_height = pbi->common.Height / 2;

+    ret = 0;

+  } else {

+    ret = -1;

+  }

+#endif /*!CONFIG_POSTPROC*/

+  vp9_clear_system_state();

+  return ret;

+}

--- /dev/null

+++ b/vp9/decoder/onyxd_int.h

@@ -1,0 +1,106 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_ONYXD_INT_H

+#define __INC_ONYXD_INT_H

+#include "vpx_ports/config.h"

+#include "vp9/common/onyxd.h"

+#include "treereader.h"

+#include "vp9/common/onyxc_int.h"

+#include "dequantize.h"

+// #define DEC_DEBUG

+typedef struct {

+  int ithread;

+  void *ptr1;

+  void *ptr2;

+} DECODETHREAD_DATA;

+typedef struct {

+  MACROBLOCKD  mbd;

+  int mb_row;

+  int current_mb_col;

+  short *coef_ptr;

+} MB_ROW_DEC;

+typedef struct {

+  int const *scan;

+  int const *scan_8x8;

+  UINT8 const *ptr_block2leftabove;

+  vp9_tree_index const *vp9_coef_tree_ptr;

+  unsigned char *norm_ptr;

+  UINT8 *ptr_coef_bands_x;

+  UINT8 *ptr_coef_bands_x_8x8;

+  ENTROPY_CONTEXT_PLANES *A;

+  ENTROPY_CONTEXT_PLANES *L;

+  INT16 *qcoeff_start_ptr;

+  vp9_prob const *coef_probs[BLOCK_TYPES];

+  vp9_prob const *coef_probs_8x8[BLOCK_TYPES_8X8];

+  vp9_prob const *coef_probs_16X16[BLOCK_TYPES_16X16];

+  UINT8 eob[25];

+} DETOK;

+typedef struct VP9Decompressor {

+  DECLARE_ALIGNED(16, MACROBLOCKD, mb);

+  DECLARE_ALIGNED(16, VP9_COMMON, common);

+  VP9D_CONFIG oxcf;

+  const unsigned char *Source;

+  unsigned int   source_sz;

+  vp9_reader *mbc;

+  int64_t last_time_stamp;

+  int   ready_for_new_data;

+  DETOK detoken;

+  vp9_dequant_idct_add_fn_t            idct_add;

+  vp9_dequant_dc_idct_add_fn_t         dc_idct_add;

+  vp9_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block;

+  vp9_dequant_idct_add_y_block_fn_t    idct_add_y_block;

+  vp9_dequant_idct_add_uv_block_fn_t   idct_add_uv_block;

+  vp9_prob prob_skip_false;

+  int decoded_key_frame;

+} VP9D_COMP;

+int vp9_decode_frame(VP9D_COMP *cpi);

+#if CONFIG_DEBUG

+#define CHECK_MEM_ERROR(lval,expr) do {\

+    lval = (expr); \

+    if(!lval) \

+      vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\

+                         "Failed to allocate "#lval" at %s:%d", \

+                         __FILE__,__LINE__);\

+  } while(0)

+#else

+#define CHECK_MEM_ERROR(lval,expr) do {\

+    lval = (expr); \

+    if(!lval) \

+      vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\

+                         "Failed to allocate "#lval);\

+  } while(0)

+#endif

+#endif  // __INC_ONYXD_INT_H

--- /dev/null

+++ b/vp9/decoder/reconintra_mt.h

@@ -1,0 +1,15 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_RECONINTRA_MT_H

+#define __INC_RECONINTRA_MT_H

+#endif

--- /dev/null

+++ b/vp9/decoder/treereader.h

@@ -1,0 +1,37 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef tree_reader_h

+#define tree_reader_h 1

+#include "vp9/common/treecoder.h"

+#include "dboolhuff.h"

+typedef BOOL_DECODER vp9_reader;

+#define vp9_read decode_bool

+#define vp9_read_literal decode_value

+#define vp9_read_bit(R) vp9_read(R, vp9_prob_half)

+/* Intent of tree data structure is to make decoding trivial. */

+static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */

+                      vp9_tree t,

+                      const vp9_prob *const p) {

+  register vp9_tree_index i = 0;

+  while ((i = t[ i + vp9_read(r, p[i >> 1])]) > 0);

+  return -i;

+}

+#endif /* tree_reader_h */

--- /dev/null

+++ b/vp9/decoder/x86/dequantize_mmx.asm

@@ -1,0 +1,406 @@

+;

+;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "third_party/x86inc/x86inc.asm"

+SECTION_RODATA

+align 16

+x_s1sqr2:      times 4 dw 0x8A8C

+align 16

+x_c1sqr2less1: times 4 dw 0x4E7B

+align 16

+pw_16:         times 4 dw 16

+SECTION .text

+INIT_MMX

+;void dequantize_b_impl_mmx(short *sq, short *dq, short *q)

+cglobal dequantize_b_impl_mmx, 3,3,0,sq,dq,arg3

+    mova       m1, [sqq]

+    pmullw     m1, [arg3q+0]            ; mm4 *= kernel 0 modifiers.

+    mova [dqq+ 0], m1

+    mova       m1, [sqq+8]

+    pmullw     m1, [arg3q+8]            ; mm4 *= kernel 0 modifiers.

+    mova [dqq+ 8], m1

+    mova       m1, [sqq+16]

+    pmullw     m1, [arg3q+16]            ; mm4 *= kernel 0 modifiers.

+    mova [dqq+16], m1

+    mova       m1, [sqq+24]

+    pmullw     m1, [arg3q+24]            ; mm4 *= kernel 0 modifiers.

+    mova [dqq+24], m1

+    RET

+;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)

+cglobal dequant_idct_add_mmx, 4,6,0,inp,dq,pred,dest,pit,stride

+%if ARCH_X86_64

+    movsxd              strideq,  dword stridem

+    movsxd              pitq,     dword pitm

+%else

+    mov                 strideq,  stridem

+    mov                 pitq,     pitm

+%endif

+    mova                m0,       [inpq+ 0]

+    pmullw              m0,       [dqq]

+    mova                m1,       [inpq+ 8]

+    pmullw              m1,       [dqq+ 8]

+    mova                m2,       [inpq+16]

+    pmullw              m2,       [dqq+16]

+    mova                m3,       [inpq+24]

+    pmullw              m3,       [dqq+24]

+    pxor                m7,        m7

+    mova            [inpq],        m7

+    mova          [inpq+8],        m7

+    mova         [inpq+16],        m7

+    mova         [inpq+24],        m7

+    psubw               m0,        m2             ; b1= 0-2

+    paddw               m2,        m2             ;

+    mova                m5,        m1

+    paddw               m2,        m0             ; a1 =0+2

+    pmulhw              m5,       [x_s1sqr2];

+    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)

+    mova                m7,        m3             ;

+    pmulhw              m7,       [x_c1sqr2less1];

+    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)

+    psubw               m7,        m5             ; c1

+    mova                m5,        m1

+    mova                m4,        m3

+    pmulhw              m5,       [x_c1sqr2less1]

+    paddw               m5,        m1

+    pmulhw              m3,       [x_s1sqr2]

+    paddw               m3,        m4

+    paddw               m3,        m5             ; d1

+    mova                m6,        m2             ; a1

+    mova                m4,        m0             ; b1

+    paddw               m2,        m3             ;0

+    paddw               m4,        m7             ;1

+    psubw               m0,        m7             ;2

+    psubw               m6,        m3             ;3

+    mova                m1,        m2             ; 03 02 01 00

+    mova                m3,        m4             ; 23 22 21 20

+    punpcklwd           m1,        m0             ; 11 01 10 00

+    punpckhwd           m2,        m0             ; 13 03 12 02

+    punpcklwd           m3,        m6             ; 31 21 30 20

+    punpckhwd           m4,        m6             ; 33 23 32 22

+    mova                m0,        m1             ; 11 01 10 00

+    mova                m5,        m2             ; 13 03 12 02

+    punpckldq           m0,        m3             ; 30 20 10 00

+    punpckhdq           m1,        m3             ; 31 21 11 01

+    punpckldq           m2,        m4             ; 32 22 12 02

+    punpckhdq           m5,        m4             ; 33 23 13 03

+    mova                m3,        m5             ; 33 23 13 03

+    psubw               m0,        m2             ; b1= 0-2

+    paddw               m2,        m2             ;

+    mova                m5,        m1

+    paddw               m2,        m0             ; a1 =0+2

+    pmulhw              m5,       [x_s1sqr2];

+    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)

+    mova                m7,        m3             ;

+    pmulhw              m7,       [x_c1sqr2less1];

+    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)

+    psubw               m7,        m5             ; c1

+    mova                m5,        m1

+    mova                m4,        m3

+    pmulhw              m5,       [x_c1sqr2less1]

+    paddw               m5,        m1

+    pmulhw              m3,       [x_s1sqr2]

+    paddw               m3,        m4

+    paddw               m3,        m5             ; d1

+    paddw               m0,       [pw_16]

+    paddw               m2,       [pw_16]

+    mova                m6,        m2             ; a1

+    mova                m4,        m0             ; b1

+    paddw               m2,        m3             ;0

+    paddw               m4,        m7             ;1

+    psubw               m0,        m7             ;2

+    psubw               m6,        m3             ;3

+    psraw               m2,        5

+    psraw               m0,        5

+    psraw               m4,        5

+    psraw               m6,        5

+    mova                m1,        m2             ; 03 02 01 00

+    mova                m3,        m4             ; 23 22 21 20

+    punpcklwd           m1,        m0             ; 11 01 10 00

+    punpckhwd           m2,        m0             ; 13 03 12 02

+    punpcklwd           m3,        m6             ; 31 21 30 20

+    punpckhwd           m4,        m6             ; 33 23 32 22

+    mova                m0,        m1             ; 11 01 10 00

+    mova                m5,        m2             ; 13 03 12 02

+    punpckldq           m0,        m3             ; 30 20 10 00

+    punpckhdq           m1,        m3             ; 31 21 11 01

+    punpckldq           m2,        m4             ; 32 22 12 02

+    punpckhdq           m5,        m4             ; 33 23 13 03

+    pxor                m7,        m7

+    movh                m4,       [predq]

+    punpcklbw           m4,        m7

+    paddsw              m0,        m4

+    packuswb            m0,        m7

+    movh           [destq],      m0

+    movh                m4,       [predq+pitq]

+    punpcklbw           m4,        m7

+    paddsw              m1,        m4

+    packuswb            m1,        m7

+    movh   [destq+strideq],        m1

+    movh                m4,       [predq+2*pitq]

+    punpcklbw           m4,        m7

+    paddsw              m2,        m4

+    packuswb            m2,        m7

+    movh [destq+strideq*2],        m2

+    add              destq,        strideq

+    add              predq,        pitq

+    movh                m4,       [predq+2*pitq]

+    punpcklbw           m4,        m7

+    paddsw              m5,        m4

+    packuswb            m5,        m7

+    movh [destq+strideq*2],        m5

+    RET

+;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)

+cglobal dequant_dc_idct_add_mmx, 4,7,0,inp,dq,pred,dest,pit,stride,Dc

+%if ARCH_X86_64

+    movsxd              strideq,   dword stridem

+    movsxd              pitq,      dword pitm

+%else

+    mov                 strideq,   stridem

+    mov                 pitq,      pitm

+%endif

+    mov                 Dcq, Dcm

+    mova                m0,       [inpq+ 0]

+    pmullw              m0,       [dqq+ 0]

+    mova                m1,       [inpq+ 8]

+    pmullw              m1,       [dqq+ 8]

+    mova                m2,       [inpq+16]

+    pmullw              m2,       [dqq+16]

+    mova                m3,       [inpq+24]

+    pmullw              m3,       [dqq+24]

+    pxor                m7,        m7

+    mova         [inpq+ 0],        m7

+    mova         [inpq+ 8],        m7

+    mova         [inpq+16],        m7

+    mova         [inpq+24],        m7

+    ; move lower word of Dc to lower word of m0

+    psrlq               m0,        16

+    psllq               m0,        16

+    and                Dcq,        0xFFFF         ; If Dc < 0, we don't want the full dword precision.

+    movh                m7,        Dcq

+    por                 m0,        m7

+    psubw               m0,        m2             ; b1= 0-2

+    paddw               m2,        m2             ;

+    mova                m5,        m1

+    paddw               m2,        m0             ; a1 =0+2

+    pmulhw              m5,       [x_s1sqr2];

+    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)

+    mova                m7,        m3             ;

+    pmulhw              m7,       [x_c1sqr2less1];

+    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)

+    psubw               m7,        m5             ; c1

+    mova                m5,        m1

+    mova                m4,        m3

+    pmulhw              m5,       [x_c1sqr2less1]

+    paddw               m5,        m1

+    pmulhw              m3,       [x_s1sqr2]

+    paddw               m3,        m4

+    paddw               m3,        m5             ; d1

+    mova                m6,        m2             ; a1

+    mova                m4,        m0             ; b1

+    paddw               m2,        m3             ;0

+    paddw               m4,        m7             ;1

+    psubw               m0,        m7             ;2

+    psubw               m6,        m3             ;3

+    mova                m1,        m2             ; 03 02 01 00

+    mova                m3,        m4             ; 23 22 21 20

+    punpcklwd           m1,        m0             ; 11 01 10 00

+    punpckhwd           m2,        m0             ; 13 03 12 02

+    punpcklwd           m3,        m6             ; 31 21 30 20

+    punpckhwd           m4,        m6             ; 33 23 32 22

+    mova                m0,        m1             ; 11 01 10 00

+    mova                m5,        m2             ; 13 03 12 02

+    punpckldq           m0,        m3             ; 30 20 10 00

+    punpckhdq           m1,        m3             ; 31 21 11 01

+    punpckldq           m2,        m4             ; 32 22 12 02

+    punpckhdq           m5,        m4             ; 33 23 13 03

+    mova                m3,        m5             ; 33 23 13 03

+    psubw               m0,        m2             ; b1= 0-2

+    paddw               m2,        m2             ;

+    mova                m5,        m1

+    paddw               m2,        m0             ; a1 =0+2

+    pmulhw              m5,       [x_s1sqr2];

+    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)

+    mova                m7,        m3             ;

+    pmulhw              m7,       [x_c1sqr2less1];

+    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)

+    psubw               m7,        m5             ; c1

+    mova                m5,        m1

+    mova                m4,        m3

+    pmulhw              m5,       [x_c1sqr2less1]

+    paddw               m5,        m1

+    pmulhw              m3,       [x_s1sqr2]

+    paddw               m3,        m4

+    paddw               m3,        m5             ; d1

+    paddw               m0,       [pw_16]

+    paddw               m2,       [pw_16]

+    mova                m6,        m2             ; a1

+    mova                m4,        m0             ; b1

+    paddw               m2,        m3             ;0

+    paddw               m4,        m7             ;1

+    psubw               m0,        m7             ;2

+    psubw               m6,        m3             ;3

+    psraw               m2,        5

+    psraw               m0,        5

+    psraw               m4,        5

+    psraw               m6,        5

+    mova                m1,        m2             ; 03 02 01 00

+    mova                m3,        m4             ; 23 22 21 20

+    punpcklwd           m1,        m0             ; 11 01 10 00

+    punpckhwd           m2,        m0             ; 13 03 12 02

+    punpcklwd           m3,        m6             ; 31 21 30 20

+    punpckhwd           m4,        m6             ; 33 23 32 22

+    mova                m0,        m1             ; 11 01 10 00

+    mova                m5,        m2             ; 13 03 12 02

+    punpckldq           m0,        m3             ; 30 20 10 00

+    punpckhdq           m1,        m3             ; 31 21 11 01

+    punpckldq           m2,        m4             ; 32 22 12 02

+    punpckhdq           m5,        m4             ; 33 23 13 03

+    pxor                m7,        m7

+    movh                m4,       [predq]

+    punpcklbw           m4,        m7

+    paddsw              m0,        m4

+    packuswb            m0,        m7

+    movh           [destq],        m0

+    movh                m4,       [predq+pitq]

+    punpcklbw           m4,        m7

+    paddsw              m1,        m4

+    packuswb            m1,        m7

+    movh   [destq+strideq],        m1

+    movh                m4,       [predq+2*pitq]

+    punpcklbw           m4,        m7

+    paddsw              m2,        m4

+    packuswb            m2,        m7

+    movh [destq+strideq*2],        m2

+    add              destq,        strideq

+    add              predq,        pitq

+    movh                m4,       [predq+2*pitq]

+    punpcklbw           m4,        m7

+    paddsw              m5,        m4

+    packuswb            m5,        m7

+    movh [destq+strideq*2],        m5

+    RET

--- /dev/null

+++ b/vp9/decoder/x86/idct_blk_mmx.c

@@ -1,0 +1,143 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vp9/common/idct.h"

+#include "vp9/decoder/dequantize.h"

+void vp9_dequant_dc_idct_add_y_block_mmx(short *q, short *dq,

+                                         unsigned char *pre,

+                                         unsigned char *dst,

+                                         int stride, char *eobs, short *dc) {

+  int i;

+  for (i = 0; i < 4; i++) {

+    if (eobs[0] > 1)

+      vp9_dequant_dc_idct_add_mmx(q, dq, pre, dst, 16, stride, dc[0]);

+    else

+      vp9_dc_only_idct_add_mmx(dc[0], pre, dst, 16, stride);

+    if (eobs[1] > 1)

+      vp9_dequant_dc_idct_add_mmx(q + 16, dq, pre + 4,

+                                  dst + 4, 16, stride, dc[1]);

+    else

+      vp9_dc_only_idct_add_mmx(dc[1], pre + 4, dst + 4, 16, stride);

+    if (eobs[2] > 1)

+      vp9_dequant_dc_idct_add_mmx(q + 32, dq, pre + 8,

+                                  dst + 8, 16, stride, dc[2]);

+    else

+      vp9_dc_only_idct_add_mmx(dc[2], pre + 8, dst + 8, 16, stride);

+    if (eobs[3] > 1)

+      vp9_dequant_dc_idct_add_mmx(q + 48, dq, pre + 12,

+                                  dst + 12, 16, stride, dc[3]);

+    else

+      vp9_dc_only_idct_add_mmx(dc[3], pre + 12, dst + 12, 16, stride);

+    q    += 64;

+    dc   += 4;

+    pre  += 64;

+    dst  += 4 * stride;

+    eobs += 4;

+  }

+}

+void vp9_dequant_idct_add_y_block_mmx(short *q, short *dq,

+                                      unsigned char *pre,

+                                      unsigned char *dst,

+                                      int stride, char *eobs) {

+  int i;

+  for (i = 0; i < 4; i++) {

+    if (eobs[0] > 1)

+      vp9_dequant_idct_add_mmx(q, dq, pre, dst, 16, stride);

+    else {

+      vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dst, 16, stride);

+      ((int *)q)[0] = 0;

+    }

+    if (eobs[1] > 1)

+      vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dst + 4, 16, stride);

+    else {

+      vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dst + 4, 16, stride);

+      ((int *)(q + 16))[0] = 0;

+    }

+    if (eobs[2] > 1)

+      vp9_dequant_idct_add_mmx(q + 32, dq, pre + 8, dst + 8, 16, stride);

+    else {

+      vp9_dc_only_idct_add_mmx(q[32]*dq[0], pre + 8, dst + 8, 16, stride);

+      ((int *)(q + 32))[0] = 0;

+    }

+    if (eobs[3] > 1)

+      vp9_dequant_idct_add_mmx(q + 48, dq, pre + 12, dst + 12, 16, stride);

+    else {

+      vp9_dc_only_idct_add_mmx(q[48]*dq[0], pre + 12, dst + 12, 16, stride);

+      ((int *)(q + 48))[0] = 0;

+    }

+    q    += 64;

+    pre  += 64;

+    dst  += 4 * stride;

+    eobs += 4;

+  }

+}

+void vp9_dequant_idct_add_uv_block_mmx(short *q, short *dq,

+                                       unsigned char *pre,

+                                       unsigned char *dstu,

+                                       unsigned char *dstv,

+                                       int stride, char *eobs) {

+  int i;

+  for (i = 0; i < 2; i++) {

+    if (eobs[0] > 1)

+      vp9_dequant_idct_add_mmx(q, dq, pre, dstu, 8, stride);

+    else {

+      vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dstu, 8, stride);

+      ((int *)q)[0] = 0;

+    }

+    if (eobs[1] > 1)

+      vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dstu + 4, 8, stride);

+    else {

+      vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dstu + 4, 8, stride);

+      ((int *)(q + 16))[0] = 0;

+    }

+    q    += 32;

+    pre  += 32;

+    dstu += 4 * stride;

+    eobs += 2;

+  }

+  for (i = 0; i < 2; i++) {

+    if (eobs[0] > 1)

+      vp9_dequant_idct_add_mmx(q, dq, pre, dstv, 8, stride);

+    else {

+      vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dstv, 8, stride);

+      ((int *)q)[0] = 0;

+    }

+    if (eobs[1] > 1)

+      vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dstv + 4, 8, stride);

+    else {

+      vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dstv + 4, 8, stride);

+      ((int *)(q + 16))[0] = 0;

+    }

+    q    += 32;

+    pre  += 32;

+    dstv += 4 * stride;

+    eobs += 2;

+  }

+}

--- /dev/null

+++ b/vp9/decoder/x86/idct_blk_sse2.c

@@ -1,0 +1,116 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vp9/common/idct.h"

+#include "vp9/decoder/dequantize.h"

+void vp9_idct_dequant_dc_0_2x_sse2(short *q, short *dq,

+                                   unsigned char *pre, unsigned char *dst,

+                                   int dst_stride, short *dc);

+void vp9_idct_dequant_dc_full_2x_sse2(short *q, short *dq,

+                                      unsigned char *pre, unsigned char *dst,

+                                      int dst_stride, short *dc);

+void vp9_idct_dequant_0_2x_sse2(short *q, short *dq,

+                                unsigned char *pre, unsigned char *dst,

+                                int dst_stride, int blk_stride);

+void vp9_idct_dequant_full_2x_sse2(short *q, short *dq,

+                                   unsigned char *pre, unsigned char *dst,

+                                   int dst_stride, int blk_stride);

+void vp9_dequant_dc_idct_add_y_block_sse2(short *q, short *dq,

+                                          unsigned char *pre,

+                                          unsigned char *dst,

+                                          int stride, char *eobs, short *dc) {

+  int i;

+  for (i = 0; i < 4; i++) {

+    if (((short *)(eobs))[0] & 0xfefe)

+      vp9_idct_dequant_dc_full_2x_sse2(q, dq, pre, dst, stride, dc);

+    else

+      vp9_idct_dequant_dc_0_2x_sse2(q, dq, pre, dst, stride, dc);

+    if (((short *)(eobs))[1] & 0xfefe)

+      vp9_idct_dequant_dc_full_2x_sse2(q + 32, dq, pre + 8, dst + 8,

+                                       stride, dc + 2);

+    else

+      vp9_idct_dequant_dc_0_2x_sse2(q + 32, dq, pre + 8, dst + 8,

+                                    stride, dc + 2);

+    q    += 64;

+    dc   += 4;

+    pre  += 64;

+    dst  += stride * 4;

+    eobs += 4;

+  }

+}

+void vp9_dequant_idct_add_y_block_sse2(short *q, short *dq,

+                                       unsigned char *pre, unsigned char *dst,

+                                       int stride, char *eobs) {

+  int i;

+  for (i = 0; i < 4; i++) {

+    if (((short *)(eobs))[0] & 0xfefe)

+      vp9_idct_dequant_full_2x_sse2(q, dq, pre, dst, stride, 16);

+    else

+      vp9_idct_dequant_0_2x_sse2(q, dq, pre, dst, stride, 16);

+    if (((short *)(eobs))[1] & 0xfefe)

+      vp9_idct_dequant_full_2x_sse2(q + 32, dq, pre + 8, dst + 8, stride, 16);

+    else

+      vp9_idct_dequant_0_2x_sse2(q + 32, dq, pre + 8, dst + 8, stride, 16);

+    q    += 64;

+    pre  += 64;

+    dst  += stride * 4;

+    eobs += 4;

+  }

+}

+void vp9_dequant_idct_add_uv_block_sse2(short *q, short *dq,

+                                        unsigned char *pre,

+                                        unsigned char *dstu,

+                                        unsigned char *dstv,

+                                        int stride, char *eobs) {

+  if (((short *)(eobs))[0] & 0xfefe)

+    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstu, stride, 8);

+  else

+    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstu, stride, 8);

+  q    += 32;

+  pre  += 32;

+  dstu += stride * 4;

+  if (((short *)(eobs))[1] & 0xfefe)

+    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstu, stride, 8);

+  else

+    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstu, stride, 8);

+  q    += 32;

+  pre  += 32;

+  if (((short *)(eobs))[2] & 0xfefe)

+    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstv, stride, 8);

+  else

+    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstv, stride, 8);

+  q    += 32;

+  pre  += 32;

+  dstv += stride * 4;

+  if (((short *)(eobs))[3] & 0xfefe)

+    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstv, stride, 8);

+  else

+    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstv, stride, 8);

+}

--- /dev/null

+++ b/vp9/decoder/x86/x86_dsystemdependent.c

@@ -1,0 +1,26 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vpx_ports/x86.h"

+#include "vp9/decoder/onyxd_int.h"

+#if HAVE_MMX

+void vp9_dequantize_b_impl_mmx(short *sq, short *dq, short *q);

+void vp9_dequantize_b_mmx(BLOCKD *d) {

+  short *sq = (short *) d->qcoeff;

+  short *dq = (short *) d->dqcoeff;

+  short *q = (short *) d->dequant;

+  vp9_dequantize_b_impl_mmx(sq, dq, q);

+}

+#endif

--- /dev/null

+++ b/vp9/encoder/arm/arm_csystemdependent.c

@@ -1,0 +1,129 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vpx_ports/arm.h"

+#include "vp9/encoder/variance.h"

+#include "vp9/encoder/onyx_int.h"

+extern void (*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);

+extern void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);

+extern void vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);

+void vp9_arch_arm_encoder_init(VP9_COMP *cpi) {

+#if CONFIG_RUNTIME_CPU_DETECT

+  int flags = cpi->common.rtcd.flags;

+#if HAVE_ARMV5TE

+  if (flags & HAS_EDSP) {

+  }

+#endif

+#if HAVE_ARMV6

+  if (flags & HAS_MEDIA) {

+    cpi->rtcd.variance.sad16x16              = vp9_sad16x16_armv6;

+    /*cpi->rtcd.variance.sad16x8               = vp9_sad16x8_c;

+    cpi->rtcd.variance.sad8x16               = vp9_sad8x16_c;

+    cpi->rtcd.variance.sad8x8                = vp9_sad8x8_c;

+    cpi->rtcd.variance.sad4x4                = vp9_sad4x4_c;*/

+    /*cpi->rtcd.variance.var4x4                = vp9_variance4x4_c;*/

+    cpi->rtcd.variance.var8x8                = vp9_variance8x8_armv6;

+    /*cpi->rtcd.variance.var8x16               = vp9_variance8x16_c;

+    cpi->rtcd.variance.var16x8               = vp9_variance16x8_c;*/

+    cpi->rtcd.variance.var16x16              = vp9_variance16x16_armv6;

+    /*cpi->rtcd.variance.subpixvar4x4          = vp9_sub_pixel_variance4x4_c;*/

+    cpi->rtcd.variance.subpixvar8x8          = vp9_sub_pixel_variance8x8_armv6;

+    /*cpi->rtcd.variance.subpixvar8x16         = vp9_sub_pixel_variance8x16_c;

+    cpi->rtcd.variance.subpixvar16x8         = vp9_sub_pixel_variance16x8_c;*/

+    cpi->rtcd.variance.subpixvar16x16        = vp9_sub_pixel_variance16x16_armv6;

+    cpi->rtcd.variance.halfpixvar16x16_h     = vp9_variance_halfpixvar16x16_h_armv6;

+    cpi->rtcd.variance.halfpixvar16x16_v     = vp9_variance_halfpixvar16x16_v_armv6;

+    cpi->rtcd.variance.halfpixvar16x16_hv    = vp9_variance_halfpixvar16x16_hv_armv6;

+    cpi->rtcd.variance.mse16x16              = vp9_mse16x16_armv6;

+    /*cpi->rtcd.variance.getmbss               = vp9_get_mb_ss_c;*/

+    cpi->rtcd.fdct.short4x4                  = vp9_short_fdct4x4_armv6;

+    cpi->rtcd.fdct.short8x4                  = vp9_short_fdct8x4_armv6;

+    cpi->rtcd.fdct.fast4x4                   = vp9_short_fdct4x4_armv6;

+    cpi->rtcd.fdct.fast8x4                   = vp9_short_fdct8x4_armv6;

+    cpi->rtcd.fdct.walsh_short4x4            = vp9_short_walsh4x4_armv6;

+    /*cpi->rtcd.encodemb.berr                  = vp9_block_error_c;

+    cpi->rtcd.encodemb.mberr                 = vp9_mbblock_error_c;

+    cpi->rtcd.encodemb.mbuverr               = vp9_mbuverror_c;*/

+    cpi->rtcd.encodemb.subb                  = vp9_subtract_b_armv6;

+    cpi->rtcd.encodemb.submby                = vp9_subtract_mby_armv6;

+    cpi->rtcd.encodemb.submbuv               = vp9_subtract_mbuv_armv6;

+    /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;*/

+    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_armv6;

+  }

+#endif

+#if HAVE_ARMV7

+  if (flags & HAS_NEON) {

+    cpi->rtcd.variance.sad16x16              = vp9_sad16x16_neon;

+    cpi->rtcd.variance.sad16x8               = vp9_sad16x8_neon;

+    cpi->rtcd.variance.sad8x16               = vp9_sad8x16_neon;

+    cpi->rtcd.variance.sad8x8                = vp9_sad8x8_neon;

+    cpi->rtcd.variance.sad4x4                = vp9_sad4x4_neon;

+    /*cpi->rtcd.variance.var4x4                = vp9_variance4x4_c;*/

+    cpi->rtcd.variance.var8x8                = vp9_variance8x8_neon;

+    cpi->rtcd.variance.var8x16               = vp9_variance8x16_neon;

+    cpi->rtcd.variance.var16x8               = vp9_variance16x8_neon;

+    cpi->rtcd.variance.var16x16              = vp9_variance16x16_neon;

+    /*cpi->rtcd.variance.subpixvar4x4          = vp9_sub_pixel_variance4x4_c;*/

+    cpi->rtcd.variance.subpixvar8x8          = vp9_sub_pixel_variance8x8_neon;

+    /*cpi->rtcd.variance.subpixvar8x16         = vp9_sub_pixel_variance8x16_c;

+    cpi->rtcd.variance.subpixvar16x8         = vp9_sub_pixel_variance16x8_c;*/

+    cpi->rtcd.variance.subpixvar16x16        = vp9_sub_pixel_variance16x16_neon;

+    cpi->rtcd.variance.halfpixvar16x16_h     = vp9_variance_halfpixvar16x16_h_neon;

+    cpi->rtcd.variance.halfpixvar16x16_v     = vp9_variance_halfpixvar16x16_v_neon;

+    cpi->rtcd.variance.halfpixvar16x16_hv    = vp9_variance_halfpixvar16x16_hv_neon;

+    cpi->rtcd.variance.mse16x16              = vp9_mse16x16_neon;

+    /*cpi->rtcd.variance.getmbss               = vp9_get_mb_ss_c;*/

+    cpi->rtcd.fdct.short4x4                  = vp9_short_fdct4x4_neon;

+    cpi->rtcd.fdct.short8x4                  = vp9_short_fdct8x4_neon;

+    cpi->rtcd.fdct.fast4x4                   = vp9_short_fdct4x4_neon;

+    cpi->rtcd.fdct.fast8x4                   = vp9_short_fdct8x4_neon;

+    cpi->rtcd.fdct.walsh_short4x4            = vp9_short_walsh4x4_neon;

+    /*cpi->rtcd.encodemb.berr                  = vp9_block_error_c;

+    cpi->rtcd.encodemb.mberr                 = vp9_mbblock_error_c;

+    cpi->rtcd.encodemb.mbuverr               = vp9_mbuverror_c;*/

+    cpi->rtcd.encodemb.subb                  = vp9_subtract_b_neon;

+    cpi->rtcd.encodemb.submby                = vp9_subtract_mby_neon;

+    cpi->rtcd.encodemb.submbuv               = vp9_subtract_mbuv_neon;

+    /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;

+    cpi->rtcd.quantize.quantb_pair           = vp8_regular_quantize_b_pair;*/

+    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_neon;

+    cpi->rtcd.quantize.fastquantb_pair       = vp8_fast_quantize_b_pair_neon;

+  }

+#endif

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+  if (flags & HAS_NEON)

+#endif

+  {

+    vp9_yv12_copy_partial_frame_ptr = vpxyv12_copy_partial_frame_neon;

+  }

+#endif

+#endif

+}

--- /dev/null

+++ b/vp9/encoder/arm/armv5te/boolhuff_armv5te.asm

@@ -1,0 +1,286 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT |vp8_start_encode|

+    EXPORT |vp9_encode_bool|

+    EXPORT |vp8_stop_encode|

+    EXPORT |vp8_encode_value|

+    INCLUDE asm_enc_offsets.asm

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA    |.text|, CODE, READONLY

+; r0 BOOL_CODER *br

+; r1 unsigned char *source

+|vp8_start_encode| PROC

+    mov     r12, #0

+    mov     r3,  #255

+    mvn     r2,  #23

+    str     r12, [r0, #vp9_writer_lowvalue]

+    str     r3,  [r0, #vp9_writer_range]

+    str     r12, [r0, #vp9_writer_value]

+    str     r2,  [r0, #vp9_writer_count]

+    str     r12, [r0, #vp9_writer_pos]

+    str     r1,  [r0, #vp9_writer_buffer]

+    bx      lr

+    ENDP

+; r0 BOOL_CODER *br

+; r1 int bit

+; r2 int probability

+|vp9_encode_bool| PROC

+    push    {r4-r9, lr}

+    mov     r4, r2

+    ldr     r2, [r0, #vp9_writer_lowvalue]

+    ldr     r5, [r0, #vp9_writer_range]

+    ldr     r3, [r0, #vp9_writer_count]

+    sub     r7, r5, #1                  ; range-1

+    cmp     r1, #0

+    mul     r6, r4, r7                  ; ((range-1) * probability)

+    mov     r7, #1

+    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * probability) >> 8)

+    addne   r2, r2, r4                  ; if  (bit) lowvalue += split

+    subne   r4, r5, r4                  ; if  (bit) range = range-split

+    ; Counting the leading zeros is used to normalize range.

+    clz     r6, r4

+    sub     r6, r6, #24                 ; shift

+    ; Flag is set on the sum of count.  This flag is used later

+    ; to determine if count >= 0

+    adds    r3, r3, r6                  ; count += shift

+    lsl     r5, r4, r6                  ; range <<= shift

+    bmi     token_count_lt_zero         ; if(count >= 0)

+    sub     r6, r6, r3                  ; offset = shift - count

+    sub     r4, r6, #1                  ; offset-1

+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

+    bpl     token_high_bit_not_set

+    ldr     r4, [r0, #vp9_writer_pos]   ; x

+    sub     r4, r4, #1                  ; x = w->pos-1

+    b       token_zero_while_start

+token_zero_while_loop

+    mov     r9, #0

+    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0

+    sub     r4, r4, #1                  ; x--

+token_zero_while_start

+    cmp     r4, #0

+    ldrge   r7, [r0, #vp9_writer_buffer]

+    ldrb    r1, [r7, r4]

+    cmpge   r1, #0xff

+    beq     token_zero_while_loop

+    ldr     r7, [r0, #vp9_writer_buffer]

+    ldrb    r9, [r7, r4]                ; w->buffer[x]

+    add     r9, r9, #1

+    strb    r9, [r7, r4]                ; w->buffer[x] + 1

+token_high_bit_not_set

+    rsb     r4, r6, #24                 ; 24-offset

+    ldr     r9, [r0, #vp9_writer_buffer]

+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

+    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos

+    lsl     r2, r2, r6                  ; lowvalue <<= offset

+    mov     r6, r3                      ; shift = count

+    add     r1, r4, #1                  ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r1, [r0, #vp9_writer_pos]

+    sub     r3, r3, #8                  ; count -= 8

+    strb    r7, [r9, r4]                ; w->buffer[w->pos++]

+token_count_lt_zero

+    lsl     r2, r2, r6                  ; lowvalue <<= shift

+    str     r2, [r0, #vp9_writer_lowvalue]

+    str     r5, [r0, #vp9_writer_range]

+    str     r3, [r0, #vp9_writer_count]

+    pop     {r4-r9, pc}

+    ENDP

+; r0 BOOL_CODER *br

+|vp8_stop_encode| PROC

+    push    {r4-r10, lr}

+    ldr     r2, [r0, #vp9_writer_lowvalue]

+    ldr     r5, [r0, #vp9_writer_range]

+    ldr     r3, [r0, #vp9_writer_count]

+    mov     r10, #32

+stop_encode_loop

+    sub     r7, r5, #1                  ; range-1

+    mov     r4, r7, lsl #7              ; ((range-1) * 128)

+    mov     r7, #1

+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)

+    ; Counting the leading zeros is used to normalize range.

+    clz     r6, r4

+    sub     r6, r6, #24                 ; shift

+    ; Flag is set on the sum of count.  This flag is used later

+    ; to determine if count >= 0

+    adds    r3, r3, r6                  ; count += shift

+    lsl     r5, r4, r6                  ; range <<= shift

+    bmi     token_count_lt_zero_se      ; if(count >= 0)

+    sub     r6, r6, r3                  ; offset = shift - count

+    sub     r4, r6, #1                  ; offset-1

+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

+    bpl     token_high_bit_not_set_se

+    ldr     r4, [r0, #vp9_writer_pos]   ; x

+    sub     r4, r4, #1                  ; x = w->pos-1

+    b       token_zero_while_start_se

+token_zero_while_loop_se

+    mov     r9, #0

+    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0

+    sub     r4, r4, #1                  ; x--

+token_zero_while_start_se

+    cmp     r4, #0

+    ldrge   r7, [r0, #vp9_writer_buffer]

+    ldrb    r1, [r7, r4]

+    cmpge   r1, #0xff

+    beq     token_zero_while_loop_se

+    ldr     r7, [r0, #vp9_writer_buffer]

+    ldrb    r9, [r7, r4]                ; w->buffer[x]

+    add     r9, r9, #1

+    strb    r9, [r7, r4]                ; w->buffer[x] + 1

+token_high_bit_not_set_se

+    rsb     r4, r6, #24                 ; 24-offset

+    ldr     r9, [r0, #vp9_writer_buffer]

+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

+    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos

+    lsl     r2, r2, r6                  ; lowvalue <<= offset

+    mov     r6, r3                      ; shift = count

+    add     r1, r4, #1                  ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r1, [r0, #vp9_writer_pos]

+    sub     r3, r3, #8                  ; count -= 8

+    strb    r7, [r9, r4]                ; w->buffer[w->pos++]

+token_count_lt_zero_se

+    lsl     r2, r2, r6                  ; lowvalue <<= shift

+    subs    r10, r10, #1

+    bne     stop_encode_loop

+    str     r2, [r0, #vp9_writer_lowvalue]

+    str     r5, [r0, #vp9_writer_range]

+    str     r3, [r0, #vp9_writer_count]

+    pop     {r4-r10, pc}

+    ENDP

+; r0 BOOL_CODER *br

+; r1 int data

+; r2 int bits

+|vp8_encode_value| PROC

+    push    {r4-r11, lr}

+    mov     r10, r2

+    ldr     r2, [r0, #vp9_writer_lowvalue]

+    ldr     r5, [r0, #vp9_writer_range]

+    ldr     r3, [r0, #vp9_writer_count]

+    rsb     r4, r10, #32                 ; 32-n

+    ; v is kept in r1 during the token pack loop

+    lsl     r1, r1, r4                  ; r1 = v << 32 - n

+encode_value_loop

+    sub     r7, r5, #1                  ; range-1

+    ; Decisions are made based on the bit value shifted

+    ; off of v, so set a flag here based on this.

+    ; This value is refered to as "bb"

+    lsls    r1, r1, #1                  ; bit = v >> n

+    mov     r4, r7, lsl #7              ; ((range-1) * 128)

+    mov     r7, #1

+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)

+    addcs   r2, r2, r4                  ; if  (bit) lowvalue += split

+    subcs   r4, r5, r4                  ; if  (bit) range = range-split

+    ; Counting the leading zeros is used to normalize range.

+    clz     r6, r4

+    sub     r6, r6, #24                 ; shift

+    ; Flag is set on the sum of count.  This flag is used later

+    ; to determine if count >= 0

+    adds    r3, r3, r6                  ; count += shift

+    lsl     r5, r4, r6                  ; range <<= shift

+    bmi     token_count_lt_zero_ev      ; if(count >= 0)

+    sub     r6, r6, r3                  ; offset = shift - count

+    sub     r4, r6, #1                  ; offset-1

+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

+    bpl     token_high_bit_not_set_ev

+    ldr     r4, [r0, #vp9_writer_pos]   ; x

+    sub     r4, r4, #1                  ; x = w->pos-1

+    b       token_zero_while_start_ev

+token_zero_while_loop_ev

+    mov     r9, #0

+    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0

+    sub     r4, r4, #1                  ; x--

+token_zero_while_start_ev

+    cmp     r4, #0

+    ldrge   r7, [r0, #vp9_writer_buffer]

+    ldrb    r11, [r7, r4]

+    cmpge   r11, #0xff

+    beq     token_zero_while_loop_ev

+    ldr     r7, [r0, #vp9_writer_buffer]

+    ldrb    r9, [r7, r4]                ; w->buffer[x]

+    add     r9, r9, #1

+    strb    r9, [r7, r4]                ; w->buffer[x] + 1

+token_high_bit_not_set_ev

+    rsb     r4, r6, #24                 ; 24-offset

+    ldr     r9, [r0, #vp9_writer_buffer]

+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

+    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos

+    lsl     r2, r2, r6                  ; lowvalue <<= offset

+    mov     r6, r3                      ; shift = count

+    add     r11, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r11, [r0, #vp9_writer_pos]

+    sub     r3, r3, #8                  ; count -= 8

+    strb    r7, [r9, r4]                ; w->buffer[w->pos++]

+token_count_lt_zero_ev

+    lsl     r2, r2, r6                  ; lowvalue <<= shift

+    subs    r10, r10, #1

+    bne     encode_value_loop

+    str     r2, [r0, #vp9_writer_lowvalue]

+    str     r5, [r0, #vp9_writer_range]

+    str     r3, [r0, #vp9_writer_count]

+    pop     {r4-r11, pc}

+    ENDP

+    END

--- /dev/null

+++ b/vp9/encoder/arm/armv5te/vp8_packtokens_armv5.asm

@@ -1,0 +1,291 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT |vp8cx_pack_tokens_armv5|

+    INCLUDE asm_enc_offsets.asm

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA    |.text|, CODE, READONLY

+; r0 vp9_writer *w

+; r1 const TOKENEXTRA *p

+; r2 int xcount

+; r3 vp8_coef_encodings

+; s0 vp8_extra_bits

+; s1 vp8_coef_tree

+|vp8cx_pack_tokens_armv5| PROC

+    push    {r4-r11, lr}

+    ; Add size of xcount * sizeof (TOKENEXTRA) to get stop

+    ;  sizeof (TOKENEXTRA) is 8

+    sub     sp, sp, #12

+    add     r2, r1, r2, lsl #3          ; stop = p + xcount*sizeof(TOKENEXTRA)

+    str     r2, [sp, #0]

+    str     r3, [sp, #8]                ; save vp8_coef_encodings

+    ldr     r2, [r0, #vp9_writer_lowvalue]

+    ldr     r5, [r0, #vp9_writer_range]

+    ldr     r3, [r0, #vp9_writer_count]

+    b       check_p_lt_stop

+while_p_lt_stop

+    ldrb    r6, [r1, #tokenextra_token] ; t

+    ldr     r4, [sp, #8]                ; vp8_coef_encodings

+    mov     lr, #0

+    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t

+    ldr     r9, [r1, #tokenextra_context_tree]   ; pp

+    ldrb    r7, [r1, #tokenextra_skip_eob_node]

+    ldr     r6, [r4, #vp9_token_value]  ; v

+    ldr     r8, [r4, #vp9_token_len]    ; n

+    ; vp8 specific skip_eob_node

+    cmp     r7, #0

+    movne   lr, #2                      ; i = 2

+    subne   r8, r8, #1                  ; --n

+    rsb     r4, r8, #32                 ; 32-n

+    ldr     r10, [sp, #52]              ; vp8_coef_tree

+    ; v is kept in r12 during the token pack loop

+    lsl     r12, r6, r4                ; r12 = v << 32 - n

+; loop start

+token_loop

+    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]

+    sub     r7, r5, #1                  ; range-1

+    ; Decisions are made based on the bit value shifted

+    ; off of v, so set a flag here based on this.

+    ; This value is refered to as "bb"

+    lsls    r12, r12, #1                ; bb = v >> n

+    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))

+    ; bb can only be 0 or 1.  So only execute this statement

+    ; if bb == 1, otherwise it will act like i + 0

+    addcs   lr, lr, #1                  ; i + bb

+    mov     r7, #1

+    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]

+    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)

+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

+    subcs   r4, r5, r4                  ; if  (bb) range = range-split

+    ; Counting the leading zeros is used to normalize range.

+    clz     r6, r4

+    sub     r6, r6, #24                 ; shift

+    ; Flag is set on the sum of count.  This flag is used later

+    ; to determine if count >= 0

+    adds    r3, r3, r6                  ; count += shift

+    lsl     r5, r4, r6                  ; range <<= shift

+    bmi     token_count_lt_zero         ; if(count >= 0)

+    sub     r6, r6, r3                  ; offset = shift - count

+    sub     r4, r6, #1                  ; offset-1

+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

+    bpl     token_high_bit_not_set

+    ldr     r4, [r0, #vp9_writer_pos]   ; x

+    sub     r4, r4, #1                  ; x = w->pos-1

+    b       token_zero_while_start

+token_zero_while_loop

+    mov     r10, #0

+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

+    sub     r4, r4, #1                  ; x--

+token_zero_while_start

+    cmp     r4, #0

+    ldrge   r7, [r0, #vp9_writer_buffer]

+    ldrb    r11, [r7, r4]

+    cmpge   r11, #0xff

+    beq     token_zero_while_loop

+    ldr     r7, [r0, #vp9_writer_buffer]

+    ldrb    r10, [r7, r4]               ; w->buffer[x]

+    add     r10, r10, #1

+    strb    r10, [r7, r4]               ; w->buffer[x] + 1

+token_high_bit_not_set

+    rsb     r4, r6, #24                 ; 24-offset

+    ldr     r10, [r0, #vp9_writer_buffer]

+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

+    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos

+    lsl     r2, r2, r6                  ; lowvalue <<= offset

+    mov     r6, r3                      ; shift = count

+    add     r11, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r11, [r0, #vp9_writer_pos]

+    sub     r3, r3, #8                  ; count -= 8

+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]

+    ; r10 is used earlier in the loop, but r10 is used as

+    ; temp variable here.  So after r10 is used, reload

+    ; vp8_coef_tree_dcd into r10

+    ldr     r10, [sp, #52]              ; vp8_coef_tree

+token_count_lt_zero

+    lsl     r2, r2, r6                  ; lowvalue <<= shift

+    subs    r8, r8, #1                  ; --n

+    bne     token_loop

+    ldrb    r6, [r1, #tokenextra_token] ; t

+    ldr     r7, [sp, #48]               ; vp8_extra_bits

+    ; Add t * sizeof (vp9_extra_bit_struct) to get the desired

+    ;  element.  Here vp9_extra_bit_struct == 16

+    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t

+    ldr     r4, [r12, #vp9_extra_bit_struct_base_val]

+    cmp     r4, #0

+    beq     skip_extra_bits

+;   if( b->base_val)

+    ldr     r8, [r12, #vp9_extra_bit_struct_len] ; L

+    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra

+    cmp     r8, #0                      ; if( L)

+    beq     no_extra_bits

+    ldr     r9, [r12, #vp9_extra_bit_struct_prob]

+    asr     r7, lr, #1                  ; v=e>>1

+    ldr     r10, [r12, #vp9_extra_bit_struct_tree]

+    str     r10, [sp, #4]               ; b->tree

+    rsb     r4, r8, #32

+    lsl     r12, r7, r4

+    mov     lr, #0                      ; i = 0

+extra_bits_loop

+    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]

+    sub     r7, r5, #1                  ; range-1

+    lsls    r12, r12, #1                ; v >> n

+    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]

+    addcs   lr, lr, #1                  ; i + bb

+    mov     r7, #1

+    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]

+    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)

+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

+    subcs   r4, r5, r4                  ; if  (bb) range = range-split

+    clz     r6, r4

+    sub     r6, r6, #24

+    adds    r3, r3, r6                  ; count += shift

+    lsl     r5, r4, r6                  ; range <<= shift

+    bmi     extra_count_lt_zero         ; if(count >= 0)

+    sub     r6, r6, r3                  ; offset= shift - count

+    sub     r4, r6, #1                  ; offset-1

+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

+    bpl     extra_high_bit_not_set

+    ldr     r4, [r0, #vp9_writer_pos]   ; x

+    sub     r4, r4, #1                  ; x = w->pos - 1

+    b       extra_zero_while_start

+extra_zero_while_loop

+    mov     r10, #0

+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

+    sub     r4, r4, #1                  ; x--

+extra_zero_while_start

+    cmp     r4, #0

+    ldrge   r7, [r0, #vp9_writer_buffer]

+    ldrb    r11, [r7, r4]

+    cmpge   r11, #0xff

+    beq     extra_zero_while_loop

+    ldr     r7, [r0, #vp9_writer_buffer]

+    ldrb    r10, [r7, r4]

+    add     r10, r10, #1

+    strb    r10, [r7, r4]

+extra_high_bit_not_set

+    rsb     r4, r6, #24                 ; 24-offset

+    ldr     r10, [r0, #vp9_writer_buffer]

+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

+    ldr     r4, [r0, #vp9_writer_pos]

+    lsl     r2, r2, r6                  ; lowvalue <<= offset

+    mov     r6, r3                      ; shift = count

+    add     r11, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r11, [r0, #vp9_writer_pos]

+    sub     r3, r3, #8                  ; count -= 8

+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))

+    ldr     r10, [sp, #4]               ; b->tree

+extra_count_lt_zero

+    lsl     r2, r2, r6

+    subs    r8, r8, #1                  ; --n

+    bne     extra_bits_loop             ; while (n)

+no_extra_bits

+    ldr     lr, [r1, #4]                ; e = p->Extra

+    add     r4, r5, #1                  ; range + 1

+    tst     lr, #1

+    lsr     r4, r4, #1                  ; split = (range + 1) >> 1

+    addne   r2, r2, r4                  ; lowvalue += split

+    subne   r4, r5, r4                  ; range = range-split

+    tst     r2, #0x80000000             ; lowvalue & 0x80000000

+    lsl     r5, r4, #1                  ; range <<= 1

+    beq     end_high_bit_not_set

+    ldr     r4, [r0, #vp9_writer_pos]

+    mov     r7, #0

+    sub     r4, r4, #1

+    b       end_zero_while_start

+end_zero_while_loop

+    strb    r7, [r6, r4]

+    sub     r4, r4, #1                  ; x--

+end_zero_while_start

+    cmp     r4, #0

+    ldrge   r6, [r0, #vp9_writer_buffer]

+    ldrb    r12, [r6, r4]

+    cmpge   r12, #0xff

+    beq     end_zero_while_loop

+    ldr     r6, [r0, #vp9_writer_buffer]

+    ldrb    r7, [r6, r4]

+    add     r7, r7, #1

+    strb    r7, [r6, r4]

+end_high_bit_not_set

+    adds    r3, r3, #1                  ; ++count

+    lsl     r2, r2, #1                  ; lowvalue  <<= 1

+    bne     end_count_zero

+    ldr     r4, [r0, #vp9_writer_pos]

+    mvn     r3, #7

+    ldr     r7, [r0, #vp9_writer_buffer]

+    lsr     r6, r2, #24                 ; lowvalue >> 24

+    add     r12, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r12, [r0, #0x10]

+    strb    r6, [r7, r4]

+end_count_zero

+skip_extra_bits

+    add     r1, r1, #TOKENEXTRA_SZ      ; ++p

+check_p_lt_stop

+    ldr     r4, [sp, #0]                ; stop

+    cmp     r1, r4                      ; while( p < stop)

+    bcc     while_p_lt_stop

+    str     r2, [r0, #vp9_writer_lowvalue]

+    str     r5, [r0, #vp9_writer_range]

+    str     r3, [r0, #vp9_writer_count]

+    add     sp, sp, #12

+    pop     {r4-r11, pc}

+    ENDP

+    END

--- /dev/null

+++ b/vp9/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm

@@ -1,0 +1,327 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT |vp8cx_pack_mb_row_tokens_armv5|

+    INCLUDE asm_enc_offsets.asm

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA    |.text|, CODE, READONLY

+; r0 VP8_COMP *cpi

+; r1 vp9_writer *w

+; r2 vp8_coef_encodings

+; r3 vp8_extra_bits

+; s0 vp8_coef_tree

+|vp8cx_pack_mb_row_tokens_armv5| PROC

+    push    {r4-r11, lr}

+    sub     sp, sp, #24

+    ; Compute address of cpi->common.mb_rows

+    ldr     r4, _VP8_COMP_common_

+    ldr     r6, _VP8_COMMON_MBrows_

+    add     r4, r0, r4

+    ldr     r5, [r4, r6]                ; load up mb_rows

+    str     r2, [sp, #20]               ; save vp8_coef_encodings

+    str     r5, [sp, #12]               ; save mb_rows

+    str     r3, [sp, #8]                ; save vp8_extra_bits

+    ldr     r4, _VP8_COMP_tplist_

+    add     r4, r0, r4

+    ldr     r7, [r4, #0]                ; dereference cpi->tp_list

+    mov     r0, r1                      ; keep same as other loops

+    ldr     r2, [r0, #vp9_writer_lowvalue]

+    ldr     r5, [r0, #vp9_writer_range]

+    ldr     r3, [r0, #vp9_writer_count]

+mb_row_loop

+    ldr     r1, [r7, #tokenlist_start]

+    ldr     r9, [r7, #tokenlist_stop]

+    str     r9, [sp, #0]                ; save stop for later comparison

+    str     r7, [sp, #16]               ; tokenlist address for next time

+    b       check_p_lt_stop

+    ; actuall work gets done here!

+while_p_lt_stop

+    ldrb    r6, [r1, #tokenextra_token] ; t

+    ldr     r4, [sp, #20]               ; vp8_coef_encodings

+    mov     lr, #0

+    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t

+    ldr     r9, [r1, #tokenextra_context_tree]   ; pp

+    ldrb    r7, [r1, #tokenextra_skip_eob_node]

+    ldr     r6, [r4, #vp9_token_value]  ; v

+    ldr     r8, [r4, #vp9_token_len]    ; n

+    ; vp8 specific skip_eob_node

+    cmp     r7, #0

+    movne   lr, #2                      ; i = 2

+    subne   r8, r8, #1                  ; --n

+    rsb     r4, r8, #32                 ; 32-n

+    ldr     r10, [sp, #60]              ; vp8_coef_tree

+    ; v is kept in r12 during the token pack loop

+    lsl     r12, r6, r4                 ; r12 = v << 32 - n

+; loop start

+token_loop

+    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]

+    sub     r7, r5, #1                  ; range-1

+    ; Decisions are made based on the bit value shifted

+    ; off of v, so set a flag here based on this.

+    ; This value is refered to as "bb"

+    lsls    r12, r12, #1                ; bb = v >> n

+    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))

+    ; bb can only be 0 or 1.  So only execute this statement

+    ; if bb == 1, otherwise it will act like i + 0

+    addcs   lr, lr, #1                  ; i + bb

+    mov     r7, #1

+    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]

+    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)

+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

+    subcs   r4, r5, r4                  ; if  (bb) range = range-split

+    ; Counting the leading zeros is used to normalize range.

+    clz     r6, r4

+    sub     r6, r6, #24                 ; shift

+    ; Flag is set on the sum of count.  This flag is used later

+    ; to determine if count >= 0

+    adds    r3, r3, r6                  ; count += shift

+    lsl     r5, r4, r6                  ; range <<= shift

+    bmi     token_count_lt_zero         ; if(count >= 0)

+    sub     r6, r6, r3                  ; offset = shift - count

+    sub     r4, r6, #1                  ; offset-1

+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

+    bpl     token_high_bit_not_set

+    ldr     r4, [r0, #vp9_writer_pos]   ; x

+    sub     r4, r4, #1                  ; x = w->pos-1

+    b       token_zero_while_start

+token_zero_while_loop

+    mov     r10, #0

+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

+    sub     r4, r4, #1                  ; x--

+token_zero_while_start

+    cmp     r4, #0

+    ldrge   r7, [r0, #vp9_writer_buffer]

+    ldrb    r11, [r7, r4]

+    cmpge   r11, #0xff

+    beq     token_zero_while_loop

+    ldr     r7, [r0, #vp9_writer_buffer]

+    ldrb    r10, [r7, r4]               ; w->buffer[x]

+    add     r10, r10, #1

+    strb    r10, [r7, r4]               ; w->buffer[x] + 1

+token_high_bit_not_set

+    rsb     r4, r6, #24                 ; 24-offset

+    ldr     r10, [r0, #vp9_writer_buffer]

+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

+    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos

+    lsl     r2, r2, r6                  ; lowvalue <<= offset

+    mov     r6, r3                      ; shift = count

+    add     r11, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r11, [r0, #vp9_writer_pos]

+    sub     r3, r3, #8                  ; count -= 8

+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]

+    ; r10 is used earlier in the loop, but r10 is used as

+    ; temp variable here.  So after r10 is used, reload

+    ; vp8_coef_tree_dcd into r10

+    ldr     r10, [sp, #60]              ; vp8_coef_tree

+token_count_lt_zero

+    lsl     r2, r2, r6                  ; lowvalue <<= shift

+    subs    r8, r8, #1                  ; --n

+    bne     token_loop

+    ldrb    r6, [r1, #tokenextra_token] ; t

+    ldr     r7, [sp, #8]                ; vp8_extra_bits

+    ; Add t * sizeof (vp9_extra_bit_struct) to get the desired

+    ;  element.  Here vp9_extra_bit_struct == 16

+    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t

+    ldr     r4, [r12, #vp9_extra_bit_struct_base_val]

+    cmp     r4, #0

+    beq     skip_extra_bits

+;   if( b->base_val)

+    ldr     r8, [r12, #vp9_extra_bit_struct_len] ; L

+    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra

+    cmp     r8, #0                      ; if( L)

+    beq     no_extra_bits

+    ldr     r9, [r12, #vp9_extra_bit_struct_prob]

+    asr     r7, lr, #1                  ; v=e>>1

+    ldr     r10, [r12, #vp9_extra_bit_struct_tree]

+    str     r10, [sp, #4]               ; b->tree

+    rsb     r4, r8, #32

+    lsl     r12, r7, r4

+    mov     lr, #0                      ; i = 0

+extra_bits_loop

+    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]

+    sub     r7, r5, #1                  ; range-1

+    lsls    r12, r12, #1                ; v >> n

+    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]

+    addcs   lr, lr, #1                  ; i + bb

+    mov     r7, #1

+    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]

+    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)

+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

+    subcs   r4, r5, r4                  ; if  (bb) range = range-split

+    clz     r6, r4

+    sub     r6, r6, #24

+    adds    r3, r3, r6                  ; count += shift

+    lsl     r5, r4, r6                  ; range <<= shift

+    bmi     extra_count_lt_zero         ; if(count >= 0)

+    sub     r6, r6, r3                  ; offset= shift - count

+    sub     r4, r6, #1                  ; offset-1

+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

+    bpl     extra_high_bit_not_set

+    ldr     r4, [r0, #vp9_writer_pos]   ; x

+    sub     r4, r4, #1                  ; x = w->pos - 1

+    b       extra_zero_while_start

+extra_zero_while_loop

+    mov     r10, #0

+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

+    sub     r4, r4, #1                  ; x--

+extra_zero_while_start

+    cmp     r4, #0

+    ldrge   r7, [r0, #vp9_writer_buffer]

+    ldrb    r11, [r7, r4]

+    cmpge   r11, #0xff

+    beq     extra_zero_while_loop

+    ldr     r7, [r0, #vp9_writer_buffer]

+    ldrb    r10, [r7, r4]

+    add     r10, r10, #1

+    strb    r10, [r7, r4]

+extra_high_bit_not_set

+    rsb     r4, r6, #24                 ; 24-offset

+    ldr     r10, [r0, #vp9_writer_buffer]

+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

+    ldr     r4, [r0, #vp9_writer_pos]

+    lsl     r2, r2, r6                  ; lowvalue <<= offset

+    mov     r6, r3                      ; shift = count

+    add     r11, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r11, [r0, #vp9_writer_pos]

+    sub     r3, r3, #8                  ; count -= 8

+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))

+    ldr     r10, [sp, #4]               ; b->tree

+extra_count_lt_zero

+    lsl     r2, r2, r6

+    subs    r8, r8, #1                  ; --n

+    bne     extra_bits_loop             ; while (n)

+no_extra_bits

+    ldr     lr, [r1, #4]                ; e = p->Extra

+    add     r4, r5, #1                  ; range + 1

+    tst     lr, #1

+    lsr     r4, r4, #1                  ; split = (range + 1) >> 1

+    addne   r2, r2, r4                  ; lowvalue += split

+    subne   r4, r5, r4                  ; range = range-split

+    tst     r2, #0x80000000             ; lowvalue & 0x80000000

+    lsl     r5, r4, #1                  ; range <<= 1

+    beq     end_high_bit_not_set

+    ldr     r4, [r0, #vp9_writer_pos]

+    mov     r7, #0

+    sub     r4, r4, #1

+    b       end_zero_while_start

+end_zero_while_loop

+    strb    r7, [r6, r4]

+    sub     r4, r4, #1                  ; x--

+end_zero_while_start

+    cmp     r4, #0

+    ldrge   r6, [r0, #vp9_writer_buffer]

+    ldrb    r12, [r6, r4]

+    cmpge   r12, #0xff

+    beq     end_zero_while_loop

+    ldr     r6, [r0, #vp9_writer_buffer]

+    ldrb    r7, [r6, r4]

+    add     r7, r7, #1

+    strb    r7, [r6, r4]

+end_high_bit_not_set

+    adds    r3, r3, #1                  ; ++count

+    lsl     r2, r2, #1                  ; lowvalue  <<= 1

+    bne     end_count_zero

+    ldr     r4, [r0, #vp9_writer_pos]

+    mvn     r3, #7

+    ldr     r7, [r0, #vp9_writer_buffer]

+    lsr     r6, r2, #24                 ; lowvalue >> 24

+    add     r12, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r12, [r0, #0x10]

+    strb    r6, [r7, r4]

+end_count_zero

+skip_extra_bits

+    add     r1, r1, #TOKENEXTRA_SZ      ; ++p

+check_p_lt_stop

+    ldr     r4, [sp, #0]                ; stop

+    cmp     r1, r4                      ; while( p < stop)

+    bcc     while_p_lt_stop

+    ldr     r6, [sp, #12]               ; mb_rows

+    ldr     r7, [sp, #16]               ; tokenlist address

+    subs    r6, r6, #1

+    add     r7, r7, #TOKENLIST_SZ       ; next element in the array

+    str     r6, [sp, #12]

+    bne     mb_row_loop

+    str     r2, [r0, #vp9_writer_lowvalue]

+    str     r5, [r0, #vp9_writer_range]

+    str     r3, [r0, #vp9_writer_count]

+    add     sp, sp, #24

+    pop     {r4-r11, pc}

+    ENDP

+_VP8_COMP_common_

+    DCD     vp8_comp_common

+_VP8_COMMON_MBrows_

+    DCD     vp8_common_mb_rows

+_VP8_COMP_tplist_

+    DCD     vp8_comp_tplist

+    END

--- /dev/null

+++ b/vp9/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm

@@ -1,0 +1,465 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT |vp8cx_pack_tokens_into_partitions_armv5|

+    INCLUDE asm_enc_offsets.asm

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA    |.text|, CODE, READONLY

+; r0 VP8_COMP *cpi

+; r1 unsigned char *cx_data

+; r2 int num_part

+; r3 *size

+; s0 vp8_coef_encodings

+; s1 vp8_extra_bits,

+; s2 const vp9_tree_index *,

+|vp8cx_pack_tokens_into_partitions_armv5| PROC

+    push    {r4-r11, lr}

+    sub     sp, sp, #44

+    ; Compute address of cpi->common.mb_rows

+    ldr     r4, _VP8_COMP_common_

+    ldr     r6, _VP8_COMMON_MBrows_

+    add     r4, r0, r4

+    ldr     r5, [r4, r6]                ; load up mb_rows

+    str     r5, [sp, #36]               ; save mb_rows

+    str     r1, [sp, #24]               ; save cx_data

+    str     r2, [sp, #20]               ; save num_part

+    str     r3, [sp, #8]                ; save *size

+    ; *size = 3*(num_part -1 );

+    sub     r2, r2, #1                  ; num_part - 1

+    add     r2, r2, r2, lsl #1          ; 3*(num_part - 1)

+    str     r2, [r3]

+    add     r2, r2, r1                  ; cx_data + *size

+    str     r2, [sp, #40]               ; ptr

+    ldr     r4, _VP8_COMP_tplist_

+    add     r4, r0, r4

+    ldr     r7, [r4, #0]                ; dereference cpi->tp_list

+    str     r7, [sp, #32]               ; store start of cpi->tp_list

+    ldr     r11, _VP8_COMP_bc2_         ; load up vp9_writer out of cpi

+    add     r0, r0, r11

+    mov     r11, #0

+    str     r11, [sp, #28]              ; i

+numparts_loop

+    ldr     r10, [sp, #40]              ; ptr

+    ldr     r5,  [sp, #36]              ; move mb_rows to the counting section

+    sub     r5, r5, r11                 ; move start point with each partition

+                                        ; mb_rows starts at i

+    str     r5,  [sp, #12]

+    ; Reset all of the VP8 Writer data for each partition that

+    ; is processed.

+    ; start_encode

+    mov     r2, #0                      ; vp9_writer_lowvalue

+    mov     r5, #255                    ; vp9_writer_range

+    mvn     r3, #23                     ; vp9_writer_count

+    str     r2,  [r0, #vp9_writer_value]

+    str     r2,  [r0, #vp9_writer_pos]

+    str     r10, [r0, #vp9_writer_buffer]

+mb_row_loop

+    ldr     r1, [r7, #tokenlist_start]

+    ldr     r9, [r7, #tokenlist_stop]

+    str     r9, [sp, #0]                ; save stop for later comparison

+    str     r7, [sp, #16]               ; tokenlist address for next time

+    b       check_p_lt_stop

+    ; actual work gets done here!

+while_p_lt_stop

+    ldrb    r6, [r1, #tokenextra_token] ; t

+    ldr     r4, [sp, #80]               ; vp8_coef_encodings

+    mov     lr, #0

+    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t

+    ldr     r9, [r1, #tokenextra_context_tree]   ; pp

+    ldrb    r7, [r1, #tokenextra_skip_eob_node]

+    ldr     r6, [r4, #vp9_token_value]  ; v

+    ldr     r8, [r4, #vp9_token_len]    ; n

+    ; vp8 specific skip_eob_node

+    cmp     r7, #0

+    movne   lr, #2                      ; i = 2

+    subne   r8, r8, #1                  ; --n

+    rsb     r4, r8, #32                 ; 32-n

+    ldr     r10, [sp, #88]              ; vp8_coef_tree

+    ; v is kept in r12 during the token pack loop

+    lsl     r12, r6, r4                ; r12 = v << 32 - n

+; loop start

+token_loop

+    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]

+    sub     r7, r5, #1                  ; range-1

+    ; Decisions are made based on the bit value shifted

+    ; off of v, so set a flag here based on this.

+    ; This value is refered to as "bb"

+    lsls    r12, r12, #1                ; bb = v >> n

+    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))

+    ; bb can only be 0 or 1.  So only execute this statement

+    ; if bb == 1, otherwise it will act like i + 0

+    addcs   lr, lr, #1                  ; i + bb

+    mov     r7, #1

+    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]

+    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)

+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

+    subcs   r4, r5, r4                  ; if  (bb) range = range-split

+    ; Counting the leading zeros is used to normalize range.

+    clz     r6, r4

+    sub     r6, r6, #24                 ; shift

+    ; Flag is set on the sum of count.  This flag is used later

+    ; to determine if count >= 0

+    adds    r3, r3, r6                  ; count += shift

+    lsl     r5, r4, r6                  ; range <<= shift

+    bmi     token_count_lt_zero         ; if(count >= 0)

+    sub     r6, r6, r3                  ; offset = shift - count

+    sub     r4, r6, #1                  ; offset-1

+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

+    bpl     token_high_bit_not_set

+    ldr     r4, [r0, #vp9_writer_pos]   ; x

+    sub     r4, r4, #1                  ; x = w->pos-1

+    b       token_zero_while_start

+token_zero_while_loop

+    mov     r10, #0

+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

+    sub     r4, r4, #1                  ; x--

+token_zero_while_start

+    cmp     r4, #0

+    ldrge   r7, [r0, #vp9_writer_buffer]

+    ldrb    r11, [r7, r4]

+    cmpge   r11, #0xff

+    beq     token_zero_while_loop

+    ldr     r7, [r0, #vp9_writer_buffer]

+    ldrb    r10, [r7, r4]               ; w->buffer[x]

+    add     r10, r10, #1

+    strb    r10, [r7, r4]               ; w->buffer[x] + 1

+token_high_bit_not_set

+    rsb     r4, r6, #24                 ; 24-offset

+    ldr     r10, [r0, #vp9_writer_buffer]

+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

+    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos

+    lsl     r2, r2, r6                  ; lowvalue <<= offset

+    mov     r6, r3                      ; shift = count

+    add     r11, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r11, [r0, #vp9_writer_pos]

+    sub     r3, r3, #8                  ; count -= 8

+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]

+    ; r10 is used earlier in the loop, but r10 is used as

+    ; temp variable here.  So after r10 is used, reload

+    ; vp8_coef_tree_dcd into r10

+    ldr     r10, [sp, #88]              ; vp8_coef_tree

+token_count_lt_zero

+    lsl     r2, r2, r6                  ; lowvalue <<= shift

+    subs    r8, r8, #1                  ; --n

+    bne     token_loop

+    ldrb    r6, [r1, #tokenextra_token] ; t

+    ldr     r7, [sp, #84]                ; vp8_extra_bits

+    ; Add t * sizeof (vp9_extra_bit_struct) to get the desired

+    ;  element.  Here vp9_extra_bit_struct == 16

+    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t

+    ldr     r4, [r12, #vp9_extra_bit_struct_base_val]

+    cmp     r4, #0

+    beq     skip_extra_bits

+;   if( b->base_val)

+    ldr     r8, [r12, #vp9_extra_bit_struct_len] ; L

+    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra

+    cmp     r8, #0                      ; if( L)

+    beq     no_extra_bits

+    ldr     r9, [r12, #vp9_extra_bit_struct_prob]

+    asr     r7, lr, #1                  ; v=e>>1

+    ldr     r10, [r12, #vp9_extra_bit_struct_tree]

+    str     r10, [sp, #4]               ; b->tree

+    rsb     r4, r8, #32

+    lsl     r12, r7, r4

+    mov     lr, #0                      ; i = 0

+extra_bits_loop

+    ldrb    r4, [r9, lr, asr #1]        ; pp[i>>1]

+    sub     r7, r5, #1                  ; range-1

+    lsls    r12, r12, #1                ; v >> n

+    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]

+    addcs   lr, lr, #1                  ; i + bb

+    mov     r7, #1

+    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]

+    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)

+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split

+    subcs   r4, r5, r4                  ; if  (bb) range = range-split

+    clz     r6, r4

+    sub     r6, r6, #24

+    adds    r3, r3, r6                  ; count += shift

+    lsl     r5, r4, r6                  ; range <<= shift

+    bmi     extra_count_lt_zero         ; if(count >= 0)

+    sub     r6, r6, r3                  ; offset= shift - count

+    sub     r4, r6, #1                  ; offset-1

+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

+    bpl     extra_high_bit_not_set

+    ldr     r4, [r0, #vp9_writer_pos]   ; x

+    sub     r4, r4, #1                  ; x = w->pos - 1

+    b       extra_zero_while_start

+extra_zero_while_loop

+    mov     r10, #0

+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

+    sub     r4, r4, #1                  ; x--

+extra_zero_while_start

+    cmp     r4, #0

+    ldrge   r7, [r0, #vp9_writer_buffer]

+    ldrb    r11, [r7, r4]

+    cmpge   r11, #0xff

+    beq     extra_zero_while_loop

+    ldr     r7, [r0, #vp9_writer_buffer]

+    ldrb    r10, [r7, r4]

+    add     r10, r10, #1

+    strb    r10, [r7, r4]

+extra_high_bit_not_set

+    rsb     r4, r6, #24                 ; 24-offset

+    ldr     r10, [r0, #vp9_writer_buffer]

+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

+    ldr     r4, [r0, #vp9_writer_pos]

+    lsl     r2, r2, r6                  ; lowvalue <<= offset

+    mov     r6, r3                      ; shift = count

+    add     r11, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r11, [r0, #vp9_writer_pos]

+    sub     r3, r3, #8                  ; count -= 8

+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))

+    ldr     r10, [sp, #4]               ; b->tree

+extra_count_lt_zero

+    lsl     r2, r2, r6

+    subs    r8, r8, #1                  ; --n

+    bne     extra_bits_loop             ; while (n)

+no_extra_bits

+    ldr     lr, [r1, #4]                ; e = p->Extra

+    add     r4, r5, #1                  ; range + 1

+    tst     lr, #1

+    lsr     r4, r4, #1                  ; split = (range + 1) >> 1

+    addne   r2, r2, r4                  ; lowvalue += split

+    subne   r4, r5, r4                  ; range = range-split

+    tst     r2, #0x80000000             ; lowvalue & 0x80000000

+    lsl     r5, r4, #1                  ; range <<= 1

+    beq     end_high_bit_not_set

+    ldr     r4, [r0, #vp9_writer_pos]

+    mov     r7, #0

+    sub     r4, r4, #1

+    b       end_zero_while_start

+end_zero_while_loop

+    strb    r7, [r6, r4]

+    sub     r4, r4, #1                  ; x--

+end_zero_while_start

+    cmp     r4, #0

+    ldrge   r6, [r0, #vp9_writer_buffer]

+    ldrb    r12, [r6, r4]

+    cmpge   r12, #0xff

+    beq     end_zero_while_loop

+    ldr     r6, [r0, #vp9_writer_buffer]

+    ldrb    r7, [r6, r4]

+    add     r7, r7, #1

+    strb    r7, [r6, r4]

+end_high_bit_not_set

+    adds    r3, r3, #1                  ; ++count

+    lsl     r2, r2, #1                  ; lowvalue  <<= 1

+    bne     end_count_zero

+    ldr     r4, [r0, #vp9_writer_pos]

+    mvn     r3, #7

+    ldr     r7, [r0, #vp9_writer_buffer]

+    lsr     r6, r2, #24                 ; lowvalue >> 24

+    add     r12, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r12, [r0, #0x10]

+    strb    r6, [r7, r4]

+end_count_zero

+skip_extra_bits

+    add     r1, r1, #TOKENEXTRA_SZ      ; ++p

+check_p_lt_stop

+    ldr     r4, [sp, #0]                ; stop

+    cmp     r1, r4                      ; while( p < stop)

+    bcc     while_p_lt_stop

+    ldr     r10, [sp, #20]              ; num_parts

+    mov     r1, #TOKENLIST_SZ

+    mul     r1, r10, r1

+    ldr     r6, [sp, #12]               ; mb_rows

+    ldr     r7, [sp, #16]               ; tokenlist address

+    subs    r6, r6, r10

+    add     r7, r7, r1                  ; next element in the array

+    str     r6, [sp, #12]

+    bgt     mb_row_loop

+    mov     r12, #32

+stop_encode_loop

+    sub     r7, r5, #1                  ; range-1

+    mov     r4, r7, lsl #7              ; ((range-1) * 128)

+    mov     r7, #1

+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)

+    ; Counting the leading zeros is used to normalize range.

+    clz     r6, r4

+    sub     r6, r6, #24                 ; shift

+    ; Flag is set on the sum of count.  This flag is used later

+    ; to determine if count >= 0

+    adds    r3, r3, r6                  ; count += shift

+    lsl     r5, r4, r6                  ; range <<= shift

+    bmi     token_count_lt_zero_se      ; if(count >= 0)

+    sub     r6, r6, r3                  ; offset = shift - count

+    sub     r4, r6, #1                  ; offset-1

+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )

+    bpl     token_high_bit_not_set_se

+    ldr     r4, [r0, #vp9_writer_pos]   ; x

+    sub     r4, r4, #1                  ; x = w->pos-1

+    b       token_zero_while_start_se

+token_zero_while_loop_se

+    mov     r10, #0

+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0

+    sub     r4, r4, #1                  ; x--

+token_zero_while_start_se

+    cmp     r4, #0

+    ldrge   r7, [r0, #vp9_writer_buffer]

+    ldrb    r11, [r7, r4]

+    cmpge   r11, #0xff

+    beq     token_zero_while_loop_se

+    ldr     r7, [r0, #vp9_writer_buffer]

+    ldrb    r10, [r7, r4]               ; w->buffer[x]

+    add     r10, r10, #1

+    strb    r10, [r7, r4]               ; w->buffer[x] + 1

+token_high_bit_not_set_se

+    rsb     r4, r6, #24                 ; 24-offset

+    ldr     r10, [r0, #vp9_writer_buffer]

+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)

+    ldr     r4, [r0, #vp9_writer_pos]   ; w->pos

+    lsl     r2, r2, r6                  ; lowvalue <<= offset

+    mov     r6, r3                      ; shift = count

+    add     r11, r4, #1                 ; w->pos++

+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff

+    str     r11, [r0, #vp9_writer_pos]

+    sub     r3, r3, #8                  ; count -= 8

+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]

+token_count_lt_zero_se

+    lsl     r2, r2, r6                  ; lowvalue <<= shift

+    subs    r12, r12, #1

+    bne     stop_encode_loop

+    ldr     r10, [sp, #8]               ; *size

+    ldr     r11, [r10]

+    ldr     r4,  [r0, #vp9_writer_pos]  ; w->pos

+    add     r11, r11, r4                ; *size += w->pos

+    str     r11, [r10]

+    ldr     r9, [sp, #20]               ; num_parts

+    sub     r9, r9, #1

+    ldr     r10, [sp, #28]              ; i

+    cmp     r10, r9                     ; if(i<(num_part - 1))

+    bge     skip_write_partition

+    ldr     r12, [sp, #40]              ; ptr

+    add     r12, r12, r4                ; ptr += w->pos

+    str     r12, [sp, #40]

+    ldr     r9, [sp, #24]               ; cx_data

+    mov     r8, r4, asr #8

+    strb    r4, [r9, #0]

+    strb    r8, [r9, #1]

+    mov     r4, r4, asr #16

+    strb    r4, [r9, #2]

+    add     r9, r9, #3                  ; cx_data += 3

+    str     r9, [sp, #24]

+skip_write_partition

+    ldr     r11, [sp, #28]              ; i

+    ldr     r10, [sp, #20]              ; num_parts

+    add     r11, r11, #1                ; i++

+    str     r11, [sp, #28]

+    ldr     r7, [sp, #32]               ; cpi->tp_list[i]

+    mov     r1, #TOKENLIST_SZ

+    add     r7, r7, r1                  ; next element in cpi->tp_list

+    str     r7, [sp, #32]               ; cpi->tp_list[i+1]

+    cmp     r10, r11

+    bgt     numparts_loop

+    add     sp, sp, #44

+    pop     {r4-r11, pc}

+    ENDP

+_VP8_COMP_common_

+    DCD     vp8_comp_common

+_VP8_COMMON_MBrows_

+    DCD     vp8_common_mb_rows

+_VP8_COMP_tplist_

+    DCD     vp8_comp_tplist

+_VP8_COMP_bc2_

+    DCD     vp8_comp_bc2

+    END

--- /dev/null

+++ b/vp9/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm

@@ -1,0 +1,224 @@

+;

+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_fast_quantize_b_armv6|

+    INCLUDE asm_enc_offsets.asm

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    BLOCK *b

+; r1    BLOCKD *d

+|vp8_fast_quantize_b_armv6| PROC

+    stmfd   sp!, {r1, r4-r11, lr}

+    ldr     r3, [r0, #vp8_block_coeff]      ; coeff

+    ldr     r4, [r0, #vp8_block_quant_fast] ; quant_fast

+    ldr     r5, [r0, #vp8_block_round]      ; round

+    ldr     r6, [r1, #vp8_blockd_qcoeff]    ; qcoeff

+    ldr     r7, [r1, #vp8_blockd_dqcoeff]   ; dqcoeff

+    ldr     r8, [r1, #vp8_blockd_dequant]   ; dequant

+    ldr     r2, loop_count          ; loop_count=0x1000000. 'lsls' instruction

+                                    ; is used to update the counter so that

+                                    ; it can be used to mark nonzero

+                                    ; quantized coefficient pairs.

+    mov     r1, #0                  ; flags for quantized coeffs

+    ; PART 1: quantization and dequantization loop

+loop

+    ldr     r9, [r3], #4            ; [z1 | z0]

+    ldr     r10, [r5], #4           ; [r1 | r0]

+    ldr     r11, [r4], #4           ; [q1 | q0]

+    ssat16  lr, #1, r9              ; [sz1 | sz0]

+    eor     r9, r9, lr              ; [z1 ^ sz1 | z0 ^ sz0]

+    ssub16  r9, r9, lr              ; x = (z ^ sz) - sz

+    sadd16  r9, r9, r10             ; [x1+r1 | x0+r0]

+    ldr     r12, [r3], #4           ; [z3 | z2]

+    smulbb  r0, r9, r11             ; [(x0+r0)*q0]

+    smultt  r9, r9, r11             ; [(x1+r1)*q1]

+    ldr     r10, [r5], #4           ; [r3 | r2]

+    ssat16  r11, #1, r12            ; [sz3 | sz2]

+    eor     r12, r12, r11           ; [z3 ^ sz3 | z2 ^ sz2]

+    pkhtb   r0, r9, r0, asr #16     ; [y1 | y0]

+    ldr     r9, [r4], #4            ; [q3 | q2]

+    ssub16  r12, r12, r11           ; x = (z ^ sz) - sz

+    sadd16  r12, r12, r10           ; [x3+r3 | x2+r2]

+    eor     r0, r0, lr              ; [(y1 ^ sz1) | (y0 ^ sz0)]

+    smulbb  r10, r12, r9            ; [(x2+r2)*q2]

+    smultt  r12, r12, r9            ; [(x3+r3)*q3]

+    ssub16  r0, r0, lr              ; x = (y ^ sz) - sz

+    cmp     r0, #0                  ; check if zero

+    orrne   r1, r1, r2, lsr #24     ; add flag for nonzero coeffs

+    str     r0, [r6], #4            ; *qcoeff++ = x

+    ldr     r9, [r8], #4            ; [dq1 | dq0]

+    pkhtb   r10, r12, r10, asr #16  ; [y3 | y2]

+    eor     r10, r10, r11           ; [(y3 ^ sz3) | (y2 ^ sz2)]

+    ssub16  r10, r10, r11           ; x = (y ^ sz) - sz

+    cmp     r10, #0                 ; check if zero

+    orrne   r1, r1, r2, lsr #23     ; add flag for nonzero coeffs

+    str     r10, [r6], #4           ; *qcoeff++ = x

+    ldr     r11, [r8], #4           ; [dq3 | dq2]

+    smulbb  r12, r0, r9             ; [x0*dq0]

+    smultt  r0, r0, r9              ; [x1*dq1]

+    smulbb  r9, r10, r11            ; [x2*dq2]

+    smultt  r10, r10, r11           ; [x3*dq3]

+    lsls    r2, r2, #2              ; update loop counter

+    strh    r12, [r7, #0]           ; dqcoeff[0] = [x0*dq0]

+    strh    r0, [r7, #2]            ; dqcoeff[1] = [x1*dq1]

+    strh    r9, [r7, #4]            ; dqcoeff[2] = [x2*dq2]

+    strh    r10, [r7, #6]           ; dqcoeff[3] = [x3*dq3]

+    add     r7, r7, #8              ; dqcoeff += 8

+    bne     loop

+    ; PART 2: check position for eob...

+    mov     lr, #0                  ; init eob

+    cmp     r1, #0                  ; coeffs after quantization?

+    ldr     r11, [sp, #0]           ; restore BLOCKD pointer

+    beq     end                     ; skip eob calculations if all zero

+    ldr     r0, [r11, #vp8_blockd_qcoeff]

+    ; check shortcut for nonzero qcoeffs

+    tst    r1, #0x80

+    bne    quant_coeff_15_14

+    tst    r1, #0x20

+    bne    quant_coeff_13_11

+    tst    r1, #0x8

+    bne    quant_coeff_12_7

+    tst    r1, #0x40

+    bne    quant_coeff_10_9

+    tst    r1, #0x10

+    bne    quant_coeff_8_3

+    tst    r1, #0x2

+    bne    quant_coeff_6_5

+    tst    r1, #0x4

+    bne    quant_coeff_4_2

+    b      quant_coeff_1_0

+quant_coeff_15_14

+    ldrh    r2, [r0, #30]       ; rc=15, i=15

+    mov     lr, #16

+    cmp     r2, #0

+    bne     end

+    ldrh    r3, [r0, #28]       ; rc=14, i=14

+    mov     lr, #15

+    cmp     r3, #0

+    bne     end

+quant_coeff_13_11

+    ldrh    r2, [r0, #22]       ; rc=11, i=13

+    mov     lr, #14

+    cmp     r2, #0

+    bne     end

+quant_coeff_12_7

+    ldrh    r3, [r0, #14]       ; rc=7,  i=12

+    mov     lr, #13

+    cmp     r3, #0

+    bne     end

+    ldrh    r2, [r0, #20]       ; rc=10, i=11

+    mov     lr, #12

+    cmp     r2, #0

+    bne     end

+quant_coeff_10_9

+    ldrh    r3, [r0, #26]       ; rc=13, i=10

+    mov     lr, #11

+    cmp     r3, #0

+    bne     end

+    ldrh    r2, [r0, #24]       ; rc=12, i=9

+    mov     lr, #10

+    cmp     r2, #0

+    bne     end

+quant_coeff_8_3

+    ldrh    r3, [r0, #18]       ; rc=9,  i=8

+    mov     lr, #9

+    cmp     r3, #0

+    bne     end

+    ldrh    r2, [r0, #12]       ; rc=6,  i=7

+    mov     lr, #8

+    cmp     r2, #0

+    bne     end

+quant_coeff_6_5

+    ldrh    r3, [r0, #6]        ; rc=3,  i=6

+    mov     lr, #7

+    cmp     r3, #0

+    bne     end

+    ldrh    r2, [r0, #4]        ; rc=2,  i=5

+    mov     lr, #6

+    cmp     r2, #0

+    bne     end

+quant_coeff_4_2

+    ldrh    r3, [r0, #10]       ; rc=5,  i=4

+    mov     lr, #5

+    cmp     r3, #0

+    bne     end

+    ldrh    r2, [r0, #16]       ; rc=8,  i=3

+    mov     lr, #4

+    cmp     r2, #0

+    bne     end

+    ldrh    r3, [r0, #8]        ; rc=4,  i=2

+    mov     lr, #3

+    cmp     r3, #0

+    bne     end

+quant_coeff_1_0

+    ldrh    r2, [r0, #2]        ; rc=1,  i=1

+    mov     lr, #2

+    cmp     r2, #0

+    bne     end

+    mov     lr, #1              ; rc=0,  i=0

+end

+    str     lr, [r11, #vp8_blockd_eob]

+    ldmfd   sp!, {r1, r4-r11, pc}

+    ENDP

+loop_count

+    DCD     0x1000000

+    END

--- /dev/null

+++ b/vp9/encoder/arm/armv6/vp8_mse16x16_armv6.asm

@@ -1,0 +1,138 @@

+;

+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_mse16x16_armv6|

+    ARM

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char *src_ptr

+; r1    int source_stride

+; r2    unsigned char *ref_ptr

+; r3    int  recon_stride

+; stack unsigned int *sse

+;

+;note: Based on vp9_variance16x16_armv6. In this function, sum is never used.

+;      So, we can remove this part of calculation.

+|vp8_mse16x16_armv6| PROC

+    push    {r4-r9, lr}

+    pld     [r0, r1, lsl #0]

+    pld     [r2, r3, lsl #0]

+    mov     r12, #16            ; set loop counter to 16 (=block height)

+    mov     r4, #0              ; initialize sse = 0

+loop

+    ; 1st 4 pixels

+    ldr     r5, [r0, #0x0]      ; load 4 src pixels

+    ldr     r6, [r2, #0x0]      ; load 4 ref pixels

+    mov     lr, #0              ; constant zero

+    usub8   r8, r5, r6          ; calculate difference

+    pld     [r0, r1, lsl #1]

+    sel     r7, r8, lr          ; select bytes with positive difference

+    usub8   r9, r6, r5          ; calculate difference with reversed operands

+    pld     [r2, r3, lsl #1]

+    sel     r8, r9, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r5, r7, lr          ; calculate sum of positive differences

+    usad8   r6, r8, lr          ; calculate sum of negative differences

+    orr     r8, r8, r7          ; differences of all 4 pixels

+    ldr     r5, [r0, #0x4]      ; load 4 src pixels

+    ; calculate sse

+    uxtb16  r6, r8              ; byte (two pixels) to halfwords

+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords

+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)

+    ; 2nd 4 pixels

+    ldr     r6, [r2, #0x4]      ; load 4 ref pixels

+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)

+    usub8   r8, r5, r6          ; calculate difference

+    sel     r7, r8, lr          ; select bytes with positive difference

+    usub8   r9, r6, r5          ; calculate difference with reversed operands

+    sel     r8, r9, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r5, r7, lr          ; calculate sum of positive differences

+    usad8   r6, r8, lr          ; calculate sum of negative differences

+    orr     r8, r8, r7          ; differences of all 4 pixels

+    ldr     r5, [r0, #0x8]      ; load 4 src pixels

+    ; calculate sse

+    uxtb16  r6, r8              ; byte (two pixels) to halfwords

+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords

+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)

+    ; 3rd 4 pixels

+    ldr     r6, [r2, #0x8]      ; load 4 ref pixels

+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)

+    usub8   r8, r5, r6          ; calculate difference

+    sel     r7, r8, lr          ; select bytes with positive difference

+    usub8   r9, r6, r5          ; calculate difference with reversed operands

+    sel     r8, r9, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r5, r7, lr          ; calculate sum of positive differences

+    usad8   r6, r8, lr          ; calculate sum of negative differences

+    orr     r8, r8, r7          ; differences of all 4 pixels

+    ldr     r5, [r0, #0xc]      ; load 4 src pixels

+    ; calculate sse

+    uxtb16  r6, r8              ; byte (two pixels) to halfwords

+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords

+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)

+    ; 4th 4 pixels

+    ldr     r6, [r2, #0xc]      ; load 4 ref pixels

+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)

+    usub8   r8, r5, r6          ; calculate difference

+    add     r0, r0, r1          ; set src_ptr to next row

+    sel     r7, r8, lr          ; select bytes with positive difference

+    usub8   r9, r6, r5          ; calculate difference with reversed operands

+    add     r2, r2, r3          ; set dst_ptr to next row

+    sel     r8, r9, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r5, r7, lr          ; calculate sum of positive differences

+    usad8   r6, r8, lr          ; calculate sum of negative differences

+    orr     r8, r8, r7          ; differences of all 4 pixels

+    subs    r12, r12, #1        ; next row

+    ; calculate sse

+    uxtb16  r6, r8              ; byte (two pixels) to halfwords

+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords

+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)

+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)

+    bne     loop

+    ; return stuff

+    ldr     r1, [sp, #28]       ; get address of sse

+    mov     r0, r4              ; return sse

+    str     r4, [r1]            ; store sse

+    pop     {r4-r9, pc}

+    ENDP

+    END

--- /dev/null

+++ b/vp9/encoder/arm/armv6/vp8_sad16x16_armv6.asm

@@ -1,0 +1,96 @@

+;

+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_sad16x16_armv6|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    const unsigned char *src_ptr

+; r1    int  src_stride

+; r2    const unsigned char *ref_ptr

+; r3    int  ref_stride

+; stack max_sad (not used)

+|vp8_sad16x16_armv6| PROC

+    stmfd   sp!, {r4-r12, lr}

+    pld     [r0, r1, lsl #0]

+    pld     [r2, r3, lsl #0]

+    pld     [r0, r1, lsl #1]

+    pld     [r2, r3, lsl #1]

+    mov     r4, #0              ; sad = 0;

+    mov     r5, #8              ; loop count

+loop

+    ; 1st row

+    ldr     r6, [r0, #0x0]      ; load 4 src pixels (1A)

+    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (1A)

+    ldr     r7, [r0, #0x4]      ; load 4 src pixels (1A)

+    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (1A)

+    ldr     r10, [r0, #0x8]     ; load 4 src pixels (1B)

+    ldr     r11, [r0, #0xC]     ; load 4 src pixels (1B)

+    usada8  r4, r8, r6, r4      ; calculate sad for 4 pixels

+    usad8   r8, r7, r9          ; calculate sad for 4 pixels

+    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (1B)

+    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (1B)

+    add     r0, r0, r1          ; set src pointer to next row

+    add     r2, r2, r3          ; set dst pointer to next row

+    pld     [r0, r1, lsl #1]

+    pld     [r2, r3, lsl #1]

+    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels

+    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels

+    ldr     r6, [r0, #0x0]      ; load 4 src pixels (2A)

+    ldr     r7, [r0, #0x4]      ; load 4 src pixels (2A)

+    add     r4, r4, r8          ; add partial sad values

+    ; 2nd row

+    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (2A)

+    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (2A)

+    ldr     r10, [r0, #0x8]     ; load 4 src pixels (2B)

+    ldr     r11, [r0, #0xC]     ; load 4 src pixels (2B)

+    usada8  r4, r6, r8, r4      ; calculate sad for 4 pixels

+    usad8   r8, r7, r9          ; calculate sad for 4 pixels

+    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (2B)

+    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (2B)

+    add     r0, r0, r1          ; set src pointer to next row

+    add     r2, r2, r3          ; set dst pointer to next row

+    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels

+    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels

+    pld     [r0, r1, lsl #1]

+    pld     [r2, r3, lsl #1]

+    subs    r5, r5, #1          ; decrement loop counter

+    add     r4, r4, r8          ; add partial sad values

+    bne     loop

+    mov     r0, r4              ; return sad

+    ldmfd   sp!, {r4-r12, pc}

+    ENDP

+    END

--- /dev/null

+++ b/vp9/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm

@@ -1,0 +1,262 @@

+;

+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT |vp8_short_fdct4x4_armv6|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA    |.text|, CODE, READONLY

+; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)

+|vp8_short_fdct4x4_armv6| PROC

+    stmfd       sp!, {r4 - r12, lr}

+    ; PART 1

+    ; coeffs 0-3

+    ldrd        r4, r5, [r0]        ; [i1 | i0] [i3 | i2]

+    ldr         r10, c7500

+    ldr         r11, c14500

+    ldr         r12, c0x22a453a0    ; [2217*4 | 5352*4]

+    ldr         lr, c0x00080008

+    ror         r5, r5, #16         ; [i2 | i3]

+    qadd16      r6, r4, r5          ; [i1+i2 | i0+i3] = [b1 | a1] without shift

+    qsub16      r7, r4, r5          ; [i1-i2 | i0-i3] = [c1 | d1] without shift

+    add         r0, r0, r2          ; update input pointer

+    qadd16      r7, r7, r7          ; 2*[c1|d1] --> we can use smlad and smlsd

+                                    ; with 2217*4 and 5352*4 without losing the

+                                    ; sign bit (overflow)

+    smuad       r4, r6, lr          ; o0 = (i1+i2)*8 + (i0+i3)*8

+    smusd       r5, r6, lr          ; o2 = (i1+i2)*8 - (i0+i3)*8

+    smlad       r6, r7, r12, r11    ; o1 = (c1 * 2217 + d1 * 5352 +  14500)

+    smlsdx      r7, r7, r12, r10    ; o3 = (d1 * 2217 - c1 * 5352 +   7500)

+    ldrd        r8, r9, [r0]        ; [i5 | i4] [i7 | i6]

+    pkhbt       r3, r4, r6, lsl #4  ; [o1 | o0], keep in register for PART 2

+    pkhbt       r6, r5, r7, lsl #4  ; [o3 | o2]

+    str         r6, [r1, #4]

+    ; coeffs 4-7

+    ror         r9, r9, #16         ; [i6 | i7]

+    qadd16      r6, r8, r9          ; [i5+i6 | i4+i7] = [b1 | a1] without shift

+    qsub16      r7, r8, r9          ; [i5-i6 | i4-i7] = [c1 | d1] without shift

+    add         r0, r0, r2          ; update input pointer

+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd

+                                    ; with 2217*4 and 5352*4 without losing the

+                                    ; sign bit (overflow)

+    smuad       r9, r6, lr          ; o4 = (i5+i6)*8 + (i4+i7)*8

+    smusd       r8, r6, lr          ; o6 = (i5+i6)*8 - (i4+i7)*8

+    smlad       r6, r7, r12, r11    ; o5 = (c1 * 2217 + d1 * 5352 +  14500)

+    smlsdx      r7, r7, r12, r10    ; o7 = (d1 * 2217 - c1 * 5352 +   7500)

+    ldrd        r4, r5, [r0]        ; [i9 | i8] [i11 | i10]

+    pkhbt       r9, r9, r6, lsl #4  ; [o5 | o4], keep in register for PART 2

+    pkhbt       r6, r8, r7, lsl #4  ; [o7 | o6]

+    str         r6, [r1, #12]

+    ; coeffs 8-11

+    ror         r5, r5, #16         ; [i10 | i11]

+    qadd16      r6, r4, r5          ; [i9+i10 | i8+i11]=[b1 | a1] without shift

+    qsub16      r7, r4, r5          ; [i9-i10 | i8-i11]=[c1 | d1] without shift

+    add         r0, r0, r2          ; update input pointer

+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd

+                                    ; with 2217*4 and 5352*4 without losing the

+                                    ; sign bit (overflow)

+    smuad       r2, r6, lr          ; o8 = (i9+i10)*8 + (i8+i11)*8

+    smusd       r8, r6, lr          ; o10 = (i9+i10)*8 - (i8+i11)*8

+    smlad       r6, r7, r12, r11    ; o9 = (c1 * 2217 + d1 * 5352 +  14500)

+    smlsdx      r7, r7, r12, r10    ; o11 = (d1 * 2217 - c1 * 5352 +   7500)

+    ldrd        r4, r5, [r0]        ; [i13 | i12] [i15 | i14]

+    pkhbt       r2, r2, r6, lsl #4  ; [o9 | o8], keep in register for PART 2

+    pkhbt       r6, r8, r7, lsl #4  ; [o11 | o10]

+    str         r6, [r1, #20]

+    ; coeffs 12-15

+    ror         r5, r5, #16         ; [i14 | i15]

+    qadd16      r6, r4, r5          ; [i13+i14 | i12+i15]=[b1|a1] without shift

+    qsub16      r7, r4, r5          ; [i13-i14 | i12-i15]=[c1|d1] without shift

+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd

+                                    ; with 2217*4 and 5352*4 without losing the

+                                    ; sign bit (overflow)

+    smuad       r4, r6, lr          ; o12 = (i13+i14)*8 + (i12+i15)*8

+    smusd       r5, r6, lr          ; o14 = (i13+i14)*8 - (i12+i15)*8

+    smlad       r6, r7, r12, r11    ; o13 = (c1 * 2217 + d1 * 5352 +  14500)

+    smlsdx      r7, r7, r12, r10    ; o15 = (d1 * 2217 - c1 * 5352 +   7500)

+    pkhbt       r0, r4, r6, lsl #4  ; [o13 | o12], keep in register for PART 2

+    pkhbt       r6, r5, r7, lsl #4  ; [o15 | o14]

+    str         r6, [r1, #28]

+    ; PART 2 -------------------------------------------------

+    ldr         r11, c12000

+    ldr         r10, c51000

+    ldr         lr, c0x00070007

+    qadd16      r4, r3, r0          ; a1 = [i1+i13 | i0+i12]

+    qadd16      r5, r9, r2          ; b1 = [i5+i9  |  i4+i8]

+    qsub16      r6, r9, r2          ; c1 = [i5-i9  |  i4-i8]

+    qsub16      r7, r3, r0          ; d1 = [i1-i13 | i0-i12]

+    qadd16      r4, r4, lr          ; a1 + 7

+    add         r0, r11, #0x10000   ; add (d!=0)

+    qadd16      r2, r4, r5          ; a1 + b1 + 7

+    qsub16      r3, r4, r5          ; a1 - b1 + 7

+    ldr         r12, c0x08a914e8    ; [2217 | 5352]

+    lsl         r8, r2, #16         ; prepare bottom halfword for scaling

+    asr         r2, r2, #4          ; scale top halfword

+    lsl         r9, r3, #16         ; prepare bottom halfword for scaling

+    asr         r3, r3, #4          ; scale top halfword

+    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword

+    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword

+    smulbt      r2, r6, r12         ; [ ------ | c1*2217]

+    str         r4, [r1, #0]        ; [     o1 |      o0]

+    smultt      r3, r6, r12         ; [c1*2217 | ------ ]

+    str         r5, [r1, #16]       ; [     o9 |      o8]

+    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]

+    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]

+    smulbb      r2, r6, r12         ; [ ------ | c1*5352]

+    smultb      r3, r6, r12         ; [c1*5352 | ------ ]

+    lsls        r6, r7, #16         ; d1 != 0 ?

+    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)

+    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)

+    asrs        r6, r7, #16

+    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)

+    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)

+    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000

+    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000

+    pkhtb       r9, r9, r8, asr #16

+    sub         r4, r4, r2

+    sub         r5, r5, r3

+    ldr         r3, [r1, #4]        ; [i3 | i2]

+    pkhtb       r5, r5, r4, asr #16 ; [o13|o12]

+    str         r9, [r1, #8]        ; [o5 | 04]

+    ldr         r9, [r1, #12]       ; [i7 | i6]

+    ldr         r8, [r1, #28]       ; [i15|i14]

+    ldr         r2, [r1, #20]       ; [i11|i10]

+    str         r5, [r1, #24]       ; [o13|o12]

+    qadd16      r4, r3, r8          ; a1 = [i3+i15 | i2+i14]

+    qadd16      r5, r9, r2          ; b1 = [i7+i11 | i6+i10]

+    qadd16      r4, r4, lr          ; a1 + 7

+    qsub16      r6, r9, r2          ; c1 = [i7-i11 | i6-i10]

+    qadd16      r2, r4, r5          ; a1 + b1 + 7

+    qsub16      r7, r3, r8          ; d1 = [i3-i15 | i2-i14]

+    qsub16      r3, r4, r5          ; a1 - b1 + 7

+    lsl         r8, r2, #16         ; prepare bottom halfword for scaling

+    asr         r2, r2, #4          ; scale top halfword

+    lsl         r9, r3, #16         ; prepare bottom halfword for scaling

+    asr         r3, r3, #4          ; scale top halfword

+    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword

+    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword

+    smulbt      r2, r6, r12         ; [ ------ | c1*2217]

+    str         r4, [r1, #4]        ; [     o3 |      o2]

+    smultt      r3, r6, r12         ; [c1*2217 | ------ ]

+    str         r5, [r1, #20]       ; [    o11 |     o10]

+    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]

+    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]

+    smulbb      r2, r6, r12         ; [ ------ | c1*5352]

+    smultb      r3, r6, r12         ; [c1*5352 | ------ ]

+    lsls        r6, r7, #16         ; d1 != 0 ?

+    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)

+    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)

+    asrs        r6, r7, #16

+    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)

+    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)

+    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000

+    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000

+    pkhtb       r9, r9, r8, asr #16

+    sub         r4, r4, r2

+    sub         r5, r5, r3

+    str         r9, [r1, #12]       ; [o7 | o6]

+    pkhtb       r5, r5, r4, asr #16 ; [o15|o14]

+    str         r5, [r1, #28]       ; [o15|o14]

+    ldmfd       sp!, {r4 - r12, pc}

+    ENDP

+; Used constants

+c7500

+    DCD     7500

+c14500

+    DCD     14500

+c0x22a453a0

+    DCD     0x22a453a0

+c0x00080008

+    DCD     0x00080008

+c12000

+    DCD     12000

+c51000

+    DCD     51000

+c0x00070007

+    DCD     0x00070007

+c0x08a914e8

+    DCD     0x08a914e8

+    END

--- /dev/null

+++ b/vp9/encoder/arm/armv6/vp8_subtract_armv6.asm

@@ -1,0 +1,265 @@

+;

+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_subtract_mby_armv6|

+    EXPORT  |vp8_subtract_mbuv_armv6|

+    EXPORT  |vp8_subtract_b_armv6|

+    INCLUDE asm_enc_offsets.asm

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    BLOCK *be

+; r1    BLOCKD *bd

+; r2    int pitch

+|vp8_subtract_b_armv6| PROC

+    stmfd   sp!, {r4-r9}

+    ldr     r4, [r0, #vp8_block_base_src]

+    ldr     r5, [r0, #vp8_block_src]

+    ldr     r6, [r0, #vp8_block_src_diff]

+    ldr     r3, [r4]

+    ldr     r7, [r0, #vp8_block_src_stride]

+    add     r3, r3, r5          ; src = *base_src + src

+    ldr     r8, [r1, #vp8_blockd_predictor]

+    mov     r9, #4              ; loop count

+loop_block

+    ldr     r0, [r3], r7        ; src

+    ldr     r1, [r8], r2        ; pred

+    uxtb16  r4, r0              ; [s2 | s0]

+    uxtb16  r5, r1              ; [p2 | p0]

+    uxtb16  r0, r0, ror #8      ; [s3 | s1]

+    uxtb16  r1, r1, ror #8      ; [p3 | p1]

+    usub16  r4, r4, r5          ; [d2 | d0]

+    usub16  r5, r0, r1          ; [d3 | d1]

+    subs    r9, r9, #1          ; decrement loop counter

+    pkhbt   r0, r4, r5, lsl #16 ; [d1 | d0]

+    pkhtb   r1, r5, r4, asr #16 ; [d3 | d2]

+    str     r0, [r6, #0]        ; diff

+    str     r1, [r6, #4]        ; diff

+    add     r6, r6, r2, lsl #1  ; update diff pointer

+    bne     loop_block

+    ldmfd   sp!, {r4-r9}

+    mov     pc, lr

+    ENDP

+; r0    short *diff

+; r1    unsigned char *usrc

+; r2    unsigned char *vsrc

+; r3    unsigned char *pred

+; stack int stride

+|vp8_subtract_mbuv_armv6| PROC

+    stmfd   sp!, {r4-r12, lr}

+    add     r0, r0, #512        ; set *diff point to Cb

+    add     r3, r3, #256        ; set *pred point to Cb

+    mov     r4, #8              ; loop count

+    ldr     r5, [sp, #40]       ; stride

+    ; Subtract U block

+loop_u

+    ldr     r6, [r1]            ; src       (A)

+    ldr     r7, [r3], #4        ; pred      (A)

+    uxtb16  r8, r6              ; [s2 | s0] (A)

+    uxtb16  r9, r7              ; [p2 | p0] (A)

+    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)

+    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)

+    usub16  r6, r8, r9          ; [d2 | d0] (A)

+    usub16  r7, r10, r11        ; [d3 | d1] (A)

+    ldr     r10, [r1, #4]       ; src       (B)

+    ldr     r11, [r3], #4       ; pred      (B)

+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)

+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)

+    str     r8, [r0], #4        ; diff      (A)

+    uxtb16  r8, r10             ; [s2 | s0] (B)

+    str     r9, [r0], #4        ; diff      (A)

+    uxtb16  r9, r11             ; [p2 | p0] (B)

+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)

+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)

+    usub16  r6, r8, r9          ; [d2 | d0] (B)

+    usub16  r7, r10, r11        ; [d3 | d1] (B)

+    add     r1, r1, r5          ; update usrc pointer

+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)

+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)

+    str     r8, [r0], #4        ; diff      (B)

+    subs    r4, r4, #1          ; update loop counter

+    str     r9, [r0], #4        ; diff      (B)

+    bne     loop_u

+    mov     r4, #8              ; loop count

+    ; Subtract V block

+loop_v

+    ldr     r6, [r2]            ; src       (A)

+    ldr     r7, [r3], #4        ; pred      (A)

+    uxtb16  r8, r6              ; [s2 | s0] (A)

+    uxtb16  r9, r7              ; [p2 | p0] (A)

+    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)

+    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)

+    usub16  r6, r8, r9          ; [d2 | d0] (A)

+    usub16  r7, r10, r11        ; [d3 | d1] (A)

+    ldr     r10, [r2, #4]       ; src       (B)

+    ldr     r11, [r3], #4       ; pred      (B)

+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)

+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)

+    str     r8, [r0], #4        ; diff      (A)

+    uxtb16  r8, r10             ; [s2 | s0] (B)

+    str     r9, [r0], #4        ; diff      (A)

+    uxtb16  r9, r11             ; [p2 | p0] (B)

+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)

+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)

+    usub16  r6, r8, r9          ; [d2 | d0] (B)

+    usub16  r7, r10, r11        ; [d3 | d1] (B)

+    add     r2, r2, r5          ; update vsrc pointer

+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)

+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)

+    str     r8, [r0], #4        ; diff      (B)

+    subs    r4, r4, #1          ; update loop counter

+    str     r9, [r0], #4        ; diff      (B)

+    bne     loop_v

+    ldmfd   sp!, {r4-r12, pc}

+    ENDP

+; r0    short *diff

+; r1    unsigned char *src

+; r2    unsigned char *pred

+; r3    int stride

+|vp8_subtract_mby_armv6| PROC

+    stmfd   sp!, {r4-r11}

+    mov     r4, #16

+loop

+    ldr     r6, [r1]            ; src       (A)

+    ldr     r7, [r2], #4        ; pred      (A)

+    uxtb16  r8, r6              ; [s2 | s0] (A)

+    uxtb16  r9, r7              ; [p2 | p0] (A)

+    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)

+    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)

+    usub16  r6, r8, r9          ; [d2 | d0] (A)

+    usub16  r7, r10, r11        ; [d3 | d1] (A)

+    ldr     r10, [r1, #4]       ; src       (B)

+    ldr     r11, [r2], #4       ; pred      (B)

+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)

+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)

+    str     r8, [r0], #4        ; diff      (A)

+    uxtb16  r8, r10             ; [s2 | s0] (B)

+    str     r9, [r0], #4        ; diff      (A)

+    uxtb16  r9, r11             ; [p2 | p0] (B)

+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)

+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)

+    usub16  r6, r8, r9          ; [d2 | d0] (B)

+    usub16  r7, r10, r11        ; [d3 | d1] (B)

+    ldr     r10, [r1, #8]       ; src       (C)

+    ldr     r11, [r2], #4       ; pred      (C)

+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)

+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)

+    str     r8, [r0], #4        ; diff      (B)

+    uxtb16  r8, r10             ; [s2 | s0] (C)

+    str     r9, [r0], #4        ; diff      (B)

+    uxtb16  r9, r11             ; [p2 | p0] (C)

+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (C)

+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (C)

+    usub16  r6, r8, r9          ; [d2 | d0] (C)

+    usub16  r7, r10, r11        ; [d3 | d1] (C)

+    ldr     r10, [r1, #12]      ; src       (D)

+    ldr     r11, [r2], #4       ; pred      (D)

+    pkhbt   r8, r6, r7, lsl #16  ; [d1 | d0] (C)

+    pkhtb   r9, r7, r6, asr #16  ; [d3 | d2] (C)

+    str     r8, [r0], #4        ; diff      (C)

+    uxtb16  r8, r10             ; [s2 | s0] (D)

+    str     r9, [r0], #4        ; diff      (C)

+    uxtb16  r9, r11             ; [p2 | p0] (D)

+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (D)

+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (D)

+    usub16  r6, r8, r9          ; [d2 | d0] (D)

+    usub16  r7, r10, r11        ; [d3 | d1] (D)

+    add     r1, r1, r3          ; update src pointer

+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (D)

+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (D)

+    str     r8, [r0], #4        ; diff      (D)

+    subs    r4, r4, #1          ; update loop counter

+    str     r9, [r0], #4        ; diff      (D)

+    bne     loop

+    ldmfd   sp!, {r4-r11}

+    mov     pc, lr

+    ENDP

+    END

--- /dev/null

+++ b/vp9/encoder/arm/armv6/vp8_variance16x16_armv6.asm

@@ -1,0 +1,154 @@

+;

+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_variance16x16_armv6|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char *src_ptr

+; r1    int source_stride

+; r2    unsigned char *ref_ptr

+; r3    int  recon_stride

+; stack unsigned int *sse

+|vp9_variance16x16_armv6| PROC

+    stmfd   sp!, {r4-r12, lr}

+    pld     [r0, r1, lsl #0]

+    pld     [r2, r3, lsl #0]

+    mov     r8, #0              ; initialize sum = 0

+    mov     r11, #0             ; initialize sse = 0

+    mov     r12, #16            ; set loop counter to 16 (=block height)

+loop

+    ; 1st 4 pixels

+    ldr     r4, [r0, #0]        ; load 4 src pixels

+    ldr     r5, [r2, #0]        ; load 4 ref pixels

+    mov     lr, #0              ; constant zero

+    usub8   r6, r4, r5          ; calculate difference

+    pld     [r0, r1, lsl #1]

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r9, r5, r4          ; calculate difference with reversed operands

+    pld     [r2, r3, lsl #1]

+    sel     r6, r9, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    adds    r8, r8, r4          ; add positive differences to sum

+    subs    r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 2nd 4 pixels

+    ldr     r4, [r0, #4]        ; load 4 src pixels

+    ldr     r5, [r2, #4]        ; load 4 ref pixels

+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r9, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r9, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 3rd 4 pixels

+    ldr     r4, [r0, #8]        ; load 4 src pixels

+    ldr     r5, [r2, #8]        ; load 4 ref pixels

+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r9, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r9, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 4th 4 pixels

+    ldr     r4, [r0, #12]       ; load 4 src pixels

+    ldr     r5, [r2, #12]       ; load 4 ref pixels

+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    add     r0, r0, r1          ; set src_ptr to next row

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r9, r5, r4          ; calculate difference with reversed operands

+    add     r2, r2, r3          ; set dst_ptr to next row

+    sel     r6, r9, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)

+    subs    r12, r12, #1

+    bne     loop

+    ; return stuff

+    ldr     r6, [sp, #40]       ; get address of sse

+    mul     r0, r8, r8          ; sum * sum

+    str     r11, [r6]           ; store sse

+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))

+    ldmfd   sp!, {r4-r12, pc}

+    ENDP

+    END

--- /dev/null

+++ b/vp9/encoder/arm/armv6/vp8_variance8x8_armv6.asm

@@ -1,0 +1,101 @@

+;

+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_variance8x8_armv6|

+    ARM

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char *src_ptr

+; r1    int source_stride

+; r2    unsigned char *ref_ptr

+; r3    int  recon_stride

+; stack unsigned int *sse

+|vp9_variance8x8_armv6| PROC

+    push    {r4-r10, lr}

+    pld     [r0, r1, lsl #0]

+    pld     [r2, r3, lsl #0]

+    mov     r12, #8             ; set loop counter to 8 (=block height)

+    mov     r4, #0              ; initialize sum = 0

+    mov     r5, #0              ; initialize sse = 0

+loop

+    ; 1st 4 pixels

+    ldr     r6, [r0, #0x0]      ; load 4 src pixels

+    ldr     r7, [r2, #0x0]      ; load 4 ref pixels

+    mov     lr, #0              ; constant zero

+    usub8   r8, r6, r7          ; calculate difference

+    pld     [r0, r1, lsl #1]

+    sel     r10, r8, lr         ; select bytes with positive difference

+    usub8   r9, r7, r6          ; calculate difference with reversed operands

+    pld     [r2, r3, lsl #1]

+    sel     r8, r9, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r6, r10, lr         ; calculate sum of positive differences

+    usad8   r7, r8, lr          ; calculate sum of negative differences

+    orr     r8, r8, r10         ; differences of all 4 pixels

+    ; calculate total sum

+    add    r4, r4, r6           ; add positive differences to sum

+    sub    r4, r4, r7           ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r7, r8              ; byte (two pixels) to halfwords

+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords

+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)

+    ; 2nd 4 pixels

+    ldr     r6, [r0, #0x4]      ; load 4 src pixels

+    ldr     r7, [r2, #0x4]      ; load 4 ref pixels

+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)

+    usub8   r8, r6, r7          ; calculate difference

+    add     r0, r0, r1          ; set src_ptr to next row

+    sel     r10, r8, lr         ; select bytes with positive difference

+    usub8   r9, r7, r6          ; calculate difference with reversed operands

+    add     r2, r2, r3          ; set dst_ptr to next row

+    sel     r8, r9, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r6, r10, lr         ; calculate sum of positive differences

+    usad8   r7, r8, lr          ; calculate sum of negative differences

+    orr     r8, r8, r10         ; differences of all 4 pixels

+    ; calculate total sum

+    add     r4, r4, r6          ; add positive differences to sum

+    sub     r4, r4, r7          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r7, r8              ; byte (two pixels) to halfwords

+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords

+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)

+    subs    r12, r12, #1        ; next row

+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)

+    bne     loop

+    ; return stuff

+    ldr     r8, [sp, #32]       ; get address of sse

+    mul     r1, r4, r4          ; sum * sum

+    str     r5, [r8]            ; store sse

+    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))

+    pop     {r4-r10, pc}

+    ENDP

+    END

--- /dev/null

+++ b/vp9/encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm

@@ -1,0 +1,182 @@

+;

+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_variance_halfpixvar16x16_h_armv6|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char *src_ptr

+; r1    int source_stride

+; r2    unsigned char *ref_ptr

+; r3    int  recon_stride

+; stack unsigned int *sse

+|vp9_variance_halfpixvar16x16_h_armv6| PROC

+    stmfd   sp!, {r4-r12, lr}

+    pld     [r0, r1, lsl #0]

+    pld     [r2, r3, lsl #0]

+    mov     r8, #0              ; initialize sum = 0

+    ldr     r10, c80808080

+    mov     r11, #0             ; initialize sse = 0

+    mov     r12, #16            ; set loop counter to 16 (=block height)

+    mov     lr, #0              ; constant zero

+loop

+    ; 1st 4 pixels

+    ldr     r4, [r0, #0]        ; load 4 src pixels

+    ldr     r6, [r0, #1]        ; load 4 src pixels with 1 byte offset

+    ldr     r5, [r2, #0]        ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    usub8   r6, r4, r5          ; calculate difference

+    pld     [r0, r1, lsl #1]

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    pld     [r2, r3, lsl #1]

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    adds    r8, r8, r4          ; add positive differences to sum

+    subs    r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 2nd 4 pixels

+    ldr     r4, [r0, #4]        ; load 4 src pixels

+    ldr     r6, [r0, #5]        ; load 4 src pixels with 1 byte offset

+    ldr     r5, [r2, #4]        ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 3rd 4 pixels

+    ldr     r4, [r0, #8]        ; load 4 src pixels

+    ldr     r6, [r0, #9]        ; load 4 src pixels with 1 byte offset

+    ldr     r5, [r2, #8]        ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    smlad   r11, r7, r7, r11  ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 4th 4 pixels

+    ldr     r4, [r0, #12]       ; load 4 src pixels

+    ldr     r6, [r0, #13]       ; load 4 src pixels with 1 byte offset

+    ldr     r5, [r2, #12]       ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    add     r0, r0, r1          ; set src_ptr to next row

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    add     r2, r2, r3          ; set dst_ptr to next row

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    subs    r12, r12, #1

+    bne     loop

+    ; return stuff

+    ldr     r6, [sp, #40]       ; get address of sse

+    mul     r0, r8, r8          ; sum * sum

+    str     r11, [r6]           ; store sse

+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))

+    ldmfd   sp!, {r4-r12, pc}

+    ENDP

+c80808080

+    DCD     0x80808080

+    END

--- /dev/null

+++ b/vp9/encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm

@@ -1,0 +1,222 @@

+;

+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_variance_halfpixvar16x16_hv_armv6|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char *src_ptr

+; r1    int source_stride

+; r2    unsigned char *ref_ptr

+; r3    int  recon_stride

+; stack unsigned int *sse

+|vp9_variance_halfpixvar16x16_hv_armv6| PROC

+    stmfd   sp!, {r4-r12, lr}

+    pld     [r0, r1, lsl #0]

+    pld     [r2, r3, lsl #0]

+    mov     r8, #0              ; initialize sum = 0

+    ldr     r10, c80808080

+    mov     r11, #0             ; initialize sse = 0

+    mov     r12, #16            ; set loop counter to 16 (=block height)

+    mov     lr, #0              ; constant zero

+loop

+    add     r9, r0, r1          ; pointer to pixels on the next row

+    ; 1st 4 pixels

+    ldr     r4, [r0, #0]        ; load source pixels a, row N

+    ldr     r6, [r0, #1]        ; load source pixels b, row N

+    ldr     r5, [r9, #0]        ; load source pixels c, row N+1

+    ldr     r7, [r9, #1]        ; load source pixels d, row N+1

+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1

+    mvn     r7, r7

+    uhsub8  r5, r5, r7

+    eor     r5, r5, r10

+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically

+    mvn     r5, r5

+    uhsub8  r4, r4, r5

+    ldr     r5, [r2, #0]        ; load 4 ref pixels

+    eor     r4, r4, r10

+    usub8   r6, r4, r5          ; calculate difference

+    pld     [r0, r1, lsl #1]

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    pld     [r2, r3, lsl #1]

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    adds    r8, r8, r4          ; add positive differences to sum

+    subs    r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 2nd 4 pixels

+    ldr     r4, [r0, #4]        ; load source pixels a, row N

+    ldr     r6, [r0, #5]        ; load source pixels b, row N

+    ldr     r5, [r9, #4]        ; load source pixels c, row N+1

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    ldr     r7, [r9, #5]        ; load source pixels d, row N+1

+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1

+    mvn     r7, r7

+    uhsub8  r5, r5, r7

+    eor     r5, r5, r10

+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically

+    mvn     r5, r5

+    uhsub8  r4, r4, r5

+    ldr     r5, [r2, #4]        ; load 4 ref pixels

+    eor     r4, r4, r10

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 3rd 4 pixels

+    ldr     r4, [r0, #8]        ; load source pixels a, row N

+    ldr     r6, [r0, #9]        ; load source pixels b, row N

+    ldr     r5, [r9, #8]        ; load source pixels c, row N+1

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    ldr     r7, [r9, #9]        ; load source pixels d, row N+1

+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1

+    mvn     r7, r7

+    uhsub8  r5, r5, r7

+    eor     r5, r5, r10

+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically

+    mvn     r5, r5

+    uhsub8  r4, r4, r5

+    ldr     r5, [r2, #8]        ; load 4 ref pixels

+    eor     r4, r4, r10

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 4th 4 pixels

+    ldr     r4, [r0, #12]       ; load source pixels a, row N

+    ldr     r6, [r0, #13]       ; load source pixels b, row N

+    ldr     r5, [r9, #12]       ; load source pixels c, row N+1

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    ldr     r7, [r9, #13]       ; load source pixels d, row N+1

+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1

+    mvn     r7, r7

+    uhsub8  r5, r5, r7

+    eor     r5, r5, r10

+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically

+    mvn     r5, r5

+    uhsub8  r4, r4, r5

+    ldr     r5, [r2, #12]       ; load 4 ref pixels

+    eor     r4, r4, r10

+    usub8   r6, r4, r5          ; calculate difference

+    add     r0, r0, r1          ; set src_ptr to next row

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    add     r2, r2, r3          ; set dst_ptr to next row

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    subs    r12, r12, #1

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    bne     loop

+    ; return stuff

+    ldr     r6, [sp, #40]       ; get address of sse

+    mul     r0, r8, r8          ; sum * sum

+    str     r11, [r6]           ; store sse

+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))

+    ldmfd   sp!, {r4-r12, pc}

+    ENDP

+c80808080

+    DCD     0x80808080

+    END

--- /dev/null

+++ b/vp9/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm

@@ -1,0 +1,184 @@

+;

+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_variance_halfpixvar16x16_v_armv6|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char *src_ptr

+; r1    int source_stride

+; r2    unsigned char *ref_ptr

+; r3    int  recon_stride

+; stack unsigned int *sse

+|vp9_variance_halfpixvar16x16_v_armv6| PROC

+    stmfd   sp!, {r4-r12, lr}

+    pld     [r0, r1, lsl #0]

+    pld     [r2, r3, lsl #0]

+    mov     r8, #0              ; initialize sum = 0

+    ldr     r10, c80808080

+    mov     r11, #0             ; initialize sse = 0

+    mov     r12, #16            ; set loop counter to 16 (=block height)

+    mov     lr, #0              ; constant zero

+loop

+    add     r9, r0, r1          ; set src pointer to next row

+    ; 1st 4 pixels

+    ldr     r4, [r0, #0]        ; load 4 src pixels

+    ldr     r6, [r9, #0]        ; load 4 src pixels from next row

+    ldr     r5, [r2, #0]        ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    usub8   r6, r4, r5          ; calculate difference

+    pld     [r0, r1, lsl #1]

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    pld     [r2, r3, lsl #1]

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    adds    r8, r8, r4          ; add positive differences to sum

+    subs    r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 2nd 4 pixels

+    ldr     r4, [r0, #4]        ; load 4 src pixels

+    ldr     r6, [r9, #4]        ; load 4 src pixels from next row

+    ldr     r5, [r2, #4]        ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 3rd 4 pixels

+    ldr     r4, [r0, #8]        ; load 4 src pixels

+    ldr     r6, [r9, #8]        ; load 4 src pixels from next row

+    ldr     r5, [r2, #8]        ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    ; 4th 4 pixels

+    ldr     r4, [r0, #12]       ; load 4 src pixels

+    ldr     r6, [r9, #12]       ; load 4 src pixels from next row

+    ldr     r5, [r2, #12]       ; load 4 ref pixels

+    ; bilinear interpolation

+    mvn     r6, r6

+    uhsub8  r4, r4, r6

+    eor     r4, r4, r10

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    usub8   r6, r4, r5          ; calculate difference

+    add     r0, r0, r1          ; set src_ptr to next row

+    sel     r7, r6, lr          ; select bytes with positive difference

+    usub8   r6, r5, r4          ; calculate difference with reversed operands

+    add     r2, r2, r3          ; set dst_ptr to next row

+    sel     r6, r6, lr          ; select bytes with negative difference

+    ; calculate partial sums

+    usad8   r4, r7, lr          ; calculate sum of positive differences

+    usad8   r5, r6, lr          ; calculate sum of negative differences

+    orr     r6, r6, r7          ; differences of all 4 pixels

+    ; calculate total sum

+    add     r8, r8, r4          ; add positive differences to sum

+    sub     r8, r8, r5          ; substract negative differences from sum

+    ; calculate sse

+    uxtb16  r5, r6              ; byte (two pixels) to halfwords

+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords

+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)

+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)

+    subs    r12, r12, #1

+    bne     loop

+    ; return stuff

+    ldr     r6, [sp, #40]       ; get address of sse

+    mul     r0, r8, r8          ; sum * sum

+    str     r11, [r6]           ; store sse

+    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))

+    ldmfd   sp!, {r4-r12, pc}

+    ENDP

+c80808080

+    DCD     0x80808080

+    END

--- /dev/null

+++ b/vp9/encoder/arm/armv6/walsh_v6.asm

@@ -1,0 +1,212 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT |vp8_short_walsh4x4_armv6|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA    |.text|, CODE, READONLY  ; name this block of code

+;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)

+; r0    short *input,

+; r1    short *output,

+; r2    int pitch

+|vp8_short_walsh4x4_armv6| PROC

+    stmdb       sp!, {r4 - r11, lr}

+    ldrd        r4, r5, [r0], r2

+    ldr         lr, c00040004

+    ldrd        r6, r7, [r0], r2

+    ; 0-3

+    qadd16      r3, r4, r5          ; [d1|a1] [1+3   |   0+2]

+    qsub16      r4, r4, r5          ; [c1|b1] [1-3   |   0-2]

+    ldrd        r8, r9, [r0], r2

+    ; 4-7

+    qadd16      r5, r6, r7          ; [d1|a1] [5+7   |   4+6]

+    qsub16      r6, r6, r7          ; [c1|b1] [5-7   |   4-6]

+    ldrd        r10, r11, [r0]

+    ; 8-11

+    qadd16      r7, r8, r9          ; [d1|a1] [9+11  |  8+10]

+    qsub16      r8, r8, r9          ; [c1|b1] [9-11  |  8-10]

+    ; 12-15

+    qadd16      r9, r10, r11        ; [d1|a1] [13+15 | 12+14]

+    qsub16      r10, r10, r11       ; [c1|b1] [13-15 | 12-14]

+    lsls        r2, r3, #16

+    smuad       r11, r3, lr         ; A0 = a1<<2 + d1<<2

+    addne       r11, r11, #1        ; A0 += (a1!=0)

+    lsls        r2, r7, #16

+    smuad       r12, r7, lr         ; C0 = a1<<2 + d1<<2

+    addne       r12, r12, #1        ; C0 += (a1!=0)

+    add         r0, r11, r12        ; a1_0 = A0 + C0

+    sub         r11, r11, r12       ; b1_0 = A0 - C0

+    lsls        r2, r5, #16

+    smuad       r12, r5, lr         ; B0 = a1<<2 + d1<<2

+    addne       r12, r12, #1        ; B0 += (a1!=0)

+    lsls        r2, r9, #16

+    smuad       r2, r9, lr          ; D0 = a1<<2 + d1<<2

+    addne       r2, r2, #1          ; D0 += (a1!=0)

+    add         lr, r12, r2         ; d1_0 = B0 + D0

+    sub         r12, r12, r2        ; c1_0 = B0 - D0

+    ; op[0,4,8,12]

+    adds        r2, r0, lr          ; a2 = a1_0 + d1_0

+    addmi       r2, r2, #1          ; += a2 < 0

+    add         r2, r2, #3          ; += 3

+    subs        r0, r0, lr          ; d2 = a1_0 - d1_0

+    mov         r2, r2, asr #3      ; >> 3

+    strh        r2, [r1]            ; op[0]

+    addmi       r0, r0, #1          ; += a2 < 0

+    add         r0, r0, #3          ; += 3

+    ldr         lr, c00040004

+    mov         r0, r0, asr #3      ; >> 3

+    strh        r0, [r1, #24]       ; op[12]

+    adds        r2, r11, r12        ; b2 = b1_0 + c1_0

+    addmi       r2, r2, #1          ; += a2 < 0

+    add         r2, r2, #3          ; += 3

+    subs        r0, r11, r12        ; c2 = b1_0 - c1_0

+    mov         r2, r2, asr #3      ; >> 3

+    strh        r2, [r1, #8]        ; op[4]

+    addmi       r0, r0, #1          ; += a2 < 0

+    add         r0, r0, #3          ; += 3

+    smusd       r3, r3, lr          ; A3 = a1<<2 - d1<<2

+    smusd       r7, r7, lr          ; C3 = a1<<2 - d1<<2

+    mov         r0, r0, asr #3      ; >> 3

+    strh        r0, [r1, #16]       ; op[8]

+    ; op[3,7,11,15]

+    add         r0, r3, r7          ; a1_3 = A3 + C3

+    sub         r3, r3, r7          ; b1_3 = A3 - C3

+    smusd       r5, r5, lr          ; B3 = a1<<2 - d1<<2

+    smusd       r9, r9, lr          ; D3 = a1<<2 - d1<<2

+    add         r7, r5, r9          ; d1_3 = B3 + D3

+    sub         r5, r5, r9          ; c1_3 = B3 - D3

+    adds        r2, r0, r7          ; a2 = a1_3 + d1_3

+    addmi       r2, r2, #1          ; += a2 < 0

+    add         r2, r2, #3          ; += 3

+    adds        r9, r3, r5          ; b2 = b1_3 + c1_3

+    mov         r2, r2, asr #3      ; >> 3

+    strh        r2, [r1, #6]        ; op[3]

+    addmi       r9, r9, #1          ; += a2 < 0

+    add         r9, r9, #3          ; += 3

+    subs        r2, r3, r5          ; c2 = b1_3 - c1_3

+    mov         r9, r9, asr #3      ; >> 3

+    strh        r9, [r1, #14]       ; op[7]

+    addmi       r2, r2, #1          ; += a2 < 0

+    add         r2, r2, #3          ; += 3

+    subs        r9, r0, r7          ; d2 = a1_3 - d1_3

+    mov         r2, r2, asr #3      ; >> 3

+    strh        r2, [r1, #22]       ; op[11]

+    addmi       r9, r9, #1          ; += a2 < 0

+    add         r9, r9, #3          ; += 3

+    smuad       r3, r4, lr          ; A1 = b1<<2 + c1<<2

+    smuad       r5, r8, lr          ; C1 = b1<<2 + c1<<2

+    mov         r9, r9, asr #3      ; >> 3

+    strh        r9, [r1, #30]       ; op[15]

+    ; op[1,5,9,13]

+    add         r0, r3, r5          ; a1_1 = A1 + C1

+    sub         r3, r3, r5          ; b1_1 = A1 - C1

+    smuad       r7, r6, lr          ; B1 = b1<<2 + c1<<2

+    smuad       r9, r10, lr         ; D1 = b1<<2 + c1<<2

+    add         r5, r7, r9          ; d1_1 = B1 + D1

+    sub         r7, r7, r9          ; c1_1 = B1 - D1

+    adds        r2, r0, r5          ; a2 = a1_1 + d1_1

+    addmi       r2, r2, #1          ; += a2 < 0

+    add         r2, r2, #3          ; += 3

+    adds        r9, r3, r7          ; b2 = b1_1 + c1_1

+    mov         r2, r2, asr #3      ; >> 3

+    strh        r2, [r1, #2]        ; op[1]

+    addmi       r9, r9, #1          ; += a2 < 0

+    add         r9, r9, #3          ; += 3

+    subs        r2, r3, r7          ; c2 = b1_1 - c1_1

+    mov         r9, r9, asr #3      ; >> 3

+    strh        r9, [r1, #10]       ; op[5]

+    addmi       r2, r2, #1          ; += a2 < 0

+    add         r2, r2, #3          ; += 3

+    subs        r9, r0, r5          ; d2 = a1_1 - d1_1

+    mov         r2, r2, asr #3      ; >> 3

+    strh        r2, [r1, #18]       ; op[9]

+    addmi       r9, r9, #1          ; += a2 < 0

+    add         r9, r9, #3          ; += 3

+    smusd       r4, r4, lr          ; A2 = b1<<2 - c1<<2

+    smusd       r8, r8, lr          ; C2 = b1<<2 - c1<<2

+    mov         r9, r9, asr #3      ; >> 3

+    strh        r9, [r1, #26]       ; op[13]

+    ; op[2,6,10,14]

+    add         r11, r4, r8         ; a1_2 = A2 + C2

+    sub         r12, r4, r8         ; b1_2 = A2 - C2

+    smusd       r6, r6, lr          ; B2 = b1<<2 - c1<<2

+    smusd       r10, r10, lr        ; D2 = b1<<2 - c1<<2

+    add         r4, r6, r10         ; d1_2 = B2 + D2

+    sub         r8, r6, r10         ; c1_2 = B2 - D2

+    adds        r2, r11, r4         ; a2 = a1_2 + d1_2

+    addmi       r2, r2, #1          ; += a2 < 0

+    add         r2, r2, #3          ; += 3

+    adds        r9, r12, r8         ; b2 = b1_2 + c1_2

+    mov         r2, r2, asr #3      ; >> 3

+    strh        r2, [r1, #4]        ; op[2]

+    addmi       r9, r9, #1          ; += a2 < 0

+    add         r9, r9, #3          ; += 3

+    subs        r2, r12, r8         ; c2 = b1_2 - c1_2

+    mov         r9, r9, asr #3      ; >> 3

+    strh        r9, [r1, #12]       ; op[6]

+    addmi       r2, r2, #1          ; += a2 < 0

+    add         r2, r2, #3          ; += 3

+    subs        r9, r11, r4         ; d2 = a1_2 - d1_2

+    mov         r2, r2, asr #3      ; >> 3

+    strh        r2, [r1, #20]       ; op[10]

+    addmi       r9, r9, #1          ; += a2 < 0

+    add         r9, r9, #3          ; += 3

+    mov         r9, r9, asr #3      ; >> 3

+    strh        r9, [r1, #28]       ; op[14]

+    ldmia       sp!, {r4 - r11, pc}

+    ENDP        ; |vp8_short_walsh4x4_armv6|

+c00040004

+    DCD         0x00040004

+    END

--- /dev/null

+++ b/vp9/encoder/arm/boolhuff_arm.c

@@ -1,0 +1,33 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vp9/encoder/boolhuff.h"

+#include "vp9/common/blockd.h"

+const unsigned int vp9_prob_cost[256] = {

+  2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,

+  1023, 1000,  979,  959,  940,  922,  905,  889,  873,  858,  843,  829,  816,  803,  790,  778,

+  767,  755,  744,  733,  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,

+  617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,  534,  528,  522,  516,

+  511,  505,  499,  494,  488,  483,  477,  472,  467,  462,  457,  452,  447,  442,  437,  433,

+  428,  424,  419,  415,  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,

+  361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,  317,  314,  311,  307,

+  304,  301,  297,  294,  291,  288,  285,  281,  278,  275,  272,  269,  266,  263,  260,  257,

+  255,  252,  249,  246,  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,

+  211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,  181,  179,  177,  174,

+  172,  170,  168,  165,  163,  161,  159,  156,  154,  152,  150,  148,  145,  143,  141,  139,

+  137,  135,  133,  131,  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,

+  105,  103,  101,   99,   97,   95,   93,   92,   90,   88,   86,   84,   82,   81,   79,   77,

+  75,   73,   72,   70,   68,   66,   65,   63,   61,   60,   58,   56,   55,   53,   51,   50,

+  48,   46,   45,   43,   41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,

+  22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1

+};

--- /dev/null

+++ b/vp9/encoder/arm/dct_arm.c

@@ -1,0 +1,21 @@

+/*

+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_config.h"

+#include "./vpx_rtcd.h"

+#if HAVE_ARMV6

+void vp9_short_fdct8x4_armv6(short *input, short *output, int pitch) {

+  vp9_short_fdct4x4_armv6(input,   output,    pitch);

+  vp9_short_fdct4x4_armv6(input + 4, output + 16, pitch);

+}

+#endif /* HAVE_ARMV6 */

--- /dev/null

+++ b/vp9/encoder/arm/dct_arm.h

@@ -1,0 +1,65 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef DCT_ARM_H

+#define DCT_ARM_H

+#if HAVE_ARMV6

+extern prototype_fdct(vp9_short_walsh4x4_armv6);

+extern prototype_fdct(vp9_short_fdct4x4_armv6);

+extern prototype_fdct(vp9_short_fdct8x4_armv6);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp8_fdct_walsh_short4x4

+#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_armv6

+#undef  vp8_fdct_short4x4

+#define vp8_fdct_short4x4 vp9_short_fdct4x4_armv6

+#undef  vp8_fdct_short8x4

+#define vp8_fdct_short8x4 vp9_short_fdct8x4_armv6

+#undef  vp8_fdct_fast4x4

+#define vp8_fdct_fast4x4 vp9_short_fdct4x4_armv6

+#undef  vp8_fdct_fast8x4

+#define vp8_fdct_fast8x4 vp9_short_fdct8x4_armv6

+#endif

+#endif /* HAVE_ARMV6 */

+#if HAVE_ARMV7

+extern prototype_fdct(vp9_short_fdct4x4_neon);

+extern prototype_fdct(vp9_short_fdct8x4_neon);

+extern prototype_fdct(vp8_fast_fdct4x4_neon);

+extern prototype_fdct(vp8_fast_fdct8x4_neon);

+extern prototype_fdct(vp9_short_walsh4x4_neon);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp8_fdct_short4x4

+#define vp8_fdct_short4x4 vp9_short_fdct4x4_neon

+#undef  vp8_fdct_short8x4

+#define vp8_fdct_short8x4 vp9_short_fdct8x4_neon

+#undef  vp8_fdct_fast4x4

+#define vp8_fdct_fast4x4 vp9_short_fdct4x4_neon

+#undef  vp8_fdct_fast8x4

+#define vp8_fdct_fast8x4 vp9_short_fdct8x4_neon

+#undef  vp8_fdct_walsh_short4x4

+#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_neon

+#endif

+#endif

+#endif

--- /dev/null

+++ b/vp9/encoder/arm/encodemb_arm.h

@@ -1,0 +1,64 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef ENCODEMB_ARM_H

+#define ENCODEMB_ARM_H

+#if HAVE_ARMV6

+extern prototype_subb(vp9_subtract_b_armv6);

+extern prototype_submby(vp9_subtract_mby_armv6);

+extern prototype_submbuv(vp9_subtract_mbuv_armv6);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp8_encodemb_subb

+#define vp8_encodemb_subb vp9_subtract_b_armv6

+#undef  vp8_encodemb_submby

+#define vp8_encodemb_submby vp9_subtract_mby_armv6

+#undef  vp8_encodemb_submbuv

+#define vp8_encodemb_submbuv vp9_subtract_mbuv_armv6

+#endif

+#endif /* HAVE_ARMV6 */

+#if HAVE_ARMV7

+// extern prototype_berr(vp9_block_error_c);

+// extern prototype_mberr(vp9_mbblock_error_c);

+// extern prototype_mbuverr(vp9_mbuverror_c);

+extern prototype_subb(vp9_subtract_b_neon);

+extern prototype_submby(vp9_subtract_mby_neon);

+extern prototype_submbuv(vp9_subtract_mbuv_neon);

+// #undef  vp8_encodemb_berr

+// #define vp8_encodemb_berr vp9_block_error_c

+// #undef  vp8_encodemb_mberr

+// #define vp8_encodemb_mberr vp9_mbblock_error_c

+// #undef  vp8_encodemb_mbuverr

+// #define vp8_encodemb_mbuverr vp9_mbuverror_c

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp8_encodemb_subb

+#define vp8_encodemb_subb vp9_subtract_b_neon

+#undef  vp8_encodemb_submby

+#define vp8_encodemb_submby vp9_subtract_mby_neon

+#undef  vp8_encodemb_submbuv

+#define vp8_encodemb_submbuv vp9_subtract_mbuv_neon

+#endif

+#endif

+#endif

--- /dev/null

+++ b/vp9/encoder/arm/neon/fastquantizeb_neon.asm

@@ -1,0 +1,261 @@

+;

+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_fast_quantize_b_neon|

+    EXPORT  |vp8_fast_quantize_b_pair_neon|

+    INCLUDE asm_enc_offsets.asm

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=4

+;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);

+|vp8_fast_quantize_b_pair_neon| PROC

+    stmfd           sp!, {r4-r9}

+    vstmdb          sp!, {q4-q7}

+    ldr             r4, [r0, #vp8_block_coeff]

+    ldr             r5, [r0, #vp8_block_quant_fast]

+    ldr             r6, [r0, #vp8_block_round]

+    vld1.16         {q0, q1}, [r4@128]  ; load z

+    ldr             r7, [r2, #vp8_blockd_qcoeff]

+    vabs.s16        q4, q0              ; calculate x = abs(z)

+    vabs.s16        q5, q1

+    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative

+    vshr.s16        q2, q0, #15         ; sz

+    vshr.s16        q3, q1, #15

+    vld1.s16        {q6, q7}, [r6@128]  ; load round_ptr [0-15]

+    vld1.s16        {q8, q9}, [r5@128]  ; load quant_ptr [0-15]

+    ldr             r4, [r1, #vp8_block_coeff]

+    vadd.s16        q4, q6              ; x + Round

+    vadd.s16        q5, q7

+    vld1.16         {q0, q1}, [r4@128]  ; load z2

+    vqdmulh.s16     q4, q8              ; y = ((Round+abs(z)) * Quant) >> 16

+    vqdmulh.s16     q5, q9

+    vabs.s16        q10, q0             ; calculate x2 = abs(z_2)

+    vabs.s16        q11, q1

+    vshr.s16        q12, q0, #15        ; sz2

+    vshr.s16        q13, q1, #15

+    ;modify data to have its original sign

+    veor.s16        q4, q2              ; y^sz

+    veor.s16        q5, q3

+    vadd.s16        q10, q6             ; x2 + Round

+    vadd.s16        q11, q7

+    ldr             r8, [r2, #vp8_blockd_dequant]

+    vqdmulh.s16     q10, q8             ; y2 = ((Round+abs(z)) * Quant) >> 16

+    vqdmulh.s16     q11, q9

+    vshr.s16        q4, #1              ; right shift 1 after vqdmulh

+    vshr.s16        q5, #1

+    vld1.s16        {q6, q7}, [r8@128]  ;load dequant_ptr[i]

+    vsub.s16        q4, q2              ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)

+    vsub.s16        q5, q3

+    vshr.s16        q10, #1             ; right shift 1 after vqdmulh

+    vshr.s16        q11, #1

+    ldr             r9, [r2, #vp8_blockd_dqcoeff]

+    veor.s16        q10, q12            ; y2^sz2

+    veor.s16        q11, q13

+    vst1.s16        {q4, q5}, [r7]      ; store: qcoeff = x1

+    vsub.s16        q10, q12            ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)

+    vsub.s16        q11, q13

+    ldr             r6, [r3, #vp8_blockd_qcoeff]

+    vmul.s16        q2, q6, q4          ; x * Dequant

+    vmul.s16        q3, q7, q5

+    ldr             r0, _inv_zig_zag_   ; load ptr of inverse zigzag table

+    vceq.s16        q8, q8              ; set q8 to all 1

+    vst1.s16        {q10, q11}, [r6]    ; store: qcoeff = x2

+    vmul.s16        q12, q6, q10        ; x2 * Dequant

+    vmul.s16        q13, q7, q11

+    vld1.16         {q6, q7}, [r0@128]  ; load inverse scan order

+    vtst.16         q14, q4, q8         ; now find eob

+    vtst.16         q15, q5, q8         ; non-zero element is set to all 1

+    vst1.s16        {q2, q3}, [r9]      ; store dqcoeff = x * Dequant

+    ldr             r7, [r3, #vp8_blockd_dqcoeff]

+    vand            q0, q6, q14         ; get all valid numbers from scan array

+    vand            q1, q7, q15

+    vst1.s16        {q12, q13}, [r7]    ; store dqcoeff = x * Dequant

+    vtst.16         q2, q10, q8         ; now find eob

+    vtst.16         q3, q11, q8         ; non-zero element is set to all 1

+    vmax.u16        q0, q0, q1          ; find maximum value in q0, q1

+    vand            q10, q6, q2         ; get all valid numbers from scan array

+    vand            q11, q7, q3

+    vmax.u16        q10, q10, q11       ; find maximum value in q10, q11

+    vmax.u16        d0, d0, d1

+    vmax.u16        d20, d20, d21

+    vmovl.u16       q0, d0

+    vmovl.u16       q10, d20

+    vmax.u32        d0, d0, d1

+    vmax.u32        d20, d20, d21

+    vpmax.u32       d0, d0, d0

+    vpmax.u32       d20, d20, d20

+    add             r4, r2, #vp8_blockd_eob

+    add             r5, r3, #vp8_blockd_eob

+    vst1.32         {d0[0]}, [r4@32]

+    vst1.32         {d20[0]}, [r5@32]

+    vldmia          sp!, {q4-q7}

+    ldmfd           sp!, {r4-r9}

+    bx              lr

+    ENDP

+;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)

+|vp8_fast_quantize_b_neon| PROC

+    stmfd           sp!, {r4-r7}

+    ldr             r3, [r0, #vp8_block_coeff]

+    ldr             r4, [r0, #vp8_block_quant_fast]

+    ldr             r5, [r0, #vp8_block_round]

+    vld1.16         {q0, q1}, [r3@128]  ; load z

+    vorr.s16        q14, q0, q1         ; check if all zero (step 1)

+    ldr             r6, [r1, #vp8_blockd_qcoeff]

+    ldr             r7, [r1, #vp8_blockd_dqcoeff]

+    vorr.s16        d28, d28, d29       ; check if all zero (step 2)

+    vabs.s16        q12, q0             ; calculate x = abs(z)

+    vabs.s16        q13, q1

+    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative

+    vshr.s16        q2, q0, #15         ; sz

+    vmov            r2, r3, d28         ; check if all zero (step 3)

+    vshr.s16        q3, q1, #15

+    vld1.s16        {q14, q15}, [r5@128]; load round_ptr [0-15]

+    vld1.s16        {q8, q9}, [r4@128]  ; load quant_ptr [0-15]

+    vadd.s16        q12, q14            ; x + Round

+    vadd.s16        q13, q15

+    ldr             r0, _inv_zig_zag_   ; load ptr of inverse zigzag table

+    vqdmulh.s16     q12, q8             ; y = ((Round+abs(z)) * Quant) >> 16

+    vqdmulh.s16     q13, q9

+    vld1.16         {q10, q11}, [r0@128]; load inverse scan order

+    vceq.s16        q8, q8              ; set q8 to all 1

+    ldr             r4, [r1, #vp8_blockd_dequant]

+    vshr.s16        q12, #1             ; right shift 1 after vqdmulh

+    vshr.s16        q13, #1

+    orr             r2, r2, r3          ; check if all zero (step 4)

+    cmp             r2, #0              ; check if all zero (step 5)

+    beq             zero_output         ; check if all zero (step 6)

+    ;modify data to have its original sign

+    veor.s16        q12, q2             ; y^sz

+    veor.s16        q13, q3

+    vsub.s16        q12, q2             ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)

+    vsub.s16        q13, q3

+    vld1.s16        {q2, q3}, [r4@128]  ; load dequant_ptr[i]

+    vtst.16         q14, q12, q8        ; now find eob

+    vtst.16         q15, q13, q8        ; non-zero element is set to all 1

+    vst1.s16        {q12, q13}, [r6@128]; store: qcoeff = x1

+    vand            q10, q10, q14       ; get all valid numbers from scan array

+    vand            q11, q11, q15

+    vmax.u16        q0, q10, q11        ; find maximum value in q0, q1

+    vmax.u16        d0, d0, d1

+    vmovl.u16       q0, d0

+    vmul.s16        q2, q12             ; x * Dequant

+    vmul.s16        q3, q13

+    vmax.u32        d0, d0, d1

+    vpmax.u32       d0, d0, d0

+    vst1.s16        {q2, q3}, [r7@128]  ; store dqcoeff = x * Dequant

+    add             r4, r1, #vp8_blockd_eob

+    vst1.32         {d0[0]}, [r4@32]

+    ldmfd           sp!, {r4-r7}

+    bx              lr

+zero_output

+    str             r2, [r1, #vp8_blockd_eob]

+    vst1.s16        {q0, q1}, [r6@128]  ; qcoeff = 0

+    vst1.s16        {q0, q1}, [r7@128]  ; dqcoeff = 0

+    ldmfd           sp!, {r4-r7}

+    bx              lr

+    ENDP

+; default inverse zigzag table is defined in vp9/common/entropy.c

+_inv_zig_zag_

+    DCD inv_zig_zag

+    ALIGN 16    ; enable use of @128 bit aligned loads

+inv_zig_zag

+    DCW 0x0001, 0x0002, 0x0006, 0x0007

+    DCW 0x0003, 0x0005, 0x0008, 0x000d

+    DCW 0x0004, 0x0009, 0x000c, 0x000e

+    DCW 0x000a, 0x000b, 0x000f, 0x0010

+    END

--- /dev/null

+++ b/vp9/encoder/arm/neon/picklpf_arm.c

@@ -1,0 +1,49 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vp9/common/onyxc_int.h"

+#include "vp9/encoder/onyx_int.h"

+#include "vp9/encoder/quantize.h"

+#include "vpx_mem/vpx_mem.h"

+#include "vpx_scale/yv12extend.h"

+#include "vpx_scale/vpxscale.h"

+#include "vp9/common/alloccommon.h"

+extern void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);

+void

+vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction) {

+  unsigned char *src_y, *dst_y;

+  int yheight;

+  int ystride;

+  int border;

+  int yoffset;

+  int linestocopy;

+  border   = src_ybc->border;

+  yheight  = src_ybc->y_height;

+  ystride  = src_ybc->y_stride;

+  linestocopy = (yheight >> (Fraction + 4));

+  if (linestocopy < 1)

+    linestocopy = 1;

+  linestocopy <<= 4;

+  yoffset  = ystride * ((yheight >> 5) * 16 - 8);

+  src_y = src_ybc->y_buffer + yoffset;

+  dst_y = dst_ybc->y_buffer + yoffset;

+  // vpx_memcpy (dst_y, src_y, ystride * (linestocopy +16));

+  vp8_memcpy_neon((unsigned char *)dst_y, (unsigned char *)src_y, (int)(ystride * (linestocopy + 16)));

+}

--- /dev/null

+++ b/vp9/encoder/arm/neon/sad16_neon.asm

@@ -1,0 +1,207 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_sad16x16_neon|

+    EXPORT  |vp8_sad16x8_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char *src_ptr

+; r1    int  src_stride

+; r2    unsigned char *ref_ptr

+; r3    int  ref_stride

+|vp8_sad16x16_neon| PROC

+;;

+    vld1.8          {q0}, [r0], r1

+    vld1.8          {q4}, [r2], r3

+    vld1.8          {q1}, [r0], r1

+    vld1.8          {q5}, [r2], r3

+    vabdl.u8        q12, d0, d8

+    vabdl.u8        q13, d1, d9

+    vld1.8          {q2}, [r0], r1

+    vld1.8          {q6}, [r2], r3

+    vabal.u8        q12, d2, d10

+    vabal.u8        q13, d3, d11

+    vld1.8          {q3}, [r0], r1

+    vld1.8          {q7}, [r2], r3

+    vabal.u8        q12, d4, d12

+    vabal.u8        q13, d5, d13

+;;

+    vld1.8          {q0}, [r0], r1

+    vld1.8          {q4}, [r2], r3

+    vabal.u8        q12, d6, d14

+    vabal.u8        q13, d7, d15

+    vld1.8          {q1}, [r0], r1

+    vld1.8          {q5}, [r2], r3

+    vabal.u8        q12, d0, d8

+    vabal.u8        q13, d1, d9

+    vld1.8          {q2}, [r0], r1

+    vld1.8          {q6}, [r2], r3

+    vabal.u8        q12, d2, d10

+    vabal.u8        q13, d3, d11

+    vld1.8          {q3}, [r0], r1

+    vld1.8          {q7}, [r2], r3

+    vabal.u8        q12, d4, d12

+    vabal.u8        q13, d5, d13

+;;

+    vld1.8          {q0}, [r0], r1

+    vld1.8          {q4}, [r2], r3

+    vabal.u8        q12, d6, d14

+    vabal.u8        q13, d7, d15

+    vld1.8          {q1}, [r0], r1

+    vld1.8          {q5}, [r2], r3

+    vabal.u8        q12, d0, d8

+    vabal.u8        q13, d1, d9

+    vld1.8          {q2}, [r0], r1

+    vld1.8          {q6}, [r2], r3

+    vabal.u8        q12, d2, d10

+    vabal.u8        q13, d3, d11

+    vld1.8          {q3}, [r0], r1

+    vld1.8          {q7}, [r2], r3

+    vabal.u8        q12, d4, d12

+    vabal.u8        q13, d5, d13

+;;

+    vld1.8          {q0}, [r0], r1

+    vld1.8          {q4}, [r2], r3

+    vabal.u8        q12, d6, d14

+    vabal.u8        q13, d7, d15

+    vld1.8          {q1}, [r0], r1

+    vld1.8          {q5}, [r2], r3

+    vabal.u8        q12, d0, d8

+    vabal.u8        q13, d1, d9

+    vld1.8          {q2}, [r0], r1

+    vld1.8          {q6}, [r2], r3

+    vabal.u8        q12, d2, d10

+    vabal.u8        q13, d3, d11

+    vld1.8          {q3}, [r0]

+    vld1.8          {q7}, [r2]

+    vabal.u8        q12, d4, d12

+    vabal.u8        q13, d5, d13

+    vabal.u8        q12, d6, d14

+    vabal.u8        q13, d7, d15

+    vadd.u16        q0, q12, q13

+    vpaddl.u16      q1, q0

+    vpaddl.u32      q0, q1

+    vadd.u32        d0, d0, d1

+    vmov.32         r0, d0[0]

+    bx              lr

+    ENDP

+;==============================

+;unsigned int vp8_sad16x8_c(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride)

+|vp8_sad16x8_neon| PROC

+    vld1.8          {q0}, [r0], r1

+    vld1.8          {q4}, [r2], r3

+    vld1.8          {q1}, [r0], r1

+    vld1.8          {q5}, [r2], r3

+    vabdl.u8        q12, d0, d8

+    vabdl.u8        q13, d1, d9

+    vld1.8          {q2}, [r0], r1

+    vld1.8          {q6}, [r2], r3

+    vabal.u8        q12, d2, d10

+    vabal.u8        q13, d3, d11

+    vld1.8          {q3}, [r0], r1

+    vld1.8          {q7}, [r2], r3

+    vabal.u8        q12, d4, d12

+    vabal.u8        q13, d5, d13

+    vld1.8          {q0}, [r0], r1

+    vld1.8          {q4}, [r2], r3

+    vabal.u8        q12, d6, d14

+    vabal.u8        q13, d7, d15

+    vld1.8          {q1}, [r0], r1

+    vld1.8          {q5}, [r2], r3

+    vabal.u8        q12, d0, d8

+    vabal.u8        q13, d1, d9

+    vld1.8          {q2}, [r0], r1

+    vld1.8          {q6}, [r2], r3

+    vabal.u8        q12, d2, d10

+    vabal.u8        q13, d3, d11

+    vld1.8          {q3}, [r0], r1

+    vld1.8          {q7}, [r2], r3

+    vabal.u8        q12, d4, d12

+    vabal.u8        q13, d5, d13

+    vabal.u8        q12, d6, d14

+    vabal.u8        q13, d7, d15

+    vadd.u16        q0, q12, q13

+    vpaddl.u16      q1, q0

+    vpaddl.u32      q0, q1

+    vadd.u32        d0, d0, d1

+    vmov.32         r0, d0[0]

+    bx              lr

+    ENDP

+    END

--- /dev/null

+++ b/vp9/encoder/arm/neon/sad8_neon.asm

@@ -1,0 +1,209 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_sad8x8_neon|

+    EXPORT  |vp8_sad8x16_neon|

+    EXPORT  |vp8_sad4x4_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; unsigned int vp8_sad8x8_c(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride)

+|vp8_sad8x8_neon| PROC

+    vld1.8          {d0}, [r0], r1

+    vld1.8          {d8}, [r2], r3

+    vld1.8          {d2}, [r0], r1

+    vld1.8          {d10}, [r2], r3

+    vabdl.u8        q12, d0, d8

+    vld1.8          {d4}, [r0], r1

+    vld1.8          {d12}, [r2], r3

+    vabal.u8        q12, d2, d10

+    vld1.8          {d6}, [r0], r1

+    vld1.8          {d14}, [r2], r3

+    vabal.u8        q12, d4, d12

+    vld1.8          {d0}, [r0], r1

+    vld1.8          {d8}, [r2], r3

+    vabal.u8        q12, d6, d14

+    vld1.8          {d2}, [r0], r1

+    vld1.8          {d10}, [r2], r3

+    vabal.u8        q12, d0, d8

+    vld1.8          {d4}, [r0], r1

+    vld1.8          {d12}, [r2], r3

+    vabal.u8        q12, d2, d10

+    vld1.8          {d6}, [r0], r1

+    vld1.8          {d14}, [r2], r3

+    vabal.u8        q12, d4, d12

+    vabal.u8        q12, d6, d14

+    vpaddl.u16      q1, q12

+    vpaddl.u32      q0, q1

+    vadd.u32        d0, d0, d1

+    vmov.32         r0, d0[0]

+    bx              lr

+    ENDP

+;============================

+;unsigned int vp8_sad8x16_c(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride)

+|vp8_sad8x16_neon| PROC

+    vld1.8          {d0}, [r0], r1

+    vld1.8          {d8}, [r2], r3

+    vld1.8          {d2}, [r0], r1

+    vld1.8          {d10}, [r2], r3

+    vabdl.u8        q12, d0, d8

+    vld1.8          {d4}, [r0], r1

+    vld1.8          {d12}, [r2], r3

+    vabal.u8        q12, d2, d10

+    vld1.8          {d6}, [r0], r1

+    vld1.8          {d14}, [r2], r3

+    vabal.u8        q12, d4, d12

+    vld1.8          {d0}, [r0], r1

+    vld1.8          {d8}, [r2], r3

+    vabal.u8        q12, d6, d14

+    vld1.8          {d2}, [r0], r1

+    vld1.8          {d10}, [r2], r3

+    vabal.u8        q12, d0, d8

+    vld1.8          {d4}, [r0], r1

+    vld1.8          {d12}, [r2], r3

+    vabal.u8        q12, d2, d10

+    vld1.8          {d6}, [r0], r1

+    vld1.8          {d14}, [r2], r3

+    vabal.u8        q12, d4, d12

+    vld1.8          {d0}, [r0], r1

+    vld1.8          {d8}, [r2], r3

+    vabal.u8        q12, d6, d14

+    vld1.8          {d2}, [r0], r1

+    vld1.8          {d10}, [r2], r3

+    vabal.u8        q12, d0, d8

+    vld1.8          {d4}, [r0], r1

+    vld1.8          {d12}, [r2], r3

+    vabal.u8        q12, d2, d10

+    vld1.8          {d6}, [r0], r1

+    vld1.8          {d14}, [r2], r3

+    vabal.u8        q12, d4, d12

+    vld1.8          {d0}, [r0], r1

+    vld1.8          {d8}, [r2], r3

+    vabal.u8        q12, d6, d14

+    vld1.8          {d2}, [r0], r1

+    vld1.8          {d10}, [r2], r3

+    vabal.u8        q12, d0, d8

+    vld1.8          {d4}, [r0], r1

+    vld1.8          {d12}, [r2], r3

+    vabal.u8        q12, d2, d10

+    vld1.8          {d6}, [r0], r1

+    vld1.8          {d14}, [r2], r3

+    vabal.u8        q12, d4, d12

+    vabal.u8        q12, d6, d14

+    vpaddl.u16      q1, q12

+    vpaddl.u32      q0, q1

+    vadd.u32        d0, d0, d1

+    vmov.32         r0, d0[0]

+    bx              lr

+    ENDP

+;===========================

+;unsigned int vp8_sad4x4_c(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride)

+|vp8_sad4x4_neon| PROC

+    vld1.8          {d0}, [r0], r1

+    vld1.8          {d8}, [r2], r3

+    vld1.8          {d2}, [r0], r1

+    vld1.8          {d10}, [r2], r3

+    vabdl.u8        q12, d0, d8

+    vld1.8          {d4}, [r0], r1

+    vld1.8          {d12}, [r2], r3

+    vabal.u8        q12, d2, d10

+    vld1.8          {d6}, [r0], r1

+    vld1.8          {d14}, [r2], r3

+    vabal.u8        q12, d4, d12

+    vabal.u8        q12, d6, d14

+    vpaddl.u16      d1, d24

+    vpaddl.u32      d0, d1

+    vmov.32         r0, d0[0]

+    bx              lr

+    ENDP

+    END

--- /dev/null

+++ b/vp9/encoder/arm/neon/shortfdct_neon.asm

@@ -1,0 +1,221 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_short_fdct4x4_neon|

+    EXPORT  |vp8_short_fdct8x4_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=4

+    ALIGN 16    ; enable use of @128 bit aligned loads

+coeff

+    DCW      5352,  5352,  5352, 5352

+    DCW      2217,  2217,  2217, 2217

+    DCD     14500, 14500, 14500, 14500

+    DCD      7500,  7500,  7500, 7500

+    DCD     12000, 12000, 12000, 12000

+    DCD     51000, 51000, 51000, 51000

+;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)

+|vp8_short_fdct4x4_neon| PROC

+    ; Part one

+    vld1.16         {d0}, [r0@64], r2

+    adr             r12, coeff

+    vld1.16         {d1}, [r0@64], r2

+    vld1.16         {q8}, [r12@128]!        ; d16=5352,  d17=2217

+    vld1.16         {d2}, [r0@64], r2

+    vld1.32         {q9, q10}, [r12@128]!   ;  q9=14500, q10=7500

+    vld1.16         {d3}, [r0@64], r2

+    ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]

+    vtrn.32         d0, d2

+    vtrn.32         d1, d3

+    vld1.32         {q11,q12}, [r12@128]    ; q11=12000, q12=51000

+    vtrn.16         d0, d1

+    vtrn.16         d2, d3

+    vadd.s16        d4, d0, d3      ; a1 = ip[0] + ip[3]

+    vadd.s16        d5, d1, d2      ; b1 = ip[1] + ip[2]

+    vsub.s16        d6, d1, d2      ; c1 = ip[1] - ip[2]

+    vsub.s16        d7, d0, d3      ; d1 = ip[0] - ip[3]

+    vshl.s16        q2, q2, #3      ; (a1, b1) << 3

+    vshl.s16        q3, q3, #3      ; (c1, d1) << 3

+    vadd.s16        d0, d4, d5      ; op[0] = a1 + b1

+    vsub.s16        d2, d4, d5      ; op[2] = a1 - b1

+    vmlal.s16       q9, d7, d16     ; d1*5352 + 14500

+    vmlal.s16       q10, d7, d17    ; d1*2217 + 7500

+    vmlal.s16       q9, d6, d17     ; c1*2217 + d1*5352 + 14500

+    vmlsl.s16       q10, d6, d16    ; d1*2217 - c1*5352 + 7500

+    vshrn.s32       d1, q9, #12     ; op[1] = (c1*2217 + d1*5352 + 14500)>>12

+    vshrn.s32       d3, q10, #12    ; op[3] = (d1*2217 - c1*5352 +  7500)>>12

+    ; Part two

+    ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]

+    vtrn.32         d0, d2

+    vtrn.32         d1, d3

+    vtrn.16         d0, d1

+    vtrn.16         d2, d3

+    vmov.s16        d26, #7

+    vadd.s16        d4, d0, d3      ; a1 = ip[0] + ip[12]

+    vadd.s16        d5, d1, d2      ; b1 = ip[4] + ip[8]

+    vsub.s16        d6, d1, d2      ; c1 = ip[4] - ip[8]

+    vadd.s16        d4, d4, d26     ; a1 + 7

+    vsub.s16        d7, d0, d3      ; d1 = ip[0] - ip[12]

+    vadd.s16        d0, d4, d5      ; op[0] = a1 + b1 + 7

+    vsub.s16        d2, d4, d5      ; op[8] = a1 - b1 + 7

+    vmlal.s16       q11, d7, d16    ; d1*5352 + 12000

+    vmlal.s16       q12, d7, d17    ; d1*2217 + 51000

+    vceq.s16        d4, d7, #0

+    vshr.s16        d0, d0, #4

+    vshr.s16        d2, d2, #4

+    vmlal.s16       q11, d6, d17    ; c1*2217 + d1*5352 + 12000

+    vmlsl.s16       q12, d6, d16    ; d1*2217 - c1*5352 + 51000

+    vmvn.s16        d4, d4

+    vshrn.s32       d1, q11, #16    ; op[4] = (c1*2217 + d1*5352 + 12000)>>16

+    vsub.s16        d1, d1, d4      ; op[4] += (d1!=0)

+    vshrn.s32       d3, q12, #16    ; op[12]= (d1*2217 - c1*5352 + 51000)>>16

+    vst1.16         {q0, q1}, [r1@128]

+    bx              lr

+    ENDP

+;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)

+|vp8_short_fdct8x4_neon| PROC

+    ; Part one

+    vld1.16         {q0}, [r0@128], r2

+    adr             r12, coeff

+    vld1.16         {q1}, [r0@128], r2

+    vld1.16         {q8}, [r12@128]!        ; d16=5352,  d17=2217

+    vld1.16         {q2}, [r0@128], r2

+    vld1.32         {q9, q10}, [r12@128]!   ;  q9=14500, q10=7500

+    vld1.16         {q3}, [r0@128], r2

+    ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]

+    vtrn.32         q0, q2          ; [A0|B0]

+    vtrn.32         q1, q3          ; [A1|B1]

+    vtrn.16         q0, q1          ; [A2|B2]

+    vtrn.16         q2, q3          ; [A3|B3]

+    vadd.s16        q11, q0, q3     ; a1 = ip[0] + ip[3]

+    vadd.s16        q12, q1, q2     ; b1 = ip[1] + ip[2]

+    vsub.s16        q13, q1, q2     ; c1 = ip[1] - ip[2]

+    vsub.s16        q14, q0, q3     ; d1 = ip[0] - ip[3]

+    vshl.s16        q11, q11, #3    ; a1 << 3

+    vshl.s16        q12, q12, #3    ; b1 << 3

+    vshl.s16        q13, q13, #3    ; c1 << 3

+    vshl.s16        q14, q14, #3    ; d1 << 3

+    vadd.s16        q0, q11, q12    ; [A0 | B0] = a1 + b1

+    vsub.s16        q2, q11, q12    ; [A2 | B2] = a1 - b1

+    vmov.s16        q11, q9         ; 14500

+    vmov.s16        q12, q10        ; 7500

+    vmlal.s16       q9, d28, d16    ; A[1] = d1*5352 + 14500

+    vmlal.s16       q10, d28, d17   ; A[3] = d1*2217 + 7500

+    vmlal.s16       q11, d29, d16   ; B[1] = d1*5352 + 14500

+    vmlal.s16       q12, d29, d17   ; B[3] = d1*2217 + 7500

+    vmlal.s16       q9, d26, d17    ; A[1] = c1*2217 + d1*5352 + 14500

+    vmlsl.s16       q10, d26, d16   ; A[3] = d1*2217 - c1*5352 + 7500

+    vmlal.s16       q11, d27, d17   ; B[1] = c1*2217 + d1*5352 + 14500

+    vmlsl.s16       q12, d27, d16   ; B[3] = d1*2217 - c1*5352 + 7500

+    vshrn.s32       d2, q9, #12     ; A[1] = (c1*2217 + d1*5352 + 14500)>>12

+    vshrn.s32       d6, q10, #12    ; A[3] = (d1*2217 - c1*5352 +  7500)>>12

+    vshrn.s32       d3, q11, #12    ; B[1] = (c1*2217 + d1*5352 + 14500)>>12

+    vshrn.s32       d7, q12, #12    ; B[3] = (d1*2217 - c1*5352 +  7500)>>12

+    ; Part two

+    vld1.32         {q9,q10}, [r12@128]    ; q9=12000, q10=51000

+    ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]

+    vtrn.32         q0, q2          ; q0=[A0 | B0]

+    vtrn.32         q1, q3          ; q1=[A4 | B4]

+    vtrn.16         q0, q1          ; q2=[A8 | B8]

+    vtrn.16         q2, q3          ; q3=[A12|B12]

+    vmov.s16        q15, #7

+    vadd.s16        q11, q0, q3     ; a1 = ip[0] + ip[12]

+    vadd.s16        q12, q1, q2     ; b1 = ip[4] + ip[8]

+    vadd.s16        q11, q11, q15   ; a1 + 7

+    vsub.s16        q13, q1, q2     ; c1 = ip[4] - ip[8]

+    vsub.s16        q14, q0, q3     ; d1 = ip[0] - ip[12]

+    vadd.s16        q0, q11, q12    ; a1 + b1 + 7

+    vsub.s16        q1, q11, q12    ; a1 - b1 + 7

+    vmov.s16        q11, q9         ; 12000

+    vmov.s16        q12, q10        ; 51000

+    vshr.s16        d0, d0, #4      ; A[0] = (a1 + b1 + 7)>>4

+    vshr.s16        d4, d1, #4      ; B[0] = (a1 + b1 + 7)>>4

+    vshr.s16        d2, d2, #4      ; A[8] = (a1 + b1 + 7)>>4

+    vshr.s16        d6, d3, #4      ; B[8] = (a1 + b1 + 7)>>4

+    vmlal.s16       q9, d28, d16    ; A[4]  = d1*5352 + 12000

+    vmlal.s16       q10, d28, d17   ; A[12] = d1*2217 + 51000

+    vmlal.s16       q11, d29, d16   ; B[4]  = d1*5352 + 12000

+    vmlal.s16       q12, d29, d17   ; B[12] = d1*2217 + 51000

+    vceq.s16        q14, q14, #0

+    vmlal.s16       q9, d26, d17    ; A[4]  = c1*2217 + d1*5352 + 12000

+    vmlsl.s16       q10, d26, d16   ; A[12] = d1*2217 - c1*5352 + 51000

+    vmlal.s16       q11, d27, d17   ; B[4]  = c1*2217 + d1*5352 + 12000

+    vmlsl.s16       q12, d27, d16   ; B[12] = d1*2217 - c1*5352 + 51000

+    vmvn.s16        q14, q14

+    vshrn.s32       d1, q9, #16     ; A[4] = (c1*2217 + d1*5352 + 12000)>>16

+    vshrn.s32       d3, q10, #16    ; A[12]= (d1*2217 - c1*5352 + 51000)>>16

+    vsub.s16        d1, d1, d28     ; A[4] += (d1!=0)

+    vshrn.s32       d5, q11, #16    ; B[4] = (c1*2217 + d1*5352 + 12000)>>16

+    vshrn.s32       d7, q12, #16    ; B[12]= (d1*2217 - c1*5352 + 51000)>>16

+    vsub.s16        d5, d5, d29     ; B[4] += (d1!=0)

+    vst1.16         {q0, q1}, [r1@128]! ; block A

+    vst1.16         {q2, q3}, [r1@128]! ; block B

+    bx              lr

+    ENDP

+    END

--- /dev/null

+++ b/vp9/encoder/arm/neon/subtract_neon.asm

@@ -1,0 +1,185 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT |vp8_subtract_b_neon|

+    EXPORT |vp8_subtract_mby_neon|

+    EXPORT |vp8_subtract_mbuv_neon|

+    INCLUDE asm_enc_offsets.asm

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)

+|vp8_subtract_b_neon| PROC

+    stmfd   sp!, {r4-r7}

+    ldr     r3, [r0, #vp8_block_base_src]

+    ldr     r4, [r0, #vp8_block_src]

+    ldr     r5, [r0, #vp8_block_src_diff]

+    ldr     r3, [r3]

+    ldr     r6, [r0, #vp8_block_src_stride]

+    add     r3, r3, r4                      ; src = *base_src + src

+    ldr     r7, [r1, #vp8_blockd_predictor]

+    vld1.8          {d0}, [r3], r6          ;load src

+    vld1.8          {d1}, [r7], r2          ;load pred

+    vld1.8          {d2}, [r3], r6

+    vld1.8          {d3}, [r7], r2

+    vld1.8          {d4}, [r3], r6

+    vld1.8          {d5}, [r7], r2

+    vld1.8          {d6}, [r3], r6

+    vld1.8          {d7}, [r7], r2

+    vsubl.u8        q10, d0, d1

+    vsubl.u8        q11, d2, d3

+    vsubl.u8        q12, d4, d5

+    vsubl.u8        q13, d6, d7

+    mov             r2, r2, lsl #1

+    vst1.16         {d20}, [r5], r2         ;store diff

+    vst1.16         {d22}, [r5], r2

+    vst1.16         {d24}, [r5], r2

+    vst1.16         {d26}, [r5], r2

+    ldmfd   sp!, {r4-r7}

+    bx              lr

+    ENDP

+;==========================================

+;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)

+|vp8_subtract_mby_neon| PROC

+    mov             r12, #4

+subtract_mby_loop

+    vld1.8          {q0}, [r1], r3          ;load src

+    vld1.8          {q1}, [r2]!             ;load pred

+    vld1.8          {q2}, [r1], r3

+    vld1.8          {q3}, [r2]!

+    vld1.8          {q4}, [r1], r3

+    vld1.8          {q5}, [r2]!

+    vld1.8          {q6}, [r1], r3

+    vld1.8          {q7}, [r2]!

+    vsubl.u8        q8, d0, d2

+    vsubl.u8        q9, d1, d3

+    vsubl.u8        q10, d4, d6

+    vsubl.u8        q11, d5, d7

+    vsubl.u8        q12, d8, d10

+    vsubl.u8        q13, d9, d11

+    vsubl.u8        q14, d12, d14

+    vsubl.u8        q15, d13, d15

+    vst1.16         {q8}, [r0]!             ;store diff

+    vst1.16         {q9}, [r0]!

+    vst1.16         {q10}, [r0]!

+    vst1.16         {q11}, [r0]!

+    vst1.16         {q12}, [r0]!

+    vst1.16         {q13}, [r0]!

+    vst1.16         {q14}, [r0]!

+    vst1.16         {q15}, [r0]!

+    subs            r12, r12, #1

+    bne             subtract_mby_loop

+    bx              lr

+    ENDP

+;=================================

+;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)

+|vp8_subtract_mbuv_neon| PROC

+    ldr             r12, [sp]

+;u

+    add             r0, r0, #512        ;   short *udiff = diff + 256;

+    add             r3, r3, #256        ;   unsigned char *upred = pred + 256;

+    vld1.8          {d0}, [r1], r12         ;load src

+    vld1.8          {d1}, [r3]!             ;load pred

+    vld1.8          {d2}, [r1], r12

+    vld1.8          {d3}, [r3]!

+    vld1.8          {d4}, [r1], r12

+    vld1.8          {d5}, [r3]!

+    vld1.8          {d6}, [r1], r12

+    vld1.8          {d7}, [r3]!

+    vld1.8          {d8}, [r1], r12

+    vld1.8          {d9}, [r3]!

+    vld1.8          {d10}, [r1], r12

+    vld1.8          {d11}, [r3]!

+    vld1.8          {d12}, [r1], r12

+    vld1.8          {d13}, [r3]!

+    vld1.8          {d14}, [r1], r12

+    vld1.8          {d15}, [r3]!

+    vsubl.u8        q8, d0, d1

+    vsubl.u8        q9, d2, d3

+    vsubl.u8        q10, d4, d5

+    vsubl.u8        q11, d6, d7

+    vsubl.u8        q12, d8, d9

+    vsubl.u8        q13, d10, d11

+    vsubl.u8        q14, d12, d13

+    vsubl.u8        q15, d14, d15

+    vst1.16         {q8}, [r0]!             ;store diff

+    vst1.16         {q9}, [r0]!

+    vst1.16         {q10}, [r0]!

+    vst1.16         {q11}, [r0]!

+    vst1.16         {q12}, [r0]!

+    vst1.16         {q13}, [r0]!

+    vst1.16         {q14}, [r0]!

+    vst1.16         {q15}, [r0]!

+;v

+    vld1.8          {d0}, [r2], r12         ;load src

+    vld1.8          {d1}, [r3]!             ;load pred

+    vld1.8          {d2}, [r2], r12

+    vld1.8          {d3}, [r3]!

+    vld1.8          {d4}, [r2], r12

+    vld1.8          {d5}, [r3]!

+    vld1.8          {d6}, [r2], r12

+    vld1.8          {d7}, [r3]!

+    vld1.8          {d8}, [r2], r12

+    vld1.8          {d9}, [r3]!

+    vld1.8          {d10}, [r2], r12

+    vld1.8          {d11}, [r3]!

+    vld1.8          {d12}, [r2], r12

+    vld1.8          {d13}, [r3]!

+    vld1.8          {d14}, [r2], r12

+    vld1.8          {d15}, [r3]!

+    vsubl.u8        q8, d0, d1

+    vsubl.u8        q9, d2, d3

+    vsubl.u8        q10, d4, d5

+    vsubl.u8        q11, d6, d7

+    vsubl.u8        q12, d8, d9

+    vsubl.u8        q13, d10, d11

+    vsubl.u8        q14, d12, d13

+    vsubl.u8        q15, d14, d15

+    vst1.16         {q8}, [r0]!             ;store diff

+    vst1.16         {q9}, [r0]!

+    vst1.16         {q10}, [r0]!

+    vst1.16         {q11}, [r0]!

+    vst1.16         {q12}, [r0]!

+    vst1.16         {q13}, [r0]!

+    vst1.16         {q14}, [r0]!

+    vst1.16         {q15}, [r0]!

+    bx              lr

+    ENDP

+    END

--- /dev/null

+++ b/vp9/encoder/arm/neon/variance_neon.asm

@@ -1,0 +1,276 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_variance16x16_neon|

+    EXPORT  |vp9_variance16x8_neon|

+    EXPORT  |vp9_variance8x16_neon|

+    EXPORT  |vp9_variance8x8_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char *src_ptr

+; r1    int source_stride

+; r2    unsigned char *ref_ptr

+; r3    int  recon_stride

+; stack unsigned int *sse

+|vp9_variance16x16_neon| PROC

+    vmov.i8         q8, #0                      ;q8 - sum

+    vmov.i8         q9, #0                      ;q9, q10 - sse

+    vmov.i8         q10, #0

+    mov             r12, #8

+variance16x16_neon_loop

+    vld1.8          {q0}, [r0], r1              ;Load up source and reference

+    vld1.8          {q2}, [r2], r3

+    vld1.8          {q1}, [r0], r1

+    vld1.8          {q3}, [r2], r3

+    vsubl.u8        q11, d0, d4                 ;calculate diff

+    vsubl.u8        q12, d1, d5

+    vsubl.u8        q13, d2, d6

+    vsubl.u8        q14, d3, d7

+    ;VPADAL adds adjacent pairs of elements of a vector, and accumulates

+    ;the results into the elements of the destination vector. The explanation

+    ;in ARM guide is wrong.

+    vpadal.s16      q8, q11                     ;calculate sum

+    vmlal.s16       q9, d22, d22                ;calculate sse

+    vmlal.s16       q10, d23, d23

+    subs            r12, r12, #1

+    vpadal.s16      q8, q12

+    vmlal.s16       q9, d24, d24

+    vmlal.s16       q10, d25, d25

+    vpadal.s16      q8, q13

+    vmlal.s16       q9, d26, d26

+    vmlal.s16       q10, d27, d27

+    vpadal.s16      q8, q14

+    vmlal.s16       q9, d28, d28

+    vmlal.s16       q10, d29, d29

+    bne             variance16x16_neon_loop

+    vadd.u32        q10, q9, q10                ;accumulate sse

+    vpaddl.s32      q0, q8                      ;accumulate sum

+    ldr             r12, [sp]                   ;load *sse from stack

+    vpaddl.u32      q1, q10

+    vadd.s64        d0, d0, d1

+    vadd.u64        d1, d2, d3

+    ;vmov.32        r0, d0[0]                   ;this instruction costs a lot

+    ;vmov.32        r1, d1[0]

+    ;mul            r0, r0, r0

+    ;str            r1, [r12]

+    ;sub            r0, r1, r0, asr #8

+    ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should

+    ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.

+    vmull.s32       q5, d0, d0

+    vst1.32         {d1[0]}, [r12]              ;store sse

+    vshr.s32        d10, d10, #8

+    vsub.s32        d0, d1, d10

+    vmov.32         r0, d0[0]                   ;return

+    bx              lr

+    ENDP

+;================================

+;unsigned int vp9_variance16x8_c(

+;    unsigned char *src_ptr,

+;    int  source_stride,

+;    unsigned char *ref_ptr,

+;    int  recon_stride,

+;   unsigned int *sse)

+|vp9_variance16x8_neon| PROC

+    vmov.i8         q8, #0                      ;q8 - sum

+    vmov.i8         q9, #0                      ;q9, q10 - sse

+    vmov.i8         q10, #0

+    mov             r12, #4

+variance16x8_neon_loop

+    vld1.8          {q0}, [r0], r1              ;Load up source and reference

+    vld1.8          {q2}, [r2], r3

+    vld1.8          {q1}, [r0], r1

+    vld1.8          {q3}, [r2], r3

+    vsubl.u8        q11, d0, d4                 ;calculate diff

+    vsubl.u8        q12, d1, d5

+    vsubl.u8        q13, d2, d6

+    vsubl.u8        q14, d3, d7

+    vpadal.s16      q8, q11                     ;calculate sum

+    vmlal.s16       q9, d22, d22                ;calculate sse

+    vmlal.s16       q10, d23, d23

+    subs            r12, r12, #1

+    vpadal.s16      q8, q12

+    vmlal.s16       q9, d24, d24

+    vmlal.s16       q10, d25, d25

+    vpadal.s16      q8, q13

+    vmlal.s16       q9, d26, d26

+    vmlal.s16       q10, d27, d27

+    vpadal.s16      q8, q14

+    vmlal.s16       q9, d28, d28

+    vmlal.s16       q10, d29, d29

+    bne             variance16x8_neon_loop

+    vadd.u32        q10, q9, q10                ;accumulate sse

+    vpaddl.s32      q0, q8                      ;accumulate sum

+    ldr             r12, [sp]                   ;load *sse from stack

+    vpaddl.u32      q1, q10

+    vadd.s64        d0, d0, d1

+    vadd.u64        d1, d2, d3

+    vmull.s32       q5, d0, d0

+    vst1.32         {d1[0]}, [r12]              ;store sse

+    vshr.s32        d10, d10, #7

+    vsub.s32        d0, d1, d10

+    vmov.32         r0, d0[0]                   ;return

+    bx              lr

+    ENDP

+;=================================

+;unsigned int vp9_variance8x16_c(

+;    unsigned char *src_ptr,

+;    int  source_stride,

+;    unsigned char *ref_ptr,

+;    int  recon_stride,

+;   unsigned int *sse)

+|vp9_variance8x16_neon| PROC

+    vmov.i8         q8, #0                      ;q8 - sum

+    vmov.i8         q9, #0                      ;q9, q10 - sse

+    vmov.i8         q10, #0

+    mov             r12, #8

+variance8x16_neon_loop

+    vld1.8          {d0}, [r0], r1              ;Load up source and reference

+    vld1.8          {d4}, [r2], r3

+    vld1.8          {d2}, [r0], r1

+    vld1.8          {d6}, [r2], r3

+    vsubl.u8        q11, d0, d4                 ;calculate diff

+    vsubl.u8        q12, d2, d6

+    vpadal.s16      q8, q11                     ;calculate sum

+    vmlal.s16       q9, d22, d22                ;calculate sse

+    vmlal.s16       q10, d23, d23

+    subs            r12, r12, #1

+    vpadal.s16      q8, q12

+    vmlal.s16       q9, d24, d24

+    vmlal.s16       q10, d25, d25

+    bne             variance8x16_neon_loop

+    vadd.u32        q10, q9, q10                ;accumulate sse

+    vpaddl.s32      q0, q8                      ;accumulate sum

+    ldr             r12, [sp]                   ;load *sse from stack

+    vpaddl.u32      q1, q10

+    vadd.s64        d0, d0, d1

+    vadd.u64        d1, d2, d3

+    vmull.s32       q5, d0, d0

+    vst1.32         {d1[0]}, [r12]              ;store sse

+    vshr.s32        d10, d10, #7

+    vsub.s32        d0, d1, d10

+    vmov.32         r0, d0[0]                   ;return

+    bx              lr

+    ENDP

+;==================================

+; r0    unsigned char *src_ptr

+; r1    int source_stride

+; r2    unsigned char *ref_ptr

+; r3    int  recon_stride

+; stack unsigned int *sse

+|vp9_variance8x8_neon| PROC

+    vmov.i8         q8, #0                      ;q8 - sum

+    vmov.i8         q9, #0                      ;q9, q10 - sse

+    vmov.i8         q10, #0

+    mov             r12, #2

+variance8x8_neon_loop

+    vld1.8          {d0}, [r0], r1              ;Load up source and reference

+    vld1.8          {d4}, [r2], r3

+    vld1.8          {d1}, [r0], r1

+    vld1.8          {d5}, [r2], r3

+    vld1.8          {d2}, [r0], r1

+    vld1.8          {d6}, [r2], r3

+    vld1.8          {d3}, [r0], r1

+    vld1.8          {d7}, [r2], r3

+    vsubl.u8        q11, d0, d4                 ;calculate diff

+    vsubl.u8        q12, d1, d5

+    vsubl.u8        q13, d2, d6

+    vsubl.u8        q14, d3, d7

+    vpadal.s16      q8, q11                     ;calculate sum

+    vmlal.s16       q9, d22, d22                ;calculate sse

+    vmlal.s16       q10, d23, d23

+    subs            r12, r12, #1

+    vpadal.s16      q8, q12

+    vmlal.s16       q9, d24, d24

+    vmlal.s16       q10, d25, d25

+    vpadal.s16      q8, q13

+    vmlal.s16       q9, d26, d26

+    vmlal.s16       q10, d27, d27

+    vpadal.s16      q8, q14

+    vmlal.s16       q9, d28, d28

+    vmlal.s16       q10, d29, d29

+    bne             variance8x8_neon_loop

+    vadd.u32        q10, q9, q10                ;accumulate sse

+    vpaddl.s32      q0, q8                      ;accumulate sum

+    ldr             r12, [sp]                   ;load *sse from stack

+    vpaddl.u32      q1, q10

+    vadd.s64        d0, d0, d1

+    vadd.u64        d1, d2, d3

+    vmull.s32       q5, d0, d0

+    vst1.32         {d1[0]}, [r12]              ;store sse

+    vshr.s32        d10, d10, #6

+    vsub.s32        d0, d1, d10

+    vmov.32         r0, d0[0]                   ;return

+    bx              lr

+    ENDP

+    END

--- /dev/null

+++ b/vp9/encoder/arm/neon/vp8_memcpy_neon.asm

@@ -1,0 +1,68 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT |vp8_memcpy_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;=========================================

+;void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);

+|vp8_memcpy_neon| PROC

+    ;pld                [r1]                        ;preload pred data

+    ;pld                [r1, #128]

+    ;pld                [r1, #256]

+    ;pld                [r1, #384]

+    mov             r12, r2, lsr #8                 ;copy 256 bytes data at one time

+memcpy_neon_loop

+    vld1.8          {q0, q1}, [r1]!                 ;load src data

+    subs            r12, r12, #1

+    vld1.8          {q2, q3}, [r1]!

+    vst1.8          {q0, q1}, [r0]!                 ;copy to dst_ptr

+    vld1.8          {q4, q5}, [r1]!

+    vst1.8          {q2, q3}, [r0]!

+    vld1.8          {q6, q7}, [r1]!

+    vst1.8          {q4, q5}, [r0]!

+    vld1.8          {q8, q9}, [r1]!

+    vst1.8          {q6, q7}, [r0]!

+    vld1.8          {q10, q11}, [r1]!

+    vst1.8          {q8, q9}, [r0]!

+    vld1.8          {q12, q13}, [r1]!

+    vst1.8          {q10, q11}, [r0]!

+    vld1.8          {q14, q15}, [r1]!

+    vst1.8          {q12, q13}, [r0]!

+    vst1.8          {q14, q15}, [r0]!

+    ;pld                [r1]                        ;preload pred data -- need to adjust for real device

+    ;pld                [r1, #128]

+    ;pld                [r1, #256]

+    ;pld                [r1, #384]

+    bne             memcpy_neon_loop

+    ands            r3, r2, #0xff                   ;extra copy

+    beq             done_copy_neon_loop

+extra_copy_neon_loop

+    vld1.8          {q0}, [r1]!                 ;load src data

+    subs            r3, r3, #16

+    vst1.8          {q0}, [r0]!

+    bne             extra_copy_neon_loop

+done_copy_neon_loop

+    bx              lr

+    ENDP

+    END

--- /dev/null

+++ b/vp9/encoder/arm/neon/vp8_mse16x16_neon.asm

@@ -1,0 +1,116 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_mse16x16_neon|

+    EXPORT  |vp8_get4x4sse_cs_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;============================

+; r0    unsigned char *src_ptr

+; r1    int source_stride

+; r2    unsigned char *ref_ptr

+; r3    int  recon_stride

+; stack unsigned int *sse

+;note: in this function, sum is never used. So, we can remove this part of calculation

+;from vp9_variance().

+|vp8_mse16x16_neon| PROC

+    vmov.i8         q7, #0                      ;q7, q8, q9, q10 - sse

+    vmov.i8         q8, #0

+    vmov.i8         q9, #0

+    vmov.i8         q10, #0

+    mov             r12, #8

+mse16x16_neon_loop

+    vld1.8          {q0}, [r0], r1              ;Load up source and reference

+    vld1.8          {q2}, [r2], r3

+    vld1.8          {q1}, [r0], r1

+    vld1.8          {q3}, [r2], r3

+    vsubl.u8        q11, d0, d4

+    vsubl.u8        q12, d1, d5

+    vsubl.u8        q13, d2, d6

+    vsubl.u8        q14, d3, d7

+    vmlal.s16       q7, d22, d22

+    vmlal.s16       q8, d23, d23

+    subs            r12, r12, #1

+    vmlal.s16       q9, d24, d24

+    vmlal.s16       q10, d25, d25

+    vmlal.s16       q7, d26, d26

+    vmlal.s16       q8, d27, d27

+    vmlal.s16       q9, d28, d28

+    vmlal.s16       q10, d29, d29

+    bne             mse16x16_neon_loop

+    vadd.u32        q7, q7, q8

+    vadd.u32        q9, q9, q10

+    ldr             r12, [sp]               ;load *sse from stack

+    vadd.u32        q10, q7, q9

+    vpaddl.u32      q1, q10

+    vadd.u64        d0, d2, d3

+    vst1.32         {d0[0]}, [r12]

+    vmov.32         r0, d0[0]

+    bx              lr

+    ENDP

+;=============================

+; r0    unsigned char *src_ptr,

+; r1    int  source_stride,

+; r2    unsigned char *ref_ptr,

+; r3    int  recon_stride

+|vp8_get4x4sse_cs_neon| PROC

+    vld1.8          {d0}, [r0], r1              ;Load up source and reference

+    vld1.8          {d4}, [r2], r3

+    vld1.8          {d1}, [r0], r1

+    vld1.8          {d5}, [r2], r3

+    vld1.8          {d2}, [r0], r1

+    vld1.8          {d6}, [r2], r3

+    vld1.8          {d3}, [r0], r1

+    vld1.8          {d7}, [r2], r3

+    vsubl.u8        q11, d0, d4

+    vsubl.u8        q12, d1, d5

+    vsubl.u8        q13, d2, d6

+    vsubl.u8        q14, d3, d7

+    vmull.s16       q7, d22, d22

+    vmull.s16       q8, d24, d24

+    vmull.s16       q9, d26, d26

+    vmull.s16       q10, d28, d28

+    vadd.u32        q7, q7, q8

+    vadd.u32        q9, q9, q10

+    vadd.u32        q9, q7, q9

+    vpaddl.u32      q1, q9

+    vadd.u64        d0, d2, d3

+    vmov.32         r0, d0[0]

+    bx              lr

+    ENDP

+    END

--- /dev/null

+++ b/vp9/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm

@@ -1,0 +1,103 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp8_short_walsh4x4_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)

+; r0   short *input,

+; r1   short *output,

+; r2   int pitch

+|vp8_short_walsh4x4_neon| PROC

+    vld1.16         {d0}, [r0@64], r2   ; load input

+    vld1.16         {d1}, [r0@64], r2

+    vld1.16         {d2}, [r0@64], r2

+    vld1.16         {d3}, [r0@64]

+    ;First for-loop

+    ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]

+    vtrn.32         d0, d2

+    vtrn.32         d1, d3

+    vmov.s32        q15, #3             ; add 3 to all values

+    vtrn.16         d0, d1

+    vtrn.16         d2, d3

+    vadd.s16        d4, d0, d2          ; ip[0] + ip[2]

+    vadd.s16        d5, d1, d3          ; ip[1] + ip[3]

+    vsub.s16        d6, d1, d3          ; ip[1] - ip[3]

+    vsub.s16        d7, d0, d2          ; ip[0] - ip[2]

+    vshl.s16        d4, d4, #2          ; a1 = (ip[0] + ip[2]) << 2

+    vshl.s16        d5, d5, #2          ; d1 = (ip[1] + ip[3]) << 2

+    vshl.s16        d6, d6, #2          ; c1 = (ip[1] - ip[3]) << 2

+    vceq.s16        d16, d4, #0         ; a1 == 0

+    vshl.s16        d7, d7, #2          ; b1 = (ip[0] - ip[2]) << 2

+    vadd.s16        d0, d4, d5          ; a1 + d1

+    vmvn            d16, d16            ; a1 != 0

+    vsub.s16        d3, d4, d5          ; op[3] = a1 - d1

+    vadd.s16        d1, d7, d6          ; op[1] = b1 + c1

+    vsub.s16        d2, d7, d6          ; op[2] = b1 - c1

+    vsub.s16        d0, d0, d16         ; op[0] = a1 + d1 + (a1 != 0)

+    ;Second for-loop

+    ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]

+    vtrn.32         d1, d3

+    vtrn.32         d0, d2

+    vtrn.16         d2, d3

+    vtrn.16         d0, d1

+    vaddl.s16       q8, d0, d2          ; a1 = ip[0]+ip[8]

+    vaddl.s16       q9, d1, d3          ; d1 = ip[4]+ip[12]

+    vsubl.s16       q10, d1, d3         ; c1 = ip[4]-ip[12]

+    vsubl.s16       q11, d0, d2         ; b1 = ip[0]-ip[8]

+    vadd.s32        q0, q8, q9          ; a2 = a1 + d1

+    vadd.s32        q1, q11, q10        ; b2 = b1 + c1

+    vsub.s32        q2, q11, q10        ; c2 = b1 - c1

+    vsub.s32        q3, q8, q9          ; d2 = a1 - d1

+    vclt.s32        q8, q0, #0

+    vclt.s32        q9, q1, #0

+    vclt.s32        q10, q2, #0

+    vclt.s32        q11, q3, #0

+    ; subtract -1 (or 0)

+    vsub.s32        q0, q0, q8          ; a2 += a2 < 0

+    vsub.s32        q1, q1, q9          ; b2 += b2 < 0

+    vsub.s32        q2, q2, q10         ; c2 += c2 < 0

+    vsub.s32        q3, q3, q11         ; d2 += d2 < 0

+    vadd.s32        q8, q0, q15         ; a2 + 3

+    vadd.s32        q9, q1, q15         ; b2 + 3

+    vadd.s32        q10, q2, q15        ; c2 + 3

+    vadd.s32        q11, q3, q15        ; d2 + 3

+    ; vrshrn? would add 1 << 3-1 = 2

+    vshrn.s32       d0, q8, #3

+    vshrn.s32       d1, q9, #3

+    vshrn.s32       d2, q10, #3

+    vshrn.s32       d3, q11, #3

+    vst1.16         {q0, q1}, [r1@128]

+    bx              lr

+    ENDP

+    END

--- /dev/null

+++ b/vp9/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm

@@ -1,0 +1,425 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_sub_pixel_variance16x16_neon_func|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char  *src_ptr,

+; r1    int  src_pixels_per_line,

+; r2    int  xoffset,

+; r3    int  yoffset,

+; stack(r4) unsigned char *dst_ptr,

+; stack(r5) int dst_pixels_per_line,

+; stack(r6) unsigned int *sse

+;note: most of the code is copied from bilinear_predict16x16_neon and vp9_variance16x16_neon.

+|vp9_sub_pixel_variance16x16_neon_func| PROC

+    push            {r4-r6, lr}

+    ldr             r12, _BilinearTaps_coeff_

+    ldr             r4, [sp, #16]           ;load *dst_ptr from stack

+    ldr             r5, [sp, #20]           ;load dst_pixels_per_line from stack

+    ldr             r6, [sp, #24]           ;load *sse from stack

+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

+    beq             secondpass_bfilter16x16_only

+    add             r2, r12, r2, lsl #3     ;calculate filter location

+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

+    vld1.s32        {d31}, [r2]             ;load first_pass filter

+    beq             firstpass_bfilter16x16_only

+    sub             sp, sp, #272            ;reserve space on stack for temporary storage

+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data

+    mov             lr, sp

+    vld1.u8         {d5, d6, d7}, [r0], r1

+    mov             r2, #3                  ;loop counter

+    vld1.u8         {d8, d9, d10}, [r0], r1

+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)

+    vld1.u8         {d11, d12, d13}, [r0], r1

+    vdup.8          d1, d31[4]

+;First Pass: output_height lines x output_width columns (17x16)

+vp8e_filt_blk2d_fp16x16_loop_neon

+    pld             [r0]

+    pld             [r0, r1]

+    pld             [r0, r1, lsl #1]

+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * Filter[0])

+    vmull.u8        q8, d3, d0

+    vmull.u8        q9, d5, d0

+    vmull.u8        q10, d6, d0

+    vmull.u8        q11, d8, d0

+    vmull.u8        q12, d9, d0

+    vmull.u8        q13, d11, d0

+    vmull.u8        q14, d12, d0

+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]

+    vext.8          d5, d5, d6, #1

+    vext.8          d8, d8, d9, #1

+    vext.8          d11, d11, d12, #1

+    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * Filter[1])

+    vmlal.u8        q9, d5, d1

+    vmlal.u8        q11, d8, d1

+    vmlal.u8        q13, d11, d1

+    vext.8          d3, d3, d4, #1

+    vext.8          d6, d6, d7, #1

+    vext.8          d9, d9, d10, #1

+    vext.8          d12, d12, d13, #1

+    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * Filter[1])

+    vmlal.u8        q10, d6, d1

+    vmlal.u8        q12, d9, d1

+    vmlal.u8        q14, d12, d1

+    subs            r2, r2, #1

+    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8

+    vqrshrn.u16    d15, q8, #7

+    vqrshrn.u16    d16, q9, #7

+    vqrshrn.u16    d17, q10, #7

+    vqrshrn.u16    d18, q11, #7

+    vqrshrn.u16    d19, q12, #7

+    vqrshrn.u16    d20, q13, #7

+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data

+    vqrshrn.u16    d21, q14, #7

+    vld1.u8         {d5, d6, d7}, [r0], r1

+    vst1.u8         {d14, d15, d16, d17}, [lr]!     ;store result

+    vld1.u8         {d8, d9, d10}, [r0], r1

+    vst1.u8         {d18, d19, d20, d21}, [lr]!

+    vld1.u8         {d11, d12, d13}, [r0], r1

+    bne             vp8e_filt_blk2d_fp16x16_loop_neon

+;First-pass filtering for rest 5 lines

+    vld1.u8         {d14, d15, d16}, [r0], r1

+    vmull.u8        q9, d2, d0              ;(src_ptr[0] * Filter[0])

+    vmull.u8        q10, d3, d0

+    vmull.u8        q11, d5, d0

+    vmull.u8        q12, d6, d0

+    vmull.u8        q13, d8, d0

+    vmull.u8        q14, d9, d0

+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]

+    vext.8          d5, d5, d6, #1

+    vext.8          d8, d8, d9, #1

+    vmlal.u8        q9, d2, d1              ;(src_ptr[0] * Filter[1])

+    vmlal.u8        q11, d5, d1

+    vmlal.u8        q13, d8, d1

+    vext.8          d3, d3, d4, #1

+    vext.8          d6, d6, d7, #1

+    vext.8          d9, d9, d10, #1

+    vmlal.u8        q10, d3, d1             ;(src_ptr[0] * Filter[1])

+    vmlal.u8        q12, d6, d1

+    vmlal.u8        q14, d9, d1

+    vmull.u8        q1, d11, d0

+    vmull.u8        q2, d12, d0

+    vmull.u8        q3, d14, d0

+    vmull.u8        q4, d15, d0

+    vext.8          d11, d11, d12, #1       ;construct src_ptr[1]

+    vext.8          d14, d14, d15, #1

+    vmlal.u8        q1, d11, d1             ;(src_ptr[0] * Filter[1])

+    vmlal.u8        q3, d14, d1

+    vext.8          d12, d12, d13, #1

+    vext.8          d15, d15, d16, #1

+    vmlal.u8        q2, d12, d1             ;(src_ptr[0] * Filter[1])

+    vmlal.u8        q4, d15, d1

+    vqrshrn.u16    d10, q9, #7              ;shift/round/saturate to u8

+    vqrshrn.u16    d11, q10, #7

+    vqrshrn.u16    d12, q11, #7

+    vqrshrn.u16    d13, q12, #7

+    vqrshrn.u16    d14, q13, #7

+    vqrshrn.u16    d15, q14, #7

+    vqrshrn.u16    d16, q1, #7

+    vqrshrn.u16    d17, q2, #7

+    vqrshrn.u16    d18, q3, #7

+    vqrshrn.u16    d19, q4, #7

+    vst1.u8         {d10, d11, d12, d13}, [lr]!         ;store result

+    vst1.u8         {d14, d15, d16, d17}, [lr]!

+    vst1.u8         {d18, d19}, [lr]!

+;Second pass: 16x16

+;secondpass_filter

+    add             r3, r12, r3, lsl #3

+    sub             lr, lr, #272

+    vld1.u32        {d31}, [r3]             ;load second_pass filter

+    sub             sp, sp, #256

+    mov             r3, sp

+    vld1.u8         {d22, d23}, [lr]!       ;load src data

+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)

+    vdup.8          d1, d31[4]

+    mov             r12, #4                 ;loop counter

+vp8e_filt_blk2d_sp16x16_loop_neon

+    vld1.u8         {d24, d25}, [lr]!

+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])

+    vld1.u8         {d26, d27}, [lr]!

+    vmull.u8        q2, d23, d0

+    vld1.u8         {d28, d29}, [lr]!

+    vmull.u8        q3, d24, d0

+    vld1.u8         {d30, d31}, [lr]!

+    vmull.u8        q4, d25, d0

+    vmull.u8        q5, d26, d0

+    vmull.u8        q6, d27, d0

+    vmull.u8        q7, d28, d0

+    vmull.u8        q8, d29, d0

+    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * Filter[1])

+    vmlal.u8        q2, d25, d1

+    vmlal.u8        q3, d26, d1

+    vmlal.u8        q4, d27, d1

+    vmlal.u8        q5, d28, d1

+    vmlal.u8        q6, d29, d1

+    vmlal.u8        q7, d30, d1

+    vmlal.u8        q8, d31, d1

+    subs            r12, r12, #1

+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8

+    vqrshrn.u16    d3, q2, #7

+    vqrshrn.u16    d4, q3, #7

+    vqrshrn.u16    d5, q4, #7

+    vqrshrn.u16    d6, q5, #7

+    vqrshrn.u16    d7, q6, #7

+    vqrshrn.u16    d8, q7, #7

+    vqrshrn.u16    d9, q8, #7

+    vst1.u8         {d2, d3}, [r3]!         ;store result

+    vst1.u8         {d4, d5}, [r3]!

+    vst1.u8         {d6, d7}, [r3]!

+    vmov            q11, q15

+    vst1.u8         {d8, d9}, [r3]!

+    bne             vp8e_filt_blk2d_sp16x16_loop_neon

+    b               sub_pixel_variance16x16_neon

+;--------------------

+firstpass_bfilter16x16_only

+    mov             r2, #4                      ;loop counter

+    sub             sp, sp, #528            ;reserve space on stack for temporary storage

+    vdup.8          d0, d31[0]                  ;first_pass filter (d0 d1)

+    vdup.8          d1, d31[4]

+    mov             r3, sp

+;First Pass: output_height lines x output_width columns (16x16)

+vp8e_filt_blk2d_fpo16x16_loop_neon

+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data

+    vld1.u8         {d5, d6, d7}, [r0], r1

+    vld1.u8         {d8, d9, d10}, [r0], r1

+    vld1.u8         {d11, d12, d13}, [r0], r1

+    pld             [r0]

+    pld             [r0, r1]

+    pld             [r0, r1, lsl #1]

+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * Filter[0])

+    vmull.u8        q8, d3, d0

+    vmull.u8        q9, d5, d0

+    vmull.u8        q10, d6, d0

+    vmull.u8        q11, d8, d0

+    vmull.u8        q12, d9, d0

+    vmull.u8        q13, d11, d0

+    vmull.u8        q14, d12, d0

+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]

+    vext.8          d5, d5, d6, #1

+    vext.8          d8, d8, d9, #1

+    vext.8          d11, d11, d12, #1

+    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * Filter[1])

+    vmlal.u8        q9, d5, d1

+    vmlal.u8        q11, d8, d1

+    vmlal.u8        q13, d11, d1

+    vext.8          d3, d3, d4, #1

+    vext.8          d6, d6, d7, #1

+    vext.8          d9, d9, d10, #1

+    vext.8          d12, d12, d13, #1

+    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * Filter[1])

+    vmlal.u8        q10, d6, d1

+    vmlal.u8        q12, d9, d1

+    vmlal.u8        q14, d12, d1

+    subs            r2, r2, #1

+    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8

+    vqrshrn.u16    d15, q8, #7

+    vqrshrn.u16    d16, q9, #7

+    vqrshrn.u16    d17, q10, #7

+    vqrshrn.u16    d18, q11, #7

+    vqrshrn.u16    d19, q12, #7

+    vqrshrn.u16    d20, q13, #7

+    vst1.u8         {d14, d15}, [r3]!       ;store result

+    vqrshrn.u16    d21, q14, #7

+    vst1.u8         {d16, d17}, [r3]!

+    vst1.u8         {d18, d19}, [r3]!

+    vst1.u8         {d20, d21}, [r3]!

+    bne             vp8e_filt_blk2d_fpo16x16_loop_neon

+    b               sub_pixel_variance16x16_neon

+;---------------------

+secondpass_bfilter16x16_only

+;Second pass: 16x16

+;secondpass_filter

+    sub             sp, sp, #528            ;reserve space on stack for temporary storage

+    add             r3, r12, r3, lsl #3

+    mov             r12, #4                     ;loop counter

+    vld1.u32        {d31}, [r3]                 ;load second_pass filter

+    vld1.u8         {d22, d23}, [r0], r1        ;load src data

+    mov             r3, sp

+    vdup.8          d0, d31[0]                  ;second_pass filter parameters (d0 d1)

+    vdup.8          d1, d31[4]

+vp8e_filt_blk2d_spo16x16_loop_neon

+    vld1.u8         {d24, d25}, [r0], r1

+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])

+    vld1.u8         {d26, d27}, [r0], r1

+    vmull.u8        q2, d23, d0

+    vld1.u8         {d28, d29}, [r0], r1

+    vmull.u8        q3, d24, d0

+    vld1.u8         {d30, d31}, [r0], r1

+    vmull.u8        q4, d25, d0

+    vmull.u8        q5, d26, d0

+    vmull.u8        q6, d27, d0

+    vmull.u8        q7, d28, d0

+    vmull.u8        q8, d29, d0

+    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * Filter[1])

+    vmlal.u8        q2, d25, d1

+    vmlal.u8        q3, d26, d1

+    vmlal.u8        q4, d27, d1

+    vmlal.u8        q5, d28, d1

+    vmlal.u8        q6, d29, d1

+    vmlal.u8        q7, d30, d1

+    vmlal.u8        q8, d31, d1

+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8

+    vqrshrn.u16    d3, q2, #7

+    vqrshrn.u16    d4, q3, #7

+    vqrshrn.u16    d5, q4, #7

+    vqrshrn.u16    d6, q5, #7

+    vqrshrn.u16    d7, q6, #7

+    vqrshrn.u16    d8, q7, #7

+    vqrshrn.u16    d9, q8, #7

+    vst1.u8         {d2, d3}, [r3]!         ;store result

+    subs            r12, r12, #1

+    vst1.u8         {d4, d5}, [r3]!

+    vmov            q11, q15

+    vst1.u8         {d6, d7}, [r3]!

+    vst1.u8         {d8, d9}, [r3]!

+    bne             vp8e_filt_blk2d_spo16x16_loop_neon

+    b               sub_pixel_variance16x16_neon

+;----------------------------

+;variance16x16

+sub_pixel_variance16x16_neon

+    vmov.i8         q8, #0                      ;q8 - sum

+    vmov.i8         q9, #0                      ;q9, q10 - sse

+    vmov.i8         q10, #0

+    sub             r3, r3, #256

+    mov             r12, #8

+sub_pixel_variance16x16_neon_loop

+    vld1.8          {q0}, [r3]!                 ;Load up source and reference

+    vld1.8          {q2}, [r4], r5

+    vld1.8          {q1}, [r3]!

+    vld1.8          {q3}, [r4], r5

+    vsubl.u8        q11, d0, d4                 ;diff

+    vsubl.u8        q12, d1, d5

+    vsubl.u8        q13, d2, d6

+    vsubl.u8        q14, d3, d7

+    vpadal.s16      q8, q11                     ;sum

+    vmlal.s16       q9, d22, d22                ;sse

+    vmlal.s16       q10, d23, d23

+    subs            r12, r12, #1

+    vpadal.s16      q8, q12

+    vmlal.s16       q9, d24, d24

+    vmlal.s16       q10, d25, d25

+    vpadal.s16      q8, q13

+    vmlal.s16       q9, d26, d26

+    vmlal.s16       q10, d27, d27

+    vpadal.s16      q8, q14

+    vmlal.s16       q9, d28, d28

+    vmlal.s16       q10, d29, d29

+    bne             sub_pixel_variance16x16_neon_loop

+    vadd.u32        q10, q9, q10                ;accumulate sse

+    vpaddl.s32      q0, q8                      ;accumulate sum

+    vpaddl.u32      q1, q10

+    vadd.s64        d0, d0, d1

+    vadd.u64        d1, d2, d3

+    vmull.s32       q5, d0, d0

+    vst1.32         {d1[0]}, [r6]               ;store sse

+    vshr.s32        d10, d10, #8

+    vsub.s32        d0, d1, d10

+    add             sp, sp, #528

+    vmov.32         r0, d0[0]                   ;return

+    pop             {r4-r6,pc}

+    ENDP

+;-----------------

+_BilinearTaps_coeff_

+    DCD     bilinear_taps_coeff

+bilinear_taps_coeff

+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112

+    END

--- /dev/null

+++ b/vp9/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm

@@ -1,0 +1,572 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_variance_halfpixvar16x16_h_neon|

+    EXPORT  |vp9_variance_halfpixvar16x16_v_neon|

+    EXPORT  |vp9_variance_halfpixvar16x16_hv_neon|

+    EXPORT  |vp9_sub_pixel_variance16x16s_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;================================================

+;unsigned int vp9_variance_halfpixvar16x16_h_neon

+;(

+;    unsigned char  *src_ptr, r0

+;    int  src_pixels_per_line,  r1

+;    unsigned char *dst_ptr,  r2

+;    int dst_pixels_per_line,   r3

+;    unsigned int *sse

+;);

+;================================================

+|vp9_variance_halfpixvar16x16_h_neon| PROC

+    push            {lr}

+    mov             r12, #4                  ;loop counter

+    ldr             lr, [sp, #4]           ;load *sse from stack

+    vmov.i8         q8, #0                      ;q8 - sum

+    vmov.i8         q9, #0                      ;q9, q10 - sse

+    vmov.i8         q10, #0

+;First Pass: output_height lines x output_width columns (16x16)

+vp8_filt_fpo16x16s_4_0_loop_neon

+    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data

+    vld1.8          {q11}, [r2], r3

+    vld1.u8         {d4, d5, d6, d7}, [r0], r1

+    vld1.8          {q12}, [r2], r3

+    vld1.u8         {d8, d9, d10, d11}, [r0], r1

+    vld1.8          {q13}, [r2], r3

+    vld1.u8         {d12, d13, d14, d15}, [r0], r1

+    ;pld                [r0]

+    ;pld                [r0, r1]

+    ;pld                [r0, r1, lsl #1]

+    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]

+    vext.8          q3, q2, q3, #1

+    vext.8          q5, q4, q5, #1

+    vext.8          q7, q6, q7, #1

+    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1

+    vld1.8          {q14}, [r2], r3

+    vrhadd.u8       q1, q2, q3

+    vrhadd.u8       q2, q4, q5

+    vrhadd.u8       q3, q6, q7

+    vsubl.u8        q4, d0, d22                 ;diff

+    vsubl.u8        q5, d1, d23

+    vsubl.u8        q6, d2, d24

+    vsubl.u8        q7, d3, d25

+    vsubl.u8        q0, d4, d26

+    vsubl.u8        q1, d5, d27

+    vsubl.u8        q2, d6, d28

+    vsubl.u8        q3, d7, d29

+    vpadal.s16      q8, q4                     ;sum

+    vmlal.s16       q9, d8, d8                ;sse

+    vmlal.s16       q10, d9, d9

+    subs            r12, r12, #1

+    vpadal.s16      q8, q5

+    vmlal.s16       q9, d10, d10

+    vmlal.s16       q10, d11, d11

+    vpadal.s16      q8, q6

+    vmlal.s16       q9, d12, d12

+    vmlal.s16       q10, d13, d13

+    vpadal.s16      q8, q7

+    vmlal.s16       q9, d14, d14

+    vmlal.s16       q10, d15, d15

+    vpadal.s16      q8, q0                     ;sum

+    vmlal.s16       q9, d0, d0                ;sse

+    vmlal.s16       q10, d1, d1

+    vpadal.s16      q8, q1

+    vmlal.s16       q9, d2, d2

+    vmlal.s16       q10, d3, d3

+    vpadal.s16      q8, q2

+    vmlal.s16       q9, d4, d4

+    vmlal.s16       q10, d5, d5

+    vpadal.s16      q8, q3

+    vmlal.s16       q9, d6, d6

+    vmlal.s16       q10, d7, d7

+    bne             vp8_filt_fpo16x16s_4_0_loop_neon

+    vadd.u32        q10, q9, q10                ;accumulate sse

+    vpaddl.s32      q0, q8                      ;accumulate sum

+    vpaddl.u32      q1, q10

+    vadd.s64        d0, d0, d1

+    vadd.u64        d1, d2, d3

+    vmull.s32       q5, d0, d0

+    vst1.32         {d1[0]}, [lr]               ;store sse

+    vshr.s32        d10, d10, #8

+    vsub.s32        d0, d1, d10

+    vmov.32         r0, d0[0]                   ;return

+    pop             {pc}

+    ENDP

+;================================================

+;unsigned int vp9_variance_halfpixvar16x16_v_neon

+;(

+;    unsigned char  *src_ptr, r0

+;    int  src_pixels_per_line,  r1

+;    unsigned char *dst_ptr,  r2

+;    int dst_pixels_per_line,   r3

+;    unsigned int *sse

+;);

+;================================================

+|vp9_variance_halfpixvar16x16_v_neon| PROC

+    push            {lr}

+    mov             r12, #4                     ;loop counter

+    vld1.u8         {q0}, [r0], r1              ;load src data

+    ldr             lr, [sp, #4]                ;load *sse from stack

+    vmov.i8         q8, #0                      ;q8 - sum

+    vmov.i8         q9, #0                      ;q9, q10 - sse

+    vmov.i8         q10, #0

+vp8_filt_spo16x16s_0_4_loop_neon

+    vld1.u8         {q2}, [r0], r1

+    vld1.8          {q1}, [r2], r3

+    vld1.u8         {q4}, [r0], r1

+    vld1.8          {q3}, [r2], r3

+    vld1.u8         {q6}, [r0], r1

+    vld1.8          {q5}, [r2], r3

+    vld1.u8         {q15}, [r0], r1

+    vrhadd.u8       q0, q0, q2

+    vld1.8          {q7}, [r2], r3

+    vrhadd.u8       q2, q2, q4

+    vrhadd.u8       q4, q4, q6

+    vrhadd.u8       q6, q6, q15

+    vsubl.u8        q11, d0, d2                 ;diff

+    vsubl.u8        q12, d1, d3

+    vsubl.u8        q13, d4, d6

+    vsubl.u8        q14, d5, d7

+    vsubl.u8        q0, d8, d10

+    vsubl.u8        q1, d9, d11

+    vsubl.u8        q2, d12, d14

+    vsubl.u8        q3, d13, d15

+    vpadal.s16      q8, q11                     ;sum

+    vmlal.s16       q9, d22, d22                ;sse

+    vmlal.s16       q10, d23, d23

+    subs            r12, r12, #1

+    vpadal.s16      q8, q12

+    vmlal.s16       q9, d24, d24

+    vmlal.s16       q10, d25, d25

+    vpadal.s16      q8, q13

+    vmlal.s16       q9, d26, d26

+    vmlal.s16       q10, d27, d27

+    vpadal.s16      q8, q14

+    vmlal.s16       q9, d28, d28

+    vmlal.s16       q10, d29, d29

+    vpadal.s16      q8, q0                     ;sum

+    vmlal.s16       q9, d0, d0                 ;sse

+    vmlal.s16       q10, d1, d1

+    vpadal.s16      q8, q1

+    vmlal.s16       q9, d2, d2

+    vmlal.s16       q10, d3, d3

+    vpadal.s16      q8, q2

+    vmlal.s16       q9, d4, d4

+    vmlal.s16       q10, d5, d5

+    vmov            q0, q15

+    vpadal.s16      q8, q3

+    vmlal.s16       q9, d6, d6

+    vmlal.s16       q10, d7, d7

+    bne             vp8_filt_spo16x16s_0_4_loop_neon

+    vadd.u32        q10, q9, q10                ;accumulate sse

+    vpaddl.s32      q0, q8                      ;accumulate sum

+    vpaddl.u32      q1, q10

+    vadd.s64        d0, d0, d1

+    vadd.u64        d1, d2, d3

+    vmull.s32       q5, d0, d0

+    vst1.32         {d1[0]}, [lr]               ;store sse

+    vshr.s32        d10, d10, #8

+    vsub.s32        d0, d1, d10

+    vmov.32         r0, d0[0]                   ;return

+    pop             {pc}

+    ENDP

+;================================================

+;unsigned int vp9_variance_halfpixvar16x16_hv_neon

+;(

+;    unsigned char  *src_ptr, r0

+;    int  src_pixels_per_line,  r1

+;    unsigned char *dst_ptr,  r2

+;    int dst_pixels_per_line,   r3

+;    unsigned int *sse

+;);

+;================================================

+|vp9_variance_halfpixvar16x16_hv_neon| PROC

+    push            {lr}

+    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data

+    ldr             lr, [sp, #4]           ;load *sse from stack

+    vmov.i8         q13, #0                      ;q8 - sum

+    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]

+    vmov.i8         q14, #0                      ;q9, q10 - sse

+    vmov.i8         q15, #0

+    mov             r12, #4                  ;loop counter

+    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1

+;First Pass: output_height lines x output_width columns (17x16)

+vp8_filt16x16s_4_4_loop_neon

+    vld1.u8         {d4, d5, d6, d7}, [r0], r1

+    vld1.u8         {d8, d9, d10, d11}, [r0], r1

+    vld1.u8         {d12, d13, d14, d15}, [r0], r1

+    vld1.u8         {d16, d17, d18, d19}, [r0], r1

+    ;pld                [r0]

+    ;pld                [r0, r1]

+    ;pld                [r0, r1, lsl #1]

+    vext.8          q3, q2, q3, #1          ;construct src_ptr[1]

+    vext.8          q5, q4, q5, #1

+    vext.8          q7, q6, q7, #1

+    vext.8          q9, q8, q9, #1

+    vrhadd.u8       q1, q2, q3              ;(src_ptr[0]+src_ptr[1])/round/shift right 1

+    vrhadd.u8       q2, q4, q5

+    vrhadd.u8       q3, q6, q7

+    vrhadd.u8       q4, q8, q9

+    vld1.8          {q5}, [r2], r3

+    vrhadd.u8       q0, q0, q1

+    vld1.8          {q6}, [r2], r3

+    vrhadd.u8       q1, q1, q2

+    vld1.8          {q7}, [r2], r3

+    vrhadd.u8       q2, q2, q3

+    vld1.8          {q8}, [r2], r3

+    vrhadd.u8       q3, q3, q4

+    vsubl.u8        q9, d0, d10                 ;diff

+    vsubl.u8        q10, d1, d11

+    vsubl.u8        q11, d2, d12

+    vsubl.u8        q12, d3, d13

+    vsubl.u8        q0, d4, d14                 ;diff

+    vsubl.u8        q1, d5, d15

+    vsubl.u8        q5, d6, d16

+    vsubl.u8        q6, d7, d17

+    vpadal.s16      q13, q9                     ;sum

+    vmlal.s16       q14, d18, d18                ;sse

+    vmlal.s16       q15, d19, d19

+    vpadal.s16      q13, q10                     ;sum

+    vmlal.s16       q14, d20, d20                ;sse

+    vmlal.s16       q15, d21, d21

+    vpadal.s16      q13, q11                     ;sum

+    vmlal.s16       q14, d22, d22                ;sse

+    vmlal.s16       q15, d23, d23

+    vpadal.s16      q13, q12                     ;sum

+    vmlal.s16       q14, d24, d24                ;sse

+    vmlal.s16       q15, d25, d25

+    subs            r12, r12, #1

+    vpadal.s16      q13, q0                     ;sum

+    vmlal.s16       q14, d0, d0                ;sse

+    vmlal.s16       q15, d1, d1

+    vpadal.s16      q13, q1                     ;sum

+    vmlal.s16       q14, d2, d2                ;sse

+    vmlal.s16       q15, d3, d3

+    vpadal.s16      q13, q5                     ;sum

+    vmlal.s16       q14, d10, d10                ;sse

+    vmlal.s16       q15, d11, d11

+    vmov            q0, q4

+    vpadal.s16      q13, q6                     ;sum

+    vmlal.s16       q14, d12, d12                ;sse

+    vmlal.s16       q15, d13, d13

+    bne             vp8_filt16x16s_4_4_loop_neon

+    vadd.u32        q15, q14, q15                ;accumulate sse

+    vpaddl.s32      q0, q13                      ;accumulate sum

+    vpaddl.u32      q1, q15

+    vadd.s64        d0, d0, d1

+    vadd.u64        d1, d2, d3

+    vmull.s32       q5, d0, d0

+    vst1.32         {d1[0]}, [lr]               ;store sse

+    vshr.s32        d10, d10, #8

+    vsub.s32        d0, d1, d10

+    vmov.32         r0, d0[0]                   ;return

+    pop             {pc}

+    ENDP

+;==============================

+; r0    unsigned char  *src_ptr,

+; r1    int  src_pixels_per_line,

+; r2    int  xoffset,

+; r3    int  yoffset,

+; stack unsigned char *dst_ptr,

+; stack int dst_pixels_per_line,

+; stack unsigned int *sse

+;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()

+;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,

+;or filter coeff is {64, 64}. This simplified program only works in this situation.

+;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.

+|vp9_sub_pixel_variance16x16s_neon| PROC

+    push            {r4, lr}

+    ldr             r4, [sp, #8]            ;load *dst_ptr from stack

+    ldr             r12, [sp, #12]          ;load dst_pixels_per_line from stack

+    ldr             lr, [sp, #16]           ;load *sse from stack

+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

+    beq             secondpass_bfilter16x16s_only

+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

+    beq             firstpass_bfilter16x16s_only

+    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data

+    sub             sp, sp, #256            ;reserve space on stack for temporary storage

+    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]

+    mov             r3, sp

+    mov             r2, #4                  ;loop counter

+    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1

+;First Pass: output_height lines x output_width columns (17x16)

+vp8e_filt_blk2d_fp16x16s_loop_neon

+    vld1.u8         {d4, d5, d6, d7}, [r0], r1

+    vld1.u8         {d8, d9, d10, d11}, [r0], r1

+    vld1.u8         {d12, d13, d14, d15}, [r0], r1

+    vld1.u8         {d16, d17, d18, d19}, [r0], r1

+    ;pld                [r0]

+    ;pld                [r0, r1]

+    ;pld                [r0, r1, lsl #1]

+    vext.8          q3, q2, q3, #1          ;construct src_ptr[1]

+    vext.8          q5, q4, q5, #1

+    vext.8          q7, q6, q7, #1

+    vext.8          q9, q8, q9, #1

+    vrhadd.u8       q1, q2, q3              ;(src_ptr[0]+src_ptr[1])/round/shift right 1

+    vrhadd.u8       q2, q4, q5

+    vrhadd.u8       q3, q6, q7

+    vrhadd.u8       q4, q8, q9

+    vrhadd.u8       q0, q0, q1

+    vrhadd.u8       q1, q1, q2

+    vrhadd.u8       q2, q2, q3

+    vrhadd.u8       q3, q3, q4

+    subs            r2, r2, #1

+    vst1.u8         {d0, d1 ,d2, d3}, [r3]!         ;store result

+    vmov            q0, q4

+    vst1.u8         {d4, d5, d6, d7}, [r3]!

+    bne             vp8e_filt_blk2d_fp16x16s_loop_neon

+    b               sub_pixel_variance16x16s_neon

+;--------------------

+firstpass_bfilter16x16s_only

+    mov             r2, #2                  ;loop counter

+    sub             sp, sp, #256            ;reserve space on stack for temporary storage

+    mov             r3, sp

+;First Pass: output_height lines x output_width columns (16x16)

+vp8e_filt_blk2d_fpo16x16s_loop_neon

+    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data

+    vld1.u8         {d4, d5, d6, d7}, [r0], r1

+    vld1.u8         {d8, d9, d10, d11}, [r0], r1

+    vld1.u8         {d12, d13, d14, d15}, [r0], r1

+    ;pld                [r0]

+    ;pld                [r0, r1]

+    ;pld                [r0, r1, lsl #1]

+    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]

+    vld1.u8         {d16, d17, d18, d19}, [r0], r1

+    vext.8          q3, q2, q3, #1

+    vld1.u8         {d20, d21, d22, d23}, [r0], r1

+    vext.8          q5, q4, q5, #1

+    vld1.u8         {d24, d25, d26, d27}, [r0], r1

+    vext.8          q7, q6, q7, #1

+    vld1.u8         {d28, d29, d30, d31}, [r0], r1

+    vext.8          q9, q8, q9, #1

+    vext.8          q11, q10, q11, #1

+    vext.8          q13, q12, q13, #1

+    vext.8          q15, q14, q15, #1

+    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1

+    vrhadd.u8       q1, q2, q3

+    vrhadd.u8       q2, q4, q5

+    vrhadd.u8       q3, q6, q7

+    vrhadd.u8       q4, q8, q9

+    vrhadd.u8       q5, q10, q11

+    vrhadd.u8       q6, q12, q13

+    vrhadd.u8       q7, q14, q15

+    subs            r2, r2, #1

+    vst1.u8         {d0, d1, d2, d3}, [r3]!         ;store result

+    vst1.u8         {d4, d5, d6, d7}, [r3]!

+    vst1.u8         {d8, d9, d10, d11}, [r3]!

+    vst1.u8         {d12, d13, d14, d15}, [r3]!

+    bne             vp8e_filt_blk2d_fpo16x16s_loop_neon

+    b               sub_pixel_variance16x16s_neon

+;---------------------

+secondpass_bfilter16x16s_only

+    sub             sp, sp, #256            ;reserve space on stack for temporary storage

+    mov             r2, #2                  ;loop counter

+    vld1.u8         {d0, d1}, [r0], r1      ;load src data

+    mov             r3, sp

+vp8e_filt_blk2d_spo16x16s_loop_neon

+    vld1.u8         {d2, d3}, [r0], r1

+    vld1.u8         {d4, d5}, [r0], r1

+    vld1.u8         {d6, d7}, [r0], r1

+    vld1.u8         {d8, d9}, [r0], r1

+    vrhadd.u8       q0, q0, q1

+    vld1.u8         {d10, d11}, [r0], r1

+    vrhadd.u8       q1, q1, q2

+    vld1.u8         {d12, d13}, [r0], r1

+    vrhadd.u8       q2, q2, q3

+    vld1.u8         {d14, d15}, [r0], r1

+    vrhadd.u8       q3, q3, q4

+    vld1.u8         {d16, d17}, [r0], r1

+    vrhadd.u8       q4, q4, q5

+    vrhadd.u8       q5, q5, q6

+    vrhadd.u8       q6, q6, q7

+    vrhadd.u8       q7, q7, q8

+    subs            r2, r2, #1

+    vst1.u8         {d0, d1, d2, d3}, [r3]!         ;store result

+    vmov            q0, q8

+    vst1.u8         {d4, d5, d6, d7}, [r3]!

+    vst1.u8         {d8, d9, d10, d11}, [r3]!           ;store result

+    vst1.u8         {d12, d13, d14, d15}, [r3]!

+    bne             vp8e_filt_blk2d_spo16x16s_loop_neon

+    b               sub_pixel_variance16x16s_neon

+;----------------------------

+;variance16x16

+sub_pixel_variance16x16s_neon

+    vmov.i8         q8, #0                      ;q8 - sum

+    vmov.i8         q9, #0                      ;q9, q10 - sse

+    vmov.i8         q10, #0

+    sub             r3, r3, #256

+    mov             r2, #4

+sub_pixel_variance16x16s_neon_loop

+    vld1.8          {q0}, [r3]!                 ;Load up source and reference

+    vld1.8          {q1}, [r4], r12

+    vld1.8          {q2}, [r3]!

+    vld1.8          {q3}, [r4], r12

+    vld1.8          {q4}, [r3]!

+    vld1.8          {q5}, [r4], r12

+    vld1.8          {q6}, [r3]!

+    vld1.8          {q7}, [r4], r12

+    vsubl.u8        q11, d0, d2                 ;diff

+    vsubl.u8        q12, d1, d3

+    vsubl.u8        q13, d4, d6

+    vsubl.u8        q14, d5, d7

+    vsubl.u8        q0, d8, d10

+    vsubl.u8        q1, d9, d11

+    vsubl.u8        q2, d12, d14

+    vsubl.u8        q3, d13, d15

+    vpadal.s16      q8, q11                     ;sum

+    vmlal.s16       q9, d22, d22                ;sse

+    vmlal.s16       q10, d23, d23

+    subs            r2, r2, #1

+    vpadal.s16      q8, q12

+    vmlal.s16       q9, d24, d24

+    vmlal.s16       q10, d25, d25

+    vpadal.s16      q8, q13

+    vmlal.s16       q9, d26, d26

+    vmlal.s16       q10, d27, d27

+    vpadal.s16      q8, q14

+    vmlal.s16       q9, d28, d28

+    vmlal.s16       q10, d29, d29

+    vpadal.s16      q8, q0                     ;sum

+    vmlal.s16       q9, d0, d0                ;sse

+    vmlal.s16       q10, d1, d1

+    vpadal.s16      q8, q1

+    vmlal.s16       q9, d2, d2

+    vmlal.s16       q10, d3, d3

+    vpadal.s16      q8, q2

+    vmlal.s16       q9, d4, d4

+    vmlal.s16       q10, d5, d5

+    vpadal.s16      q8, q3

+    vmlal.s16       q9, d6, d6

+    vmlal.s16       q10, d7, d7

+    bne             sub_pixel_variance16x16s_neon_loop

+    vadd.u32        q10, q9, q10                ;accumulate sse

+    vpaddl.s32      q0, q8                      ;accumulate sum

+    vpaddl.u32      q1, q10

+    vadd.s64        d0, d0, d1

+    vadd.u64        d1, d2, d3

+    vmull.s32       q5, d0, d0

+    vst1.32         {d1[0]}, [lr]               ;store sse

+    vshr.s32        d10, d10, #8

+    vsub.s32        d0, d1, d10

+    add             sp, sp, #256

+    vmov.32         r0, d0[0]                   ;return

+    pop             {r4, pc}

+    ENDP

+    END

--- /dev/null

+++ b/vp9/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm

@@ -1,0 +1,224 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |vp9_sub_pixel_variance8x8_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+; r0    unsigned char  *src_ptr,

+; r1    int  src_pixels_per_line,

+; r2    int  xoffset,

+; r3    int  yoffset,

+; stack(r4) unsigned char *dst_ptr,

+; stack(r5) int dst_pixels_per_line,

+; stack(r6) unsigned int *sse

+;note: most of the code is copied from bilinear_predict8x8_neon and vp9_variance8x8_neon.

+|vp9_sub_pixel_variance8x8_neon| PROC

+    push            {r4-r5, lr}

+    ldr             r12, _BilinearTaps_coeff_

+    ldr             r4, [sp, #12]           ;load *dst_ptr from stack

+    ldr             r5, [sp, #16]           ;load dst_pixels_per_line from stack

+    ldr             lr, [sp, #20]           ;load *sse from stack

+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0

+    beq             skip_firstpass_filter

+;First pass: output_height lines x output_width columns (9x8)

+    add             r2, r12, r2, lsl #3     ;calculate filter location

+    vld1.u8         {q1}, [r0], r1          ;load src data

+    vld1.u32        {d31}, [r2]             ;load first_pass filter

+    vld1.u8         {q2}, [r0], r1

+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)

+    vld1.u8         {q3}, [r0], r1

+    vdup.8          d1, d31[4]

+    vld1.u8         {q4}, [r0], r1

+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])

+    vmull.u8        q7, d4, d0

+    vmull.u8        q8, d6, d0

+    vmull.u8        q9, d8, d0

+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]

+    vext.8          d5, d4, d5, #1

+    vext.8          d7, d6, d7, #1

+    vext.8          d9, d8, d9, #1

+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])

+    vmlal.u8        q7, d5, d1

+    vmlal.u8        q8, d7, d1

+    vmlal.u8        q9, d9, d1

+    vld1.u8         {q1}, [r0], r1          ;load src data

+    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8

+    vld1.u8         {q2}, [r0], r1

+    vqrshrn.u16    d23, q7, #7

+    vld1.u8         {q3}, [r0], r1

+    vqrshrn.u16    d24, q8, #7

+    vld1.u8         {q4}, [r0], r1

+    vqrshrn.u16    d25, q9, #7

+    ;first_pass filtering on the rest 5-line data

+    vld1.u8         {q5}, [r0], r1

+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])

+    vmull.u8        q7, d4, d0

+    vmull.u8        q8, d6, d0

+    vmull.u8        q9, d8, d0

+    vmull.u8        q10, d10, d0

+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]

+    vext.8          d5, d4, d5, #1

+    vext.8          d7, d6, d7, #1

+    vext.8          d9, d8, d9, #1

+    vext.8          d11, d10, d11, #1

+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])

+    vmlal.u8        q7, d5, d1

+    vmlal.u8        q8, d7, d1

+    vmlal.u8        q9, d9, d1

+    vmlal.u8        q10, d11, d1

+    vqrshrn.u16    d26, q6, #7              ;shift/round/saturate to u8

+    vqrshrn.u16    d27, q7, #7

+    vqrshrn.u16    d28, q8, #7

+    vqrshrn.u16    d29, q9, #7

+    vqrshrn.u16    d30, q10, #7

+;Second pass: 8x8

+secondpass_filter

+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0

+    ;skip_secondpass_filter

+    beq             sub_pixel_variance8x8_neon

+    add             r3, r12, r3, lsl #3

+    vld1.u32        {d31}, [r3]             ;load second_pass filter

+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)

+    vdup.8          d1, d31[4]

+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])

+    vmull.u8        q2, d23, d0

+    vmull.u8        q3, d24, d0

+    vmull.u8        q4, d25, d0

+    vmull.u8        q5, d26, d0

+    vmull.u8        q6, d27, d0

+    vmull.u8        q7, d28, d0

+    vmull.u8        q8, d29, d0

+    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * Filter[1])

+    vmlal.u8        q2, d24, d1

+    vmlal.u8        q3, d25, d1

+    vmlal.u8        q4, d26, d1

+    vmlal.u8        q5, d27, d1

+    vmlal.u8        q6, d28, d1

+    vmlal.u8        q7, d29, d1

+    vmlal.u8        q8, d30, d1

+    vqrshrn.u16    d22, q1, #7              ;shift/round/saturate to u8

+    vqrshrn.u16    d23, q2, #7

+    vqrshrn.u16    d24, q3, #7

+    vqrshrn.u16    d25, q4, #7

+    vqrshrn.u16    d26, q5, #7

+    vqrshrn.u16    d27, q6, #7

+    vqrshrn.u16    d28, q7, #7

+    vqrshrn.u16    d29, q8, #7

+    b               sub_pixel_variance8x8_neon

+;--------------------

+skip_firstpass_filter

+    vld1.u8         {d22}, [r0], r1         ;load src data

+    vld1.u8         {d23}, [r0], r1

+    vld1.u8         {d24}, [r0], r1

+    vld1.u8         {d25}, [r0], r1

+    vld1.u8         {d26}, [r0], r1

+    vld1.u8         {d27}, [r0], r1

+    vld1.u8         {d28}, [r0], r1

+    vld1.u8         {d29}, [r0], r1

+    vld1.u8         {d30}, [r0], r1

+    b               secondpass_filter

+;----------------------

+;vp9_variance8x8_neon

+sub_pixel_variance8x8_neon

+    vmov.i8         q8, #0                      ;q8 - sum

+    vmov.i8         q9, #0                      ;q9, q10 - sse

+    vmov.i8         q10, #0

+    mov             r12, #2

+sub_pixel_variance8x8_neon_loop

+    vld1.8          {d0}, [r4], r5              ;load dst data

+    subs            r12, r12, #1

+    vld1.8          {d1}, [r4], r5

+    vld1.8          {d2}, [r4], r5

+    vsubl.u8        q4, d22, d0                 ;calculate diff

+    vld1.8          {d3}, [r4], r5

+    vsubl.u8        q5, d23, d1

+    vsubl.u8        q6, d24, d2

+    vpadal.s16      q8, q4                      ;sum

+    vmlal.s16       q9, d8, d8                  ;sse

+    vmlal.s16       q10, d9, d9

+    vsubl.u8        q7, d25, d3

+    vpadal.s16      q8, q5

+    vmlal.s16       q9, d10, d10

+    vmlal.s16       q10, d11, d11

+    vmov            q11, q13

+    vpadal.s16      q8, q6

+    vmlal.s16       q9, d12, d12

+    vmlal.s16       q10, d13, d13

+    vmov            q12, q14

+    vpadal.s16      q8, q7

+    vmlal.s16       q9, d14, d14

+    vmlal.s16       q10, d15, d15

+    bne             sub_pixel_variance8x8_neon_loop

+    vadd.u32        q10, q9, q10                ;accumulate sse

+    vpaddl.s32      q0, q8                      ;accumulate sum

+    vpaddl.u32      q1, q10

+    vadd.s64        d0, d0, d1

+    vadd.u64        d1, d2, d3

+    vmull.s32       q5, d0, d0

+    vst1.32         {d1[0]}, [lr]               ;store sse

+    vshr.s32        d10, d10, #6

+    vsub.s32        d0, d1, d10

+    vmov.32         r0, d0[0]                   ;return

+    pop             {r4-r5, pc}

+    ENDP

+;-----------------

+_BilinearTaps_coeff_

+    DCD     bilinear_taps_coeff

+bilinear_taps_coeff

+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112

+    END

--- /dev/null

+++ b/vp9/encoder/arm/quantize_arm.c

@@ -1,0 +1,59 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <math.h>

+#include "vpx_mem/vpx_mem.h"

+#include "vp9/encoder/quantize.h"

+#include "vp9/common/entropy.h"

+#if HAVE_ARMV7

+/* vp8_quantize_mbX functions here differs from corresponding ones in

+ * quantize.c only by using quantize_b_pair function pointer instead of

+ * the regular quantize_b function pointer */

+void vp8_quantize_mby_neon(MACROBLOCK *x) {

+  int i;

+  int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED

+                       && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);

+  for (i = 0; i < 16; i += 2)

+    x->quantize_b_pair(&x->block[i], &x->block[i + 1],

+                       &x->e_mbd.block[i], &x->e_mbd.block[i + 1]);

+  if (has_2nd_order)

+    x->quantize_b(&x->block[24], &x->e_mbd.block[24]);

+}

+void vp8_quantize_mb_neon(MACROBLOCK *x) {

+  int i;

+  int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED

+                       && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);

+  for (i = 0; i < 24; i += 2)

+    x->quantize_b_pair(&x->block[i], &x->block[i + 1],

+                       &x->e_mbd.block[i], &x->e_mbd.block[i + 1]);

+  if (has_2nd_order)

+    x->quantize_b(&x->block[i], &x->e_mbd.block[i]);

+}

+void vp8_quantize_mbuv_neon(MACROBLOCK *x) {

+  int i;

+  for (i = 16; i < 24; i += 2)

+    x->quantize_b_pair(&x->block[i], &x->block[i + 1],

+                       &x->e_mbd.block[i], &x->e_mbd.block[i + 1]);

+}

+#endif /* HAVE_ARMV7 */

--- /dev/null

+++ b/vp9/encoder/arm/quantize_arm.h

@@ -1,0 +1,52 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef QUANTIZE_ARM_H

+#define QUANTIZE_ARM_H

+#if HAVE_ARMV6

+extern prototype_quantize_block(vp8_fast_quantize_b_armv6);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp8_quantize_fastquantb

+#define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6

+#endif

+#endif /* HAVE_ARMV6 */

+#if HAVE_ARMV7

+extern prototype_quantize_block(vp8_fast_quantize_b_neon);

+extern prototype_quantize_block_pair(vp8_fast_quantize_b_pair_neon);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp8_quantize_fastquantb

+#define vp8_quantize_fastquantb vp8_fast_quantize_b_neon

+#undef  vp8_quantize_fastquantb_pair

+#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_neon

+#undef vp8_quantize_mb

+#define vp8_quantize_mb vp8_quantize_mb_neon

+#undef vp8_quantize_mbuv

+#define vp8_quantize_mbuv vp8_quantize_mbuv_neon

+#undef vp8_quantize_mby

+#define vp8_quantize_mby vp8_quantize_mby_neon

+#endif

+#endif /* HAVE_ARMV7 */

+#endif

--- /dev/null

+++ b/vp9/encoder/arm/variance_arm.c

@@ -1,0 +1,112 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_config.h"

+#include "vp9/encoder/variance.h"

+#include "vp9/common/filter.h"

+#include "vp9/common/arm/bilinearfilter_arm.h"

+#define HALFNDX 8

+#if HAVE_ARMV6

+unsigned int vp9_sub_pixel_variance8x8_armv6

+(

+  const unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  const unsigned char *dst_ptr,

+  int dst_pixels_per_line,

+  unsigned int *sse

+) {

+  unsigned short first_pass[10 * 8];

+  unsigned char  second_pass[8 * 8];

+  const short *HFilter, *VFilter;

+  HFilter = vp8_bilinear_filters[xoffset];

+  VFilter = vp8_bilinear_filters[yoffset];

+  vp9_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,

+                                          src_pixels_per_line,

+                                          9, 8, HFilter);

+  vp9_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,

+                                           8, 8, 8, VFilter);

+  return vp9_variance8x8_armv6(second_pass, 8, dst_ptr,

+                               dst_pixels_per_line, sse);

+}

+unsigned int vp9_sub_pixel_variance16x16_armv6

+(

+  const unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  const unsigned char *dst_ptr,

+  int dst_pixels_per_line,

+  unsigned int *sse

+) {

+  unsigned short first_pass[36 * 16];

+  unsigned char  second_pass[20 * 16];

+  const short *HFilter, *VFilter;

+  unsigned int var;

+  if (xoffset == HALFNDX && yoffset == 0) {

+    var = vp9_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line,

+                                               dst_ptr, dst_pixels_per_line, sse);

+  } else if (xoffset == 0 && yoffset == HALFNDX) {

+    var = vp9_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line,

+                                               dst_ptr, dst_pixels_per_line, sse);

+  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {

+    var = vp9_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line,

+                                                dst_ptr, dst_pixels_per_line, sse);

+  } else {

+    HFilter = vp8_bilinear_filters[xoffset];

+    VFilter = vp8_bilinear_filters[yoffset];

+    vp9_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,

+                                            src_pixels_per_line,

+                                            17, 16, HFilter);

+    vp9_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,

+                                             16, 16, 16, VFilter);

+    var = vp9_variance16x16_armv6(second_pass, 16, dst_ptr,

+                                  dst_pixels_per_line, sse);

+  }

+  return var;

+}

+#endif /* HAVE_ARMV6 */

+#if HAVE_ARMV7

+unsigned int vp9_sub_pixel_variance16x16_neon

+(

+  const unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  const unsigned char *dst_ptr,

+  int dst_pixels_per_line,

+  unsigned int *sse

+) {

+  if (xoffset == HALFNDX && yoffset == 0)

+    return vp9_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);

+  else if (xoffset == 0 && yoffset == HALFNDX)

+    return vp9_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);

+  else if (xoffset == HALFNDX && yoffset == HALFNDX)

+    return vp9_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);

+  else

+    return vp9_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);

+}

+#endif

--- /dev/null

+++ b/vp9/encoder/arm/variance_arm.h

@@ -1,0 +1,132 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VARIANCE_ARM_H

+#define VARIANCE_ARM_H

+#if HAVE_ARMV6

+extern prototype_sad(vp9_sad16x16_armv6);

+extern prototype_variance(vp9_variance16x16_armv6);

+extern prototype_variance(vp9_variance8x8_armv6);

+extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_armv6);

+extern prototype_subpixvariance(vp9_sub_pixel_variance8x8_armv6);

+extern prototype_variance(vp9_variance_halfpixvar16x16_h_armv6);

+extern prototype_variance(vp9_variance_halfpixvar16x16_v_armv6);

+extern prototype_variance(vp9_variance_halfpixvar16x16_hv_armv6);

+extern prototype_variance(vp9_mse16x16_armv6);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp9_variance_sad16x16

+#define vp9_variance_sad16x16 vp9_sad16x16_armv6

+#undef  vp9_variance_subpixvar16x16

+#define vp9_variance_subpixvar16x16 vp9_sub_pixel_variance16x16_armv6

+#undef  vp9_variance_subpixvar8x8

+#define vp9_variance_subpixvar8x8 vp9_sub_pixel_variance8x8_armv6

+#undef  vp9_variance_var16x16

+#define vp9_variance_var16x16 vp9_variance16x16_armv6

+#undef  vp9_variance_mse16x16

+#define vp9_variance_mse16x16 vp9_mse16x16_armv6

+#undef  vp9_variance_var8x8

+#define vp9_variance_var8x8 vp9_variance8x8_armv6

+#undef  vp9_variance_halfpixvar16x16_h

+#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_armv6

+#undef  vp9_variance_halfpixvar16x16_v

+#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_armv6

+#undef  vp9_variance_halfpixvar16x16_hv

+#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_armv6

+#endif /* !CONFIG_RUNTIME_CPU_DETECT */

+#endif /* HAVE_ARMV6 */

+#if HAVE_ARMV7

+extern prototype_sad(vp9_sad4x4_neon);

+extern prototype_sad(vp9_sad8x8_neon);

+extern prototype_sad(vp9_sad8x16_neon);

+extern prototype_sad(vp9_sad16x8_neon);

+extern prototype_sad(vp9_sad16x16_neon);

+extern prototype_variance(vp9_variance8x8_neon);

+extern prototype_variance(vp9_variance8x16_neon);

+extern prototype_variance(vp9_variance16x8_neon);

+extern prototype_variance(vp9_variance16x16_neon);

+extern prototype_subpixvariance(vp9_sub_pixel_variance8x8_neon);

+extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_neon);

+extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_neon_func);

+extern prototype_variance(vp9_variance_halfpixvar16x16_h_neon);

+extern prototype_variance(vp9_variance_halfpixvar16x16_v_neon);

+extern prototype_variance(vp9_variance_halfpixvar16x16_hv_neon);

+extern prototype_variance(vp9_mse16x16_neon);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp9_variance_sad4x4

+#define vp9_variance_sad4x4 vp9_sad4x4_neon

+#undef  vp9_variance_sad8x8

+#define vp9_variance_sad8x8 vp9_sad8x8_neon

+#undef  vp9_variance_sad8x16

+#define vp9_variance_sad8x16 vp9_sad8x16_neon

+#undef  vp9_variance_sad16x8

+#define vp9_variance_sad16x8 vp9_sad16x8_neon

+#undef  vp9_variance_sad16x16

+#define vp9_variance_sad16x16 vp9_sad16x16_neon

+#undef  vp9_variance_var8x8

+#define vp9_variance_var8x8 vp9_variance8x8_neon

+#undef  vp9_variance_var8x16

+#define vp9_variance_var8x16 vp9_variance8x16_neon

+#undef  vp9_variance_var16x8

+#define vp9_variance_var16x8 vp9_variance16x8_neon

+#undef  vp9_variance_var16x16

+#define vp9_variance_var16x16 vp9_variance16x16_neon

+#undef  vp9_variance_subpixvar8x8

+#define vp9_variance_subpixvar8x8 vp9_sub_pixel_variance8x8_neon

+#undef  vp9_variance_subpixvar16x16

+#define vp9_variance_subpixvar16x16 vp9_sub_pixel_variance16x16_neon

+#undef  vp9_variance_halfpixvar16x16_h

+#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_neon

+#undef  vp9_variance_halfpixvar16x16_v

+#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_neon

+#undef  vp9_variance_halfpixvar16x16_hv

+#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_neon

+#undef  vp9_variance_mse16x16

+#define vp9_variance_mse16x16 vp9_mse16x16_neon

+#endif

+#endif

+#endif

--- /dev/null

+++ b/vp9/encoder/asm_enc_offsets.c

@@ -1,0 +1,90 @@

+/*

+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/asm_offsets.h"

+#include "vpx_config.h"

+#include "block.h"

+#include "vp9/common/blockd.h"

+#include "onyx_int.h"

+#include "treewriter.h"

+#include "tokenize.h"

+BEGIN

+/* regular quantize */

+DEFINE(vp9_block_coeff,                         offsetof(BLOCK, coeff));

+DEFINE(vp9_block_zbin,                          offsetof(BLOCK, zbin));

+DEFINE(vp9_block_round,                         offsetof(BLOCK, round));

+DEFINE(vp9_block_quant,                         offsetof(BLOCK, quant));

+DEFINE(vp9_block_quant_fast,                    offsetof(BLOCK, quant_fast));

+DEFINE(vp9_block_zbin_extra,                    offsetof(BLOCK, zbin_extra));

+DEFINE(vp9_block_zrun_zbin_boost,               offsetof(BLOCK, zrun_zbin_boost));

+DEFINE(vp9_block_quant_shift,                   offsetof(BLOCK, quant_shift));

+DEFINE(vp9_blockd_qcoeff,                       offsetof(BLOCKD, qcoeff));

+DEFINE(vp9_blockd_dequant,                      offsetof(BLOCKD, dequant));

+DEFINE(vp9_blockd_dqcoeff,                      offsetof(BLOCKD, dqcoeff));

+DEFINE(vp9_blockd_eob,                          offsetof(BLOCKD, eob));

+/* subtract */

+DEFINE(vp9_block_base_src,                      offsetof(BLOCK, base_src));

+DEFINE(vp9_block_src,                           offsetof(BLOCK, src));

+DEFINE(vp9_block_src_diff,                      offsetof(BLOCK, src_diff));

+DEFINE(vp9_block_src_stride,                    offsetof(BLOCK, src_stride));

+DEFINE(vp9_blockd_predictor,                    offsetof(BLOCKD, predictor));

+/* pack tokens */

+DEFINE(vp9_writer_lowvalue,                     offsetof(vp9_writer, lowvalue));

+DEFINE(vp9_writer_range,                        offsetof(vp9_writer, range));

+DEFINE(vp9_writer_value,                        offsetof(vp9_writer, value));

+DEFINE(vp9_writer_count,                        offsetof(vp9_writer, count));

+DEFINE(vp9_writer_pos,                          offsetof(vp9_writer, pos));

+DEFINE(vp9_writer_buffer,                       offsetof(vp9_writer, buffer));

+DEFINE(tokenextra_token,                        offsetof(TOKENEXTRA, Token));

+DEFINE(tokenextra_extra,                        offsetof(TOKENEXTRA, Extra));

+DEFINE(tokenextra_context_tree,                 offsetof(TOKENEXTRA, context_tree));

+DEFINE(tokenextra_skip_eob_node,                offsetof(TOKENEXTRA, skip_eob_node));

+DEFINE(TOKENEXTRA_SZ,                           sizeof(TOKENEXTRA));

+DEFINE(vp9_extra_bit_struct_sz,                 sizeof(vp9_extra_bit_struct));

+DEFINE(vp9_token_value,                         offsetof(vp9_token, value));

+DEFINE(vp9_token_len,                           offsetof(vp9_token, Len));

+DEFINE(vp9_extra_bit_struct_tree,               offsetof(vp9_extra_bit_struct, tree));

+DEFINE(vp9_extra_bit_struct_prob,               offsetof(vp9_extra_bit_struct, prob));

+DEFINE(vp9_extra_bit_struct_len,                offsetof(vp9_extra_bit_struct, Len));

+DEFINE(vp9_extra_bit_struct_base_val,           offsetof(vp9_extra_bit_struct, base_val));

+DEFINE(vp9_comp_tplist,                         offsetof(VP9_COMP, tplist));

+DEFINE(vp9_comp_common,                         offsetof(VP9_COMP, common));

+DEFINE(tokenlist_start,                         offsetof(TOKENLIST, start));

+DEFINE(tokenlist_stop,                          offsetof(TOKENLIST, stop));

+DEFINE(TOKENLIST_SZ,                            sizeof(TOKENLIST));

+DEFINE(vp9_common_mb_rows,                      offsetof(VP9_COMMON, mb_rows));

+END

+/* add asserts for any offset that is not supported by assembly code

+ * add asserts for any size that is not supported by assembly code

+ * These are used in vp8cx_pack_tokens.  They are hard coded so if their sizes

+ * change they will have to be adjusted.

+ */

+#if HAVE_ARMV5TE

+ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)

+ct_assert(vp9_extra_bit_struct_sz, sizeof(vp9_extra_bit_struct) == 16)

+#endif

--- /dev/null

+++ b/vp9/encoder/bitstream.c

@@ -1,0 +1,2394 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vp9/common/header.h"

+#include "encodemv.h"

+#include "vp9/common/entropymode.h"

+#include "vp9/common/findnearmv.h"

+#include "mcomp.h"

+#include "vp9/common/systemdependent.h"

+#include <assert.h>

+#include <stdio.h>

+#include <limits.h>

+#include "vp9/common/pragmas.h"

+#include "vpx/vpx_encoder.h"

+#include "vpx_mem/vpx_mem.h"

+#include "bitstream.h"

+#include "segmentation.h"

+#include "vp9/common/seg_common.h"

+#include "vp9/common/pred_common.h"

+#include "vp9/common/entropy.h"

+#include "vp9/encoder/encodemv.h"

+#include "vp9/common/entropymv.h"

+#if CONFIG_NEWBESTREFMV

+#include "vp9/common/mvref_common.h"

+#endif

+#if defined(SECTIONBITS_OUTPUT)

+unsigned __int64 Sectionbits[500];

+#endif

+#ifdef ENTROPY_STATS

+int intra_mode_stats [VP9_BINTRAMODES] [VP9_BINTRAMODES] [VP9_BINTRAMODES];

+unsigned int tree_update_hist [BLOCK_TYPES]

+                              [COEF_BANDS]

+                              [PREV_COEF_CONTEXTS]

+                              [ENTROPY_NODES][2];

+unsigned int hybrid_tree_update_hist [BLOCK_TYPES]

+                                     [COEF_BANDS]

+                                     [PREV_COEF_CONTEXTS]

+                                     [ENTROPY_NODES][2];

+unsigned int tree_update_hist_8x8 [BLOCK_TYPES_8X8]

+                                  [COEF_BANDS]

+                                  [PREV_COEF_CONTEXTS]

+                                  [ENTROPY_NODES] [2];

+unsigned int hybrid_tree_update_hist_8x8 [BLOCK_TYPES_8X8]

+                                         [COEF_BANDS]

+                                         [PREV_COEF_CONTEXTS]

+                                         [ENTROPY_NODES] [2];

+unsigned int tree_update_hist_16x16 [BLOCK_TYPES_16X16]

+                                    [COEF_BANDS]

+                                    [PREV_COEF_CONTEXTS]

+                                    [ENTROPY_NODES] [2];

+unsigned int hybrid_tree_update_hist_16x16 [BLOCK_TYPES_16X16]

+                                           [COEF_BANDS]

+                                           [PREV_COEF_CONTEXTS]

+                                           [ENTROPY_NODES] [2];

+extern unsigned int active_section;

+#endif

+#ifdef MODE_STATS

+int count_mb_seg[4] = { 0, 0, 0, 0 };

+#endif

+#define vp9_cost_upd  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)) >> 8)

+#define vp9_cost_upd256  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))

+#define SEARCH_NEWP

+static int update_bits[255];

+static void compute_update_table() {

+  int i;

+  for (i = 0; i < 255; i++)

+    update_bits[i] = vp9_count_term_subexp(i, SUBEXP_PARAM, 255);

+}

+static int split_index(int i, int n, int modulus) {

+  int max1 = (n - 1 - modulus / 2) / modulus + 1;

+  if (i % modulus == modulus / 2) i = i / modulus;

+  else i = max1 + i - (i + modulus - modulus / 2) / modulus;

+  return i;

+}

+static int remap_prob(int v, int m) {

+  const int n = 256;

+  const int modulus = MODULUS_PARAM;

+  int i;

+  if ((m << 1) <= n)

+    i = vp9_recenter_nonneg(v, m) - 1;

+  else

+    i = vp9_recenter_nonneg(n - 1 - v, n - 1 - m) - 1;

+  i = split_index(i, n - 1, modulus);

+  return i;

+}

+static void write_prob_diff_update(vp9_writer *const bc,

+                                   vp9_prob newp, vp9_prob oldp) {

+  int delp = remap_prob(newp, oldp);

+  vp9_encode_term_subexp(bc, delp, SUBEXP_PARAM, 255);

+}

+static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {

+  int delp = remap_prob(newp, oldp);

+  return update_bits[delp] * 256;

+}

+static void update_mode(

+  vp9_writer *const bc,

+  int n,

+  vp9_token tok               [/* n */],

+  vp9_tree tree,

+  vp9_prob Pnew               [/* n-1 */],

+  vp9_prob Pcur               [/* n-1 */],

+  unsigned int bct            [/* n-1 */] [2],

+  const unsigned int num_events[/* n */]

+) {

+  unsigned int new_b = 0, old_b = 0;

+  int i = 0;

+  vp9_tree_probs_from_distribution(

+    n--, tok, tree,

+    Pnew, bct, num_events,

+    256, 1

+  );

+  do {

+    new_b += cost_branch(bct[i], Pnew[i]);

+    old_b += cost_branch(bct[i], Pcur[i]);

+  } while (++i < n);

+  if (new_b + (n << 8) < old_b) {

+    int i = 0;

+    vp9_write_bit(bc, 1);

+    do {

+      const vp9_prob p = Pnew[i];

+      vp9_write_literal(bc, Pcur[i] = p ? p : 1, 8);

+    } while (++i < n);

+  } else

+    vp9_write_bit(bc, 0);

+}

+static void update_mbintra_mode_probs(VP9_COMP* const cpi,

+                                      vp9_writer* const bc) {

+  VP9_COMMON *const cm = &cpi->common;

+  {

+    vp9_prob Pnew   [VP9_YMODES - 1];

+    unsigned int bct [VP9_YMODES - 1] [2];

+    update_mode(

+      bc, VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree,

+      Pnew, cm->fc.ymode_prob, bct, (unsigned int *)cpi->ymode_count

+    );

+  }

+}

+static int get_prob(int num, int den) {

+  int p;

+  if (den <= 0)

+    return 128;

+  p = (num * 255 + (den >> 1)) / den;

+  if (p > 255)

+    return 255;

+  else if (p < 1)

+    return 1;

+  return p;

+}

+static int get_binary_prob(int n0, int n1) {

+  return get_prob(n0, n0 + n1);

+}

+void vp9_update_skip_probs(VP9_COMP *cpi) {

+  VP9_COMMON *const pc = &cpi->common;

+  int prob_skip_false[3] = {0, 0, 0};

+  int k;

+  for (k = 0; k < MBSKIP_CONTEXTS; ++k) {

+    pc->mbskip_pred_probs[k] = get_binary_prob(cpi->skip_false_count[k],

+                                               cpi->skip_true_count[k]);

+  }

+}

+static void update_switchable_interp_probs(VP9_COMP *cpi,

+                                           vp9_writer* const bc) {

+  VP9_COMMON *const pc = &cpi->common;

+  unsigned int branch_ct[32][2];

+  int i, j;

+  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {

+    vp9_tree_probs_from_distribution(

+        VP9_SWITCHABLE_FILTERS,

+        vp9_switchable_interp_encodings, vp9_switchable_interp_tree,

+        pc->fc.switchable_interp_prob[j], branch_ct,

+        cpi->switchable_interp_count[j], 256, 1);

+    for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {

+      if (pc->fc.switchable_interp_prob[j][i] < 1)

+        pc->fc.switchable_interp_prob[j][i] = 1;

+      vp9_write_literal(bc, pc->fc.switchable_interp_prob[j][i], 8);

+    }

+  }

+}

+// This function updates the reference frame prediction stats

+static void update_refpred_stats(VP9_COMP *cpi) {

+  VP9_COMMON *const cm = &cpi->common;

+  int i;

+  int tot_count;

+  vp9_prob new_pred_probs[PREDICTION_PROBS];

+  int old_cost, new_cost;

+  // Set the prediction probability structures to defaults

+  if (cm->frame_type == KEY_FRAME) {

+    // Set the prediction probabilities to defaults

+    cm->ref_pred_probs[0] = 120;

+    cm->ref_pred_probs[1] = 80;

+    cm->ref_pred_probs[2] = 40;

+    vpx_memset(cpi->ref_pred_probs_update, 0,

+               sizeof(cpi->ref_pred_probs_update));

+  } else {

+    // From the prediction counts set the probabilities for each context

+    for (i = 0; i < PREDICTION_PROBS; i++) {

+      new_pred_probs[i] = get_binary_prob(cpi->ref_pred_count[i][0],

+                                          cpi->ref_pred_count[i][1]);

+      // Decide whether or not to update the reference frame probs.

+      // Returned costs are in 1/256 bit units.

+      old_cost =

+        (cpi->ref_pred_count[i][0] * vp9_cost_zero(cm->ref_pred_probs[i])) +

+        (cpi->ref_pred_count[i][1] * vp9_cost_one(cm->ref_pred_probs[i]));

+      new_cost =

+        (cpi->ref_pred_count[i][0] * vp9_cost_zero(new_pred_probs[i])) +

+        (cpi->ref_pred_count[i][1] * vp9_cost_one(new_pred_probs[i]));

+      // Cost saving must be >= 8 bits (2048 in these units)

+      if ((old_cost - new_cost) >= 2048) {

+        cpi->ref_pred_probs_update[i] = 1;

+        cm->ref_pred_probs[i] = new_pred_probs[i];

+      } else

+        cpi->ref_pred_probs_update[i] = 0;

+    }

+  }

+}

+static void update_mvcount(VP9_COMP *cpi, MACROBLOCK *x,

+                           int_mv *best_ref_mv, int_mv *second_best_ref_mv) {

+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

+  MV mv;

+  if (mbmi->mode == SPLITMV) {

+    int i;

+    for (i = 0; i < x->partition_info->count; i++) {

+      if (x->partition_info->bmi[i].mode == NEW4X4) {

+        if (x->e_mbd.allow_high_precision_mv) {

+          mv.row = (x->partition_info->bmi[i].mv.as_mv.row

+                    - best_ref_mv->as_mv.row);

+          mv.col = (x->partition_info->bmi[i].mv.as_mv.col

+                    - best_ref_mv->as_mv.col);

+          vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 1);

+          if (x->e_mbd.mode_info_context->mbmi.second_ref_frame) {

+            mv.row = (x->partition_info->bmi[i].second_mv.as_mv.row

+                      - second_best_ref_mv->as_mv.row);

+            mv.col = (x->partition_info->bmi[i].second_mv.as_mv.col

+                      - second_best_ref_mv->as_mv.col);

+            vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv,

+                              &cpi->NMVcount, 1);

+          }

+        } else {

+          mv.row = (x->partition_info->bmi[i].mv.as_mv.row

+                    - best_ref_mv->as_mv.row);

+          mv.col = (x->partition_info->bmi[i].mv.as_mv.col

+                    - best_ref_mv->as_mv.col);

+          vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 0);

+          if (x->e_mbd.mode_info_context->mbmi.second_ref_frame) {

+            mv.row = (x->partition_info->bmi[i].second_mv.as_mv.row

+                      - second_best_ref_mv->as_mv.row);

+            mv.col = (x->partition_info->bmi[i].second_mv.as_mv.col

+                      - second_best_ref_mv->as_mv.col);

+            vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv,

+                              &cpi->NMVcount, 0);

+          }

+        }

+      }

+    }

+  } else if (mbmi->mode == NEWMV) {

+    if (x->e_mbd.allow_high_precision_mv) {

+      mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);

+      mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);

+      vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 1);

+      if (mbmi->second_ref_frame) {

+        mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);

+        mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);

+        vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount, 1);

+      }

+    } else {

+      mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);

+      mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);

+      vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 0);

+      if (mbmi->second_ref_frame) {

+        mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);

+        mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);

+        vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount, 0);

+      }

+    }

+  }

+}

+static void write_ymode(vp9_writer *bc, int m, const vp9_prob *p) {

+  write_token(bc, vp9_ymode_tree, p, vp9_ymode_encodings + m);

+}

+static void kfwrite_ymode(vp9_writer *bc, int m, const vp9_prob *p) {

+  write_token(bc, vp9_kf_ymode_tree, p, vp9_kf_ymode_encodings + m);

+}

+#if CONFIG_SUPERBLOCKS

+static void sb_kfwrite_ymode(vp9_writer *bc, int m, const vp9_prob *p) {

+  write_token(bc, vp9_uv_mode_tree, p, vp9_sb_kf_ymode_encodings + m);

+}

+#endif

+static void write_i8x8_mode(vp9_writer *bc, int m, const vp9_prob *p) {

+  write_token(bc, vp9_i8x8_mode_tree, p, vp9_i8x8_mode_encodings + m);

+}

+static void write_uv_mode(vp9_writer *bc, int m, const vp9_prob *p) {

+  write_token(bc, vp9_uv_mode_tree, p, vp9_uv_mode_encodings + m);

+}

+static void write_bmode(vp9_writer *bc, int m, const vp9_prob *p) {

+  write_token(bc, vp9_bmode_tree, p, vp9_bmode_encodings + m);

+}

+static void write_split(vp9_writer *bc, int x, const vp9_prob *p) {

+  write_token(bc, vp9_mbsplit_tree, p, vp9_mbsplit_encodings + x);

+}

+static int prob_update_savings(const unsigned int *ct,

+                               const vp9_prob oldp, const vp9_prob newp,

+                               const vp9_prob upd) {

+  const int old_b = cost_branch256(ct, oldp);

+  const int new_b = cost_branch256(ct, newp);

+  const int update_b = 2048 + vp9_cost_upd256;

+  return (old_b - new_b - update_b);

+}

+static int prob_diff_update_savings(const unsigned int *ct,

+                                    const vp9_prob oldp, const vp9_prob newp,

+                                    const vp9_prob upd) {

+  const int old_b = cost_branch256(ct, oldp);

+  const int new_b = cost_branch256(ct, newp);

+  const int update_b = (newp == oldp ? 0 :

+                        prob_diff_update_cost(newp, oldp) + vp9_cost_upd256);

+  return (old_b - new_b - update_b);

+}

+static int prob_diff_update_savings_search(const unsigned int *ct,

+                                           const vp9_prob oldp, vp9_prob *bestp,

+                                           const vp9_prob upd) {

+  const int old_b = cost_branch256(ct, oldp);

+  int new_b, update_b, savings, bestsavings, step;

+  vp9_prob newp, bestnewp;

+  bestsavings = 0;

+  bestnewp = oldp;

+  step = (*bestp > oldp ? -1 : 1);

+  for (newp = *bestp; newp != oldp; newp += step) {

+    new_b = cost_branch256(ct, newp);

+    update_b = prob_diff_update_cost(newp, oldp) + vp9_cost_upd256;

+    savings = old_b - new_b - update_b;

+    if (savings > bestsavings) {

+      bestsavings = savings;

+      bestnewp = newp;

+    }

+  }

+  *bestp = bestnewp;

+  return bestsavings;

+}

+static void pack_mb_tokens(vp9_writer* const bc,

+                           TOKENEXTRA **tp,

+                           const TOKENEXTRA *const stop) {

+  unsigned int split;

+  unsigned int shift;

+  int count = bc->count;

+  unsigned int range = bc->range;

+  unsigned int lowvalue = bc->lowvalue;

+  TOKENEXTRA *p = *tp;

+  while (p < stop) {

+    const int t = p->Token;

+    vp9_token *const a = vp9_coef_encodings + t;

+    const vp9_extra_bit_struct *const b = vp9_extra_bits + t;

+    int i = 0;

+    const unsigned char *pp = p->context_tree;

+    int v = a->value;

+    int n = a->Len;

+    if (t == EOSB_TOKEN)

+    {

+      ++p;

+      break;

+    }

+    /* skip one or two nodes */

+    if (p->skip_eob_node) {

+      n -= p->skip_eob_node;

+      i = 2 * p->skip_eob_node;

+    }

+    do {

+      const int bb = (v >> --n) & 1;

+      split = 1 + (((range - 1) * pp[i >> 1]) >> 8);

+      i = vp9_coef_tree[i + bb];

+      if (bb) {

+        lowvalue += split;

+        range = range - split;

+      } else {

+        range = split;

+      }

+      shift = vp9_norm[range];

+      range <<= shift;

+      count += shift;

+      if (count >= 0) {

+        int offset = shift - count;

+        if ((lowvalue << (offset - 1)) & 0x80000000) {

+          int x = bc->pos - 1;

+          while (x >= 0 && bc->buffer[x] == 0xff) {

+            bc->buffer[x] = (unsigned char)0;

+            x--;

+          }

+          bc->buffer[x] += 1;

+        }

+        bc->buffer[bc->pos++] = (lowvalue >> (24 - offset));

+        lowvalue <<= offset;

+        shift = count;

+        lowvalue &= 0xffffff;

+        count -= 8;

+      }

+      lowvalue <<= shift;

+    } while (n);

+    if (b->base_val) {

+      const int e = p->Extra, L = b->Len;

+      if (L) {

+        const unsigned char *pp = b->prob;

+        int v = e >> 1;

+        int n = L;              /* number of bits in v, assumed nonzero */

+        int i = 0;

+        do {

+          const int bb = (v >> --n) & 1;

+          split = 1 + (((range - 1) * pp[i >> 1]) >> 8);

+          i = b->tree[i + bb];

+          if (bb) {

+            lowvalue += split;

+            range = range - split;

+          } else {

+            range = split;

+          }

+          shift = vp9_norm[range];

+          range <<= shift;

+          count += shift;

+          if (count >= 0) {

+            int offset = shift - count;

+            if ((lowvalue << (offset - 1)) & 0x80000000) {

+              int x = bc->pos - 1;

+              while (x >= 0 && bc->buffer[x] == 0xff) {

+                bc->buffer[x] = (unsigned char)0;

+                x--;

+              }

+              bc->buffer[x] += 1;

+            }

+            bc->buffer[bc->pos++] = (lowvalue >> (24 - offset));

+            lowvalue <<= offset;

+            shift = count;

+            lowvalue &= 0xffffff;

+            count -= 8;

+          }

+          lowvalue <<= shift;

+        } while (n);

+      }

+      {

+        split = (range + 1) >> 1;

+        if (e & 1) {

+          lowvalue += split;

+          range = range - split;

+        } else {

+          range = split;

+        }

+        range <<= 1;

+        if ((lowvalue & 0x80000000)) {

+          int x = bc->pos - 1;

+          while (x >= 0 && bc->buffer[x] == 0xff) {

+            bc->buffer[x] = (unsigned char)0;

+            x--;

+          }

+          bc->buffer[x] += 1;

+        }

+        lowvalue  <<= 1;

+        if (!++count) {

+          count = -8;

+          bc->buffer[bc->pos++] = (lowvalue >> 24);

+          lowvalue &= 0xffffff;

+        }

+      }

+    }

+    ++p;

+  }

+  bc->count = count;

+  bc->lowvalue = lowvalue;

+  bc->range = range;

+  *tp = p;

+}

+static void write_partition_size(unsigned char *cx_data, int size) {

+  signed char csize;

+  csize = size & 0xff;

+  *cx_data = csize;

+  csize = (size >> 8) & 0xff;

+  *(cx_data + 1) = csize;

+  csize = (size >> 16) & 0xff;

+  *(cx_data + 2) = csize;

+}

+static void write_mv_ref

+(

+  vp9_writer *bc, MB_PREDICTION_MODE m, const vp9_prob *p

+) {

+#if CONFIG_DEBUG

+  assert(NEARESTMV <= m  &&  m <= SPLITMV);

+#endif

+  write_token(bc, vp9_mv_ref_tree, p,

+              vp9_mv_ref_encoding_array - NEARESTMV + m);

+}

+#if CONFIG_SUPERBLOCKS

+static void write_sb_mv_ref(vp9_writer *bc, MB_PREDICTION_MODE m,

+                            const vp9_prob *p) {

+#if CONFIG_DEBUG

+  assert(NEARESTMV <= m  &&  m < SPLITMV);

+#endif

+  write_token(bc, vp9_sb_mv_ref_tree, p,

+              vp9_sb_mv_ref_encoding_array - NEARESTMV + m);

+}

+#endif

+static void write_sub_mv_ref

+(

+  vp9_writer *bc, B_PREDICTION_MODE m, const vp9_prob *p

+) {

+#if CONFIG_DEBUG

+  assert(LEFT4X4 <= m  &&  m <= NEW4X4);

+#endif

+  write_token(bc, vp9_sub_mv_ref_tree, p,

+              vp9_sub_mv_ref_encoding_array - LEFT4X4 + m);

+}

+static void write_nmv(vp9_writer *bc, const MV *mv, const int_mv *ref,

+                      const nmv_context *nmvc, int usehp) {

+  MV e;

+  e.row = mv->row - ref->as_mv.row;

+  e.col = mv->col - ref->as_mv.col;

+  vp9_encode_nmv(bc, &e, &ref->as_mv, nmvc);

+  vp9_encode_nmv_fp(bc, &e, &ref->as_mv, nmvc, usehp);

+}

+#if CONFIG_NEW_MVREF

+static int vp9_cost_mv_ref_id(vp9_prob * ref_id_probs, int mv_ref_id) {

+  int cost;

+  // Encode the index for the MV reference.

+  switch (mv_ref_id) {

+    case 0:

+      cost = vp9_cost_zero(ref_id_probs[0]);

+      break;

+    case 1:

+      cost = vp9_cost_one(ref_id_probs[0]);

+      cost += vp9_cost_zero(ref_id_probs[1]);

+      break;

+    case 2:

+      cost = vp9_cost_one(ref_id_probs[0]);

+      cost += vp9_cost_one(ref_id_probs[1]);

+      cost += vp9_cost_zero(ref_id_probs[2]);

+      break;

+    case 3:

+      cost = vp9_cost_one(ref_id_probs[0]);

+      cost += vp9_cost_one(ref_id_probs[1]);

+      cost += vp9_cost_one(ref_id_probs[2]);

+      break;

+      // TRAP.. This should not happen

+    default:

+      assert(0);

+      break;

+  }

+  return cost;

+}

+static void vp9_write_mv_ref_id(vp9_writer *w,

+                                vp9_prob * ref_id_probs,

+                                int mv_ref_id) {

+  // Encode the index for the MV reference.

+  switch (mv_ref_id) {

+    case 0:

+      vp9_write(w, 0, ref_id_probs[0]);

+      break;

+    case 1:

+      vp9_write(w, 1, ref_id_probs[0]);

+      vp9_write(w, 0, ref_id_probs[1]);

+      break;

+    case 2:

+      vp9_write(w, 1, ref_id_probs[0]);

+      vp9_write(w, 1, ref_id_probs[1]);

+      vp9_write(w, 0, ref_id_probs[2]);

+      break;

+    case 3:

+      vp9_write(w, 1, ref_id_probs[0]);

+      vp9_write(w, 1, ref_id_probs[1]);

+      vp9_write(w, 1, ref_id_probs[2]);

+      break;

+      // TRAP.. This should not happen

+    default:

+      assert(0);

+      break;

+  }

+}

+// Estimate the cost of each coding the vector using each reference candidate

+static unsigned int pick_best_mv_ref(MACROBLOCK *x,

+                                     MV_REFERENCE_FRAME ref_frame,

+                                     int_mv target_mv,

+                                     int_mv * mv_ref_list,

+                                     int_mv * best_ref) {

+  int i;

+  int best_index = 0;

+  int cost, cost2;

+  int zero_seen = (mv_ref_list[0].as_int) ? FALSE : TRUE;

+  MACROBLOCKD *xd = &x->e_mbd;

+  int max_mv = MV_MAX;

+  cost = vp9_cost_mv_ref_id(xd->mb_mv_ref_id_probs[ref_frame], 0) +

+         vp9_mv_bit_cost(&target_mv,

+                         &mv_ref_list[0],

+                         XMVCOST, 96,

+                         xd->allow_high_precision_mv);

+  // Use 4 for now : for (i = 1; i < MAX_MV_REFS; ++i ) {

+  for (i = 1; i < 4; ++i) {

+    // If we see a 0,0 reference vector for a second time we have reached

+    // the end of the list of valid candidate vectors.

+    if (!mv_ref_list[i].as_int)

+      if (zero_seen)

+        break;

+      else

+        zero_seen = TRUE;

+    // Check for cases where the reference choice would give rise to an

+    // uncodable/out of range residual for row or col.

+    if ((abs(target_mv.as_mv.row - mv_ref_list[i].as_mv.row) > max_mv) ||

+        (abs(target_mv.as_mv.col - mv_ref_list[i].as_mv.col) > max_mv)) {

+      continue;

+    }

+    cost2 = vp9_cost_mv_ref_id(xd->mb_mv_ref_id_probs[ref_frame], i) +

+            vp9_mv_bit_cost(&target_mv,

+                            &mv_ref_list[i],

+                            XMVCOST, 96,

+                            xd->allow_high_precision_mv);

+    if (cost2 < cost) {

+      cost = cost2;

+      best_index = i;

+    }

+  }

+  (*best_ref).as_int = mv_ref_list[best_index].as_int;

+  return best_index;

+}

+#endif

+// This function writes the current macro block's segnment id to the bitstream

+// It should only be called if a segment map update is indicated.

+static void write_mb_segid(vp9_writer *bc,

+                           const MB_MODE_INFO *mi, const MACROBLOCKD *xd) {

+  // Encode the MB segment id.

+  int seg_id = mi->segment_id;

+#if CONFIG_SUPERBLOCKS

+  if (mi->encoded_as_sb) {

+    if (xd->mb_to_right_edge > 0)

+      seg_id = seg_id && xd->mode_info_context[1].mbmi.segment_id;

+    if (xd->mb_to_bottom_edge > 0) {

+      seg_id = seg_id &&

+               xd->mode_info_context[xd->mode_info_stride].mbmi.segment_id;

+      if (xd->mb_to_right_edge > 0)

+        seg_id = seg_id &&

+                xd->mode_info_context[xd->mode_info_stride + 1].mbmi.segment_id;

+    }

+  }

+#endif

+  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {

+    switch (seg_id) {

+      case 0:

+        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);

+        vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);

+        break;

+      case 1:

+        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);

+        vp9_write(bc, 1, xd->mb_segment_tree_probs[1]);

+        break;

+      case 2:

+        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);

+        vp9_write(bc, 0, xd->mb_segment_tree_probs[2]);

+        break;

+      case 3:

+        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);

+        vp9_write(bc, 1, xd->mb_segment_tree_probs[2]);

+        break;

+        // TRAP.. This should not happen

+      default:

+        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);

+        vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);

+        break;

+    }

+  }

+}

+// This function encodes the reference frame

+static void encode_ref_frame(vp9_writer *const bc,

+                             VP9_COMMON *const cm,

+                             MACROBLOCKD *xd,

+                             int segment_id,

+                             MV_REFERENCE_FRAME rf) {

+  int seg_ref_active;

+  int seg_ref_count = 0;

+  seg_ref_active = vp9_segfeature_active(xd,

+                                         segment_id,

+                                         SEG_LVL_REF_FRAME);

+  if (seg_ref_active) {

+    seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME) +

+                    vp9_check_segref(xd, segment_id, LAST_FRAME) +

+                    vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +

+                    vp9_check_segref(xd, segment_id, ALTREF_FRAME);

+  }

+  // If segment level coding of this signal is disabled...

+  // or the segment allows multiple reference frame options

+  if (!seg_ref_active || (seg_ref_count > 1)) {

+    // Values used in prediction model coding

+    unsigned char prediction_flag;

+    vp9_prob pred_prob;

+    MV_REFERENCE_FRAME pred_rf;

+    // Get the context probability the prediction flag

+    pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);

+    // Get the predicted value.

+    pred_rf = vp9_get_pred_ref(cm, xd);

+    // Did the chosen reference frame match its predicted value.

+    prediction_flag =

+      (xd->mode_info_context->mbmi.ref_frame == pred_rf);

+    vp9_set_pred_flag(xd, PRED_REF, prediction_flag);

+    vp9_write(bc, prediction_flag, pred_prob);

+    // If not predicted correctly then code value explicitly

+    if (!prediction_flag) {

+      vp9_prob mod_refprobs[PREDICTION_PROBS];

+      vpx_memcpy(mod_refprobs,

+                 cm->mod_refprobs[pred_rf], sizeof(mod_refprobs));

+      // If segment coding enabled blank out options that cant occur by

+      // setting the branch probability to 0.

+      if (seg_ref_active) {

+        mod_refprobs[INTRA_FRAME] *=

+          vp9_check_segref(xd, segment_id, INTRA_FRAME);

+        mod_refprobs[LAST_FRAME] *=

+          vp9_check_segref(xd, segment_id, LAST_FRAME);

+        mod_refprobs[GOLDEN_FRAME] *=

+          (vp9_check_segref(xd, segment_id, GOLDEN_FRAME) *

+           vp9_check_segref(xd, segment_id, ALTREF_FRAME));

+      }

+      if (mod_refprobs[0]) {

+        vp9_write(bc, (rf != INTRA_FRAME), mod_refprobs[0]);

+      }

+      // Inter coded

+      if (rf != INTRA_FRAME) {

+        if (mod_refprobs[1]) {

+          vp9_write(bc, (rf != LAST_FRAME), mod_refprobs[1]);

+        }

+        if (rf != LAST_FRAME) {

+          if (mod_refprobs[2]) {

+            vp9_write(bc, (rf != GOLDEN_FRAME), mod_refprobs[2]);

+          }

+        }

+      }

+    }

+  }

+  // if using the prediction mdoel we have nothing further to do because

+  // the reference frame is fully coded by the segment

+}

+// Update the probabilities used to encode reference frame data

+static void update_ref_probs(VP9_COMP *const cpi) {

+  VP9_COMMON *const cm = &cpi->common;

+  const int *const rfct = cpi->count_mb_ref_frame_usage;

+  const int rf_intra = rfct[INTRA_FRAME];

+  const int rf_inter = rfct[LAST_FRAME] +

+                       rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];

+  cm->prob_intra_coded = get_binary_prob(rf_intra, rf_inter);

+  cm->prob_last_coded = get_prob(rfct[LAST_FRAME], rf_inter);

+  cm->prob_gf_coded = get_binary_prob(rfct[GOLDEN_FRAME], rfct[ALTREF_FRAME]);

+  // Compute a modified set of probabilities to use when prediction of the

+  // reference frame fails

+  vp9_compute_mod_refprobs(cm);

+}

+static void pack_inter_mode_mvs(VP9_COMP *const cpi, vp9_writer *const bc) {

+  int i;

+  VP9_COMMON *const pc = &cpi->common;

+  const nmv_context *nmvc = &pc->fc.nmvc;

+  MACROBLOCK *x = &cpi->mb;

+  MACROBLOCKD *xd = &cpi->mb.e_mbd;

+  MODE_INFO *m;

+  MODE_INFO *prev_m;

+  TOKENEXTRA *tok = cpi->tok;

+  TOKENEXTRA *tok_end = tok + cpi->tok_count;

+  const int mis = pc->mode_info_stride;

+  int mb_row, mb_col;

+  int row, col;

+  // Values used in prediction model coding

+  vp9_prob pred_prob;

+  unsigned char prediction_flag;

+  int row_delta[4] = { 0, +1,  0, -1};

+  int col_delta[4] = { +1, -1, +1, +1};

+  cpi->mb.partition_info = cpi->mb.pi;

+  mb_row = 0;

+  for (row = 0; row < pc->mb_rows; row += 2) {

+    m = pc->mi + row * mis;

+    prev_m = pc->prev_mi + row * mis;

+    mb_col = 0;

+    for (col = 0; col < pc->mb_cols; col += 2) {

+      int i;

+      // Process the 4 MBs in the order:

+      // top-left, top-right, bottom-left, bottom-right

+#if CONFIG_SUPERBLOCKS

+      vp9_write(bc, m->mbmi.encoded_as_sb, pc->sb_coded);

+#endif

+      for (i = 0; i < 4; i++) {

+        MB_MODE_INFO *mi;

+        MV_REFERENCE_FRAME rf;

+        MB_PREDICTION_MODE mode;

+        int segment_id;

+        int dy = row_delta[i];

+        int dx = col_delta[i];

+        int offset_extended = dy * mis + dx;

+        if ((mb_row >= pc->mb_rows) || (mb_col >= pc->mb_cols)) {

+          // MB lies outside frame, move on

+          mb_row += dy;

+          mb_col += dx;

+          m += offset_extended;

+          prev_m += offset_extended;

+          cpi->mb.partition_info += offset_extended;

+          continue;

+        }

+        mi = &m->mbmi;

+        rf = mi->ref_frame;

+        mode = mi->mode;

+        segment_id = mi->segment_id;

+        // Distance of Mb to the various image edges.

+        // These specified to 8th pel as they are always compared to MV

+        // values that are in 1/8th pel units

+        xd->mb_to_left_edge = -((mb_col * 16) << 3);

+        xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;

+        xd->mb_to_top_edge = -((mb_row * 16)) << 3;

+        xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;

+        // Make sure the MacroBlockD mode info pointer is set correctly

+        xd->mode_info_context = m;

+        xd->prev_mode_info_context = prev_m;

+#ifdef ENTROPY_STATS

+        active_section = 9;

+#endif

+        if (cpi->mb.e_mbd.update_mb_segmentation_map) {

+          // Is temporal coding of the segment map enabled

+          if (pc->temporal_update) {

+            prediction_flag = vp9_get_pred_flag(xd, PRED_SEG_ID);

+            pred_prob = vp9_get_pred_prob(pc, xd, PRED_SEG_ID);

+            // Code the segment id prediction flag for this mb

+            vp9_write(bc, prediction_flag, pred_prob);

+            // If the mb segment id wasn't predicted code explicitly

+            if (!prediction_flag)

+              write_mb_segid(bc, mi, &cpi->mb.e_mbd);

+          } else {

+            // Normal unpredicted coding

+            write_mb_segid(bc, mi, &cpi->mb.e_mbd);

+          }

+        }

+        if (pc->mb_no_coeff_skip &&

+            (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||

+             (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {

+          int skip_coeff = mi->mb_skip_coeff;

+#if CONFIG_SUPERBLOCKS

+          if (mi->encoded_as_sb) {

+            skip_coeff &= m[1].mbmi.mb_skip_coeff;

+            skip_coeff &= m[mis].mbmi.mb_skip_coeff;

+            skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;

+          }

+#endif

+          vp9_write(bc, skip_coeff,

+                    vp9_get_pred_prob(pc, xd, PRED_MBSKIP));

+        }

+        // Encode the reference frame.

+        encode_ref_frame(bc, pc, xd, segment_id, rf);

+        if (rf == INTRA_FRAME) {

+#ifdef ENTROPY_STATS

+          active_section = 6;

+#endif

+          // TODO(rbultje) write using SB tree structure

+          if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {

+            write_ymode(bc, mode, pc->fc.ymode_prob);

+          }

+          if (mode == B_PRED) {

+            int j = 0;

+#if CONFIG_COMP_INTRA_PRED

+            int uses_second =

+              m->bmi[0].as_mode.second !=

+              (B_PREDICTION_MODE)(B_DC_PRED - 1);

+            vp9_write(bc, uses_second, 128);

+#endif

+            do {

+#if CONFIG_COMP_INTRA_PRED

+              B_PREDICTION_MODE mode2 = m->bmi[j].as_mode.second;

+#endif

+              write_bmode(bc, m->bmi[j].as_mode.first,

+                          pc->fc.bmode_prob);

+              /*

+              if (!cpi->dummy_packing) {

+                int p;

+                for (p = 0; p < VP9_BINTRAMODES - 1; ++p)

+                  printf(" %d", pc->fc.bmode_prob[p]);

+                printf("\nbmode[%d][%d]: %d\n", pc->current_video_frame, j, m->bmi[j].as_mode.first);

+              }

+              */

+#if CONFIG_COMP_INTRA_PRED

+              if (uses_second) {

+                write_bmode(bc, mode2, pc->fc.bmode_prob);

+              }

+#endif

+            } while (++j < 16);

+          }

+          if (mode == I8X8_PRED) {

+            write_i8x8_mode(bc, m->bmi[0].as_mode.first,

+                            pc->fc.i8x8_mode_prob);

+            write_i8x8_mode(bc, m->bmi[2].as_mode.first,

+                            pc->fc.i8x8_mode_prob);

+            write_i8x8_mode(bc, m->bmi[8].as_mode.first,

+                            pc->fc.i8x8_mode_prob);

+            write_i8x8_mode(bc, m->bmi[10].as_mode.first,

+                            pc->fc.i8x8_mode_prob);

+          } else {

+            write_uv_mode(bc, mi->uv_mode,

+                          pc->fc.uv_mode_prob[mode]);

+          }

+        } else {

+          int_mv best_mv, best_second_mv;

+          int ct[4];

+          vp9_prob mv_ref_p [VP9_MVREFS - 1];

+          {

+            int_mv n1, n2;

+            // Only used for context just now and soon to be deprecated.

+            vp9_find_near_mvs(xd, m, prev_m, &n1, &n2, &best_mv, ct,

+                              rf, cpi->common.ref_frame_sign_bias);

+#if CONFIG_NEWBESTREFMV

+            best_mv.as_int = mi->ref_mvs[rf][0].as_int;

+#endif

+            vp9_mv_ref_probs(&cpi->common, mv_ref_p, ct);

+#ifdef ENTROPY_STATS

+            accum_mv_refs(mode, ct);

+#endif

+          }

+#ifdef ENTROPY_STATS

+          active_section = 3;

+#endif

+          // Is the segment coding of mode enabled

+          if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {

+#if CONFIG_SUPERBLOCKS

+            if (mi->encoded_as_sb) {

+              write_sb_mv_ref(bc, mode, mv_ref_p);

+            } else

+#endif

+            {

+              write_mv_ref(bc, mode, mv_ref_p);

+            }

+            vp9_accum_mv_refs(&cpi->common, mode, ct);

+          }

+#if CONFIG_PRED_FILTER

+          // Is the prediction filter enabled

+          if (mode >= NEARESTMV && mode < SPLITMV) {

+            if (cpi->common.pred_filter_mode == 2)

+              vp9_write(bc, mi->pred_filter_enabled,

+                        pc->prob_pred_filter_off);

+            else

+              assert(mi->pred_filter_enabled ==

+                     cpi->common.pred_filter_mode);

+          }

+#endif

+          if (mode >= NEARESTMV && mode <= SPLITMV)

+          {

+            if (cpi->common.mcomp_filter_type == SWITCHABLE) {

+              write_token(bc, vp9_switchable_interp_tree,

+                          vp9_get_pred_probs(&cpi->common, xd,

+                                             PRED_SWITCHABLE_INTERP),

+                          vp9_switchable_interp_encodings +

+                              vp9_switchable_interp_map[mi->interp_filter]);

+            } else {

+              assert (mi->interp_filter ==

+                      cpi->common.mcomp_filter_type);

+            }

+          }

+          if (mi->second_ref_frame &&

+              (mode == NEWMV || mode == SPLITMV)) {

+            int_mv n1, n2;

+            // Only used for context just now and soon to be deprecated.

+            vp9_find_near_mvs(xd, m, prev_m,

+                              &n1, &n2, &best_second_mv, ct,

+                              mi->second_ref_frame,

+                              cpi->common.ref_frame_sign_bias);

+#if CONFIG_NEWBESTREFMV

+            best_second_mv.as_int =

+              mi->ref_mvs[mi->second_ref_frame][0].as_int;

+#endif

+          }

+          // does the feature use compound prediction or not

+          // (if not specified at the frame/segment level)

+          if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {

+            vp9_write(bc, mi->second_ref_frame != INTRA_FRAME,

+                      vp9_get_pred_prob(pc, xd, PRED_COMP));

+          }

+          {

+            switch (mode) { /* new, split require MVs */

+              case NEWMV:

+#ifdef ENTROPY_STATS

+                active_section = 5;

+#endif

+#if CONFIG_NEW_MVREF

+                {

+                  unsigned int best_index;

+                  // Choose the best mv reference

+                  best_index = pick_best_mv_ref(x, rf, mi->mv[0],

+                                                mi->ref_mvs[rf], &best_mv);

+                  // Encode the index of the choice.

+                  vp9_write_mv_ref_id(bc,

+                                      xd->mb_mv_ref_id_probs[rf], best_index);

+                  cpi->best_ref_index_counts[rf][best_index]++;

+                }

+#endif

+                write_nmv(bc, &mi->mv[0].as_mv, &best_mv,

+                          (const nmv_context*) nmvc,

+                          xd->allow_high_precision_mv);

+                if (mi->second_ref_frame) {

+#if CONFIG_NEW_MVREF

+                  unsigned int best_index;

+                  MV_REFERENCE_FRAME sec_ref_frame = mi->second_ref_frame;

+                  best_index =

+                    pick_best_mv_ref(x, sec_ref_frame, mi->mv[1],

+                                     mi->ref_mvs[sec_ref_frame],

+                                     &best_second_mv);

+                  // Encode the index of the choice.

+                  vp9_write_mv_ref_id(bc,

+                                      xd->mb_mv_ref_id_probs[sec_ref_frame],

+                                      best_index);

+                  cpi->best_ref_index_counts[sec_ref_frame][best_index]++;

+#endif

+                  write_nmv(bc, &mi->mv[1].as_mv, &best_second_mv,

+                            (const nmv_context*) nmvc,

+                            xd->allow_high_precision_mv);

+                }

+                break;

+              case SPLITMV: {

+                int j = 0;

+#ifdef MODE_STATS

+                ++count_mb_seg [mi->partitioning];

+#endif

+                write_split(bc, mi->partitioning, cpi->common.fc.mbsplit_prob);

+                cpi->mbsplit_count[mi->partitioning]++;

+                do {

+                  B_PREDICTION_MODE blockmode;

+                  int_mv blockmv;

+                  const int *const  L =

+                    vp9_mbsplits [mi->partitioning];

+                  int k = -1;  /* first block in subset j */

+                  int mv_contz;

+                  int_mv leftmv, abovemv;

+                  blockmode = cpi->mb.partition_info->bmi[j].mode;

+                  blockmv = cpi->mb.partition_info->bmi[j].mv;

+#if CONFIG_DEBUG

+                  while (j != L[++k])

+                    if (k >= 16)

+                      assert(0);

+#else

+                  while (j != L[++k]);

+#endif

+                  leftmv.as_int = left_block_mv(m, k);

+                  abovemv.as_int = above_block_mv(m, k, mis);

+                  mv_contz = vp9_mv_cont(&leftmv, &abovemv);

+                  write_sub_mv_ref(bc, blockmode,

+                                   cpi->common.fc.sub_mv_ref_prob [mv_contz]);

+                  cpi->sub_mv_ref_count[mv_contz][blockmode - LEFT4X4]++;

+                  if (blockmode == NEW4X4) {

+#ifdef ENTROPY_STATS

+                    active_section = 11;

+#endif

+                    write_nmv(bc, &blockmv.as_mv, &best_mv,

+                              (const nmv_context*) nmvc,

+                              xd->allow_high_precision_mv);

+                    if (mi->second_ref_frame) {

+                      write_nmv(bc,

+                                &cpi->mb.partition_info->bmi[j].second_mv.as_mv,

+                                &best_second_mv,

+                                (const nmv_context*) nmvc,

+                                xd->allow_high_precision_mv);

+                    }

+                  }

+                } while (++j < cpi->mb.partition_info->count);

+              }

+              break;

+              default:

+                break;

+            }

+          }

+          // Update the mvcounts used to tune mv probs but only if this is

+          // the real pack run.

+          if ( !cpi->dummy_packing ) {

+            update_mvcount(cpi, x, &best_mv, &best_second_mv);

+          }

+        }

+        if (

+#if CONFIG_SUPERBLOCKS

+            !mi->encoded_as_sb &&

+#endif

+            ((rf == INTRA_FRAME && mode <= I8X8_PRED) ||

+             (rf != INTRA_FRAME && !(mode == SPLITMV &&

+                                     mi->partitioning == PARTITIONING_4X4))) &&

+            pc->txfm_mode == TX_MODE_SELECT &&

+            !((pc->mb_no_coeff_skip && mi->mb_skip_coeff) ||

+              (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&

+               vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {

+          TX_SIZE sz = mi->txfm_size;

+          // FIXME(rbultje) code ternary symbol once all experiments are merged

+          vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);

+          if (sz != TX_4X4 && mode != I8X8_PRED && mode != SPLITMV)

+            vp9_write(bc, sz != TX_8X8, pc->prob_tx[1]);

+        }

+#ifdef ENTROPY_STATS

+        active_section = 1;

+#endif

+        assert(tok < tok_end);

+        pack_mb_tokens(bc, &tok, tok_end);

+#if CONFIG_SUPERBLOCKS

+        if (m->mbmi.encoded_as_sb) {

+          assert(!i);

+          mb_col += 2;

+          m += 2;

+          cpi->mb.partition_info += 2;

+          prev_m += 2;

+          break;

+        }

+#endif

+        // Next MB

+        mb_row += dy;

+        mb_col += dx;

+        m += offset_extended;

+        prev_m += offset_extended;

+        cpi->mb.partition_info += offset_extended;

+#if CONFIG_DEBUG

+        assert((prev_m - cpi->common.prev_mip) == (m - cpi->common.mip));

+        assert((prev_m - cpi->common.prev_mi) == (m - cpi->common.mi));

+#endif

+      }

+    }

+    // Next SB

+    mb_row += 2;

+    m += mis + (1 - (pc->mb_cols & 0x1));

+    prev_m += mis + (1 - (pc->mb_cols & 0x1));

+    cpi->mb.partition_info += mis + (1 - (pc->mb_cols & 0x1));

+  }

+}

+static void write_mb_modes_kf(const VP9_COMMON  *c,

+                              const MACROBLOCKD *xd,

+                              const MODE_INFO   *m,

+                              int                mode_info_stride,

+                              vp9_writer *const  bc) {

+  const int mis = mode_info_stride;

+  int ym;

+  int segment_id;

+  ym = m->mbmi.mode;

+  segment_id = m->mbmi.segment_id;

+  if (xd->update_mb_segmentation_map) {

+    write_mb_segid(bc, &m->mbmi, xd);

+  }

+  if (c->mb_no_coeff_skip &&

+      (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||

+       (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {

+        int skip_coeff = m->mbmi.mb_skip_coeff;

+#if CONFIG_SUPERBLOCKS

+        if (m->mbmi.encoded_as_sb) {

+          skip_coeff &= m[1].mbmi.mb_skip_coeff;

+          skip_coeff &= m[mis].mbmi.mb_skip_coeff;

+          skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;

+        }

+#endif

+        vp9_write(bc, skip_coeff,

+                  vp9_get_pred_prob(c, xd, PRED_MBSKIP));

+  }

+#if CONFIG_SUPERBLOCKS

+  if (m->mbmi.encoded_as_sb) {

+    sb_kfwrite_ymode(bc, ym,

+                     c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);

+  } else

+#endif

+  {

+    kfwrite_ymode(bc, ym,

+                  c->kf_ymode_prob[c->kf_ymode_probs_index]);

+  }

+  if (ym == B_PRED) {

+    const int mis = c->mode_info_stride;

+    int i = 0;

+#if CONFIG_COMP_INTRA_PRED

+    int uses_second =

+      m->bmi[0].as_mode.second !=

+      (B_PREDICTION_MODE)(B_DC_PRED - 1);

+    vp9_write(bc, uses_second, 128);

+#endif

+    do {

+      const B_PREDICTION_MODE A = above_block_mode(m, i, mis);

+      const B_PREDICTION_MODE L = left_block_mode(m, i);

+      const int bm = m->bmi[i].as_mode.first;

+#if CONFIG_COMP_INTRA_PRED

+      const int bm2 = m->bmi[i].as_mode.second;

+#endif

+#ifdef ENTROPY_STATS

+      ++intra_mode_stats [A] [L] [bm];

+#endif

+      write_bmode(bc, bm, c->kf_bmode_prob [A] [L]);

+      // printf("    mode: %d\n", bm);

+#if CONFIG_COMP_INTRA_PRED

+      if (uses_second) {

+        write_bmode(bc, bm2, c->kf_bmode_prob [A] [L]);

+      }

+#endif

+    } while (++i < 16);

+  }

+  if (ym == I8X8_PRED) {

+    write_i8x8_mode(bc, m->bmi[0].as_mode.first,

+                    c->fc.i8x8_mode_prob);

+    // printf("    mode: %d\n", m->bmi[0].as_mode.first); fflush(stdout);

+    write_i8x8_mode(bc, m->bmi[2].as_mode.first,

+                    c->fc.i8x8_mode_prob);

+    // printf("    mode: %d\n", m->bmi[2].as_mode.first); fflush(stdout);

+    write_i8x8_mode(bc, m->bmi[8].as_mode.first,

+                    c->fc.i8x8_mode_prob);

+    // printf("    mode: %d\n", m->bmi[8].as_mode.first); fflush(stdout);

+    write_i8x8_mode(bc, m->bmi[10].as_mode.first,

+                    c->fc.i8x8_mode_prob);

+    // printf("    mode: %d\n", m->bmi[10].as_mode.first); fflush(stdout);

+  } else

+    write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);

+  if (

+#if CONFIG_SUPERBLOCKS

+      !m->mbmi.encoded_as_sb &&

+#endif

+      ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&

+      !((c->mb_no_coeff_skip && m->mbmi.mb_skip_coeff) ||

+        (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&

+         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {

+    TX_SIZE sz = m->mbmi.txfm_size;

+    // FIXME(rbultje) code ternary symbol once all experiments are merged

+    vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);

+    if (sz != TX_4X4 && ym <= TM_PRED)

+      vp9_write(bc, sz != TX_8X8, c->prob_tx[1]);

+  }

+}

+static void write_kfmodes(VP9_COMP* const cpi, vp9_writer* const bc) {

+  VP9_COMMON *const c = &cpi->common;

+  const int mis = c->mode_info_stride;

+  MACROBLOCKD *xd = &cpi->mb.e_mbd;

+  MODE_INFO *m;

+  int i;

+  int row, col;

+  int mb_row, mb_col;

+  int row_delta[4] = { 0, +1,  0, -1};

+  int col_delta[4] = { +1, -1, +1, +1};

+  TOKENEXTRA *tok = cpi->tok;

+  TOKENEXTRA *tok_end = tok + cpi->tok_count;

+  mb_row = 0;

+  for (row = 0; row < c->mb_rows; row += 2) {

+    m = c->mi + row * mis;

+    mb_col = 0;

+    for (col = 0; col < c->mb_cols; col += 2) {

+#if CONFIG_SUPERBLOCKS

+      vp9_write(bc, m->mbmi.encoded_as_sb, c->sb_coded);

+#endif

+      // Process the 4 MBs in the order:

+      // top-left, top-right, bottom-left, bottom-right

+      for (i = 0; i < 4; i++) {

+        int dy = row_delta[i];

+        int dx = col_delta[i];

+        int offset_extended = dy * mis + dx;

+        if ((mb_row >= c->mb_rows) || (mb_col >= c->mb_cols)) {

+          // MB lies outside frame, move on

+          mb_row += dy;

+          mb_col += dx;

+          m += offset_extended;

+          continue;

+        }

+        // Make sure the MacroBlockD mode info pointer is set correctly

+        xd->mode_info_context = m;

+        write_mb_modes_kf(c, xd, m, mis, bc);

+#ifdef ENTROPY_STATS

+        active_section = 8;

+#endif

+        assert(tok < tok_end);

+        pack_mb_tokens(bc, &tok, tok_end);

+#if CONFIG_SUPERBLOCKS

+        if (m->mbmi.encoded_as_sb) {

+          assert(!i);

+          mb_col += 2;

+          m += 2;

+          break;

+        }

+#endif

+        // Next MB

+        mb_row += dy;

+        mb_col += dx;

+        m += offset_extended;

+      }

+    }

+    mb_row += 2;

+  }

+}

+/* This function is used for debugging probability trees. */

+static void print_prob_tree(vp9_prob

+                            coef_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]) {

+  /* print coef probability tree */

+  int i, j, k, l;

+  FILE *f = fopen("enc_tree_probs.txt", "a");

+  fprintf(f, "{\n");

+  for (i = 0; i < BLOCK_TYPES; i++) {

+    fprintf(f, "  {\n");

+    for (j = 0; j < COEF_BANDS; j++) {

+      fprintf(f, "    {\n");

+      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

+        fprintf(f, "      {");

+        for (l = 0; l < ENTROPY_NODES; l++) {

+          fprintf(f, "%3u, ",

+                  (unsigned int)(coef_probs [i][j][k][l]));

+        }

+        fprintf(f, " }\n");

+      }

+      fprintf(f, "    }\n");

+    }

+    fprintf(f, "  }\n");

+  }

+  fprintf(f, "}\n");

+  fclose(f);

+}

+static void build_coeff_contexts(VP9_COMP *cpi) {

+  int i = 0, j, k;

+#ifdef ENTROPY_STATS

+  int t = 0;

+#endif

+  for (i = 0; i < BLOCK_TYPES; ++i) {

+    for (j = 0; j < COEF_BANDS; ++j) {

+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

+          continue;

+        vp9_tree_probs_from_distribution(

+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

+          cpi->frame_coef_probs [i][j][k],

+          cpi->frame_branch_ct [i][j][k],

+          cpi->coef_counts [i][j][k],

+          256, 1

+        );

+#ifdef ENTROPY_STATS

+        if (!cpi->dummy_packing)

+          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

+            context_counters[i][j][k][t] += cpi->coef_counts[i][j][k][t];

+#endif

+      }

+    }

+  }

+  for (i = 0; i < BLOCK_TYPES; ++i) {

+    for (j = 0; j < COEF_BANDS; ++j) {

+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

+          continue;

+        vp9_tree_probs_from_distribution(

+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

+          cpi->frame_hybrid_coef_probs [i][j][k],

+          cpi->frame_hybrid_branch_ct [i][j][k],

+          cpi->hybrid_coef_counts [i][j][k],

+          256, 1

+        );

+#ifdef ENTROPY_STATS

+        if (!cpi->dummy_packing)

+          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

+            hybrid_context_counters[i][j][k][t] += cpi->hybrid_coef_counts[i][j][k][t];

+#endif

+      }

+    }

+  }

+  if (cpi->common.txfm_mode != ONLY_4X4) {

+    for (i = 0; i < BLOCK_TYPES_8X8; ++i) {

+      for (j = 0; j < COEF_BANDS; ++j) {

+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+          /* at every context */

+          /* calc probs and branch cts for this frame only */

+          // vp9_prob new_p           [ENTROPY_NODES];

+          // unsigned int branch_ct   [ENTROPY_NODES] [2];

+          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

+            continue;

+          vp9_tree_probs_from_distribution(

+            MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

+            cpi->frame_coef_probs_8x8 [i][j][k],

+            cpi->frame_branch_ct_8x8 [i][j][k],

+            cpi->coef_counts_8x8 [i][j][k],

+            256, 1

+          );

+#ifdef ENTROPY_STATS

+          if (!cpi->dummy_packing)

+            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

+              context_counters_8x8[i][j][k][t] += cpi->coef_counts_8x8[i][j][k][t];

+#endif

+        }

+      }

+    }

+    for (i = 0; i < BLOCK_TYPES_8X8; ++i) {

+      for (j = 0; j < COEF_BANDS; ++j) {

+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+          /* at every context */

+          /* calc probs and branch cts for this frame only */

+          // vp9_prob new_p           [ENTROPY_NODES];

+          // unsigned int branch_ct   [ENTROPY_NODES] [2];

+          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

+            continue;

+          vp9_tree_probs_from_distribution(

+            MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

+            cpi->frame_hybrid_coef_probs_8x8 [i][j][k],

+            cpi->frame_hybrid_branch_ct_8x8 [i][j][k],

+            cpi->hybrid_coef_counts_8x8 [i][j][k],

+            256, 1

+          );

+#ifdef ENTROPY_STATS

+          if (!cpi->dummy_packing)

+            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

+              hybrid_context_counters_8x8[i][j][k][t] += cpi->hybrid_coef_counts_8x8[i][j][k][t];

+#endif

+        }

+      }

+    }

+  }

+  if (cpi->common.txfm_mode > ALLOW_8X8) {

+    for (i = 0; i < BLOCK_TYPES_16X16; ++i) {

+      for (j = 0; j < COEF_BANDS; ++j) {

+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

+            continue;

+          vp9_tree_probs_from_distribution(

+            MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

+            cpi->frame_coef_probs_16x16[i][j][k],

+            cpi->frame_branch_ct_16x16[i][j][k],

+            cpi->coef_counts_16x16[i][j][k], 256, 1);

+#ifdef ENTROPY_STATS

+          if (!cpi->dummy_packing)

+            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

+              context_counters_16x16[i][j][k][t] += cpi->coef_counts_16x16[i][j][k][t];

+#endif

+        }

+      }

+    }

+  }

+  for (i = 0; i < BLOCK_TYPES_16X16; ++i) {

+    for (j = 0; j < COEF_BANDS; ++j) {

+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

+          continue;

+        vp9_tree_probs_from_distribution(

+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

+          cpi->frame_hybrid_coef_probs_16x16[i][j][k],

+          cpi->frame_hybrid_branch_ct_16x16[i][j][k],

+          cpi->hybrid_coef_counts_16x16[i][j][k], 256, 1);

+#ifdef ENTROPY_STATS

+        if (!cpi->dummy_packing)

+          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

+            hybrid_context_counters_16x16[i][j][k][t] += cpi->hybrid_coef_counts_16x16[i][j][k][t];

+#endif

+      }

+    }

+  }

+}

+static void update_coef_probs_common(

+    vp9_writer* const bc,

+    vp9_prob new_frame_coef_probs[BLOCK_TYPES][COEF_BANDS]

+                                 [PREV_COEF_CONTEXTS][ENTROPY_NODES],

+    vp9_prob old_frame_coef_probs[BLOCK_TYPES][COEF_BANDS]

+                                 [PREV_COEF_CONTEXTS][ENTROPY_NODES],

+    unsigned int frame_branch_ct[BLOCK_TYPES][COEF_BANDS]

+                                [PREV_COEF_CONTEXTS][ENTROPY_NODES][2]) {

+  int i, j, k, t;

+  int update[2] = {0, 0};

+  int savings;

+  // vp9_prob bestupd = find_coef_update_prob(cpi);

+  /* dry run to see if there is any udpate at all needed */

+  savings = 0;

+  for (i = 0; i < BLOCK_TYPES; ++i) {

+    for (j = !i; j < COEF_BANDS; ++j) {

+      int prev_coef_savings[ENTROPY_NODES] = {0};

+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+        for (t = 0; t < ENTROPY_NODES; ++t) {

+          vp9_prob newp = new_frame_coef_probs[i][j][k][t];

+          const vp9_prob oldp = old_frame_coef_probs[i][j][k][t];

+          const vp9_prob upd = COEF_UPDATE_PROB;

+          int s = prev_coef_savings[t];

+          int u = 0;

+          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

+            continue;

+#if defined(SEARCH_NEWP)

+          s = prob_diff_update_savings_search(

+                frame_branch_ct[i][j][k][t],

+                oldp, &newp, upd);

+          if (s > 0 && newp != oldp)

+            u = 1;

+          if (u)

+            savings += s - (int)(vp9_cost_zero(upd));

+          else

+            savings -= (int)(vp9_cost_zero(upd));

+#else

+          s = prob_update_savings(

+                frame_branch_ct[i][j][k][t],

+                oldp, newp, upd);

+          if (s > 0)

+            u = 1;

+          if (u)

+            savings += s;

+#endif

+          update[u]++;

+        }

+      }

+    }

+  }

+  // printf("Update %d %d, savings %d\n", update[0], update[1], savings);

+  /* Is coef updated at all */

+  if (update[1] == 0 || savings < 0) {

+    vp9_write_bit(bc, 0);

+  } else {

+    vp9_write_bit(bc, 1);

+    for (i = 0; i < BLOCK_TYPES; ++i) {

+      for (j = !i; j < COEF_BANDS; ++j) {

+        int prev_coef_savings[ENTROPY_NODES] = {0};

+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+          // calc probs and branch cts for this frame only

+          for (t = 0; t < ENTROPY_NODES; ++t) {

+            vp9_prob newp = new_frame_coef_probs[i][j][k][t];

+            vp9_prob *oldp = old_frame_coef_probs[i][j][k] + t;

+            const vp9_prob upd = COEF_UPDATE_PROB;

+            int s = prev_coef_savings[t];

+            int u = 0;

+            if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

+              continue;

+#if defined(SEARCH_NEWP)

+            s = prob_diff_update_savings_search(

+                  frame_branch_ct[i][j][k][t],

+                  *oldp, &newp, upd);

+            if (s > 0 && newp != *oldp)

+              u = 1;

+#else

+            s = prob_update_savings(

+                  frame_branch_ct[i][j][k][t],

+                  *oldp, newp, upd);

+            if (s > 0)

+              u = 1;

+#endif

+            vp9_write(bc, u, upd);

+#ifdef ENTROPY_STATS

+            if (!cpi->dummy_packing)

+              ++ tree_update_hist [i][j][k][t] [u];

+#endif

+            if (u) {

+              /* send/use new probability */

+              write_prob_diff_update(bc, newp, *oldp);

+              *oldp = newp;

+            }

+          }

+        }

+      }

+    }

+  }

+}

+static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {

+  vp9_clear_system_state();

+  // Build the cofficient contexts based on counts collected in encode loop

+  build_coeff_contexts(cpi);

+  update_coef_probs_common(bc,

+                           cpi->frame_coef_probs,

+                           cpi->common.fc.coef_probs,

+                           cpi->frame_branch_ct);

+  update_coef_probs_common(bc,

+                           cpi->frame_hybrid_coef_probs,

+                           cpi->common.fc.hybrid_coef_probs,

+                           cpi->frame_hybrid_branch_ct);

+  /* do not do this if not even allowed */

+  if (cpi->common.txfm_mode != ONLY_4X4) {

+    update_coef_probs_common(bc,

+                             cpi->frame_coef_probs_8x8,

+                             cpi->common.fc.coef_probs_8x8,

+                             cpi->frame_branch_ct_8x8);

+    update_coef_probs_common(bc,

+                             cpi->frame_hybrid_coef_probs_8x8,

+                             cpi->common.fc.hybrid_coef_probs_8x8,

+                             cpi->frame_hybrid_branch_ct_8x8);

+  }

+  if (cpi->common.txfm_mode > ALLOW_8X8) {

+    update_coef_probs_common(bc,

+                             cpi->frame_coef_probs_16x16,

+                             cpi->common.fc.coef_probs_16x16,

+                             cpi->frame_branch_ct_16x16);

+    update_coef_probs_common(bc,

+                             cpi->frame_hybrid_coef_probs_16x16,

+                             cpi->common.fc.hybrid_coef_probs_16x16,

+                             cpi->frame_hybrid_branch_ct_16x16);

+  }

+}

+#ifdef PACKET_TESTING

+FILE *vpxlogc = 0;

+#endif

+static void put_delta_q(vp9_writer *bc, int delta_q) {

+  if (delta_q != 0) {

+    vp9_write_bit(bc, 1);

+    vp9_write_literal(bc, abs(delta_q), 4);

+    if (delta_q < 0)

+      vp9_write_bit(bc, 1);

+    else

+      vp9_write_bit(bc, 0);

+  } else

+    vp9_write_bit(bc, 0);

+}

+static void decide_kf_ymode_entropy(VP9_COMP *cpi) {

+  int mode_cost[MB_MODE_COUNT];

+  int cost;

+  int bestcost = INT_MAX;

+  int bestindex = 0;

+  int i, j;

+  for (i = 0; i < 8; i++) {

+    vp9_cost_tokens(mode_cost, cpi->common.kf_ymode_prob[i], vp9_kf_ymode_tree);

+    cost = 0;

+    for (j = 0; j < VP9_YMODES; j++) {

+      cost += mode_cost[j] * cpi->ymode_count[j];

+    }

+#if CONFIG_SUPERBLOCKS

+    vp9_cost_tokens(mode_cost, cpi->common.sb_kf_ymode_prob[i],

+                    vp9_sb_ymode_tree);

+    for (j = 0; j < VP9_I32X32_MODES; j++) {

+      cost += mode_cost[j] * cpi->sb_ymode_count[j];

+    }

+#endif

+    if (cost < bestcost) {

+      bestindex = i;

+      bestcost = cost;

+    }

+  }

+  cpi->common.kf_ymode_probs_index = bestindex;

+}

+static void segment_reference_frames(VP9_COMP *cpi) {

+  VP9_COMMON *oci = &cpi->common;

+  MODE_INFO *mi = oci->mi;

+  int ref[MAX_MB_SEGMENTS] = {0};

+  int i, j;

+  int mb_index = 0;

+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;

+  for (i = 0; i < oci->mb_rows; i++) {

+    for (j = 0; j < oci->mb_cols; j++, mb_index++) {

+      ref[mi[mb_index].mbmi.segment_id] |= (1 << mi[mb_index].mbmi.ref_frame);

+    }

+    mb_index++;

+  }

+  for (i = 0; i < MAX_MB_SEGMENTS; i++) {

+    vp9_enable_segfeature(xd, i, SEG_LVL_REF_FRAME);

+    vp9_set_segdata(xd, i, SEG_LVL_REF_FRAME, ref[i]);

+  }

+}

+void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,

+                        unsigned long *size) {

+  int i, j;

+  VP9_HEADER oh;

+  VP9_COMMON *const pc = &cpi->common;

+  vp9_writer header_bc, residual_bc;

+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;

+  int extra_bytes_packed = 0;

+  unsigned char *cx_data = dest;

+  oh.show_frame = (int) pc->show_frame;

+  oh.type = (int)pc->frame_type;

+  oh.version = pc->version;

+  oh.first_partition_length_in_bytes = 0;

+  cx_data += 3;

+#if defined(SECTIONBITS_OUTPUT)

+  Sectionbits[active_section = 1] += sizeof(VP9_HEADER) * 8 * 256;

+#endif

+  compute_update_table();

+  /* vp9_kf_default_bmode_probs() is called in vp9_setup_key_frame() once

+   * for each K frame before encode frame. pc->kf_bmode_prob doesn't get

+   * changed anywhere else. No need to call it again here. --yw

+   * vp9_kf_default_bmode_probs( pc->kf_bmode_prob);

+   */

+  /* every keyframe send startcode, width, height, scale factor, clamp

+   * and color type.

+   */

+  if (oh.type == KEY_FRAME) {

+    int v;

+    // Start / synch code

+    cx_data[0] = 0x9D;

+    cx_data[1] = 0x01;

+    cx_data[2] = 0x2a;

+    v = (pc->horiz_scale << 14) | pc->Width;

+    cx_data[3] = v;

+    cx_data[4] = v >> 8;

+    v = (pc->vert_scale << 14) | pc->Height;

+    cx_data[5] = v;

+    cx_data[6] = v >> 8;

+    extra_bytes_packed = 7;

+    cx_data += extra_bytes_packed;

+    vp9_start_encode(&header_bc, cx_data);

+    // signal clr type

+    vp9_write_bit(&header_bc, pc->clr_type);

+    vp9_write_bit(&header_bc, pc->clamp_type);

+  } else {

+    vp9_start_encode(&header_bc, cx_data);

+  }

+  // Signal whether or not Segmentation is enabled

+  vp9_write_bit(&header_bc, (xd->segmentation_enabled) ? 1 : 0);

+  // Indicate which features are enabled

+  if (xd->segmentation_enabled) {

+    // Indicate whether or not the segmentation map is being updated.

+    vp9_write_bit(&header_bc, (xd->update_mb_segmentation_map) ? 1 : 0);

+    // If it is, then indicate the method that will be used.

+    if (xd->update_mb_segmentation_map) {

+      // Select the coding strategy (temporal or spatial)

+      vp9_choose_segmap_coding_method(cpi);

+      // Send the tree probabilities used to decode unpredicted

+      // macro-block segments

+      for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {

+        int data = xd->mb_segment_tree_probs[i];

+        if (data != 255) {

+          vp9_write_bit(&header_bc, 1);

+          vp9_write_literal(&header_bc, data, 8);

+        } else {

+          vp9_write_bit(&header_bc, 0);

+        }

+      }

+      // Write out the chosen coding method.

+      vp9_write_bit(&header_bc, (pc->temporal_update) ? 1 : 0);

+      if (pc->temporal_update) {

+        for (i = 0; i < PREDICTION_PROBS; i++) {

+          int data = pc->segment_pred_probs[i];

+          if (data != 255) {

+            vp9_write_bit(&header_bc, 1);

+            vp9_write_literal(&header_bc, data, 8);

+          } else {

+            vp9_write_bit(&header_bc, 0);

+          }

+        }

+      }

+    }

+    vp9_write_bit(&header_bc, (xd->update_mb_segmentation_data) ? 1 : 0);

+    // segment_reference_frames(cpi);

+    if (xd->update_mb_segmentation_data) {

+      signed char Data;

+      vp9_write_bit(&header_bc, (xd->mb_segment_abs_delta) ? 1 : 0);

+      // For each segments id...

+      for (i = 0; i < MAX_MB_SEGMENTS; i++) {

+        // For each segmentation codable feature...

+        for (j = 0; j < SEG_LVL_MAX; j++) {

+          Data = vp9_get_segdata(xd, i, j);

+          // If the feature is enabled...

+          if (vp9_segfeature_active(xd, i, j)) {

+            vp9_write_bit(&header_bc, 1);

+            // Is the segment data signed..

+            if (vp9_is_segfeature_signed(j)) {

+              // Encode the relevant feature data

+              if (Data < 0) {

+                Data = - Data;

+                vp9_write_literal(&header_bc, Data,

+                                  vp9_seg_feature_data_bits(j));

+                vp9_write_bit(&header_bc, 1);

+              } else {

+                vp9_write_literal(&header_bc, Data,

+                                  vp9_seg_feature_data_bits(j));

+                vp9_write_bit(&header_bc, 0);

+              }

+            }

+            // Unsigned data element so no sign bit needed

+            else

+              vp9_write_literal(&header_bc, Data,

+                                vp9_seg_feature_data_bits(j));

+          } else

+            vp9_write_bit(&header_bc, 0);

+        }

+      }

+    }

+  }

+  // Encode the common prediction model status flag probability updates for

+  // the reference frame

+  update_refpred_stats(cpi);

+  if (pc->frame_type != KEY_FRAME) {

+    for (i = 0; i < PREDICTION_PROBS; i++) {

+      if (cpi->ref_pred_probs_update[i]) {

+        vp9_write_bit(&header_bc, 1);

+        vp9_write_literal(&header_bc, pc->ref_pred_probs[i], 8);

+      } else {

+        vp9_write_bit(&header_bc, 0);

+      }

+    }

+  }

+#if CONFIG_SUPERBLOCKS

+  {

+    /* sb mode probability */

+    const int sb_max = (((pc->mb_rows + 1) >> 1) * ((pc->mb_cols + 1) >> 1));

+    pc->sb_coded = get_prob(sb_max - cpi->sb_count, sb_max);

+    vp9_write_literal(&header_bc, pc->sb_coded, 8);

+  }

+#endif

+  {

+    if (pc->txfm_mode == TX_MODE_SELECT) {

+      pc->prob_tx[0] = get_prob(cpi->txfm_count[0] + cpi->txfm_count_8x8p[0],

+                                cpi->txfm_count[0] + cpi->txfm_count[1] + cpi->txfm_count[2] +

+                                cpi->txfm_count_8x8p[0] + cpi->txfm_count_8x8p[1]);

+      pc->prob_tx[1] = get_prob(cpi->txfm_count[1], cpi->txfm_count[1] + cpi->txfm_count[2]);

+    } else {

+      pc->prob_tx[0] = 128;

+      pc->prob_tx[1] = 128;

+    }

+    vp9_write_literal(&header_bc, pc->txfm_mode, 2);

+    if (pc->txfm_mode == TX_MODE_SELECT) {

+      vp9_write_literal(&header_bc, pc->prob_tx[0], 8);

+      vp9_write_literal(&header_bc, pc->prob_tx[1], 8);

+    }

+  }

+  // Encode the loop filter level and type

+  vp9_write_bit(&header_bc, pc->filter_type);

+  vp9_write_literal(&header_bc, pc->filter_level, 6);

+  vp9_write_literal(&header_bc, pc->sharpness_level, 3);

+  // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled).

+  vp9_write_bit(&header_bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0);

+  if (xd->mode_ref_lf_delta_enabled) {

+    // Do the deltas need to be updated

+    int send_update = xd->mode_ref_lf_delta_update;

+    vp9_write_bit(&header_bc, send_update);

+    if (send_update) {

+      int Data;

+      // Send update

+      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {

+        Data = xd->ref_lf_deltas[i];

+        // Frame level data

+        if (xd->ref_lf_deltas[i] != xd->last_ref_lf_deltas[i]) {

+          xd->last_ref_lf_deltas[i] = xd->ref_lf_deltas[i];

+          vp9_write_bit(&header_bc, 1);

+          if (Data > 0) {

+            vp9_write_literal(&header_bc, (Data & 0x3F), 6);

+            vp9_write_bit(&header_bc, 0);    // sign

+          } else {

+            Data = -Data;

+            vp9_write_literal(&header_bc, (Data & 0x3F), 6);

+            vp9_write_bit(&header_bc, 1);    // sign

+          }

+        } else {

+          vp9_write_bit(&header_bc, 0);

+        }

+      }

+      // Send update

+      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {

+        Data = xd->mode_lf_deltas[i];

+        if (xd->mode_lf_deltas[i] != xd->last_mode_lf_deltas[i]) {

+          xd->last_mode_lf_deltas[i] = xd->mode_lf_deltas[i];

+          vp9_write_bit(&header_bc, 1);

+          if (Data > 0) {

+            vp9_write_literal(&header_bc, (Data & 0x3F), 6);

+            vp9_write_bit(&header_bc, 0);    // sign

+          } else {

+            Data = -Data;

+            vp9_write_literal(&header_bc, (Data & 0x3F), 6);

+            vp9_write_bit(&header_bc, 1);    // sign

+          }

+        } else {

+          vp9_write_bit(&header_bc, 0);

+        }

+      }

+    }

+  }

+  // signal here is multi token partition is enabled

+  // vp9_write_literal(&header_bc, pc->multi_token_partition, 2);

+  vp9_write_literal(&header_bc, 0, 2);

+  // Frame Q baseline quantizer index

+  vp9_write_literal(&header_bc, pc->base_qindex, QINDEX_BITS);

+  // Transmit Dc, Second order and Uv quantizer delta information

+  put_delta_q(&header_bc, pc->y1dc_delta_q);

+  put_delta_q(&header_bc, pc->y2dc_delta_q);

+  put_delta_q(&header_bc, pc->y2ac_delta_q);

+  put_delta_q(&header_bc, pc->uvdc_delta_q);

+  put_delta_q(&header_bc, pc->uvac_delta_q);

+  // When there is a key frame all reference buffers are updated using the new key frame

+  if (pc->frame_type != KEY_FRAME) {

+    // Should the GF or ARF be updated using the transmitted frame or buffer

+    vp9_write_bit(&header_bc, pc->refresh_golden_frame);

+    vp9_write_bit(&header_bc, pc->refresh_alt_ref_frame);

+    // For inter frames the current default behavior is that when

+    // cm->refresh_golden_frame is set we copy the old GF over to

+    // the ARF buffer. This is purely an encoder decision at present.

+    if (pc->refresh_golden_frame)

+      pc->copy_buffer_to_arf  = 2;

+    // If not being updated from current frame should either GF or ARF be updated from another buffer

+    if (!pc->refresh_golden_frame)

+      vp9_write_literal(&header_bc, pc->copy_buffer_to_gf, 2);

+    if (!pc->refresh_alt_ref_frame)

+      vp9_write_literal(&header_bc, pc->copy_buffer_to_arf, 2);

+    // Indicate reference frame sign bias for Golden and ARF frames (always 0 for last frame buffer)

+    vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]);

+    vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);

+    // Signal whether to allow high MV precision

+    vp9_write_bit(&header_bc, (xd->allow_high_precision_mv) ? 1 : 0);

+    if (pc->mcomp_filter_type == SWITCHABLE) {

+      /* Check to see if only one of the filters is actually used */

+      int count[VP9_SWITCHABLE_FILTERS];

+      int i, j, c = 0;

+      for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {

+        count[i] = 0;

+        for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {

+          count[i] += cpi->switchable_interp_count[j][i];

+        }

+        c += (count[i] > 0);

+      }

+      if (c == 1) {

+        /* Only one filter is used. So set the filter at frame level */

+        for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {

+          if (count[i]) {

+            pc->mcomp_filter_type = vp9_switchable_interp[i];

+            break;

+          }

+        }

+      }

+    }

+    // Signal the type of subpel filter to use

+    vp9_write_bit(&header_bc, (pc->mcomp_filter_type == SWITCHABLE));

+    if (pc->mcomp_filter_type != SWITCHABLE)

+      vp9_write_literal(&header_bc, (pc->mcomp_filter_type), 2);

+  }

+  vp9_write_bit(&header_bc, pc->refresh_entropy_probs);

+  if (pc->frame_type != KEY_FRAME)

+    vp9_write_bit(&header_bc, pc->refresh_last_frame);

+#ifdef ENTROPY_STATS

+  if (pc->frame_type == INTER_FRAME)

+    active_section = 0;

+  else

+    active_section = 7;

+#endif

+  vp9_clear_system_state();  // __asm emms;

+  vp9_copy(cpi->common.fc.pre_coef_probs, cpi->common.fc.coef_probs);

+  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs, cpi->common.fc.hybrid_coef_probs);

+  vp9_copy(cpi->common.fc.pre_coef_probs_8x8, cpi->common.fc.coef_probs_8x8);

+  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_8x8, cpi->common.fc.hybrid_coef_probs_8x8);

+  vp9_copy(cpi->common.fc.pre_coef_probs_16x16, cpi->common.fc.coef_probs_16x16);

+  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_16x16, cpi->common.fc.hybrid_coef_probs_16x16);

+  vp9_copy(cpi->common.fc.pre_ymode_prob, cpi->common.fc.ymode_prob);

+  vp9_copy(cpi->common.fc.pre_uv_mode_prob, cpi->common.fc.uv_mode_prob);

+  vp9_copy(cpi->common.fc.pre_bmode_prob, cpi->common.fc.bmode_prob);

+  vp9_copy(cpi->common.fc.pre_sub_mv_ref_prob, cpi->common.fc.sub_mv_ref_prob);

+  vp9_copy(cpi->common.fc.pre_mbsplit_prob, cpi->common.fc.mbsplit_prob);

+  vp9_copy(cpi->common.fc.pre_i8x8_mode_prob, cpi->common.fc.i8x8_mode_prob);

+  cpi->common.fc.pre_nmvc = cpi->common.fc.nmvc;

+  vp9_zero(cpi->sub_mv_ref_count);

+  vp9_zero(cpi->mbsplit_count);

+  vp9_zero(cpi->common.fc.mv_ref_ct)

+  vp9_zero(cpi->common.fc.mv_ref_ct_a)

+  update_coef_probs(cpi, &header_bc);

+#ifdef ENTROPY_STATS

+  active_section = 2;

+#endif

+  // Write out the mb_no_coeff_skip flag

+  vp9_write_bit(&header_bc, pc->mb_no_coeff_skip);

+  if (pc->mb_no_coeff_skip) {

+    int k;

+    vp9_update_skip_probs(cpi);

+    for (k = 0; k < MBSKIP_CONTEXTS; ++k)

+      vp9_write_literal(&header_bc, pc->mbskip_pred_probs[k], 8);

+  }

+  if (pc->frame_type == KEY_FRAME) {

+    if (!pc->kf_ymode_probs_update) {

+      vp9_write_literal(&header_bc, pc->kf_ymode_probs_index, 3);

+    }

+  } else {

+    // Update the probabilities used to encode reference frame data

+    update_ref_probs(cpi);

+#ifdef ENTROPY_STATS

+    active_section = 1;

+#endif

+#if CONFIG_PRED_FILTER

+    // Write the prediction filter mode used for this frame

+    vp9_write_literal(&header_bc, pc->pred_filter_mode, 2);

+    // Write prediction filter on/off probability if signaling at MB level

+    if (pc->pred_filter_mode == 2)

+      vp9_write_literal(&header_bc, pc->prob_pred_filter_off, 8);

+#endif

+    if (pc->mcomp_filter_type == SWITCHABLE)

+      update_switchable_interp_probs(cpi, &header_bc);

+    vp9_write_literal(&header_bc, pc->prob_intra_coded, 8);

+    vp9_write_literal(&header_bc, pc->prob_last_coded, 8);

+    vp9_write_literal(&header_bc, pc->prob_gf_coded, 8);

+    {

+      const int comp_pred_mode = cpi->common.comp_pred_mode;

+      const int use_compound_pred = (comp_pred_mode != SINGLE_PREDICTION_ONLY);

+      const int use_hybrid_pred = (comp_pred_mode == HYBRID_PREDICTION);

+      vp9_write(&header_bc, use_compound_pred, 128);

+      if (use_compound_pred) {

+        vp9_write(&header_bc, use_hybrid_pred, 128);

+        if (use_hybrid_pred) {

+          for (i = 0; i < COMP_PRED_CONTEXTS; i++) {

+            pc->prob_comppred[i] = get_binary_prob(cpi->single_pred_count[i],

+                                                   cpi->comp_pred_count[i]);

+            vp9_write_literal(&header_bc, pc->prob_comppred[i], 8);

+          }

+        }

+      }

+    }

+    update_mbintra_mode_probs(cpi, &header_bc);

+#if CONFIG_NEW_MVREF

+    // Temp defaults probabilities for ecnoding the MV ref id signal

+    vpx_memset(xd->mb_mv_ref_id_probs, 192, sizeof(xd->mb_mv_ref_id_probs));

+#endif

+    vp9_write_nmvprobs(cpi, xd->allow_high_precision_mv, &header_bc);

+  }

+  vp9_stop_encode(&header_bc);

+  oh.first_partition_length_in_bytes = header_bc.pos;

+  /* update frame tag */

+  {

+    int v = (oh.first_partition_length_in_bytes << 5) |

+            (oh.show_frame << 4) |

+            (oh.version << 1) |

+            oh.type;

+    dest[0] = v;

+    dest[1] = v >> 8;

+    dest[2] = v >> 16;

+  }

+  *size = VP9_HEADER_SIZE + extra_bytes_packed + header_bc.pos;

+  vp9_start_encode(&residual_bc, cx_data + header_bc.pos);

+  if (pc->frame_type == KEY_FRAME) {

+    decide_kf_ymode_entropy(cpi);

+    write_kfmodes(cpi, &residual_bc);

+  } else {

+    pack_inter_mode_mvs(cpi, &residual_bc);

+    vp9_update_mode_context(&cpi->common);

+  }

+  vp9_stop_encode(&residual_bc);

+  *size += residual_bc.pos;

+}

+#ifdef ENTROPY_STATS

+void print_tree_update_probs() {

+  int i, j, k, l;

+  FILE *f = fopen("coefupdprob.h", "w");

+  int Sum;

+  fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");

+  fprintf(f, "const vp9_prob\n"

+          "vp9_coef_update_probs[BLOCK_TYPES]\n"

+          "                     [COEF_BANDS]\n"

+          "                     [PREV_COEF_CONTEXTS]\n"

+          "                     [ENTROPY_NODES] = {\n");

+  for (i = 0; i < BLOCK_TYPES; i++) {

+    fprintf(f, "  { \n");

+    for (j = 0; j < COEF_BANDS; j++) {

+      fprintf(f, "    {\n");

+      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

+        fprintf(f, "      {");

+        for (l = 0; l < ENTROPY_NODES; l++) {

+          fprintf(f, "%3ld, ",

+              get_binary_prob(tree_update_hist[i][j][k][l][0],

+                              tree_update_hist[i][j][k][l][1]));

+        }

+        fprintf(f, "},\n");

+      }

+      fprintf(f, "    },\n");

+    }

+    fprintf(f, "  },\n");

+  }

+  fprintf(f, "};\n");

+  fprintf(f, "const vp9_prob\n"

+          "vp9_coef_update_probs_8x8[BLOCK_TYPES_8X8]\n"

+          "                         [COEF_BANDS]\n"

+          "                         [PREV_COEF_CONTEXTS]\n"

+          "                         [ENTROPY_NODES] = {\n");

+  for (i = 0; i < BLOCK_TYPES_8X8; i++) {

+    fprintf(f, "  { \n");

+    for (j = 0; j < COEF_BANDS; j++) {

+      fprintf(f, "    {\n");

+      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

+        fprintf(f, "      {");

+        for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++) {

+          fprintf(f, "%3ld, ",

+              get_binary_prob(tree_update_hist_8x8[i][j][k][l][0],

+                              tree_update_hist_8x8[i][j][k][l][1]));

+        }

+        fprintf(f, "},\n");

+      }

+      fprintf(f, "    },\n");

+    }

+    fprintf(f, "  },\n");

+  }

+  fprintf(f, "const vp9_prob\n"

+          "vp9_coef_update_probs_16x16[BLOCK_TYPES_16X16]\n"

+          "                           [COEF_BANDS]\n"

+          "                           [PREV_COEF_CONTEXTS]\n"

+          "                           [ENTROPY_NODES] = {\n");

+  for (i = 0; i < BLOCK_TYPES_16X16; i++) {

+    fprintf(f, "  { \n");

+    for (j = 0; j < COEF_BANDS; j++) {

+      fprintf(f, "    {\n");

+      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

+        fprintf(f, "      {");

+        for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++) {

+          fprintf(f, "%3ld, ",

+              get_binary_prob(tree_update_hist_16x16[i][j][k][l][0],

+                              tree_update_hist_16x16[i][j][k][l][1]));

+        }

+        fprintf(f, "},\n");

+      }

+      fprintf(f, "    },\n");

+    }

+    fprintf(f, "  },\n");

+  }

+  fclose(f);

+  f = fopen("treeupdate.bin", "wb");

+  fwrite(tree_update_hist, sizeof(tree_update_hist), 1, f);

+  fwrite(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);

+  fwrite(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);

+  fclose(f);

+}

+#endif

--- /dev/null

+++ b/vp9/encoder/bitstream.h

@@ -1,0 +1,17 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_BITSTREAM_H

+#define __INC_BITSTREAM_H

+void vp9_update_skip_probs(VP9_COMP *cpi);

+#endif

--- /dev/null

+++ b/vp9/encoder/block.h

@@ -1,0 +1,184 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_BLOCK_H

+#define __INC_BLOCK_H

+#include "vp9/common/onyx.h"

+#include "vp9/common/entropymv.h"

+#include "vp9/common/entropy.h"

+#include "vpx_ports/mem.h"

+#include "vp9/common/onyxc_int.h"

+// motion search site

+typedef struct {

+  MV mv;

+  int offset;

+} search_site;

+typedef struct block {

+  // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries

+  short *src_diff;

+  short *coeff;

+  // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries

+  short *quant;

+  short *quant_fast;      // fast quant deprecated for now

+  unsigned char *quant_shift;

+  short *zbin;

+  short *zbin_8x8;

+  short *zbin_16x16;

+  short *zrun_zbin_boost;

+  short *zrun_zbin_boost_8x8;

+  short *zrun_zbin_boost_16x16;

+  short *round;

+  // Zbin Over Quant value

+  short zbin_extra;

+  unsigned char **base_src;

+  unsigned char **base_second_src;

+  int src;

+  int src_stride;

+  int eob_max_offset;

+  int eob_max_offset_8x8;

+  int eob_max_offset_16x16;

+} BLOCK;

+typedef struct {

+  int count;

+  struct {

+    B_PREDICTION_MODE mode;

+    int_mv mv;

+    int_mv second_mv;

+  } bmi[16];

+} PARTITION_INFO;

+// Structure to hold snapshot of coding context during the mode picking process

+// TODO Do we need all of these?

+typedef struct {

+  MODE_INFO mic;

+  PARTITION_INFO partition_info;

+  int_mv best_ref_mv;

+  int_mv second_best_ref_mv;

+#if CONFIG_NEWBESTREFMV || CONFIG_NEW_MVREF

+  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REFS];

+#endif

+  int rate;

+  int distortion;

+  int64_t intra_error;

+  int best_mode_index;

+  int rddiv;

+  int rdmult;

+  int hybrid_pred_diff;

+  int comp_pred_diff;

+  int single_pred_diff;

+  int64_t txfm_rd_diff[NB_TXFM_MODES];

+} PICK_MODE_CONTEXT;

+typedef struct macroblock {

+  DECLARE_ALIGNED(16, short, src_diff[400]);  // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y

+  DECLARE_ALIGNED(16, short, coeff[400]);     // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y

+  DECLARE_ALIGNED(16, unsigned char, thismb[256]);    // 16x16 Y

+  unsigned char *thismb_ptr;

+  // 16 Y blocks, 4 U blocks, 4 V blocks,

+  // 1 DC 2nd order block each with 16 entries

+  BLOCK block[25];

+  YV12_BUFFER_CONFIG src;

+  MACROBLOCKD e_mbd;

+  PARTITION_INFO *partition_info; /* work pointer */

+  PARTITION_INFO *pi;   /* Corresponds to upper left visible macroblock */

+  PARTITION_INFO *pip;  /* Base of allocated array */

+  search_site *ss;

+  int ss_count;

+  int searches_per_step;

+  int errorperbit;

+  int sadperbit16;

+  int sadperbit4;

+  int rddiv;

+  int rdmult;

+  unsigned int *mb_activity_ptr;

+  int *mb_norm_activity_ptr;

+  signed int act_zbin_adj;

+  int nmvjointcost[MV_JOINTS];

+  int nmvcosts[2][MV_VALS];

+  int *nmvcost[2];

+  int nmvcosts_hp[2][MV_VALS];

+  int *nmvcost_hp[2];

+  int nmvjointsadcost[MV_JOINTS];

+  int nmvsadcosts[2][MV_VALS];

+  int *nmvsadcost[2];

+  int nmvsadcosts_hp[2][MV_VALS];

+  int *nmvsadcost_hp[2];

+  int mbmode_cost[2][MB_MODE_COUNT];

+  int intra_uv_mode_cost[2][MB_MODE_COUNT];

+  int bmode_costs[VP9_BINTRAMODES][VP9_BINTRAMODES][VP9_BINTRAMODES];

+  int i8x8_mode_costs[MB_MODE_COUNT];

+  int inter_bmode_costs[B_MODE_COUNT];

+  int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1]

+                             [VP9_SWITCHABLE_FILTERS];

+  // These define limits to motion vector components to prevent them

+  // from extending outside the UMV borders

+  int mv_col_min;

+  int mv_col_max;

+  int mv_row_min;

+  int mv_row_max;

+  int skip;

+  int encode_breakout;

+  // char * gf_active_ptr;

+  signed char *gf_active_ptr;

+  unsigned char *active_ptr;

+  unsigned int token_costs[TX_SIZE_MAX][BLOCK_TYPES][COEF_BANDS]

+    [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];

+  unsigned int hybrid_token_costs[TX_SIZE_MAX][BLOCK_TYPES][COEF_BANDS]

+    [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];

+  int optimize;

+  // Structure to hold context for each of the 4 MBs within a SB:

+  // when encoded as 4 independent MBs:

+  PICK_MODE_CONTEXT mb_context[4];

+#if CONFIG_SUPERBLOCKS

+  // when 4 MBs share coding parameters:

+  PICK_MODE_CONTEXT sb_context[4];

+#endif

+  void (*vp9_short_fdct4x4)(short *input, short *output, int pitch);

+  void (*vp9_short_fdct8x4)(short *input, short *output, int pitch);

+  void (*short_walsh4x4)(short *input, short *output, int pitch);

+  void (*quantize_b_4x4)(BLOCK *b, BLOCKD *d);

+  void (*quantize_b_4x4_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);

+  void (*vp9_short_fdct8x8)(short *input, short *output, int pitch);

+  void (*vp9_short_fdct16x16)(short *input, short *output, int pitch);

+  void (*short_fhaar2x2)(short *input, short *output, int pitch);

+  void (*quantize_b_16x16)(BLOCK *b, BLOCKD *d);

+  void (*quantize_b_8x8)(BLOCK *b, BLOCKD *d);

+  void (*quantize_b_2x2)(BLOCK *b, BLOCKD *d);

+} MACROBLOCK;

+#endif

--- /dev/null

+++ b/vp9/encoder/boolhuff.c

@@ -1,0 +1,153 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "boolhuff.h"

+#if defined(SECTIONBITS_OUTPUT)

+unsigned __int64 Sectionbits[500];

+#endif

+#ifdef ENTROPY_STATS

+unsigned int active_section = 0;

+#endif

+const unsigned int vp9_prob_cost[256] = {

+  2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,

+  1023, 1000,  979,  959,  940,  922,  905,  889,  873,  858,  843,  829,  816,  803,  790,  778,

+  767,  755,  744,  733,  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,

+  617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,  534,  528,  522,  516,

+  511,  505,  499,  494,  488,  483,  477,  472,  467,  462,  457,  452,  447,  442,  437,  433,

+  428,  424,  419,  415,  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,

+  361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,  317,  314,  311,  307,

+  304,  301,  297,  294,  291,  288,  285,  281,  278,  275,  272,  269,  266,  263,  260,  257,

+  255,  252,  249,  246,  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,

+  211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,  181,  179,  177,  174,

+  172,  170,  168,  165,  163,  161,  159,  156,  154,  152,  150,  148,  145,  143,  141,  139,

+  137,  135,  133,  131,  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,

+  105,  103,  101,   99,   97,   95,   93,   92,   90,   88,   86,   84,   82,   81,   79,   77,

+  75,   73,   72,   70,   68,   66,   65,   63,   61,   60,   58,   56,   55,   53,   51,   50,

+  48,   46,   45,   43,   41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,

+  22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1

+};

+void vp9_start_encode(BOOL_CODER *br, unsigned char *source) {

+  br->lowvalue = 0;

+  br->range    = 255;

+  br->value    = 0;

+  br->count    = -24;

+  br->buffer   = source;

+  br->pos      = 0;

+}

+void vp9_stop_encode(BOOL_CODER *br) {

+  int i;

+  for (i = 0; i < 32; i++)

+    encode_bool(br, 0, 128);

+}

+void vp9_encode_value(BOOL_CODER *br, int data, int bits) {

+  int bit;

+  for (bit = bits - 1; bit >= 0; bit--)

+    encode_bool(br, (1 & (data >> bit)), 0x80);

+}

+int vp9_recenter_nonneg(int v, int m) {

+  if (v > (m << 1)) return v;

+  else if (v >= m) return ((v - m) << 1);

+  else return ((m - v) << 1) - 1;

+}

+static int get_unsigned_bits(unsigned num_values) {

+  int cat = 0;

+  if ((num_values--) <= 1) return 0;

+  while (num_values > 0) {

+    cat++;

+    num_values >>= 1;

+  }

+  return cat;

+}

+void vp9_encode_uniform(BOOL_CODER *br, int v, int n) {

+  int l = get_unsigned_bits(n);

+  int m;

+  if (l == 0) return;

+  m = (1 << l) - n;

+  if (v < m)

+    vp9_encode_value(br, v, l - 1);

+  else {

+    vp9_encode_value(br, m + ((v - m) >> 1), l - 1);

+    vp9_encode_value(br, (v - m) & 1, 1);

+  }

+}

+int vp9_count_uniform(int v, int n) {

+  int l = get_unsigned_bits(n);

+  int m;

+  if (l == 0) return 0;

+  m = (1 << l) - n;

+  if (v < m)

+    return l - 1;

+  else

+    return l;

+}

+void vp9_encode_term_subexp(BOOL_CODER *br, int word, int k, int num_syms) {

+  int i = 0;

+  int mk = 0;

+  while (1) {

+    int b = (i ? k + i - 1 : k);

+    int a = (1 << b);

+    if (num_syms <= mk + 3 * a) {

+      vp9_encode_uniform(br, word - mk, num_syms - mk);

+      break;

+    } else {

+      int t = (word >= mk + a);

+      vp9_encode_value(br, t, 1);

+      if (t) {

+        i = i + 1;

+        mk += a;

+      } else {

+        vp9_encode_value(br, word - mk, b);

+        break;

+      }

+    }

+  }

+}

+int vp9_count_term_subexp(int word, int k, int num_syms) {

+  int count = 0;

+  int i = 0;

+  int mk = 0;

+  while (1) {

+    int b = (i ? k + i - 1 : k);

+    int a = (1 << b);

+    if (num_syms <= mk + 3 * a) {

+      count += vp9_count_uniform(word - mk, num_syms - mk);

+      break;

+    } else {

+      int t = (word >= mk + a);

+      count++;

+      if (t) {

+        i = i + 1;

+        mk += a;

+      } else {

+        count += b;

+        break;

+      }

+    }

+  }

+  return count;

+}

--- /dev/null

+++ b/vp9/encoder/boolhuff.h

@@ -1,0 +1,111 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+/****************************************************************************

+*

+*   Module Title :     boolhuff.h

+*

+*   Description  :     Bool Coder header file.

+*

+****************************************************************************/

+#ifndef __INC_BOOLHUFF_H

+#define __INC_BOOLHUFF_H

+#include "vpx_ports/mem.h"

+typedef struct {

+  unsigned int lowvalue;

+  unsigned int range;

+  unsigned int value;

+  int count;

+  unsigned int pos;

+  unsigned char *buffer;

+  // Variables used to track bit costs without outputing to the bitstream

+  unsigned int  measure_cost;

+  unsigned long bit_counter;

+} BOOL_CODER;

+extern void vp9_start_encode(BOOL_CODER *bc, unsigned char *buffer);

+extern void vp9_encode_value(BOOL_CODER *br, int data, int bits);

+extern void vp9_stop_encode(BOOL_CODER *bc);

+extern const unsigned int vp9_prob_cost[256];

+extern void vp9_encode_uniform(BOOL_CODER *bc, int v, int n);

+extern void vp9_encode_term_subexp(BOOL_CODER *bc, int v, int k, int n);

+extern int vp9_count_uniform(int v, int n);

+extern int vp9_count_term_subexp(int v, int k, int n);

+extern int vp9_recenter_nonneg(int v, int m);

+DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);

+static void encode_bool(BOOL_CODER *br, int bit, int probability) {

+  unsigned int split;

+  int count = br->count;

+  unsigned int range = br->range;

+  unsigned int lowvalue = br->lowvalue;

+  register unsigned int shift;

+#ifdef ENTROPY_STATS

+#if defined(SECTIONBITS_OUTPUT)

+  if (bit)

+    Sectionbits[active_section] += vp9_prob_cost[255 - probability];

+  else

+    Sectionbits[active_section] += vp9_prob_cost[probability];

+#endif

+#endif

+  split = 1 + (((range - 1) * probability) >> 8);

+  range = split;

+  if (bit) {

+    lowvalue += split;

+    range = br->range - split;

+  }

+  shift = vp9_norm[range];

+  range <<= shift;

+  count += shift;

+  if (count >= 0) {

+    int offset = shift - count;

+    if ((lowvalue << (offset - 1)) & 0x80000000) {

+      int x = br->pos - 1;

+      while (x >= 0 && br->buffer[x] == 0xff) {

+        br->buffer[x] = (unsigned char)0;

+        x--;

+      }

+      br->buffer[x] += 1;

+    }

+    br->buffer[br->pos++] = (lowvalue >> (24 - offset));

+    lowvalue <<= offset;

+    shift = count;

+    lowvalue &= 0xffffff;

+    count -= 8;

+  }

+  lowvalue <<= shift;

+  br->count = count;

+  br->lowvalue = lowvalue;

+  br->range = range;

+}

+#endif

--- /dev/null

+++ b/vp9/encoder/dct.c

@@ -1,0 +1,1109 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <assert.h>

+#include <math.h>

+#include "vpx_ports/config.h"

+#include "vp9/common/idct.h"

+#include "vp9/common/systemdependent.h"

+#include "vp9/common/blockd.h"

+// TODO: these transforms can be converted into integer forms to reduce

+//       the complexity

+static const float dct_4[16] = {

+  0.500000000000000,  0.500000000000000,  0.500000000000000,  0.500000000000000,

+  0.653281482438188,  0.270598050073099, -0.270598050073099, -0.653281482438188,

+  0.500000000000000, -0.500000000000000, -0.500000000000000,  0.500000000000000,

+  0.270598050073099, -0.653281482438188,  0.653281482438188, -0.270598050073099

+};

+static const float adst_4[16] = {

+  0.228013428883779,  0.428525073124360,  0.577350269189626,  0.656538502008139,

+  0.577350269189626,  0.577350269189626,  0.000000000000000, -0.577350269189626,

+  0.656538502008139, -0.228013428883779, -0.577350269189626,  0.428525073124359,

+  0.428525073124360, -0.656538502008139,  0.577350269189626, -0.228013428883779

+};

+static const float dct_8[64] = {

+  0.353553390593274,   0.353553390593274,   0.353553390593274,   0.353553390593274,

+  0.353553390593274,   0.353553390593274,   0.353553390593274,   0.353553390593274,

+  0.490392640201615,   0.415734806151273,   0.277785116509801,   0.097545161008064,

+ -0.097545161008064,  -0.277785116509801,  -0.415734806151273,  -0.490392640201615,

+  0.461939766255643,   0.191341716182545,  -0.191341716182545,  -0.461939766255643,

+ -0.461939766255643,  -0.191341716182545,   0.191341716182545,   0.461939766255643,

+  0.415734806151273,  -0.097545161008064,  -0.490392640201615,  -0.277785116509801,

+  0.277785116509801,   0.490392640201615,   0.097545161008064,  -0.415734806151273,

+  0.353553390593274,  -0.353553390593274,  -0.353553390593274,   0.353553390593274,

+  0.353553390593274,  -0.353553390593274,  -0.353553390593274,   0.353553390593274,

+  0.277785116509801,  -0.490392640201615,   0.097545161008064,   0.415734806151273,

+ -0.415734806151273,  -0.097545161008064,   0.490392640201615,  -0.277785116509801,

+  0.191341716182545,  -0.461939766255643,   0.461939766255643,  -0.191341716182545,

+ -0.191341716182545,   0.461939766255643,  -0.461939766255643,   0.191341716182545,

+  0.097545161008064,  -0.277785116509801,   0.415734806151273,  -0.490392640201615,

+  0.490392640201615,  -0.415734806151273,   0.277785116509801,  -0.097545161008064

+};

+static const float adst_8[64] = {

+  0.089131608307533,   0.175227946595735,   0.255357107325376,   0.326790388032145,

+  0.387095214016349,   0.434217976756762,   0.466553967085785,   0.483002021635509,

+  0.255357107325376,   0.434217976756762,   0.483002021635509,   0.387095214016349,

+  0.175227946595735,  -0.089131608307533,  -0.326790388032145,  -0.466553967085785,

+  0.387095214016349,   0.466553967085785,   0.175227946595735,  -0.255357107325376,

+ -0.483002021635509,  -0.326790388032145,   0.089131608307533,   0.434217976756762,

+  0.466553967085785,   0.255357107325376,  -0.326790388032145,  -0.434217976756762,

+  0.089131608307533,   0.483002021635509,   0.175227946595735,  -0.387095214016348,

+  0.483002021635509,  -0.089131608307533,  -0.466553967085785,   0.175227946595735,

+  0.434217976756762,  -0.255357107325376,  -0.387095214016348,   0.326790388032145,

+  0.434217976756762,  -0.387095214016348,  -0.089131608307533,   0.466553967085786,

+ -0.326790388032145,  -0.175227946595735,   0.483002021635509,  -0.255357107325375,

+  0.326790388032145,  -0.483002021635509,   0.387095214016349,  -0.089131608307534,

+ -0.255357107325377,   0.466553967085785,  -0.434217976756762,   0.175227946595736,

+  0.175227946595735,  -0.326790388032145,   0.434217976756762,  -0.483002021635509,

+  0.466553967085785,  -0.387095214016348,   0.255357107325376,  -0.089131608307532

+};

+/* Converted the transforms to integers. */

+static const int16_t dct_i4[16] = {

+  16384,  16384,  16384,  16384,

+  21407,   8867,  -8867, -21407,

+  16384, -16384, -16384,  16384,

+   8867, -21407,  21407,  -8867

+};

+static const int16_t adst_i4[16] = {

+   7472,  14042,  18919,  21513,

+  18919,  18919,      0, -18919,

+  21513,  -7472, -18919,  14042,

+  14042, -21513,  18919,  -7472

+};

+static const int16_t dct_i8[64] = {

+   11585,  11585,  11585,  11585,

+   11585,  11585,  11585,  11585,

+   16069,  13623,   9102,   3196,

+   -3196,  -9102, -13623, -16069,

+   15137,   6270,  -6270, -15137,

+  -15137,  -6270,   6270,  15137,

+   13623,  -3196, -16069,  -9102,

+    9102,  16069,   3196, -13623,

+   11585, -11585, -11585,  11585,

+   11585, -11585, -11585,  11585,

+    9102, -16069,   3196,  13623,

+  -13623,  -3196,  16069,  -9102,

+    6270, -15137,  15137,  -6270,

+   -6270,  15137, -15137,   6270,

+    3196,  -9102,  13623, -16069,

+   16069, -13623,   9102,  -3196

+};

+static const int16_t adst_i8[64] = {

+    2921,   5742,   8368,  10708,

+   12684,  14228,  15288,  15827,

+    8368,  14228,  15827,  12684,

+    5742,  -2921, -10708, -15288,

+   12684,  15288,   5742,  -8368,

+  -15827, -10708,   2921,  14228,

+   15288,   8368, -10708, -14228,

+    2921,  15827,   5742, -12684,

+   15827,  -2921, -15288,   5742,

+   14228,  -8368, -12684,  10708,

+   14228, -12684,  -2921,  15288,

+  -10708,  -5742,  15827,  -8368,

+   10708, -15827,  12684,  -2921,

+   -8368,  15288, -14228,   5742,

+    5742, -10708,  14228, -15827,

+   15288, -12684,   8368,  -2921

+};

+static const float dct_16[256] = {

+  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,

+  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,

+  0.351851,  0.338330,  0.311806,  0.273300,  0.224292,  0.166664,  0.102631,  0.034654,

+ -0.034654, -0.102631, -0.166664, -0.224292, -0.273300, -0.311806, -0.338330, -0.351851,

+  0.346760,  0.293969,  0.196424,  0.068975, -0.068975, -0.196424, -0.293969, -0.346760,

+ -0.346760, -0.293969, -0.196424, -0.068975,  0.068975,  0.196424,  0.293969,  0.346760,

+  0.338330,  0.224292,  0.034654, -0.166664, -0.311806, -0.351851, -0.273300, -0.102631,

+  0.102631,  0.273300,  0.351851,  0.311806,  0.166664, -0.034654, -0.224292, -0.338330,

+  0.326641,  0.135299, -0.135299, -0.326641, -0.326641, -0.135299,  0.135299,  0.326641,

+  0.326641,  0.135299, -0.135299, -0.326641, -0.326641, -0.135299,  0.135299,  0.326641,

+  0.311806,  0.034654, -0.273300, -0.338330, -0.102631,  0.224292,  0.351851,  0.166664,

+ -0.166664, -0.351851, -0.224292,  0.102631,  0.338330,  0.273300, -0.034654, -0.311806,

+  0.293969, -0.068975, -0.346760, -0.196424,  0.196424,  0.346760,  0.068975, -0.293969,

+ -0.293969,  0.068975,  0.346760,  0.196424, -0.196424, -0.346760, -0.068975,  0.293969,

+  0.273300, -0.166664, -0.338330,  0.034654,  0.351851,  0.102631, -0.311806, -0.224292,

+  0.224292,  0.311806, -0.102631, -0.351851, -0.034654,  0.338330,  0.166664, -0.273300,

+  0.250000, -0.250000, -0.250000,  0.250000,  0.250000, -0.250000, -0.250000,  0.250000,

+  0.250000, -0.250000, -0.250000,  0.250000,  0.250000, -0.250000, -0.250000,  0.250000,

+  0.224292, -0.311806, -0.102631,  0.351851, -0.034654, -0.338330,  0.166664,  0.273300,

+ -0.273300, -0.166664,  0.338330,  0.034654, -0.351851,  0.102631,  0.311806, -0.224292,

+  0.196424, -0.346760,  0.068975,  0.293969, -0.293969, -0.068975,  0.346760, -0.196424,

+ -0.196424,  0.346760, -0.068975, -0.293969,  0.293969,  0.068975, -0.346760,  0.196424,

+  0.166664, -0.351851,  0.224292,  0.102631, -0.338330,  0.273300,  0.034654, -0.311806,

+  0.311806, -0.034654, -0.273300,  0.338330, -0.102631, -0.224292,  0.351851, -0.166664,

+  0.135299, -0.326641,  0.326641, -0.135299, -0.135299,  0.326641, -0.326641,  0.135299,

+  0.135299, -0.326641,  0.326641, -0.135299, -0.135299,  0.326641, -0.326641,  0.135299,

+  0.102631, -0.273300,  0.351851, -0.311806,  0.166664,  0.034654, -0.224292,  0.338330,

+ -0.338330,  0.224292, -0.034654, -0.166664,  0.311806, -0.351851,  0.273300, -0.102631,

+  0.068975, -0.196424,  0.293969, -0.346760,  0.346760, -0.293969,  0.196424, -0.068975,

+ -0.068975,  0.196424, -0.293969,  0.346760, -0.346760,  0.293969, -0.196424,  0.068975,

+  0.034654, -0.102631,  0.166664, -0.224292,  0.273300, -0.311806,  0.338330, -0.351851,

+  0.351851, -0.338330,  0.311806, -0.273300,  0.224292, -0.166664,  0.102631, -0.034654

+};

+static const float adst_16[256] = {

+  0.033094,  0.065889,  0.098087,  0.129396,  0.159534,  0.188227,  0.215215,  0.240255,

+  0.263118,  0.283599,  0.301511,  0.316693,  0.329007,  0.338341,  0.344612,  0.347761,

+  0.098087,  0.188227,  0.263118,  0.316693,  0.344612,  0.344612,  0.316693,  0.263118,

+  0.188227,  0.098087,  0.000000, -0.098087, -0.188227, -0.263118, -0.316693, -0.344612,

+  0.159534,  0.283599,  0.344612,  0.329007,  0.240255,  0.098087, -0.065889, -0.215215,

+ -0.316693, -0.347761, -0.301511, -0.188227, -0.033094,  0.129396,  0.263118,  0.338341,

+  0.215215,  0.338341,  0.316693,  0.159534, -0.065889, -0.263118, -0.347761, -0.283599,

+ -0.098087,  0.129396,  0.301511,  0.344612,  0.240255,  0.033094, -0.188227, -0.329007,

+  0.263118,  0.344612,  0.188227, -0.098087, -0.316693, -0.316693, -0.098087,  0.188227,

+  0.344612,  0.263118,  0.000000, -0.263118, -0.344612, -0.188227,  0.098087,  0.316693,

+  0.301511,  0.301511,  0.000000, -0.301511, -0.301511, -0.000000,  0.301511,  0.301511,

+  0.000000, -0.301511, -0.301511, -0.000000,  0.301511,  0.301511,  0.000000, -0.301511,

+  0.329007,  0.215215, -0.188227, -0.338341, -0.033094,  0.316693,  0.240255, -0.159534,

+ -0.344612, -0.065889,  0.301511,  0.263118, -0.129396, -0.347761, -0.098087,  0.283599,

+  0.344612,  0.098087, -0.316693, -0.188227,  0.263118,  0.263118, -0.188227, -0.316693,

+  0.098087,  0.344612,  0.000000, -0.344612, -0.098087,  0.316693,  0.188227, -0.263118,

+  0.347761, -0.033094, -0.344612,  0.065889,  0.338341, -0.098087, -0.329007,  0.129396,

+  0.316693, -0.159534, -0.301511,  0.188227,  0.283599, -0.215215, -0.263118,  0.240255,

+  0.338341, -0.159534, -0.263118,  0.283599,  0.129396, -0.344612,  0.033094,  0.329007,

+ -0.188227, -0.240255,  0.301511,  0.098087, -0.347761,  0.065889,  0.316693, -0.215215,

+  0.316693, -0.263118, -0.098087,  0.344612, -0.188227, -0.188227,  0.344612, -0.098087,

+ -0.263118,  0.316693,  0.000000, -0.316693,  0.263118,  0.098087, -0.344612,  0.188227,

+  0.283599, -0.329007,  0.098087,  0.215215, -0.347761,  0.188227,  0.129396, -0.338341,

+  0.263118,  0.033094, -0.301511,  0.316693, -0.065889, -0.240255,  0.344612, -0.159534,

+  0.240255, -0.347761,  0.263118, -0.033094, -0.215215,  0.344612, -0.283599,  0.065889,

+  0.188227, -0.338341,  0.301511, -0.098087, -0.159534,  0.329007, -0.316693,  0.129396,

+  0.188227, -0.316693,  0.344612, -0.263118,  0.098087,  0.098087, -0.263118,  0.344612,

+ -0.316693,  0.188227,  0.000000, -0.188227,  0.316693, -0.344612,  0.263118, -0.098087,

+  0.129396, -0.240255,  0.316693, -0.347761,  0.329007, -0.263118,  0.159534, -0.033094,

+ -0.098087,  0.215215, -0.301511,  0.344612, -0.338341,  0.283599, -0.188227,  0.065889,

+  0.065889, -0.129396,  0.188227, -0.240255,  0.283599, -0.316693,  0.338341, -0.347761,

+  0.344612, -0.329007,  0.301511, -0.263118,  0.215215, -0.159534,  0.098087, -0.033094

+};

+/* Converted the transforms to integers. */

+static const int16_t dct_i16[256] = {

+    8192,   8192,   8192,   8192,   8192,   8192,   8192,   8192,

+    8192,   8192,   8192,   8192,   8192,   8192,   8192,   8192,

+   11529,  11086,  10217,   8955,   7350,   5461,   3363,   1136,

+   -1136,  -3363,  -5461,  -7350,  -8955, -10217, -11086, -11529,

+   11363,   9633,   6436,   2260,  -2260,  -6436,  -9633, -11363,

+  -11363,  -9633,  -6436,  -2260,   2260,   6436,   9633,  11363,

+   11086,   7350,   1136,  -5461, -10217, -11529,  -8955,  -3363,

+    3363,   8955,  11529,  10217,   5461,  -1136,  -7350, -11086,

+   10703,   4433,  -4433, -10703, -10703,  -4433,   4433,  10703,

+   10703,   4433,  -4433, -10703, -10703,  -4433,   4433,  10703,

+   10217,   1136,  -8955, -11086,  -3363,   7350,  11529,   5461,

+   -5461, -11529,  -7350,   3363,  11086,   8955,  -1136, -10217,

+    9633,  -2260, -11363,  -6436,   6436,  11363,   2260,  -9633,

+   -9633,   2260,  11363,   6436,  -6436, -11363,  -2260,   9633,

+    8955,  -5461, -11086,   1136,  11529,   3363, -10217,  -7350,

+    7350,  10217,  -3363, -11529,  -1136,  11086,   5461,  -8955,

+    8192,  -8192,  -8192,   8192,   8192,  -8192,  -8192,   8192,

+    8192,  -8192,  -8192,   8192,   8192,  -8192,  -8192,   8192,

+    7350, -10217,  -3363,  11529,  -1136, -11086,   5461,   8955,

+   -8955,  -5461,  11086,   1136, -11529,   3363,  10217,  -7350,

+    6436, -11363,   2260,   9633,  -9633,  -2260,  11363,  -6436,

+   -6436,  11363,  -2260,  -9633,   9633,   2260, -11363,   6436,

+    5461, -11529,   7350,   3363, -11086,   8955,   1136, -10217,

+   10217,  -1136,  -8955,  11086,  -3363,  -7350,  11529,  -5461,

+    4433, -10703,  10703,  -4433,  -4433,  10703, -10703,   4433,

+    4433, -10703,  10703,  -4433,  -4433,  10703, -10703,   4433,

+    3363,  -8955,  11529, -10217,   5461,   1136,  -7350,  11086,

+  -11086,   7350,  -1136,  -5461,  10217, -11529,   8955,  -3363,

+    2260,  -6436,   9633, -11363,  11363,  -9633,   6436,  -2260,

+   -2260,   6436,  -9633,  11363, -11363,   9633,  -6436,   2260,

+    1136,  -3363,   5461,  -7350,   8955, -10217,  11086, -11529,

+   11529, -11086,  10217,  -8955,   7350,  -5461,   3363,  -1136

+};

+static const int16_t adst_i16[256] = {

+    1084,   2159,   3214,   4240,   5228,   6168,   7052,   7873,

+    8622,   9293,   9880,  10377,  10781,  11087,  11292,  11395,

+    3214,   6168,   8622,  10377,  11292,  11292,  10377,   8622,

+    6168,   3214,      0,  -3214,  -6168,  -8622, -10377, -11292,

+    5228,   9293,  11292,  10781,   7873,   3214,  -2159,  -7052,

+  -10377, -11395,  -9880,  -6168,  -1084,   4240,   8622,  11087,

+    7052,  11087,  10377,   5228,  -2159,  -8622, -11395,  -9293,

+   -3214,   4240,   9880,  11292,   7873,   1084,  -6168, -10781,

+    8622,  11292,   6168,  -3214, -10377, -10377,  -3214,   6168,

+   11292,   8622,      0,  -8622, -11292,  -6168,   3214,  10377,

+    9880,   9880,      0,  -9880,  -9880,      0,   9880,   9880,

+       0,  -9880,  -9880,      0,   9880,   9880,      0,  -9880,

+   10781,   7052,  -6168, -11087,  -1084,  10377,   7873,  -5228,

+  -11292,  -2159,   9880,   8622,  -4240, -11395,  -3214,   9293,

+   11292,   3214, -10377,  -6168,   8622,   8622,  -6168, -10377,

+    3214,  11292,      0, -11292,  -3214,  10377,   6168,  -8622,

+   11395,  -1084, -11292,   2159,  11087,  -3214, -10781,   4240,

+   10377,  -5228,  -9880,   6168,   9293,  -7052,  -8622,   7873,

+   11087,  -5228,  -8622,   9293,   4240, -11292,   1084,  10781,

+   -6168,  -7873,   9880,   3214, -11395,   2159,  10377,  -7052,

+   10377,  -8622,  -3214,  11292,  -6168,  -6168,  11292,  -3214,

+   -8622,  10377,      0, -10377,   8622,   3214, -11292,   6168,

+    9293, -10781,   3214,   7052, -11395,   6168,   4240, -11087,

+    8622,   1084,  -9880,  10377,  -2159,  -7873,  11292,  -5228,

+    7873, -11395,   8622,  -1084,  -7052,  11292,  -9293,   2159,

+    6168, -11087,   9880,  -3214,  -5228,  10781, -10377,   4240,

+    6168, -10377,  11292,  -8622,   3214,   3214,  -8622,  11292,

+  -10377,   6168,      0,  -6168,  10377, -11292,   8622,  -3214,

+    4240,  -7873,  10377, -11395,  10781,  -8622,   5228,  -1084,

+   -3214,   7052,  -9880,  11292, -11087,   9293,  -6168,   2159,

+    2159,  -4240,   6168,  -7873,   9293, -10377,  11087, -11395,

+   11292, -10781,   9880,  -8622,   7052,  -5228,   3214,  -1084

+};

+static const int xC1S7 = 16069;

+static const int xC2S6 = 15137;

+static const int xC3S5 = 13623;

+static const int xC4S4 = 11585;

+static const int xC5S3 =  9102;

+static const int xC6S2 =  6270;

+static const int xC7S1 =  3196;

+#define SHIFT_BITS 14

+#define DOROUND(X) X += (1<<(SHIFT_BITS-1));

+#define FINAL_SHIFT 3

+#define FINAL_ROUNDING (1<<(FINAL_SHIFT -1))

+#define IN_SHIFT (FINAL_SHIFT+1)

+void vp9_short_fdct8x8_c(short *InputData, short *OutputData, int pitch) {

+  int loop;

+  int short_pitch = pitch >> 1;

+  int is07, is12, is34, is56;

+  int is0734, is1256;

+  int id07, id12, id34, id56;

+  int irot_input_x, irot_input_y;

+  int icommon_product1;      // Re-used product  (c4s4 * (s12 - s56))

+  int icommon_product2;      // Re-used product  (c4s4 * (d12 + d56))

+  int temp1, temp2;          // intermediate variable for computation

+  int  InterData[64];

+  int  *ip = InterData;

+  short *op = OutputData;

+  for (loop = 0; loop < 8; loop++) {

+    // Pre calculate some common sums and differences.

+    is07 = (InputData[0] + InputData[7]) << IN_SHIFT;

+    is12 = (InputData[1] + InputData[2]) << IN_SHIFT;

+    is34 = (InputData[3] + InputData[4]) << IN_SHIFT;

+    is56 = (InputData[5] + InputData[6]) << IN_SHIFT;

+    id07 = (InputData[0] - InputData[7]) << IN_SHIFT;

+    id12 = (InputData[1] - InputData[2]) << IN_SHIFT;

+    id34 = (InputData[3] - InputData[4]) << IN_SHIFT;

+    id56 = (InputData[5] - InputData[6]) << IN_SHIFT;

+    is0734 = is07 + is34;

+    is1256 = is12 + is56;

+    // Pre-Calculate some common product terms.

+    icommon_product1 = xC4S4 * (is12 - is56);

+    DOROUND(icommon_product1)

+    icommon_product1 >>= SHIFT_BITS;

+    icommon_product2 = xC4S4 * (id12 + id56);

+    DOROUND(icommon_product2)

+    icommon_product2 >>= SHIFT_BITS;

+    ip[0] = (xC4S4 * (is0734 + is1256));

+    DOROUND(ip[0]);

+    ip[0] >>= SHIFT_BITS;

+    ip[4] = (xC4S4 * (is0734 - is1256));

+    DOROUND(ip[4]);

+    ip[4] >>= SHIFT_BITS;

+    // Define inputs to rotation for outputs 2 and 6

+    irot_input_x = id12 - id56;

+    irot_input_y = is07 - is34;

+    // Apply rotation for outputs 2 and 6.

+    temp1 = xC6S2 * irot_input_x;

+    DOROUND(temp1);

+    temp1 >>= SHIFT_BITS;

+    temp2 = xC2S6 * irot_input_y;

+    DOROUND(temp2);

+    temp2 >>= SHIFT_BITS;

+    ip[2] = temp1 + temp2;

+    temp1 = xC6S2 * irot_input_y;

+    DOROUND(temp1);

+    temp1 >>= SHIFT_BITS;

+    temp2 = xC2S6 * irot_input_x;

+    DOROUND(temp2);

+    temp2 >>= SHIFT_BITS;

+    ip[6] = temp1 - temp2;

+    // Define inputs to rotation for outputs 1 and 7

+    irot_input_x = icommon_product1 + id07;

+    irot_input_y = -(id34 + icommon_product2);

+    // Apply rotation for outputs 1 and 7.

+    temp1 = xC1S7 * irot_input_x;

+    DOROUND(temp1);

+    temp1 >>= SHIFT_BITS;

+    temp2 = xC7S1 * irot_input_y;

+    DOROUND(temp2);

+    temp2 >>= SHIFT_BITS;

+    ip[1] = temp1 - temp2;

+    temp1 = xC7S1 * irot_input_x;

+    DOROUND(temp1);

+    temp1 >>= SHIFT_BITS;

+    temp2 = xC1S7 * irot_input_y;

+    DOROUND(temp2);

+    temp2 >>= SHIFT_BITS;

+    ip[7] = temp1 + temp2;

+    // Define inputs to rotation for outputs 3 and 5

+    irot_input_x = id07 - icommon_product1;

+    irot_input_y = id34 - icommon_product2;

+    // Apply rotation for outputs 3 and 5.

+    temp1 = xC3S5 * irot_input_x;

+    DOROUND(temp1);

+    temp1 >>= SHIFT_BITS;

+    temp2 = xC5S3 * irot_input_y;

+    DOROUND(temp2);

+    temp2 >>= SHIFT_BITS;

+    ip[3] = temp1 - temp2;

+    temp1 = xC5S3 * irot_input_x;

+    DOROUND(temp1);

+    temp1 >>= SHIFT_BITS;

+    temp2 = xC3S5 * irot_input_y;

+    DOROUND(temp2);

+    temp2 >>= SHIFT_BITS;

+    ip[5] = temp1 + temp2;

+    // Increment data pointer for next row

+    InputData += short_pitch;

+    ip += 8;

+  }

+  // Performed DCT on rows, now transform the columns

+  ip = InterData;

+  for (loop = 0; loop < 8; loop++) {

+    // Pre calculate some common sums and differences.

+    is07 = ip[0 * 8] + ip[7 * 8];

+    is12 = ip[1 * 8] + ip[2 * 8];

+    is34 = ip[3 * 8] + ip[4 * 8];

+    is56 = ip[5 * 8] + ip[6 * 8];

+    id07 = ip[0 * 8] - ip[7 * 8];

+    id12 = ip[1 * 8] - ip[2 * 8];

+    id34 = ip[3 * 8] - ip[4 * 8];

+    id56 = ip[5 * 8] - ip[6 * 8];

+    is0734 = is07 + is34;

+    is1256 = is12 + is56;

+    // Pre-Calculate some common product terms

+    icommon_product1 = xC4S4 * (is12 - is56);

+    icommon_product2 = xC4S4 * (id12 + id56);

+    DOROUND(icommon_product1)

+    DOROUND(icommon_product2)

+    icommon_product1 >>= SHIFT_BITS;

+    icommon_product2 >>= SHIFT_BITS;

+    temp1 = xC4S4 * (is0734 + is1256);

+    temp2 = xC4S4 * (is0734 - is1256);

+    DOROUND(temp1);

+    DOROUND(temp2);

+    temp1 >>= SHIFT_BITS;

+    temp2 >>= SHIFT_BITS;

+    op[0 * 8] = (temp1 + FINAL_ROUNDING) >> FINAL_SHIFT;

+    op[4 * 8] = (temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

+    // Define inputs to rotation for outputs 2 and 6

+    irot_input_x = id12 - id56;

+    irot_input_y = is07 - is34;

+    // Apply rotation for outputs 2 and 6.

+    temp1 = xC6S2 * irot_input_x;

+    DOROUND(temp1);

+    temp1 >>= SHIFT_BITS;

+    temp2 = xC2S6 * irot_input_y;

+    DOROUND(temp2);

+    temp2 >>= SHIFT_BITS;

+    op[2 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

+    temp1 = xC6S2 * irot_input_y;

+    DOROUND(temp1);

+    temp1 >>= SHIFT_BITS;

+    temp2 = xC2S6 * irot_input_x;

+    DOROUND(temp2);

+    temp2 >>= SHIFT_BITS;

+    op[6 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

+    // Define inputs to rotation for outputs 1 and 7

+    irot_input_x = icommon_product1 + id07;

+    irot_input_y = -(id34 + icommon_product2);

+    // Apply rotation for outputs 1 and 7.

+    temp1 = xC1S7 * irot_input_x;

+    DOROUND(temp1);

+    temp1 >>= SHIFT_BITS;

+    temp2 = xC7S1 * irot_input_y;

+    DOROUND(temp2);

+    temp2 >>= SHIFT_BITS;

+    op[1 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

+    temp1 = xC7S1 * irot_input_x;

+    DOROUND(temp1);

+    temp1 >>= SHIFT_BITS;

+    temp2 = xC1S7 * irot_input_y;

+    DOROUND(temp2);

+    temp2 >>= SHIFT_BITS;

+    op[7 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

+    // Define inputs to rotation for outputs 3 and 5

+    irot_input_x = id07 - icommon_product1;

+    irot_input_y = id34 - icommon_product2;

+    // Apply rotation for outputs 3 and 5.

+    temp1 = xC3S5 * irot_input_x;

+    DOROUND(temp1);

+    temp1 >>= SHIFT_BITS;

+    temp2 = xC5S3 * irot_input_y;

+    DOROUND(temp2);

+    temp2 >>= SHIFT_BITS;

+    op[3 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

+    temp1 = xC5S3 * irot_input_x;

+    DOROUND(temp1);

+    temp1 >>= SHIFT_BITS;

+    temp2 = xC3S5 * irot_input_y;

+    DOROUND(temp2);

+    temp2 >>= SHIFT_BITS;

+    op[5 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

+    // Increment data pointer for next column.

+    ip++;

+    op++;

+  }

+}

+void vp9_short_fhaar2x2_c(short *input, short *output, int pitch) {

+  /* [1 1; 1 -1] orthogonal transform */

+  /* use position: 0,1, 4, 8 */

+  int i;

+  short *ip1 = input;

+  short *op1 = output;

+  for (i = 0; i < 16; i++) {

+    op1[i] = 0;

+  }

+  op1[0] = (ip1[0] + ip1[1] + ip1[4] + ip1[8] + 1) >> 1;

+  op1[1] = (ip1[0] - ip1[1] + ip1[4] - ip1[8]) >> 1;

+  op1[4] = (ip1[0] + ip1[1] - ip1[4] - ip1[8]) >> 1;

+  op1[8] = (ip1[0] - ip1[1] - ip1[4] + ip1[8]) >> 1;

+}

+/* For test */

+#define TEST_INT 1

+#if TEST_INT

+#define vp9_fht_int_c vp9_fht_c

+#else

+#define vp9_fht_float_c vp9_fht_c

+#endif

+void vp9_fht_float_c(const int16_t *input, int pitch, int16_t *output,

+               TX_TYPE tx_type, int tx_dim) {

+  vp9_clear_system_state();  // Make it simd safe : __asm emms;

+  {

+    int i, j, k;

+    float bufa[256], bufb[256];  // buffers are for floating-point test purpose

+                                 // the implementation could be simplified in

+                                 // conjunction with integer transform

+    const int16_t *ip = input;

+    int16_t *op = output;

+    float *pfa = &bufa[0];

+    float *pfb = &bufb[0];

+    // pointers to vertical and horizontal transforms

+    const float *ptv, *pth;

+    assert(tx_type != DCT_DCT);

+    // load and convert residual array into floating-point

+    for (j = 0; j < tx_dim; j++) {

+      for (i = 0; i < tx_dim; i++) {

+        pfa[i] = (float)ip[i];

+      }

+      pfa += tx_dim;

+      ip  += pitch / 2;

+    }

+    // vertical transformation

+    pfa = &bufa[0];

+    pfb = &bufb[0];

+    switch (tx_type) {

+      case ADST_ADST :

+      case ADST_DCT  :

+        ptv = (tx_dim == 4) ? &adst_4[0] :

+                              ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);

+        break;

+      default :

+        ptv = (tx_dim == 4) ? &dct_4[0] :

+                              ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);

+        break;

+    }

+    for (j = 0; j < tx_dim; j++) {

+      for (i = 0; i < tx_dim; i++) {

+        pfb[i] = 0;

+        for (k = 0; k < tx_dim; k++) {

+          pfb[i] += ptv[k] * pfa[(k * tx_dim)];

+        }

+        pfa += 1;

+      }

+      pfb += tx_dim;

+      ptv += tx_dim;

+      pfa = &bufa[0];

+    }

+    // horizontal transformation

+    pfa = &bufa[0];

+    pfb = &bufb[0];

+    switch (tx_type) {

+      case ADST_ADST :

+      case  DCT_ADST :

+        pth = (tx_dim == 4) ? &adst_4[0] :

+                              ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);

+        break;

+      default :

+        pth = (tx_dim == 4) ? &dct_4[0] :

+                              ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);

+        break;

+    }

+    for (j = 0; j < tx_dim; j++) {

+      for (i = 0; i < tx_dim; i++) {

+        pfa[i] = 0;

+        for (k = 0; k < tx_dim; k++) {

+          pfa[i] += pfb[k] * pth[k];

+        }

+        pth += tx_dim;

+      }

+      pfa += tx_dim;

+      pfb += tx_dim;

+      // pth -= tx_dim * tx_dim;

+      switch (tx_type) {

+        case ADST_ADST :

+        case  DCT_ADST :

+          pth = (tx_dim == 4) ? &adst_4[0] :

+                                ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);

+          break;

+        default :

+          pth = (tx_dim == 4) ? &dct_4[0] :

+                                ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);

+          break;

+      }

+    }

+    // convert to short integer format and load BLOCKD buffer

+    op = output;

+    pfa = &bufa[0];

+    for (j = 0; j < tx_dim; j++) {

+      for (i = 0; i < tx_dim; i++) {

+        op[i] = (pfa[i] > 0 ) ? (int16_t)( 8 * pfa[i] + 0.49) :

+                                     -(int16_t)(- 8 * pfa[i] + 0.49);

+      }

+      op  += tx_dim;

+      pfa += tx_dim;

+    }

+  }

+  vp9_clear_system_state();  // Make it simd safe : __asm emms;

+}

+/* Converted the transforms to integer form. */

+#define VERTICAL_SHIFT 11

+#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)

+#define HORIZONTAL_SHIFT 16

+#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)

+void vp9_fht_int_c(const int16_t *input, int pitch, int16_t *output,

+                   TX_TYPE tx_type, int tx_dim) {

+  int i, j, k;

+  int16_t imbuf[256];

+  const int16_t *ip = input;

+  int16_t *op = output;

+  int16_t *im = &imbuf[0];

+  /* pointers to vertical and horizontal transforms. */

+  const int16_t *ptv = NULL, *pth = NULL;

+  switch (tx_type) {

+    case ADST_ADST :

+      ptv = pth = (tx_dim == 4) ? &adst_i4[0]

+                                  : ((tx_dim == 8) ? &adst_i8[0]

+                                                     : &adst_i16[0]);

+      break;

+    case ADST_DCT  :

+      ptv = (tx_dim == 4) ? &adst_i4[0]

+                            : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);

+      pth = (tx_dim == 4) ? &dct_i4[0]

+                            : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);

+      break;

+    case  DCT_ADST :

+      ptv = (tx_dim == 4) ? &dct_i4[0]

+                            : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);

+      pth = (tx_dim == 4) ? &adst_i4[0]

+                            : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);

+      break;

+    case  DCT_DCT :

+      ptv = pth = (tx_dim == 4) ? &dct_i4[0]

+                                  : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);

+      break;

+    default:

+      assert(0);

+      break;

+  }

+  /* vertical transformation */

+  for (j = 0; j < tx_dim; j++) {

+    for (i = 0; i < tx_dim; i++) {

+      int temp = 0;

+      for (k = 0; k < tx_dim; k++) {

+        temp += ptv[k] * ip[(k * (pitch >> 1))];

+      }

+      im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);

+      ip++;

+    }

+    im += tx_dim;  // 16

+    ptv += tx_dim;

+    ip = input;

+  }

+  /* horizontal transformation */

+  im = &imbuf[0];

+  for (j = 0; j < tx_dim; j++) {

+    const int16_t *pthc = pth;

+    for (i = 0; i < tx_dim; i++) {

+      int temp = 0;

+      for (k = 0; k < tx_dim; k++) {

+        temp += im[k] * pthc[k];

+      }

+      op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);

+      pthc += tx_dim;

+    }

+    im += tx_dim;  // 16

+    op += tx_dim;

+  }

+}

+void vp9_short_fdct4x4_c(short *input, short *output, int pitch) {

+  int i;

+  int a1, b1, c1, d1;

+  short *ip = input;

+  short *op = output;

+  for (i = 0; i < 4; i++) {

+    a1 = ((ip[0] + ip[3]) << 5);

+    b1 = ((ip[1] + ip[2]) << 5);

+    c1 = ((ip[1] - ip[2]) << 5);

+    d1 = ((ip[0] - ip[3]) << 5);

+    op[0] = a1 + b1;

+    op[2] = a1 - b1;

+    op[1] = (c1 * 2217 + d1 * 5352 +  14500) >> 12;

+    op[3] = (d1 * 2217 - c1 * 5352 +   7500) >> 12;

+    ip += pitch / 2;

+    op += 4;

+  }

+  ip = output;

+  op = output;

+  for (i = 0; i < 4; i++) {

+    a1 = ip[0] + ip[12];

+    b1 = ip[4] + ip[8];

+    c1 = ip[4] - ip[8];

+    d1 = ip[0] - ip[12];

+    op[0]  = (a1 + b1 + 7) >> 4;

+    op[8]  = (a1 - b1 + 7) >> 4;

+    op[4]  = ((c1 * 2217 + d1 * 5352 +  12000) >> 16) + (d1 != 0);

+    op[12] = (d1 * 2217 - c1 * 5352 +  51000) >> 16;

+    ip++;

+    op++;

+  }

+}

+void vp9_short_fdct8x4_c(short *input, short *output, int pitch)

+{

+    vp9_short_fdct4x4_c(input,   output,    pitch);

+    vp9_short_fdct4x4_c(input + 4, output + 16, pitch);

+}

+void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {

+  int i;

+  int a1, b1, c1, d1;

+  short *ip = input;

+  short *op = output;

+  int pitch_short = pitch >> 1;

+  for (i = 0; i < 4; i++) {

+    a1 = ip[0 * pitch_short] + ip[3 * pitch_short];

+    b1 = ip[1 * pitch_short] + ip[2 * pitch_short];

+    c1 = ip[1 * pitch_short] - ip[2 * pitch_short];

+    d1 = ip[0 * pitch_short] - ip[3 * pitch_short];

+    op[0] = (a1 + b1 + 1) >> 1;

+    op[4] = (c1 + d1) >> 1;

+    op[8] = (a1 - b1) >> 1;

+    op[12] = (d1 - c1) >> 1;

+    ip++;

+    op++;

+  }

+  ip = output;

+  op = output;

+  for (i = 0; i < 4; i++) {

+    a1 = ip[0] + ip[3];

+    b1 = ip[1] + ip[2];

+    c1 = ip[1] - ip[2];

+    d1 = ip[0] - ip[3];

+    op[0] = (a1 + b1 + 1) >> 1;

+    op[1] = (c1 + d1) >> 1;

+    op[2] = (a1 - b1) >> 1;

+    op[3] = (d1 - c1) >> 1;

+    ip += 4;

+    op += 4;

+  }

+}

+#if CONFIG_LOSSLESS

+void vp9_short_walsh4x4_lossless_c(short *input, short *output, int pitch) {

+  int i;

+  int a1, b1, c1, d1;

+  short *ip = input;

+  short *op = output;

+  int pitch_short = pitch >> 1;

+  for (i = 0; i < 4; i++) {

+    a1 = (ip[0 * pitch_short] + ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;

+    b1 = (ip[1 * pitch_short] + ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;

+    c1 = (ip[1 * pitch_short] - ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;

+    d1 = (ip[0 * pitch_short] - ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;

+    op[0] = (a1 + b1 + 1) >> 1;

+    op[4] = (c1 + d1) >> 1;

+    op[8] = (a1 - b1) >> 1;

+    op[12] = (d1 - c1) >> 1;

+    ip++;

+    op++;

+  }

+  ip = output;

+  op = output;

+  for (i = 0; i < 4; i++) {

+    a1 = ip[0] + ip[3];

+    b1 = ip[1] + ip[2];

+    c1 = ip[1] - ip[2];

+    d1 = ip[0] - ip[3];

+    op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

+    op[1] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

+    op[2] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

+    op[3] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

+    ip += 4;

+    op += 4;

+  }

+}

+void vp9_short_walsh4x4_x8_c(short *input, short *output, int pitch) {

+  int i;

+  int a1, b1, c1, d1;

+  short *ip = input;

+  short *op = output;

+  int pitch_short = pitch >> 1;

+  for (i = 0; i < 4; i++) {

+    a1 = ip[0 * pitch_short] + ip[3 * pitch_short];

+    b1 = ip[1 * pitch_short] + ip[2 * pitch_short];

+    c1 = ip[1 * pitch_short] - ip[2 * pitch_short];

+    d1 = ip[0 * pitch_short] - ip[3 * pitch_short];

+    op[0] = (a1 + b1 + 1) >> 1;

+    op[4] = (c1 + d1) >> 1;

+    op[8] = (a1 - b1) >> 1;

+    op[12] = (d1 - c1) >> 1;

+    ip++;

+    op++;

+  }

+  ip = output;

+  op = output;

+  for (i = 0; i < 4; i++) {

+    a1 = ip[0] + ip[3];

+    b1 = ip[1] + ip[2];

+    c1 = ip[1] - ip[2];

+    d1 = ip[0] - ip[3];

+    op[0] = ((a1 + b1 + 1) >> 1) << WHT_UPSCALE_FACTOR;

+    op[1] = ((c1 + d1) >> 1) << WHT_UPSCALE_FACTOR;

+    op[2] = ((a1 - b1) >> 1) << WHT_UPSCALE_FACTOR;

+    op[3] = ((d1 - c1) >> 1) << WHT_UPSCALE_FACTOR;

+    ip += 4;

+    op += 4;

+  }

+}

+void vp9_short_walsh8x4_x8_c(short *input, short *output, int pitch) {

+  vp9_short_walsh4x4_x8_c(input,   output,    pitch);

+  vp9_short_walsh4x4_x8_c(input + 4, output + 16, pitch);

+}

+#endif

+static const double C1 = 0.995184726672197;

+static const double C2 = 0.98078528040323;

+static const double C3 = 0.956940335732209;

+static const double C4 = 0.923879532511287;

+static const double C5 = 0.881921264348355;

+static const double C6 = 0.831469612302545;

+static const double C7 = 0.773010453362737;

+static const double C8 = 0.707106781186548;

+static const double C9 = 0.634393284163646;

+static const double C10 = 0.555570233019602;

+static const double C11 = 0.471396736825998;

+static const double C12 = 0.38268343236509;

+static const double C13 = 0.290284677254462;

+static const double C14 = 0.195090322016128;

+static const double C15 = 0.098017140329561;

+static void dct16x16_1d(double input[16], double output[16]) {

+  vp9_clear_system_state(); // Make it simd safe : __asm emms;

+  {

+    double step[16];

+    double intermediate[16];

+    double temp1, temp2;

+    // step 1

+    step[ 0] = input[0] + input[15];

+    step[ 1] = input[1] + input[14];

+    step[ 2] = input[2] + input[13];

+    step[ 3] = input[3] + input[12];

+    step[ 4] = input[4] + input[11];

+    step[ 5] = input[5] + input[10];

+    step[ 6] = input[6] + input[ 9];

+    step[ 7] = input[7] + input[ 8];

+    step[ 8] = input[7] - input[ 8];

+    step[ 9] = input[6] - input[ 9];

+    step[10] = input[5] - input[10];

+    step[11] = input[4] - input[11];

+    step[12] = input[3] - input[12];

+    step[13] = input[2] - input[13];

+    step[14] = input[1] - input[14];

+    step[15] = input[0] - input[15];

+    // step 2

+    output[0] = step[0] + step[7];

+    output[1] = step[1] + step[6];

+    output[2] = step[2] + step[5];

+    output[3] = step[3] + step[4];

+    output[4] = step[3] - step[4];

+    output[5] = step[2] - step[5];

+    output[6] = step[1] - step[6];

+    output[7] = step[0] - step[7];

+    temp1 = step[ 8]*C7;

+    temp2 = step[15]*C9;

+    output[ 8] = temp1 + temp2;

+    temp1 = step[ 9]*C11;

+    temp2 = step[14]*C5;

+    output[ 9] = temp1 - temp2;

+    temp1 = step[10]*C3;

+    temp2 = step[13]*C13;

+    output[10] = temp1 + temp2;

+    temp1 = step[11]*C15;

+    temp2 = step[12]*C1;

+    output[11] = temp1 - temp2;

+    temp1 = step[11]*C1;

+    temp2 = step[12]*C15;

+    output[12] = temp2 + temp1;

+    temp1 = step[10]*C13;

+    temp2 = step[13]*C3;

+    output[13] = temp2 - temp1;

+    temp1 = step[ 9]*C5;

+    temp2 = step[14]*C11;

+    output[14] = temp2 + temp1;

+    temp1 = step[ 8]*C9;

+    temp2 = step[15]*C7;

+    output[15] = temp2 - temp1;

+    // step 3

+    step[ 0] = output[0] + output[3];

+    step[ 1] = output[1] + output[2];

+    step[ 2] = output[1] - output[2];

+    step[ 3] = output[0] - output[3];

+    temp1 = output[4]*C14;

+    temp2 = output[7]*C2;

+    step[ 4] = temp1 + temp2;

+    temp1 = output[5]*C10;

+    temp2 = output[6]*C6;

+    step[ 5] = temp1 + temp2;

+    temp1 = output[5]*C6;

+    temp2 = output[6]*C10;

+    step[ 6] = temp2 - temp1;

+    temp1 = output[4]*C2;

+    temp2 = output[7]*C14;

+    step[ 7] = temp2 - temp1;

+    step[ 8] = output[ 8] + output[11];

+    step[ 9] = output[ 9] + output[10];

+    step[10] = output[ 9] - output[10];

+    step[11] = output[ 8] - output[11];

+    step[12] = output[12] + output[15];

+    step[13] = output[13] + output[14];

+    step[14] = output[13] - output[14];

+    step[15] = output[12] - output[15];

+    // step 4

+    output[ 0] = (step[ 0] + step[ 1]);

+    output[ 8] = (step[ 0] - step[ 1]);

+    temp1 = step[2]*C12;

+    temp2 = step[3]*C4;

+    temp1 = temp1 + temp2;

+    output[ 4] = 2*(temp1*C8);

+    temp1 = step[2]*C4;

+    temp2 = step[3]*C12;

+    temp1 = temp2 - temp1;

+    output[12] = 2*(temp1*C8);

+    output[ 2] = 2*((step[4] + step[ 5])*C8);

+    output[14] = 2*((step[7] - step[ 6])*C8);

+    temp1 = step[4] - step[5];

+    temp2 = step[6] + step[7];

+    output[ 6] = (temp1 + temp2);

+    output[10] = (temp1 - temp2);

+    intermediate[8] = step[8] + step[14];

+    intermediate[9] = step[9] + step[15];

+    temp1 = intermediate[8]*C12;

+    temp2 = intermediate[9]*C4;

+    temp1 = temp1 - temp2;

+    output[3] = 2*(temp1*C8);

+    temp1 = intermediate[8]*C4;

+    temp2 = intermediate[9]*C12;

+    temp1 = temp2 + temp1;

+    output[13] = 2*(temp1*C8);

+    output[ 9] = 2*((step[10] + step[11])*C8);

+    intermediate[11] = step[10] - step[11];

+    intermediate[12] = step[12] + step[13];

+    intermediate[13] = step[12] - step[13];

+    intermediate[14] = step[ 8] - step[14];

+    intermediate[15] = step[ 9] - step[15];

+    output[15] = (intermediate[11] + intermediate[12]);

+    output[ 1] = -(intermediate[11] - intermediate[12]);

+    output[ 7] = 2*(intermediate[13]*C8);

+    temp1 = intermediate[14]*C12;

+    temp2 = intermediate[15]*C4;

+    temp1 = temp1 - temp2;

+    output[11] = -2*(temp1*C8);

+    temp1 = intermediate[14]*C4;

+    temp2 = intermediate[15]*C12;

+    temp1 = temp2 + temp1;

+    output[ 5] = 2*(temp1*C8);

+  }

+  vp9_clear_system_state(); // Make it simd safe : __asm emms;

+}

+void vp9_short_fdct16x16_c(short *input, short *out, int pitch) {

+  vp9_clear_system_state(); // Make it simd safe : __asm emms;

+  {

+    int shortpitch = pitch >> 1;

+    int i, j;

+    double output[256];

+    // First transform columns

+    for (i = 0; i < 16; i++) {

+        double temp_in[16], temp_out[16];

+        for (j = 0; j < 16; j++)

+            temp_in[j] = input[j*shortpitch + i];

+        dct16x16_1d(temp_in, temp_out);

+        for (j = 0; j < 16; j++)

+            output[j*16 + i] = temp_out[j];

+    }

+    // Then transform rows

+    for (i = 0; i < 16; ++i) {

+        double temp_in[16], temp_out[16];

+        for (j = 0; j < 16; ++j)

+            temp_in[j] = output[j + i*16];

+        dct16x16_1d(temp_in, temp_out);

+        for (j = 0; j < 16; ++j)

+            output[j + i*16] = temp_out[j];

+    }

+    // Scale by some magic number

+    for (i = 0; i < 256; i++)

+        out[i] = (short)round(output[i]/2);

+  }

+  vp9_clear_system_state(); // Make it simd safe : __asm emms;

+}

--- /dev/null

+++ b/vp9/encoder/encodeframe.c

@@ -1,0 +1,2342 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "encodemb.h"

+#include "encodemv.h"

+#include "vp9/common/common.h"

+#include "onyx_int.h"

+#include "vp9/common/extend.h"

+#include "vp9/common/entropymode.h"

+#include "vp9/common/quant_common.h"

+#include "segmentation.h"

+#include "vp9/common/setupintrarecon.h"

+#include "vp9/common/reconintra4x4.h"

+#include "encodeintra.h"

+#include "vp9/common/reconinter.h"

+#include "vp9/common/invtrans.h"

+#include "rdopt.h"

+#include "vp9/common/findnearmv.h"

+#include "vp9/common/reconintra.h"

+#include "vp9/common/seg_common.h"

+#include "vpx_rtcd.h"

+#include <stdio.h>

+#include <math.h>

+#include <limits.h>

+#include "vp9/common/subpixel.h"

+#include "vpx_ports/vpx_timer.h"

+#include "vp9/common/pred_common.h"

+#define DBG_PRNT_SEGMAP 0

+#if CONFIG_NEWBESTREFMV

+#include "vp9/common/mvref_common.h"

+#endif

+#if CONFIG_RUNTIME_CPU_DETECT

+#define RTCD(x)     &cpi->common.rtcd.x

+#define IF_RTCD(x)  (x)

+#else

+#define RTCD(x)     NULL

+#define IF_RTCD(x)  NULL

+#endif

+#ifdef ENC_DEBUG

+int enc_debug = 0;

+int mb_row_debug, mb_col_debug;

+#endif

+extern void vp9_initialize_me_consts(VP9_COMP *cpi, int QIndex);

+extern void vp9_auto_select_speed(VP9_COMP *cpi);

+int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,

+                              int recon_yoffset, int recon_uvoffset,

+                              int *returnrate, int *returndistortion);

+extern void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,

+                                           int recon_yoffset,

+                                           int recon_uvoffset, int *r, int *d);

+void vp9_build_block_offsets(MACROBLOCK *x);

+void vp9_setup_block_ptrs(MACROBLOCK *x);

+void vp9_encode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,

+                                 int recon_yoffset, int recon_uvoffset,

+                                 int output_enabled);

+void vp9_encode_inter_superblock(VP9_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,

+                                 int recon_yoffset, int recon_uvoffset,

+                                 int mb_col, int mb_row);

+void vp9_encode_intra_macro_block(VP9_COMP *cpi, MACROBLOCK *x,

+                                  TOKENEXTRA **t, int output_enabled);

+void vp9_encode_intra_super_block(VP9_COMP *cpi, MACROBLOCK *x,

+                                  TOKENEXTRA **t, int mb_col);

+static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);

+#ifdef MODE_STATS

+unsigned int inter_y_modes[MB_MODE_COUNT];

+unsigned int inter_uv_modes[VP9_UV_MODES];

+unsigned int inter_b_modes[B_MODE_COUNT];

+unsigned int y_modes[VP9_YMODES];

+unsigned int i8x8_modes[VP9_I8X8_MODES];

+unsigned int uv_modes[VP9_UV_MODES];

+unsigned int uv_modes_y[VP9_YMODES][VP9_UV_MODES];

+unsigned int b_modes[B_MODE_COUNT];

+#endif

+/* activity_avg must be positive, or flat regions could get a zero weight

+ *  (infinite lambda), which confounds analysis.

+ * This also avoids the need for divide by zero checks in

+ *  vp9_activity_masking().

+ */

+#define VP9_ACTIVITY_AVG_MIN (64)

+/* This is used as a reference when computing the source variance for the

+ *  purposes of activity masking.

+ * Eventually this should be replaced by custom no-reference routines,

+ *  which will be faster.

+ */

+static const unsigned char VP9_VAR_OFFS[16] = {

+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128

+};

+// Original activity measure from Tim T's code.

+static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) {

+  unsigned int act;

+  unsigned int sse;

+  /* TODO: This could also be done over smaller areas (8x8), but that would

+   *  require extensive changes elsewhere, as lambda is assumed to be fixed

+   *  over an entire MB in most of the code.

+   * Another option is to compute four 8x8 variances, and pick a single

+   *  lambda using a non-linear combination (e.g., the smallest, or second

+   *  smallest, etc.).

+   */

+  act = vp9_variance16x16(x->src.y_buffer, x->src.y_stride, VP9_VAR_OFFS, 0,

+                          &sse);

+  act = act << 4;

+  /* If the region is flat, lower the activity some more. */

+  if (act < 8 << 12)

+    act = act < 5 << 12 ? act : 5 << 12;

+  return act;

+}

+// Stub for alternative experimental activity measures.

+static unsigned int alt_activity_measure(VP9_COMP *cpi,

+                                         MACROBLOCK *x, int use_dc_pred) {

+  return vp9_encode_intra(cpi, x, use_dc_pred);

+}

+// Measure the activity of the current macroblock

+// What we measure here is TBD so abstracted to this function

+#define ALT_ACT_MEASURE 1

+static unsigned int mb_activity_measure(VP9_COMP *cpi, MACROBLOCK *x,

+                                        int mb_row, int mb_col) {

+  unsigned int mb_activity;

+  if (ALT_ACT_MEASURE) {

+    int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);

+    // Or use and alternative.

+    mb_activity = alt_activity_measure(cpi, x, use_dc_pred);

+  } else {

+    // Original activity measure from Tim T's code.

+    mb_activity = tt_activity_measure(cpi, x);

+  }

+  if (mb_activity < VP9_ACTIVITY_AVG_MIN)

+    mb_activity = VP9_ACTIVITY_AVG_MIN;

+  return mb_activity;

+}

+// Calculate an "average" mb activity value for the frame

+#define ACT_MEDIAN 0

+static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {

+#if ACT_MEDIAN

+  // Find median: Simple n^2 algorithm for experimentation

+  {

+    unsigned int median;

+    unsigned int i, j;

+    unsigned int *sortlist;

+    unsigned int tmp;

+    // Create a list to sort to

+    CHECK_MEM_ERROR(sortlist,

+    vpx_calloc(sizeof(unsigned int),

+    cpi->common.MBs));

+    // Copy map to sort list

+    vpx_memcpy(sortlist, cpi->mb_activity_map,

+    sizeof(unsigned int) * cpi->common.MBs);

+    // Ripple each value down to its correct position

+    for (i = 1; i < cpi->common.MBs; i ++) {

+      for (j = i; j > 0; j --) {

+        if (sortlist[j] < sortlist[j - 1]) {

+          // Swap values

+          tmp = sortlist[j - 1];

+          sortlist[j - 1] = sortlist[j];

+          sortlist[j] = tmp;

+        } else

+          break;

+      }

+    }

+    // Even number MBs so estimate median as mean of two either side.

+    median = (1 + sortlist[cpi->common.MBs >> 1] +

+              sortlist[(cpi->common.MBs >> 1) + 1]) >> 1;

+    cpi->activity_avg = median;

+    vpx_free(sortlist);

+  }

+#else

+  // Simple mean for now

+  cpi->activity_avg = (unsigned int)(activity_sum / cpi->common.MBs);

+#endif

+  if (cpi->activity_avg < VP9_ACTIVITY_AVG_MIN)

+    cpi->activity_avg = VP9_ACTIVITY_AVG_MIN;

+  // Experimental code: return fixed value normalized for several clips

+  if (ALT_ACT_MEASURE)

+    cpi->activity_avg = 100000;

+}

+#define USE_ACT_INDEX   0

+#define OUTPUT_NORM_ACT_STATS   0

+#if USE_ACT_INDEX

+// Calculate and activity index for each mb

+static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) {

+  VP9_COMMON *const cm = &cpi->common;

+  int mb_row, mb_col;

+  int64_t act;

+  int64_t a;

+  int64_t b;

+#if OUTPUT_NORM_ACT_STATS

+  FILE *f = fopen("norm_act.stt", "a");

+  fprintf(f, "\n%12d\n", cpi->activity_avg);

+#endif

+  // Reset pointers to start of activity map

+  x->mb_activity_ptr = cpi->mb_activity_map;

+  // Calculate normalized mb activity number.

+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {

+    // for each macroblock col in image

+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

+      // Read activity from the map

+      act = *(x->mb_activity_ptr);

+      // Calculate a normalized activity number

+      a = act + 4 * cpi->activity_avg;

+      b = 4 * act + cpi->activity_avg;

+      if (b >= a)

+        *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1;

+      else

+        *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b);

+#if OUTPUT_NORM_ACT_STATS

+      fprintf(f, " %6d", *(x->mb_activity_ptr));

+#endif

+      // Increment activity map pointers

+      x->mb_activity_ptr++;

+    }

+#if OUTPUT_NORM_ACT_STATS

+    fprintf(f, "\n");

+#endif

+  }

+#if OUTPUT_NORM_ACT_STATS

+  fclose(f);

+#endif

+}

+#endif

+// Loop through all MBs. Note activity of each, average activity and

+// calculate a normalized activity for each

+static void build_activity_map(VP9_COMP *cpi) {

+  MACROBLOCK *const x = &cpi->mb;

+  MACROBLOCKD *xd = &x->e_mbd;

+  VP9_COMMON *const cm = &cpi->common;

+#if ALT_ACT_MEASURE

+  YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];

+  int recon_yoffset;

+  int recon_y_stride = new_yv12->y_stride;

+#endif

+  int mb_row, mb_col;

+  unsigned int mb_activity;

+  int64_t activity_sum = 0;

+  // for each macroblock row in image

+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {

+#if ALT_ACT_MEASURE

+    // reset above block coeffs

+    xd->up_available = (mb_row != 0);

+    recon_yoffset = (mb_row * recon_y_stride * 16);

+#endif

+    // for each macroblock col in image

+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

+#if ALT_ACT_MEASURE

+      xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;

+      xd->left_available = (mb_col != 0);

+      recon_yoffset += 16;

+#endif

+      // Copy current mb to a buffer

+      vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);

+      // measure activity

+      mb_activity = mb_activity_measure(cpi, x, mb_row, mb_col);

+      // Keep frame sum

+      activity_sum += mb_activity;

+      // Store MB level activity details.

+      *x->mb_activity_ptr = mb_activity;

+      // Increment activity map pointer

+      x->mb_activity_ptr++;

+      // adjust to the next column of source macroblocks

+      x->src.y_buffer += 16;

+    }

+    // adjust to the next row of mbs

+    x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;

+#if ALT_ACT_MEASURE

+    // extend the recon for intra prediction

+    vp9_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,

+                      xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);

+#endif

+  }

+  // Calculate an "average" MB activity

+  calc_av_activity(cpi, activity_sum);

+#if USE_ACT_INDEX

+  // Calculate an activity index number of each mb

+  calc_activity_index(cpi, x);

+#endif

+}

+// Macroblock activity masking

+void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {

+#if USE_ACT_INDEX

+  x->rdmult += *(x->mb_activity_ptr) * (x->rdmult >> 2);

+  x->errorperbit = x->rdmult * 100 / (110 * x->rddiv);

+  x->errorperbit += (x->errorperbit == 0);

+#else

+  int64_t a;

+  int64_t b;

+  int64_t act = *(x->mb_activity_ptr);

+  // Apply the masking to the RD multiplier.

+  a = act + (2 * cpi->activity_avg);

+  b = (2 * act) + cpi->activity_avg;

+  x->rdmult = (unsigned int)(((int64_t)x->rdmult * b + (a >> 1)) / a);

+  x->errorperbit = x->rdmult * 100 / (110 * x->rddiv);

+  x->errorperbit += (x->errorperbit == 0);

+#endif

+  // Activity based Zbin adjustment

+  adjust_act_zbin(cpi, x);

+}

+static void update_state(VP9_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {

+  int i;

+  MACROBLOCKD *xd = &x->e_mbd;

+  MODE_INFO *mi = &ctx->mic;

+  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;

+  int mb_mode = mi->mbmi.mode;

+  int mb_mode_index = ctx->best_mode_index;

+#if CONFIG_DEBUG

+  assert(mb_mode < MB_MODE_COUNT);

+  assert(mb_mode_index < MAX_MODES);

+  assert(mi->mbmi.ref_frame < MAX_REF_FRAMES);

+#endif

+  // Restore the coding context of the MB to that that was in place

+  // when the mode was picked for it

+  vpx_memcpy(xd->mode_info_context, mi, sizeof(MODE_INFO));

+#if CONFIG_SUPERBLOCKS

+  if (mi->mbmi.encoded_as_sb) {

+    const int mis = cpi->common.mode_info_stride;

+    if (xd->mb_to_right_edge > 0)

+      vpx_memcpy(xd->mode_info_context + 1, mi, sizeof(MODE_INFO));

+    if (xd->mb_to_bottom_edge > 0) {

+      vpx_memcpy(xd->mode_info_context + mis, mi, sizeof(MODE_INFO));

+      if (xd->mb_to_right_edge > 0)

+        vpx_memcpy(xd->mode_info_context + mis + 1, mi, sizeof(MODE_INFO));

+    }

+  }

+#endif

+  if (mb_mode == B_PRED) {

+    for (i = 0; i < 16; i++) {

+      xd->block[i].bmi.as_mode = xd->mode_info_context->bmi[i].as_mode;

+      assert(xd->block[i].bmi.as_mode.first < MB_MODE_COUNT);

+    }

+  } else if (mb_mode == I8X8_PRED) {

+    for (i = 0; i < 16; i++) {

+      xd->block[i].bmi = xd->mode_info_context->bmi[i];

+    }

+  } else if (mb_mode == SPLITMV) {

+    vpx_memcpy(x->partition_info, &ctx->partition_info,

+               sizeof(PARTITION_INFO));

+    mbmi->mv[0].as_int = x->partition_info->bmi[15].mv.as_int;

+    mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;

+  }

+  {

+    int segment_id = mbmi->segment_id;

+    if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||

+        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB)) {

+      for (i = 0; i < NB_TXFM_MODES; i++) {

+        cpi->rd_tx_select_diff[i] += ctx->txfm_rd_diff[i];

+      }

+    }

+  }

+  if (cpi->common.frame_type == KEY_FRAME) {

+    // Restore the coding modes to that held in the coding context

+    // if (mb_mode == B_PRED)

+    //    for (i = 0; i < 16; i++)

+    //    {

+    //        xd->block[i].bmi.as_mode =

+    //                          xd->mode_info_context->bmi[i].as_mode;

+    //        assert(xd->mode_info_context->bmi[i].as_mode < MB_MODE_COUNT);

+    //    }

+#if CONFIG_INTERNAL_STATS

+    static const int kf_mode_index[] = {

+      THR_DC /*DC_PRED*/,

+      THR_V_PRED /*V_PRED*/,

+      THR_H_PRED /*H_PRED*/,

+      THR_D45_PRED /*D45_PRED*/,

+      THR_D135_PRED /*D135_PRED*/,

+      THR_D117_PRED /*D117_PRED*/,

+      THR_D153_PRED /*D153_PRED*/,

+      THR_D27_PRED /*D27_PRED*/,

+      THR_D63_PRED /*D63_PRED*/,

+      THR_TM /*TM_PRED*/,

+      THR_I8X8_PRED /*I8X8_PRED*/,

+      THR_B_PRED /*B_PRED*/,

+    };

+    cpi->mode_chosen_counts[kf_mode_index[mb_mode]]++;

+#endif

+  } else {

+    /*

+            // Reduce the activation RD thresholds for the best choice mode

+            if ((cpi->rd_baseline_thresh[mb_mode_index] > 0) &&

+                (cpi->rd_baseline_thresh[mb_mode_index] < (INT_MAX >> 2)))

+            {

+                int best_adjustment = (cpi->rd_thresh_mult[mb_mode_index] >> 2);

+                cpi->rd_thresh_mult[mb_mode_index] =

+                        (cpi->rd_thresh_mult[mb_mode_index]

+                         >= (MIN_THRESHMULT + best_adjustment)) ?

+                                cpi->rd_thresh_mult[mb_mode_index] - best_adjustment :

+                                MIN_THRESHMULT;

+                cpi->rd_threshes[mb_mode_index] =

+                        (cpi->rd_baseline_thresh[mb_mode_index] >> 7)

+                        * cpi->rd_thresh_mult[mb_mode_index];

+            }

+    */

+    // Note how often each mode chosen as best

+    cpi->mode_chosen_counts[mb_mode_index]++;

+    cpi->prediction_error += ctx->distortion;

+    cpi->intra_error += ctx->intra_error;

+    cpi->rd_comp_pred_diff[0] += ctx->single_pred_diff;

+    cpi->rd_comp_pred_diff[1] += ctx->comp_pred_diff;

+    cpi->rd_comp_pred_diff[2] += ctx->hybrid_pred_diff;

+  }

+}

+static void pick_mb_modes(VP9_COMP *cpi,

+                          VP9_COMMON *cm,

+                          int mb_row,

+                          int mb_col,

+                          MACROBLOCK  *x,

+                          MACROBLOCKD *xd,

+                          TOKENEXTRA **tp,

+                          int *totalrate,

+                          int *totaldist) {

+  int i;

+  int map_index;

+  int recon_yoffset, recon_uvoffset;

+  int ref_fb_idx = cm->lst_fb_idx;

+  int dst_fb_idx = cm->new_fb_idx;

+  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;

+  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;

+  ENTROPY_CONTEXT_PLANES left_context[2];

+  ENTROPY_CONTEXT_PLANES above_context[2];

+  ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context

+                                                      + mb_col;

+  // Offsets to move pointers from MB to MB within a SB in raster order

+  int row_delta[4] = { 0, +1,  0, -1};

+  int col_delta[4] = { +1, -1, +1, +1};

+  /* Function should not modify L & A contexts; save and restore on exit */

+  vpx_memcpy(left_context,

+             cm->left_context,

+             sizeof(left_context));

+  vpx_memcpy(above_context,

+             initial_above_context_ptr,

+             sizeof(above_context));

+  /* Encode MBs in raster order within the SB */

+  for (i = 0; i < 4; i++) {

+    int dy = row_delta[i];

+    int dx = col_delta[i];

+    int offset_unextended = dy * cm->mb_cols + dx;

+    int offset_extended   = dy * xd->mode_info_stride + dx;

+    MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;

+    // TODO Many of the index items here can be computed more efficiently!

+    if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) {

+      // MB lies outside frame, move on

+      mb_row += dy;

+      mb_col += dx;

+      // Update pointers

+      x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);

+      x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);

+      x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);

+      x->gf_active_ptr += offset_unextended;

+      x->partition_info += offset_extended;

+      xd->mode_info_context += offset_extended;

+      xd->prev_mode_info_context += offset_extended;

+#if CONFIG_DEBUG

+      assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==

+             (xd->mode_info_context - cpi->common.mip));

+#endif

+      continue;

+    }

+    // Index of the MB in the SB 0..3

+    xd->mb_index = i;

+    map_index = (mb_row * cpi->common.mb_cols) + mb_col;

+    x->mb_activity_ptr = &cpi->mb_activity_map[map_index];

+    // set above context pointer

+    xd->above_context = cm->above_context + mb_col;

+    // Restore the appropriate left context depending on which

+    // row in the SB the MB is situated

+    xd->left_context = cm->left_context + (i >> 1);

+    // Set up distance of MB to edge of frame in 1/8th pel units

+    xd->mb_to_top_edge    = -((mb_row * 16) << 3);

+    xd->mb_to_left_edge   = -((mb_col * 16) << 3);

+    xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;

+    xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;

+    // Set up limit values for MV components to prevent them from

+    // extending beyond the UMV borders assuming 16x16 block size

+    x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);

+    x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);

+    x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +

+                     (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));

+    x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +

+                     (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));

+    xd->up_available   = (mb_row != 0);

+    xd->left_available = (mb_col != 0);

+    recon_yoffset  = (mb_row * recon_y_stride * 16) + (mb_col * 16);

+    recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col *  8);

+    xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;

+    xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;

+    xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;

+    // Copy current MB to a work buffer

+    vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);

+    x->rddiv = cpi->RDDIV;

+    x->rdmult = cpi->RDMULT;

+    if (cpi->oxcf.tuning == VP8_TUNE_SSIM)

+      vp9_activity_masking(cpi, x);

+    // Is segmentation enabled

+    if (xd->segmentation_enabled) {

+      // Code to set segment id in xd->mbmi.segment_id

+      if (xd->update_mb_segmentation_map)

+        mbmi->segment_id = cpi->segmentation_map[map_index];

+      else

+        mbmi->segment_id = cm->last_frame_seg_map[map_index];

+      if (mbmi->segment_id > 3)

+        mbmi->segment_id = 0;

+      vp9_mb_init_quantizer(cpi, x);

+    } else

+      // Set to Segment 0 by default

+      mbmi->segment_id = 0;

+    x->active_ptr = cpi->active_map + map_index;

+#if CONFIG_SUPERBLOCKS

+    xd->mode_info_context->mbmi.encoded_as_sb = 0;

+#endif

+    cpi->update_context = 0;    // TODO Do we need this now??

+    vp9_intra_prediction_down_copy(xd);

+    // Find best coding mode & reconstruct the MB so it is available

+    // as a predictor for MBs that follow in the SB

+    if (cm->frame_type == KEY_FRAME) {

+      int r, d;

+      vp9_rd_pick_intra_mode(cpi, x, &r, &d);

+      *totalrate += r;

+      *totaldist += d;

+      // Dummy encode, do not do the tokenization

+      vp9_encode_intra_macro_block(cpi, x, tp, 0);

+      // Note the encoder may have changed the segment_id

+      // Save the coding context

+      vpx_memcpy(&x->mb_context[i].mic, xd->mode_info_context,

+                 sizeof(MODE_INFO));

+    } else {

+      int seg_id, r, d;

+      if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&

+          !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&

+          vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME) &&

+          vp9_check_segref(xd, 1, INTRA_FRAME)  +

+          vp9_check_segref(xd, 1, LAST_FRAME)   +

+          vp9_check_segref(xd, 1, GOLDEN_FRAME) +

+          vp9_check_segref(xd, 1, ALTREF_FRAME) == 1) {

+        cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;

+      } else {

+        cpi->seg0_progress = (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols + i) << 16) / cm->MBs;

+      }

+      vp9_pick_mode_inter_macroblock(cpi, x, recon_yoffset,

+                                     recon_uvoffset, &r, &d);

+      *totalrate += r;

+      *totaldist += d;

+      // Dummy encode, do not do the tokenization

+      vp9_encode_inter_macroblock(cpi, x, tp,

+                                  recon_yoffset, recon_uvoffset, 0);

+      seg_id = mbmi->segment_id;

+      if (cpi->mb.e_mbd.segmentation_enabled && seg_id == 0) {

+        cpi->seg0_idx++;

+      }

+      if (!xd->segmentation_enabled ||

+          !vp9_segfeature_active(xd, seg_id, SEG_LVL_REF_FRAME) ||

+          vp9_check_segref(xd, seg_id, INTRA_FRAME)  +

+          vp9_check_segref(xd, seg_id, LAST_FRAME)   +

+          vp9_check_segref(xd, seg_id, GOLDEN_FRAME) +

+          vp9_check_segref(xd, seg_id, ALTREF_FRAME) > 1) {

+        // Get the prediction context and status

+        int pred_flag = vp9_get_pred_flag(xd, PRED_REF);

+        int pred_context = vp9_get_pred_context(cm, xd, PRED_REF);

+        // Count prediction success

+        cpi->ref_pred_count[pred_context][pred_flag]++;

+      }

+    }

+    // Next MB

+    mb_row += dy;

+    mb_col += dx;

+    x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);

+    x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);

+    x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);

+    x->gf_active_ptr += offset_unextended;

+    x->partition_info += offset_extended;

+    xd->mode_info_context += offset_extended;

+    xd->prev_mode_info_context += offset_extended;

+#if CONFIG_DEBUG

+    assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==

+           (xd->mode_info_context - cpi->common.mip));

+#endif

+  }

+  /* Restore L & A coding context to those in place on entry */

+  vpx_memcpy(cm->left_context,

+             left_context,

+             sizeof(left_context));

+  vpx_memcpy(initial_above_context_ptr,

+             above_context,

+             sizeof(above_context));

+}

+#if CONFIG_SUPERBLOCKS

+static void pick_sb_modes (VP9_COMP *cpi,

+                           VP9_COMMON *cm,

+                           int mb_row,

+                           int mb_col,

+                           MACROBLOCK  *x,

+                           MACROBLOCKD *xd,

+                           TOKENEXTRA **tp,

+                           int *totalrate,

+                           int *totaldist)

+{

+  int map_index;

+  int recon_yoffset, recon_uvoffset;

+  int ref_fb_idx = cm->lst_fb_idx;

+  int dst_fb_idx = cm->new_fb_idx;

+  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;

+  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;

+  ENTROPY_CONTEXT_PLANES left_context[2];

+  ENTROPY_CONTEXT_PLANES above_context[2];

+  ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context

+    + mb_col;

+  /* Function should not modify L & A contexts; save and restore on exit */

+  vpx_memcpy (left_context,

+              cm->left_context,

+              sizeof(left_context));

+  vpx_memcpy (above_context,

+              initial_above_context_ptr,

+              sizeof(above_context));

+  map_index = (mb_row * cpi->common.mb_cols) + mb_col;

+  x->mb_activity_ptr = &cpi->mb_activity_map[map_index];

+  /* set above context pointer */

+  xd->above_context = cm->above_context + mb_col;

+  /* Restore the appropriate left context depending on which

+   * row in the SB the MB is situated */

+  xd->left_context = cm->left_context;

+  // Set up distance of MB to edge of frame in 1/8th pel units

+  xd->mb_to_top_edge    = -((mb_row * 16) << 3);

+  xd->mb_to_left_edge   = -((mb_col * 16) << 3);

+  xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;

+  xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;

+  /* Set up limit values for MV components to prevent them from

+   * extending beyond the UMV borders assuming 16x16 block size */

+  x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);

+  x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);

+  x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +

+                   (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));

+  x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +

+                   (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));

+  xd->up_available   = (mb_row != 0);

+  xd->left_available = (mb_col != 0);

+  recon_yoffset  = (mb_row * recon_y_stride * 16) + (mb_col * 16);

+  recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col *  8);

+  xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;

+  xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;

+  xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;

+#if 0 // FIXME

+  /* Copy current MB to a work buffer */

+  vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);

+#endif

+  x->rddiv = cpi->RDDIV;

+  x->rdmult = cpi->RDMULT;

+  if(cpi->oxcf.tuning == VP8_TUNE_SSIM)

+    vp9_activity_masking(cpi, x);

+  /* Is segmentation enabled */

+  if (xd->segmentation_enabled)

+  {

+    /* Code to set segment id in xd->mbmi.segment_id */

+    if (xd->update_mb_segmentation_map)

+      xd->mode_info_context->mbmi.segment_id =

+            cpi->segmentation_map[map_index] &&

+            cpi->segmentation_map[map_index + 1] &&

+            cpi->segmentation_map[map_index + cm->mb_cols] &&

+            cpi->segmentation_map[map_index + cm->mb_cols + 1];

+    else

+      xd->mode_info_context->mbmi.segment_id =

+            cm->last_frame_seg_map[map_index] &&

+            cm->last_frame_seg_map[map_index + 1] &&

+            cm->last_frame_seg_map[map_index + cm->mb_cols] &&

+            cm->last_frame_seg_map[map_index + cm->mb_cols + 1];

+    if (xd->mode_info_context->mbmi.segment_id > 3)

+      xd->mode_info_context->mbmi.segment_id = 0;

+    vp9_mb_init_quantizer(cpi, x);

+  }

+  else

+    /* Set to Segment 0 by default */

+    xd->mode_info_context->mbmi.segment_id = 0;

+  x->active_ptr = cpi->active_map + map_index;

+  cpi->update_context = 0;    // TODO Do we need this now??

+  /* Find best coding mode & reconstruct the MB so it is available

+   * as a predictor for MBs that follow in the SB */

+  if (cm->frame_type == KEY_FRAME)

+  {

+    vp9_rd_pick_intra_mode_sb(cpi, x,

+                              totalrate,

+                              totaldist);

+    /* Save the coding context */

+    vpx_memcpy(&x->sb_context[0].mic, xd->mode_info_context,

+               sizeof(MODE_INFO));

+  } else {

+    if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&

+        !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&

+        vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME) &&

+        vp9_check_segref(xd, 1, INTRA_FRAME)  +

+        vp9_check_segref(xd, 1, LAST_FRAME)   +

+        vp9_check_segref(xd, 1, GOLDEN_FRAME) +

+        vp9_check_segref(xd, 1, ALTREF_FRAME) == 1) {

+      cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;

+    } else {

+      cpi->seg0_progress =

+        (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols) << 16) / cm->MBs;

+    }

+    vp9_rd_pick_inter_mode_sb(cpi, x,

+                              recon_yoffset,

+                              recon_uvoffset,

+                              totalrate,

+                              totaldist);

+  }

+  /* Restore L & A coding context to those in place on entry */

+  vpx_memcpy (cm->left_context,

+              left_context,

+              sizeof(left_context));

+  vpx_memcpy (initial_above_context_ptr,

+              above_context,

+              sizeof(above_context));

+}

+#endif

+static void encode_sb(VP9_COMP *cpi,

+                      VP9_COMMON *cm,

+                      int mbrow,

+                      int mbcol,

+                      MACROBLOCK  *x,

+                      MACROBLOCKD *xd,

+                      TOKENEXTRA **tp) {

+  int i;

+  int map_index;

+  int mb_row, mb_col;

+  int recon_yoffset, recon_uvoffset;

+  int ref_fb_idx = cm->lst_fb_idx;

+  int dst_fb_idx = cm->new_fb_idx;

+  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;

+  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;

+  int row_delta[4] = { 0, +1,  0, -1};

+  int col_delta[4] = { +1, -1, +1, +1};

+  mb_row = mbrow;

+  mb_col = mbcol;

+  /* Encode MBs in raster order within the SB */

+  for (i = 0; i < 4; i++) {

+    int dy = row_delta[i];

+    int dx = col_delta[i];

+    int offset_extended   = dy * xd->mode_info_stride + dx;

+    int offset_unextended = dy * cm->mb_cols + dx;

+    MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;

+    if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) {

+      // MB lies outside frame, move on

+      mb_row += dy;

+      mb_col += dx;

+      x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);

+      x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);

+      x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);

+      x->gf_active_ptr      += offset_unextended;

+      x->partition_info     += offset_extended;

+      xd->mode_info_context += offset_extended;

+      xd->prev_mode_info_context += offset_extended;

+#if CONFIG_DEBUG

+      assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==

+             (xd->mode_info_context - cpi->common.mip));

+#endif

+      continue;

+    }

+    xd->mb_index = i;

+#ifdef ENC_DEBUG

+    enc_debug = (cpi->common.current_video_frame == 0 &&

+                 mb_row == 0 && mb_col == 0);

+    mb_col_debug = mb_col;

+    mb_row_debug = mb_row;

+#endif

+    // Restore MB state to that when it was picked

+#if CONFIG_SUPERBLOCKS

+    if (xd->mode_info_context->mbmi.encoded_as_sb) {

+      update_state(cpi, x, &x->sb_context[i]);

+      cpi->sb_count++;

+    } else

+#endif

+      update_state(cpi, x, &x->mb_context[i]);

+    map_index = (mb_row * cpi->common.mb_cols) + mb_col;

+    x->mb_activity_ptr = &cpi->mb_activity_map[map_index];

+    // reset above block coeffs

+    xd->above_context = cm->above_context + mb_col;

+    xd->left_context  = cm->left_context + (i >> 1);

+    // Set up distance of MB to edge of the frame in 1/8th pel units

+    xd->mb_to_top_edge    = -((mb_row * 16) << 3);

+    xd->mb_to_left_edge   = -((mb_col * 16) << 3);

+    xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;

+    xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;

+#if CONFIG_SUPERBLOCKS

+    if (xd->mode_info_context->mbmi.encoded_as_sb) {

+      // Set up limit values for MV components to prevent them from

+      // extending beyond the UMV borders assuming 32x32 block size

+      x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);

+      x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);

+      x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +

+                       (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));

+      x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +

+                       (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));

+    } else {

+#endif

+      // Set up limit values for MV components to prevent them from

+      // extending beyond the UMV borders assuming 16x16 block size

+      x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);

+      x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);

+      x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +

+                       (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));

+      x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +

+                       (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));

+#if CONFIG_SUPERBLOCKS

+    }

+#endif

+    xd->up_available = (mb_row != 0);

+    xd->left_available = (mb_col != 0);

+    recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);

+    recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);

+    xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;

+    xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;

+    xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;

+    // Copy current MB to a work buffer

+    vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);

+    if (cpi->oxcf.tuning == VP8_TUNE_SSIM)

+      vp9_activity_masking(cpi, x);

+    // Is segmentation enabled

+    if (xd->segmentation_enabled) {

+      vp9_mb_init_quantizer(cpi, x);

+    }

+    x->active_ptr = cpi->active_map + map_index;

+    cpi->update_context = 0;

+#if CONFIG_SUPERBLOCKS

+    if (!xd->mode_info_context->mbmi.encoded_as_sb)

+#endif

+      vp9_intra_prediction_down_copy(xd);

+    if (cm->frame_type == KEY_FRAME) {

+#if CONFIG_SUPERBLOCKS

+      if (xd->mode_info_context->mbmi.encoded_as_sb)

+        vp9_encode_intra_super_block(cpi, x, tp, mb_col);

+      else

+#endif

+        vp9_encode_intra_macro_block(cpi, x, tp, 1);

+        // Note the encoder may have changed the segment_id

+#ifdef MODE_STATS

+      y_modes[mbmi->mode]++;

+#endif

+    } else {

+      unsigned char *segment_id;

+      int seg_ref_active;

+      if (xd->mode_info_context->mbmi.ref_frame) {

+        unsigned char pred_context;

+        pred_context = vp9_get_pred_context(cm, xd, PRED_COMP);

+        if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME)

+          cpi->single_pred_count[pred_context]++;

+        else

+          cpi->comp_pred_count[pred_context]++;

+      }

+#if CONFIG_SUPERBLOCKS

+      if (xd->mode_info_context->mbmi.encoded_as_sb)

+        vp9_encode_inter_superblock(cpi, x, tp, recon_yoffset, recon_uvoffset,

+                                    mb_col, mb_row);

+      else

+#endif

+        vp9_encode_inter_macroblock(cpi, x, tp,

+                                    recon_yoffset, recon_uvoffset, 1);

+        // Note the encoder may have changed the segment_id

+#ifdef MODE_STATS

+      inter_y_modes[mbmi->mode]++;

+      if (mbmi->mode == SPLITMV) {

+        int b;

+        for (b = 0; b < x->partition_info->count; b++) {

+          inter_b_modes[x->partition_info->bmi[b].mode]++;

+        }

+      }

+#endif

+      // If we have just a single reference frame coded for a segment then

+      // exclude from the reference frame counts used to work out

+      // probabilities. NOTE: At the moment we dont support custom trees

+      // for the reference frame coding for each segment but this is a

+      // possible future action.

+      segment_id = &mbmi->segment_id;

+      seg_ref_active = vp9_segfeature_active(xd, *segment_id,

+                                             SEG_LVL_REF_FRAME);

+      if (!seg_ref_active ||

+          ((vp9_check_segref(xd, *segment_id, INTRA_FRAME) +

+            vp9_check_segref(xd, *segment_id, LAST_FRAME) +

+            vp9_check_segref(xd, *segment_id, GOLDEN_FRAME) +

+            vp9_check_segref(xd, *segment_id, ALTREF_FRAME)) > 1)) {

+        {

+          cpi->count_mb_ref_frame_usage[mbmi->ref_frame]++;

+        }

+      }

+      // Count of last ref frame 0,0 usage

+      if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))

+        cpi->inter_zz_count++;

+    }

+#if CONFIG_SUPERBLOCKS

+    if (xd->mode_info_context->mbmi.encoded_as_sb) {

+      x->src.y_buffer += 32;

+      x->src.u_buffer += 16;

+      x->src.v_buffer += 16;

+      x->gf_active_ptr      += 2;

+      x->partition_info     += 2;

+      xd->mode_info_context += 2;

+      xd->prev_mode_info_context += 2;

+      (*tp)->Token = EOSB_TOKEN;

+      (*tp)++;

+      if (mb_row < cm->mb_rows) cpi->tplist[mb_row].stop = *tp;

+      break;

+    }

+#endif

+    // Next MB

+    mb_row += dy;

+    mb_col += dx;

+    x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);

+    x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);

+    x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);

+    x->gf_active_ptr      += offset_unextended;

+    x->partition_info     += offset_extended;

+    xd->mode_info_context += offset_extended;

+    xd->prev_mode_info_context += offset_extended;

+#if CONFIG_DEBUG

+    assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==

+           (xd->mode_info_context - cpi->common.mip));

+#endif

+    (*tp)->Token = EOSB_TOKEN;

+    (*tp)++;

+    if (mb_row < cm->mb_rows) cpi->tplist[mb_row].stop = *tp;

+  }

+  // debug output

+#if DBG_PRNT_SEGMAP

+  {

+    FILE *statsfile;

+    statsfile = fopen("segmap2.stt", "a");

+    fprintf(statsfile, "\n");

+    fclose(statsfile);

+  }

+#endif

+}

+static

+void encode_sb_row(VP9_COMP *cpi,

+                   VP9_COMMON *cm,

+                   int mb_row,

+                   MACROBLOCK  *x,

+                   MACROBLOCKD *xd,

+                   TOKENEXTRA **tp,

+                   int *totalrate) {

+  int mb_col;

+  int mb_cols = cm->mb_cols;

+  // Initialize the left context for the new SB row

+  vpx_memset(cm->left_context, 0, sizeof(cm->left_context));

+  // Code each SB in the row

+  for (mb_col = 0; mb_col < mb_cols; mb_col += 2) {

+    int mb_rate = 0, mb_dist = 0;

+#if CONFIG_SUPERBLOCKS

+    int sb_rate = INT_MAX, sb_dist;

+#endif

+#if CONFIG_DEBUG

+    MODE_INFO *mic = xd->mode_info_context;

+    PARTITION_INFO *pi = x->partition_info;

+    signed char  *gfa = x->gf_active_ptr;

+    unsigned char *yb = x->src.y_buffer;

+    unsigned char *ub = x->src.u_buffer;

+    unsigned char *vb = x->src.v_buffer;

+#endif

+#if CONFIG_SUPERBLOCKS

+    // Pick modes assuming the SB is coded as 4 independent MBs

+    xd->mode_info_context->mbmi.encoded_as_sb = 0;

+#endif

+    pick_mb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &mb_rate, &mb_dist);

+#if CONFIG_SUPERBLOCKS

+    mb_rate += vp9_cost_bit(cm->sb_coded, 0);

+#endif

+    x->src.y_buffer -= 32;

+    x->src.u_buffer -= 16;

+    x->src.v_buffer -= 16;

+    x->gf_active_ptr -= 2;

+    x->partition_info -= 2;

+    xd->mode_info_context -= 2;

+    xd->prev_mode_info_context -= 2;

+#if CONFIG_DEBUG

+    assert(x->gf_active_ptr == gfa);

+    assert(x->partition_info == pi);

+    assert(xd->mode_info_context == mic);

+    assert(x->src.y_buffer == yb);

+    assert(x->src.u_buffer == ub);

+    assert(x->src.v_buffer == vb);

+#endif

+#if CONFIG_SUPERBLOCKS

+    if (!(((    mb_cols & 1) && mb_col ==     mb_cols - 1) ||

+          ((cm->mb_rows & 1) && mb_row == cm->mb_rows - 1))) {

+      /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */

+      xd->mode_info_context->mbmi.encoded_as_sb = 1;

+      pick_sb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &sb_rate, &sb_dist);

+      sb_rate += vp9_cost_bit(cm->sb_coded, 1);

+    }

+    /* Decide whether to encode as a SB or 4xMBs */

+    if (sb_rate < INT_MAX &&

+        RDCOST(x->rdmult, x->rddiv, sb_rate, sb_dist) <

+          RDCOST(x->rdmult, x->rddiv, mb_rate, mb_dist)) {

+      xd->mode_info_context->mbmi.encoded_as_sb = 1;

+      xd->mode_info_context[1].mbmi.encoded_as_sb = 1;

+      xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 1;

+      xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 1;

+      *totalrate += sb_rate;

+    } else

+#endif

+    {

+#if CONFIG_SUPERBLOCKS

+      xd->mode_info_context->mbmi.encoded_as_sb = 0;

+      if (cm->mb_cols - 1 > mb_col)

+        xd->mode_info_context[1].mbmi.encoded_as_sb = 0;

+      if (cm->mb_rows - 1 > mb_row) {

+        xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 0;

+        if (cm->mb_cols - 1 > mb_col)

+          xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 0;

+      }

+#endif

+      *totalrate += mb_rate;

+    }

+    /* Encode SB using best computed mode(s) */

+    encode_sb(cpi, cm, mb_row, mb_col, x, xd, tp);

+#if CONFIG_DEBUG

+    assert(x->gf_active_ptr == gfa + 2);

+    assert(x->partition_info == pi + 2);

+    assert(xd->mode_info_context == mic + 2);

+    assert(x->src.y_buffer == yb + 32);

+    assert(x->src.u_buffer == ub + 16);

+    assert(x->src.v_buffer == vb + 16);

+#endif

+  }

+  // this is to account for the border

+  x->gf_active_ptr += mb_cols - (mb_cols & 0x1);

+  x->partition_info += xd->mode_info_stride + 1 - (mb_cols & 0x1);

+  xd->mode_info_context += xd->mode_info_stride + 1 - (mb_cols & 0x1);

+  xd->prev_mode_info_context += xd->mode_info_stride + 1 - (mb_cols & 0x1);

+#if CONFIG_DEBUG

+  assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==

+         (xd->mode_info_context - cpi->common.mip));

+#endif

+}

+static void init_encode_frame_mb_context(VP9_COMP *cpi) {

+  MACROBLOCK *const x = &cpi->mb;

+  VP9_COMMON *const cm = &cpi->common;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  // GF active flags data structure

+  x->gf_active_ptr = (signed char *)cpi->gf_active_flags;

+  // Activity map pointer

+  x->mb_activity_ptr = cpi->mb_activity_map;

+  x->act_zbin_adj = 0;

+  cpi->seg0_idx = 0;

+  vpx_memset(cpi->ref_pred_count, 0, sizeof(cpi->ref_pred_count));

+  x->partition_info = x->pi;

+  xd->mode_info_context = cm->mi;

+  xd->mode_info_stride = cm->mode_info_stride;

+  xd->prev_mode_info_context = cm->prev_mi;

+  xd->frame_type = cm->frame_type;

+  xd->frames_since_golden = cm->frames_since_golden;

+  xd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;

+  // reset intra mode contexts

+  if (cm->frame_type == KEY_FRAME)

+    vp9_init_mbmode_probs(cm);

+  // Copy data over into macro block data structures.

+  x->src = * cpi->Source;

+  xd->pre = cm->yv12_fb[cm->lst_fb_idx];

+  xd->dst = cm->yv12_fb[cm->new_fb_idx];

+  // set up frame for intra coded blocks

+  vp9_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]);

+  vp9_build_block_offsets(x);

+  vp9_setup_block_dptrs(&x->e_mbd);

+  vp9_setup_block_ptrs(x);

+  xd->mode_info_context->mbmi.mode = DC_PRED;

+  xd->mode_info_context->mbmi.uv_mode = DC_PRED;

+  vp9_zero(cpi->count_mb_ref_frame_usage)

+  vp9_zero(cpi->bmode_count)

+  vp9_zero(cpi->ymode_count)

+  vp9_zero(cpi->i8x8_mode_count)

+  vp9_zero(cpi->y_uv_mode_count)

+  vp9_zero(cpi->sub_mv_ref_count)

+  vp9_zero(cpi->mbsplit_count)

+  vp9_zero(cpi->common.fc.mv_ref_ct)

+  vp9_zero(cpi->common.fc.mv_ref_ct_a)

+#if CONFIG_SUPERBLOCKS

+  vp9_zero(cpi->sb_ymode_count)

+  cpi->sb_count = 0;

+#endif

+  vpx_memset(cm->above_context, 0,

+             sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);

+  xd->fullpixel_mask = 0xffffffff;

+  if (cm->full_pixel)

+    xd->fullpixel_mask = 0xfffffff8;

+}

+static void encode_frame_internal(VP9_COMP *cpi) {

+  int mb_row;

+  MACROBLOCK *const x = &cpi->mb;

+  VP9_COMMON *const cm = &cpi->common;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  TOKENEXTRA *tp = cpi->tok;

+  int totalrate;

+  //printf("encode_frame_internal\n");

+  // Compute a modified set of reference frame probabilities to use when

+  // prediction fails. These are based on the current general estimates for

+  // this frame which may be updated with each iteration of the recode loop.

+  vp9_compute_mod_refprobs(cm);

+#if CONFIG_NEW_MVREF

+  // temp stats reset

+  vp9_zero( cpi->best_ref_index_counts );

+#endif

+// debug output

+#if DBG_PRNT_SEGMAP

+  {

+    FILE *statsfile;

+    statsfile = fopen("segmap2.stt", "a");

+    fprintf(statsfile, "\n");

+    fclose(statsfile);

+  }

+#endif

+  totalrate = 0;

+  // Functions setup for all frame types so we can use MC in AltRef

+  vp9_setup_interp_filters(xd, cm->mcomp_filter_type, cm);

+  // Reset frame count of inter 0,0 motion vector usage.

+  cpi->inter_zz_count = 0;

+  cpi->prediction_error = 0;

+  cpi->intra_error = 0;

+  cpi->skip_true_count[0] = cpi->skip_true_count[1] = cpi->skip_true_count[2] = 0;

+  cpi->skip_false_count[0] = cpi->skip_false_count[1] = cpi->skip_false_count[2] = 0;

+#if CONFIG_PRED_FILTER

+  if (cm->current_video_frame == 0) {

+    // Initially assume that we'll signal the prediction filter

+    // state at the frame level and that it is off.

+    cpi->common.pred_filter_mode = 0;

+    cpi->common.prob_pred_filter_off = 128;

+  }

+  cpi->pred_filter_on_count = 0;

+  cpi->pred_filter_off_count = 0;

+#endif

+  vp9_zero(cpi->switchable_interp_count);

+  xd->mode_info_context = cm->mi;

+  xd->prev_mode_info_context = cm->prev_mi;

+  vp9_zero(cpi->NMVcount);

+  vp9_zero(cpi->coef_counts);

+  vp9_zero(cpi->hybrid_coef_counts);

+  vp9_zero(cpi->coef_counts_8x8);

+  vp9_zero(cpi->hybrid_coef_counts_8x8);

+  vp9_zero(cpi->coef_counts_16x16);

+  vp9_zero(cpi->hybrid_coef_counts_16x16);

+  vp9_frame_init_quantizer(cpi);

+  vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);

+  vp9_initialize_me_consts(cpi, cm->base_qindex);

+  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {

+    // Initialize encode frame context.

+    init_encode_frame_mb_context(cpi);

+    // Build a frame level activity map

+    build_activity_map(cpi);

+  }

+  // re-initencode frame context.

+  init_encode_frame_mb_context(cpi);

+  vpx_memset(cpi->rd_comp_pred_diff, 0, sizeof(cpi->rd_comp_pred_diff));

+  vpx_memset(cpi->single_pred_count, 0, sizeof(cpi->single_pred_count));

+  vpx_memset(cpi->comp_pred_count, 0, sizeof(cpi->comp_pred_count));

+  vpx_memset(cpi->txfm_count, 0, sizeof(cpi->txfm_count));

+  vpx_memset(cpi->txfm_count_8x8p, 0, sizeof(cpi->txfm_count_8x8p));

+  vpx_memset(cpi->rd_tx_select_diff, 0, sizeof(cpi->rd_tx_select_diff));

+  {

+    struct vpx_usec_timer  emr_timer;

+    vpx_usec_timer_start(&emr_timer);

+    {

+      // For each row of SBs in the frame

+      for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 2) {

+        int offset = (cm->mb_cols + 1) & ~0x1;

+        encode_sb_row(cpi, cm, mb_row, x, xd, &tp, &totalrate);

+        // adjust to the next row of SBs

+        x->src.y_buffer += 32 * x->src.y_stride - 16 * offset;

+        x->src.u_buffer += 16 * x->src.uv_stride - 8 * offset;

+        x->src.v_buffer += 16 * x->src.uv_stride - 8 * offset;

+      }

+      cpi->tok_count = tp - cpi->tok;

+    }

+    vpx_usec_timer_mark(&emr_timer);

+    cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);

+  }

+  // 256 rate units to the bit,

+  // projected_frame_size in units of BYTES

+  cpi->projected_frame_size = totalrate >> 8;

+#if 0

+  // Keep record of the total distortion this time around for future use

+  cpi->last_frame_distortion = cpi->frame_distortion;

+#endif

+}

+static int check_dual_ref_flags(VP9_COMP *cpi) {

+  MACROBLOCKD *xd = &cpi->mb.e_mbd;

+  int ref_flags = cpi->ref_frame_flags;

+  if (vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {

+    if ((ref_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) == (VP9_LAST_FLAG | VP9_GOLD_FLAG) &&

+        vp9_check_segref(xd, 1, LAST_FRAME))

+      return 1;

+    if ((ref_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) == (VP9_GOLD_FLAG | VP9_ALT_FLAG) &&

+        vp9_check_segref(xd, 1, GOLDEN_FRAME))

+      return 1;

+    if ((ref_flags & (VP9_ALT_FLAG  | VP9_LAST_FLAG)) == (VP9_ALT_FLAG  | VP9_LAST_FLAG) &&

+        vp9_check_segref(xd, 1, ALTREF_FRAME))

+      return 1;

+    return 0;

+  } else {

+    return (!!(ref_flags & VP9_GOLD_FLAG) +

+            !!(ref_flags & VP9_LAST_FLAG) +

+            !!(ref_flags & VP9_ALT_FLAG)) >= 2;

+  }

+}

+static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {

+  VP9_COMMON *cm = &cpi->common;

+  int mb_row, mb_col, mis = cm->mode_info_stride, segment_id;

+  MODE_INFO *mi, *mi_ptr = cm->mi;

+#if CONFIG_SUPERBLOCKS

+  MODE_INFO *sb_mi_ptr = cm->mi, *sb_mi;

+  MB_MODE_INFO *sb_mbmi;

+#endif

+  MB_MODE_INFO *mbmi;

+  MACROBLOCK *x = &cpi->mb;

+  MACROBLOCKD *xd = &x->e_mbd;

+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++, mi_ptr += mis) {

+    mi = mi_ptr;

+#if CONFIG_SUPERBLOCKS

+    sb_mi = sb_mi_ptr;

+#endif

+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++, mi++) {

+      mbmi = &mi->mbmi;

+#if CONFIG_SUPERBLOCKS

+      sb_mbmi = &sb_mi->mbmi;

+#endif

+      if (

+#if CONFIG_SUPERBLOCKS

+          !sb_mbmi->encoded_as_sb &&

+#endif

+          mbmi->txfm_size > txfm_max) {

+        segment_id = mbmi->segment_id;

+        xd->mode_info_context = mi;

+        assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&

+                vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||

+               (cm->mb_no_coeff_skip && mbmi->mb_skip_coeff));

+        mbmi->txfm_size = txfm_max;

+      }

+#if CONFIG_SUPERBLOCKS

+      if (mb_col & 1)

+        sb_mi += 2;

+#endif

+    }

+#if CONFIG_SUPERBLOCKS

+    if (mb_row & 1)

+      sb_mi_ptr += 2 * mis;

+#endif

+  }

+}

+void vp9_encode_frame(VP9_COMP *cpi) {

+  if (cpi->sf.RD) {

+    int i, frame_type, pred_type;

+    TXFM_MODE txfm_type;

+    /*

+     * This code does a single RD pass over the whole frame assuming

+     * either compound, single or hybrid prediction as per whatever has

+     * worked best for that type of frame in the past.

+     * It also predicts whether another coding mode would have worked

+     * better that this coding mode. If that is the case, it remembers

+     * that for subsequent frames.

+     * It does the same analysis for transform size selection also.

+     */

+    if (cpi->common.frame_type == KEY_FRAME)

+      frame_type = 0;

+    else if (cpi->is_src_frame_alt_ref && cpi->common.refresh_golden_frame)

+      frame_type = 3;

+    else if (cpi->common.refresh_golden_frame || cpi->common.refresh_alt_ref_frame)

+      frame_type = 1;

+    else

+      frame_type = 2;

+    /* prediction (compound, single or hybrid) mode selection */

+    if (frame_type == 3)

+      pred_type = SINGLE_PREDICTION_ONLY;

+    else if (cpi->rd_prediction_type_threshes[frame_type][1] >

+                 cpi->rd_prediction_type_threshes[frame_type][0] &&

+             cpi->rd_prediction_type_threshes[frame_type][1] >

+                 cpi->rd_prediction_type_threshes[frame_type][2] &&

+             check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)

+      pred_type = COMP_PREDICTION_ONLY;

+    else if (cpi->rd_prediction_type_threshes[frame_type][0] >

+                 cpi->rd_prediction_type_threshes[frame_type][2])

+      pred_type = SINGLE_PREDICTION_ONLY;

+    else

+      pred_type = HYBRID_PREDICTION;

+    /* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */

+#if CONFIG_LOSSLESS

+    if (cpi->oxcf.lossless) {

+      txfm_type = ONLY_4X4;

+    } else

+#endif

+    /* FIXME (rbultje)

+     * this is a hack (no really), basically to work around the complete

+     * nonsense coefficient cost prediction for keyframes. The probabilities

+     * are reset to defaults, and thus we basically have no idea how expensive

+     * a 4x4 vs. 8x8 will really be. The result is that any estimate at which

+     * of the two is better is utterly bogus.

+     * I'd like to eventually remove this hack, but in order to do that, we

+     * need to move the frame reset code from the frame encode init to the

+     * bitstream write code, or alternatively keep a backup of the previous

+     * keyframe's probabilities as an estimate of what the current keyframe's

+     * coefficient cost distributions may look like. */

+    if (frame_type == 0) {

+      txfm_type = ALLOW_16X16;

+    } else

+#if 0

+    /* FIXME (rbultje)

+     * this code is disabled for a similar reason as the code above; the

+     * problem is that each time we "revert" to 4x4 only (or even 8x8 only),

+     * the coefficient probabilities for 16x16 (and 8x8) start lagging behind,

+     * thus leading to them lagging further behind and not being chosen for

+     * subsequent frames either. This is essentially a local minimum problem

+     * that we can probably fix by estimating real costs more closely within

+     * a frame, perhaps by re-calculating costs on-the-fly as frame encoding

+     * progresses. */

+    if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >

+            cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] &&

+        cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >

+            cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] &&

+        cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >

+            cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {

+      txfm_type = TX_MODE_SELECT;

+    } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >

+                  cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]

+            && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >

+                  cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16]

+               ) {

+      txfm_type = ONLY_4X4;

+    } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=

+                  cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {

+      txfm_type = ALLOW_16X16;

+    } else

+      txfm_type = ALLOW_8X8;

+#else

+    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=

+                 cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?

+    ALLOW_16X16 : TX_MODE_SELECT;

+#endif

+    cpi->common.txfm_mode = txfm_type;

+    if (txfm_type != TX_MODE_SELECT) {

+      cpi->common.prob_tx[0] = 128;

+      cpi->common.prob_tx[1] = 128;

+    }

+    cpi->common.comp_pred_mode = pred_type;

+    encode_frame_internal(cpi);

+    for (i = 0; i < NB_PREDICTION_TYPES; ++i) {

+      const int diff = cpi->rd_comp_pred_diff[i] / cpi->common.MBs;

+      cpi->rd_prediction_type_threshes[frame_type][i] += diff;

+      cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;

+    }

+    for (i = 0; i < NB_TXFM_MODES; ++i) {

+      int64_t pd = cpi->rd_tx_select_diff[i];

+      int diff;

+      if (i == TX_MODE_SELECT)

+        pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, 2048 * (TX_SIZE_MAX - 1), 0);

+      diff = pd / cpi->common.MBs;

+      cpi->rd_tx_select_threshes[frame_type][i] += diff;

+      cpi->rd_tx_select_threshes[frame_type][i] /= 2;

+    }

+    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {

+      int single_count_zero = 0;

+      int comp_count_zero = 0;

+      for (i = 0; i < COMP_PRED_CONTEXTS; i++) {

+        single_count_zero += cpi->single_pred_count[i];

+        comp_count_zero += cpi->comp_pred_count[i];

+      }

+      if (comp_count_zero == 0) {

+        cpi->common.comp_pred_mode = SINGLE_PREDICTION_ONLY;

+      } else if (single_count_zero == 0) {

+        cpi->common.comp_pred_mode = COMP_PREDICTION_ONLY;

+      }

+    }

+    if (cpi->common.txfm_mode == TX_MODE_SELECT) {

+      const int count4x4 = cpi->txfm_count[TX_4X4] + cpi->txfm_count_8x8p[TX_4X4];

+      const int count8x8 = cpi->txfm_count[TX_8X8];

+      const int count8x8_8x8p = cpi->txfm_count_8x8p[TX_8X8];

+      const int count16x16 = cpi->txfm_count[TX_16X16];

+      if (count4x4 == 0 && count16x16 == 0) {

+        cpi->common.txfm_mode = ALLOW_8X8;

+        reset_skip_txfm_size(cpi, TX_8X8);

+      } else if (count8x8 == 0 && count16x16 == 0 && count8x8_8x8p == 0) {

+        cpi->common.txfm_mode = ONLY_4X4;

+        reset_skip_txfm_size(cpi, TX_4X4);

+      } else if (count8x8 == 0 && count4x4 == 0) {

+        cpi->common.txfm_mode = ALLOW_16X16;

+      }

+    }

+  } else {

+    encode_frame_internal(cpi);

+  }

+}

+void vp9_setup_block_ptrs(MACROBLOCK *x) {

+  int r, c;

+  int i;

+  for (r = 0; r < 4; r++) {

+    for (c = 0; c < 4; c++) {

+      x->block[r * 4 + c].src_diff = x->src_diff + r * 4 * 16 + c * 4;

+    }

+  }

+  for (r = 0; r < 2; r++) {

+    for (c = 0; c < 2; c++) {

+      x->block[16 + r * 2 + c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4;

+    }

+  }

+  for (r = 0; r < 2; r++) {

+    for (c = 0; c < 2; c++) {

+      x->block[20 + r * 2 + c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4;

+    }

+  }

+  x->block[24].src_diff = x->src_diff + 384;

+  for (i = 0; i < 25; i++) {

+    x->block[i].coeff = x->coeff + i * 16;

+  }

+}

+void vp9_build_block_offsets(MACROBLOCK *x) {

+  int block = 0;

+  int br, bc;

+  vp9_build_block_doffsets(&x->e_mbd);

+  // y blocks

+  x->thismb_ptr = &x->thismb[0];

+  for (br = 0; br < 4; br++) {

+    for (bc = 0; bc < 4; bc++) {

+      BLOCK *this_block = &x->block[block];

+      // this_block->base_src = &x->src.y_buffer;

+      // this_block->src_stride = x->src.y_stride;

+      // this_block->src = 4 * br * this_block->src_stride + 4 * bc;

+      this_block->base_src = &x->thismb_ptr;

+      this_block->src_stride = 16;

+      this_block->src = 4 * br * 16 + 4 * bc;

+      ++block;

+    }

+  }

+  // u blocks

+  for (br = 0; br < 2; br++) {

+    for (bc = 0; bc < 2; bc++) {

+      BLOCK *this_block = &x->block[block];

+      this_block->base_src = &x->src.u_buffer;

+      this_block->src_stride = x->src.uv_stride;

+      this_block->src = 4 * br * this_block->src_stride + 4 * bc;

+      ++block;

+    }

+  }

+  // v blocks

+  for (br = 0; br < 2; br++) {

+    for (bc = 0; bc < 2; bc++) {

+      BLOCK *this_block = &x->block[block];

+      this_block->base_src = &x->src.v_buffer;

+      this_block->src_stride = x->src.uv_stride;

+      this_block->src = 4 * br * this_block->src_stride + 4 * bc;

+      ++block;

+    }

+  }

+}

+static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {

+  const MACROBLOCKD *xd = &x->e_mbd;

+  const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;

+  const MB_PREDICTION_MODE uvm = xd->mode_info_context->mbmi.uv_mode;

+#ifdef MODE_STATS

+  const int is_key = cpi->common.frame_type == KEY_FRAME;

+  ++ (is_key ? uv_modes : inter_uv_modes)[uvm];

+  ++ uv_modes_y[m][uvm];

+  if (m == B_PRED) {

+    unsigned int *const bct = is_key ? b_modes : inter_b_modes;

+    int b = 0;

+    do {

+      ++ bct[xd->block[b].bmi.as_mode.first];

+    } while (++b < 16);

+  }

+  if (m == I8X8_PRED) {

+    i8x8_modes[xd->block[0].bmi.as_mode.first]++;

+    i8x8_modes[xd->block[2].bmi.as_mode.first]++;

+    i8x8_modes[xd->block[8].bmi.as_mode.first]++;

+    i8x8_modes[xd->block[10].bmi.as_mode.first]++;

+  }

+#endif

+#if CONFIG_SUPERBLOCKS

+  if (xd->mode_info_context->mbmi.encoded_as_sb) {

+    ++cpi->sb_ymode_count[m];

+  } else

+#endif

+    ++cpi->ymode_count[m];

+  if (m != I8X8_PRED)

+    ++cpi->y_uv_mode_count[m][uvm];

+  else {

+    cpi->i8x8_mode_count[xd->block[0].bmi.as_mode.first]++;

+    cpi->i8x8_mode_count[xd->block[2].bmi.as_mode.first]++;

+    cpi->i8x8_mode_count[xd->block[8].bmi.as_mode.first]++;

+    cpi->i8x8_mode_count[xd->block[10].bmi.as_mode.first]++;

+  }

+  if (m == B_PRED) {

+    int b = 0;

+    do {

+      ++ cpi->bmode_count[xd->block[b].bmi.as_mode.first];

+    } while (++b < 16);

+  }

+}

+// Experimental stub function to create a per MB zbin adjustment based on

+// some previously calculated measure of MB activity.

+static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) {

+#if USE_ACT_INDEX

+  x->act_zbin_adj = *(x->mb_activity_ptr);

+#else

+  int64_t a;

+  int64_t b;

+  int64_t act = *(x->mb_activity_ptr);

+  // Apply the masking to the RD multiplier.

+  a = act + 4 * cpi->activity_avg;

+  b = 4 * act + cpi->activity_avg;

+  if (act > cpi->activity_avg)

+    x->act_zbin_adj = (int)(((int64_t)b + (a >> 1)) / a) - 1;

+  else

+    x->act_zbin_adj = 1 - (int)(((int64_t)a + (b >> 1)) / b);

+#endif

+}

+#if CONFIG_SUPERBLOCKS

+static void update_sb_skip_coeff_state(VP9_COMP *cpi,

+                                       MACROBLOCK *x,

+                                       ENTROPY_CONTEXT_PLANES ta[4],

+                                       ENTROPY_CONTEXT_PLANES tl[4],

+                                       TOKENEXTRA *t[4],

+                                       TOKENEXTRA **tp,

+                                       int skip[4])

+{

+  TOKENEXTRA tokens[4][16 * 24];

+  int n_tokens[4], n;

+  // if there were no skips, we don't need to do anything

+  if (!skip[0] && !skip[1] && !skip[2] && !skip[3])

+    return;

+  // if we don't do coeff skipping for this frame, we don't

+  // need to do anything here

+  if (!cpi->common.mb_no_coeff_skip)

+    return;

+  // if all 4 MBs skipped coeff coding, nothing to be done

+  if (skip[0] && skip[1] && skip[2] && skip[3])

+    return;

+  // so the situation now is that we want to skip coeffs

+  // for some MBs, but not all, and we didn't code EOB

+  // coefficients for them. However, the skip flag for this

+  // SB will be 0 overall, so we need to insert EOBs in the

+  // middle of the token tree. Do so here.

+  n_tokens[0] = t[1] - t[0];

+  n_tokens[1] = t[2] - t[1];

+  n_tokens[2] = t[3] - t[2];

+  n_tokens[3] = *tp  - t[3];

+  if (n_tokens[0])

+    memcpy(tokens[0], t[0], n_tokens[0] * sizeof(*t[0]));

+  if (n_tokens[1])

+    memcpy(tokens[1], t[1], n_tokens[1] * sizeof(*t[0]));

+  if (n_tokens[2])

+    memcpy(tokens[2], t[2], n_tokens[2] * sizeof(*t[0]));

+  if (n_tokens[3])

+    memcpy(tokens[3], t[3], n_tokens[3] * sizeof(*t[0]));

+  // reset pointer, stuff EOBs where necessary

+  *tp = t[0];

+  for (n = 0; n < 4; n++) {

+    if (skip[n]) {

+      x->e_mbd.above_context = &ta[n];

+      x->e_mbd.left_context  = &tl[n];

+      vp9_stuff_mb(cpi, &x->e_mbd, tp, 0);

+    } else {

+      if (n_tokens[n]) {

+        memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);

+      }

+      (*tp) += n_tokens[n];

+    }

+  }

+}

+void vp9_encode_intra_super_block(VP9_COMP *cpi,

+                                  MACROBLOCK *x,

+                                  TOKENEXTRA **t,

+                                  int mb_col) {

+  const int output_enabled = 1;

+  int n;

+  MACROBLOCKD *xd = &x->e_mbd;

+  VP9_COMMON *cm = &cpi->common;

+  const uint8_t *src = x->src.y_buffer;

+  uint8_t *dst = xd->dst.y_buffer;

+  const uint8_t *usrc = x->src.u_buffer;

+  uint8_t *udst = xd->dst.u_buffer;

+  const uint8_t *vsrc = x->src.v_buffer;

+  uint8_t *vdst = xd->dst.v_buffer;

+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;

+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;

+  const VP9_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);

+  TOKENEXTRA *tp[4];

+  int skip[4];

+  MODE_INFO *mi = x->e_mbd.mode_info_context;

+  ENTROPY_CONTEXT_PLANES ta[4], tl[4];

+  if ((cpi->oxcf.tuning == VP8_TUNE_SSIM) && output_enabled) {

+    adjust_act_zbin(cpi, x);

+    vp9_update_zbin_extra(cpi, x);

+  }

+  vp9_build_intra_predictors_sby_s(&x->e_mbd);

+  vp9_build_intra_predictors_sbuv_s(&x->e_mbd);

+  assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);

+  for (n = 0; n < 4; n++) {

+    int x_idx = n & 1, y_idx = n >> 1;

+    xd->above_context = cm->above_context + mb_col + (n & 1);

+    xd->left_context = cm->left_context + (n >> 1);

+    vp9_subtract_mby_s_c(x->src_diff,

+                         src + x_idx * 16 + y_idx * 16 * src_y_stride,

+                         src_y_stride,

+                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,

+                         dst_y_stride);

+    vp9_subtract_mbuv_s_c(x->src_diff,

+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

+                          src_uv_stride,

+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

+                          dst_uv_stride);

+    vp9_transform_mb_8x8(x);

+    vp9_quantize_mb_8x8(x);

+    if (x->optimize) {

+      vp9_optimize_mby_8x8(x, rtcd);

+      vp9_optimize_mbuv_8x8(x, rtcd);

+    }

+    vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);

+    vp9_recon_mby_s_c(&x->e_mbd, dst + x_idx * 16 + y_idx * 16 * dst_y_stride);

+    vp9_recon_mbuv_s_c(&x->e_mbd,

+                       udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

+                       vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);

+    if (output_enabled) {

+      memcpy(&ta[n], xd->above_context, sizeof(ta[n]));

+      memcpy(&tl[n], xd->left_context, sizeof(tl[n]));

+      tp[n] = *t;

+      xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;

+      vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);

+      skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;

+    }

+  }

+  if (output_enabled) {

+    // Tokenize

+    xd->mode_info_context = mi;

+    sum_intra_stats(cpi, x);

+    update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);

+  }

+}

+#endif /* CONFIG_SUPERBLOCKS */

+void vp9_encode_intra_macro_block(VP9_COMP *cpi,

+                                  MACROBLOCK *x,

+                                  TOKENEXTRA **t,

+                                  int output_enabled) {

+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

+  if ((cpi->oxcf.tuning == VP8_TUNE_SSIM) && output_enabled) {

+    adjust_act_zbin(cpi, x);

+    vp9_update_zbin_extra(cpi, x);

+  }

+  if (mbmi->mode == I8X8_PRED) {

+    vp9_encode_intra8x8mby(IF_RTCD(&cpi->rtcd), x);

+    vp9_encode_intra8x8mbuv(IF_RTCD(&cpi->rtcd), x);

+  } else if (mbmi->mode == B_PRED) {

+    vp9_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);

+  } else {

+    vp9_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);

+  }

+  if (mbmi->mode != I8X8_PRED) {

+    vp9_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);

+  }

+  if (output_enabled) {

+    int segment_id = mbmi->segment_id;

+    // Tokenize

+    sum_intra_stats(cpi, x);

+    vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);

+    if (cpi->common.txfm_mode == TX_MODE_SELECT &&

+        !((cpi->common.mb_no_coeff_skip && mbmi->mb_skip_coeff) ||

+          (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_EOB) &&

+           vp9_get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) {

+      if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED) {

+        cpi->txfm_count[mbmi->txfm_size]++;

+      } else if (mbmi->mode == I8X8_PRED) {

+        cpi->txfm_count_8x8p[mbmi->txfm_size]++;

+      }

+    } else if (cpi->common.txfm_mode >= ALLOW_16X16 && mbmi->mode <= TM_PRED) {

+      mbmi->txfm_size = TX_16X16;

+    } else

+    if (cpi->common.txfm_mode >= ALLOW_8X8 && mbmi->mode != B_PRED) {

+      mbmi->txfm_size = TX_8X8;

+    } else {

+      mbmi->txfm_size = TX_4X4;

+    }

+  }

+#if CONFIG_NEWBESTREFMV

+  else

+    vp9_tokenize_mb(cpi, &x->e_mbd, t, 1);

+#endif

+}

+extern void vp9_fix_contexts(MACROBLOCKD *xd);

+void vp9_encode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,

+                                 TOKENEXTRA **t, int recon_yoffset,

+                                 int recon_uvoffset, int output_enabled) {

+  VP9_COMMON *cm = &cpi->common;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;

+  unsigned char *segment_id = &mbmi->segment_id;

+  int seg_ref_active;

+  unsigned char ref_pred_flag;

+  x->skip = 0;

+#if CONFIG_SUPERBLOCKS

+  assert(!xd->mode_info_context->mbmi.encoded_as_sb);

+#endif

+  vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);

+  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {

+    // Adjust the zbin based on this MB rate.

+    adjust_act_zbin(cpi, x);

+  }

+  {

+    // Experimental code. Special case for gf and arf zeromv modes.

+    // Increase zbin size to suppress noise

+    cpi->zbin_mode_boost = 0;

+    if (cpi->zbin_mode_boost_enabled) {

+      if (mbmi->ref_frame != INTRA_FRAME) {

+        if (mbmi->mode == ZEROMV) {

+          if (mbmi->ref_frame != LAST_FRAME)

+            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;

+          else

+            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;

+        } else if (mbmi->mode == SPLITMV)

+          cpi->zbin_mode_boost = 0;

+        else

+          cpi->zbin_mode_boost = MV_ZBIN_BOOST;

+      }

+    }

+    vp9_update_zbin_extra(cpi, x);

+  }

+  seg_ref_active = vp9_segfeature_active(xd, *segment_id, SEG_LVL_REF_FRAME);

+  // SET VARIOUS PREDICTION FLAGS

+  // Did the chosen reference frame match its predicted value.

+  ref_pred_flag = ((mbmi->ref_frame == vp9_get_pred_ref(cm, xd)));

+  vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);

+  if (mbmi->ref_frame == INTRA_FRAME) {

+    if (mbmi->mode == B_PRED) {

+      vp9_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);

+      vp9_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);

+    } else if (mbmi->mode == I8X8_PRED) {

+      vp9_encode_intra8x8mby(IF_RTCD(&cpi->rtcd), x);

+      vp9_encode_intra8x8mbuv(IF_RTCD(&cpi->rtcd), x);

+    } else {

+      vp9_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);

+      vp9_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);

+    }

+    if (output_enabled)

+      sum_intra_stats(cpi, x);

+  } else {

+    int ref_fb_idx;

+    if (mbmi->ref_frame == LAST_FRAME)

+      ref_fb_idx = cpi->common.lst_fb_idx;

+    else if (mbmi->ref_frame == GOLDEN_FRAME)

+      ref_fb_idx = cpi->common.gld_fb_idx;

+    else

+      ref_fb_idx = cpi->common.alt_fb_idx;

+    xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;

+    xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;

+    xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;

+    if (mbmi->second_ref_frame) {

+      int second_ref_fb_idx;

+      if (mbmi->second_ref_frame == LAST_FRAME)

+        second_ref_fb_idx = cpi->common.lst_fb_idx;

+      else if (mbmi->second_ref_frame == GOLDEN_FRAME)

+        second_ref_fb_idx = cpi->common.gld_fb_idx;

+      else

+        second_ref_fb_idx = cpi->common.alt_fb_idx;

+      xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer +

+                                recon_yoffset;

+      xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer +

+                                recon_uvoffset;

+      xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer +

+                                recon_uvoffset;

+    }

+    if (!x->skip) {

+      vp9_encode_inter16x16(IF_RTCD(&cpi->rtcd), x);

+      // Clear mb_skip_coeff if mb_no_coeff_skip is not set

+      if (!cpi->common.mb_no_coeff_skip)

+        mbmi->mb_skip_coeff = 0;

+    } else {

+      vp9_build_1st_inter16x16_predictors_mb(xd, xd->dst.y_buffer,

+                                             xd->dst.u_buffer, xd->dst.v_buffer,

+                                             xd->dst.y_stride,

+                                             xd->dst.uv_stride);

+    }

+  }

+  if (!x->skip) {

+#ifdef ENC_DEBUG

+    if (enc_debug) {

+      int i;

+      printf("Segment=%d [%d, %d]: %d %d:\n", mbmi->segment_id, mb_col_debug,

+             mb_row_debug, xd->mb_to_left_edge, xd->mb_to_top_edge);

+      for (i = 0; i < 400; i++) {

+        printf("%3d ", xd->qcoeff[i]);

+        if (i % 16 == 15) printf("\n");

+      }

+      printf("\n");

+      printf("eobs = ");

+      for (i = 0; i < 25; i++)

+        printf("%d:%d ", i, xd->block[i].eob);

+      printf("\n");

+      fflush(stdout);

+    }

+#endif

+    vp9_tokenize_mb(cpi, xd, t, !output_enabled);

+#ifdef ENC_DEBUG

+    if (enc_debug) {

+      printf("Tokenized\n");

+      fflush(stdout);

+    }

+#endif

+  } else {

+    int mb_skip_context =

+      cpi->common.mb_no_coeff_skip ?

+      (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +

+      (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff :

+      0;

+    if (cpi->common.mb_no_coeff_skip) {

+      mbmi->mb_skip_coeff = 1;

+      if (output_enabled)

+        cpi->skip_true_count[mb_skip_context]++;

+      vp9_fix_contexts(xd);

+    } else {

+      vp9_stuff_mb(cpi, xd, t, !output_enabled);

+      mbmi->mb_skip_coeff = 0;

+      if (output_enabled)

+        cpi->skip_false_count[mb_skip_context]++;

+    }

+  }

+  if (output_enabled) {

+    int segment_id = mbmi->segment_id;

+    if (cpi->common.txfm_mode == TX_MODE_SELECT &&

+        !((cpi->common.mb_no_coeff_skip && mbmi->mb_skip_coeff) ||

+          (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_EOB) &&

+           vp9_get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) {

+      if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&

+          mbmi->mode != SPLITMV) {

+        cpi->txfm_count[mbmi->txfm_size]++;

+      } else if (mbmi->mode == I8X8_PRED ||

+                 (mbmi->mode == SPLITMV &&

+                  mbmi->partitioning != PARTITIONING_4X4)) {

+        cpi->txfm_count_8x8p[mbmi->txfm_size]++;

+      }

+    } else if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&

+        mbmi->mode != SPLITMV && cpi->common.txfm_mode >= ALLOW_16X16) {

+      mbmi->txfm_size = TX_16X16;

+    } else if (mbmi->mode != B_PRED &&

+               !(mbmi->mode == SPLITMV &&

+                 mbmi->partitioning == PARTITIONING_4X4) &&

+               cpi->common.txfm_mode >= ALLOW_8X8) {

+      mbmi->txfm_size = TX_8X8;

+    } else {

+      mbmi->txfm_size = TX_4X4;

+    }

+  }

+}

+#if CONFIG_SUPERBLOCKS

+void vp9_encode_inter_superblock(VP9_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,

+                                 int recon_yoffset, int recon_uvoffset,

+                                 int mb_col, int mb_row) {

+  const int output_enabled = 1;

+  VP9_COMMON *cm = &cpi->common;

+  MACROBLOCKD *xd = &x->e_mbd;

+  const uint8_t *src = x->src.y_buffer;

+  uint8_t *dst = xd->dst.y_buffer;

+  const uint8_t *usrc = x->src.u_buffer;

+  uint8_t *udst = xd->dst.u_buffer;

+  const uint8_t *vsrc = x->src.v_buffer;

+  uint8_t *vdst = xd->dst.v_buffer;

+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;

+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;

+  const VP9_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);

+  unsigned int segment_id = xd->mode_info_context->mbmi.segment_id;

+  int seg_ref_active;

+  unsigned char ref_pred_flag;

+  int n;

+  TOKENEXTRA *tp[4];

+  int skip[4];

+  MODE_INFO *mi = x->e_mbd.mode_info_context;

+  ENTROPY_CONTEXT_PLANES ta[4], tl[4];

+  x->skip = 0;

+  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {

+    // Adjust the zbin based on this MB rate.

+    adjust_act_zbin(cpi, x);

+  }

+  {

+    // Experimental code. Special case for gf and arf zeromv modes.

+    // Increase zbin size to suppress noise

+    cpi->zbin_mode_boost = 0;

+    if (cpi->zbin_mode_boost_enabled) {

+      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {

+        if (xd->mode_info_context->mbmi.mode == ZEROMV) {

+          if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)

+            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;

+          else

+            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;

+        } else if (xd->mode_info_context->mbmi.mode == SPLITMV)

+          cpi->zbin_mode_boost = 0;

+        else

+          cpi->zbin_mode_boost = MV_ZBIN_BOOST;

+      }

+    }

+    vp9_update_zbin_extra(cpi, x);

+  }

+  seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);

+  // SET VARIOUS PREDICTION FLAGS

+  // Did the chosen reference frame match its predicted value.

+  ref_pred_flag = ((xd->mode_info_context->mbmi.ref_frame ==

+                    vp9_get_pred_ref(cm, xd)));

+  vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);

+  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {

+    vp9_build_intra_predictors_sby_s(&x->e_mbd);

+    vp9_build_intra_predictors_sbuv_s(&x->e_mbd);

+  } else {

+    int ref_fb_idx;

+    if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)

+      ref_fb_idx = cpi->common.lst_fb_idx;

+    else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)

+      ref_fb_idx = cpi->common.gld_fb_idx;

+    else

+      ref_fb_idx = cpi->common.alt_fb_idx;

+    xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;

+    xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;

+    xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;

+    if (xd->mode_info_context->mbmi.second_ref_frame) {

+      int second_ref_fb_idx;

+      if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)

+        second_ref_fb_idx = cpi->common.lst_fb_idx;

+      else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)

+        second_ref_fb_idx = cpi->common.gld_fb_idx;

+      else

+        second_ref_fb_idx = cpi->common.alt_fb_idx;

+      xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer +

+                                    recon_yoffset;

+      xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer +

+                                    recon_uvoffset;

+      xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer +

+                                    recon_uvoffset;

+    }

+    vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,

+                                       xd->dst.u_buffer, xd->dst.v_buffer,

+                                       xd->dst.y_stride, xd->dst.uv_stride);

+  }

+  assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);

+  for (n = 0; n < 4; n++) {

+    int x_idx = n & 1, y_idx = n >> 1;

+    vp9_subtract_mby_s_c(x->src_diff,

+                         src + x_idx * 16 + y_idx * 16 * src_y_stride,

+                         src_y_stride,

+                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,

+                         dst_y_stride);

+    vp9_subtract_mbuv_s_c(x->src_diff,

+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

+                          src_uv_stride,

+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

+                          dst_uv_stride);

+    vp9_transform_mb_8x8(x);

+    vp9_quantize_mb_8x8(x);

+    if (x->optimize) {

+      vp9_optimize_mby_8x8(x, rtcd);

+      vp9_optimize_mbuv_8x8(x, rtcd);

+    }

+    vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);

+    vp9_recon_mby_s_c(&x->e_mbd,

+                      dst + x_idx * 16 + y_idx * 16 * dst_y_stride);

+    vp9_recon_mbuv_s_c(&x->e_mbd,

+                       udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

+                       vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);

+    if (!x->skip) {

+      if (output_enabled) {

+        xd->left_context = cm->left_context + (n >> 1);

+        xd->above_context = cm->above_context + mb_col + (n & 1);

+        memcpy(&ta[n], xd->above_context, sizeof(ta[n]));

+        memcpy(&tl[n], xd->left_context, sizeof(tl[n]));

+        tp[n] = *t;

+        xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;

+        vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);

+        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;

+      }

+    } else {

+      int mb_skip_context =

+        cpi->common.mb_no_coeff_skip ?

+          (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +

+            (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff :

+          0;

+      if (cpi->common.mb_no_coeff_skip) {

+        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff = 1;

+        xd->left_context = cm->left_context + (n >> 1);

+        xd->above_context = cm->above_context + mb_col + (n & 1);

+        memcpy(&ta[n], xd->above_context, sizeof(ta[n]));

+        memcpy(&tl[n], xd->left_context, sizeof(tl[n]));

+        tp[n] = *t;

+        cpi->skip_true_count[mb_skip_context]++;

+        vp9_fix_contexts(xd);

+      } else {

+        vp9_stuff_mb(cpi, xd, t, 0);

+        xd->mode_info_context->mbmi.mb_skip_coeff = 0;

+        cpi->skip_false_count[mb_skip_context]++;

+      }

+    }

+  }

+  xd->mode_info_context = mi;

+  update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);

+}

+#endif

--- /dev/null

+++ b/vp9/encoder/encodeintra.c

@@ -1,0 +1,289 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vpx_rtcd.h"

+#include "vp9/common/idct.h"

+#include "quantize.h"

+#include "vp9/common/reconintra.h"

+#include "vp9/common/reconintra4x4.h"

+#include "encodemb.h"

+#include "vp9/common/invtrans.h"

+#include "encodeintra.h"

+#if CONFIG_RUNTIME_CPU_DETECT

+#define IF_RTCD(x) (x)

+#else

+#define IF_RTCD(x) NULL

+#endif

+int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {

+  int i;

+  int intra_pred_var = 0;

+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

+  (void) cpi;

+  if (use_16x16_pred) {

+    mbmi->mode = DC_PRED;

+#if CONFIG_COMP_INTRA_PRED

+    mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

+#endif

+    mbmi->uv_mode = DC_PRED;

+    mbmi->ref_frame = INTRA_FRAME;

+    vp9_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);

+  } else {

+    for (i = 0; i < 16; i++) {

+      x->e_mbd.block[i].bmi.as_mode.first = B_DC_PRED;

+      vp9_encode_intra4x4block(IF_RTCD(&cpi->rtcd), x, i);

+    }

+  }

+  intra_pred_var = vp9_get_mb_ss(x->src_diff);

+  return intra_pred_var;

+}

+void vp9_encode_intra4x4block(const VP9_ENCODER_RTCD *rtcd,

+                              MACROBLOCK *x, int ib) {

+  BLOCKD *b = &x->e_mbd.block[ib];

+  BLOCK *be = &x->block[ib];

+  TX_TYPE tx_type;

+#if CONFIG_COMP_INTRA_PRED

+  if (b->bmi.as_mode.second == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {

+#endif

+    vp9_intra4x4_predict(b, b->bmi.as_mode.first, b->predictor);

+#if CONFIG_COMP_INTRA_PRED

+  } else {

+    vp9_comp_intra4x4_predict(b, b->bmi.as_mode.first, b->bmi.as_mode.second,

+                              b->predictor);

+  }

+#endif

+  vp9_subtract_b(be, b, 16);

+  tx_type = get_tx_type(&x->e_mbd, b);

+  if (tx_type != DCT_DCT) {

+    vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);

+    vp9_ht_quantize_b_4x4(be, b, tx_type);

+    vp9_ihtllm_c(b->dqcoeff, b->diff, 32, tx_type, 4);

+  } else {

+    x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);

+    x->quantize_b_4x4(be, b) ;

+    vp9_inverse_transform_b_4x4(IF_RTCD(&rtcd->common->idct), b, 32);

+  }

+  vp9_recon_b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);

+}

+void vp9_encode_intra4x4mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *mb) {

+  int i;

+  for (i = 0; i < 16; i++)

+    vp9_encode_intra4x4block(rtcd, mb, i);

+  return;

+}

+void vp9_encode_intra16x16mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {

+  MACROBLOCKD *xd = &x->e_mbd;

+  BLOCK *b = &x->block[0];

+  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;

+  TX_TYPE tx_type;

+#if CONFIG_COMP_INTRA_PRED

+  if (xd->mode_info_context->mbmi.second_mode == (MB_PREDICTION_MODE)(DC_PRED - 1))

+#endif

+    vp9_build_intra_predictors_mby(xd);

+#if CONFIG_COMP_INTRA_PRED

+  else

+    vp9_build_comp_intra_predictors_mby(xd);

+#endif

+  vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);

+  if (tx_size == TX_16X16) {

+    BLOCKD  *bd = &xd->block[0];

+    tx_type = get_tx_type(xd, bd);

+    if (tx_type != DCT_DCT) {

+      vp9_fht(b->src_diff, 32, b->coeff, tx_type, 16);

+      vp9_quantize_mby_16x16(x);

+      if (x->optimize)

+        vp9_optimize_mby_16x16(x, rtcd);

+      vp9_ihtllm_c(bd->dqcoeff, bd->diff, 32, tx_type, 16);

+    } else {

+      vp9_transform_mby_16x16(x);

+      vp9_quantize_mby_16x16(x);

+      if (x->optimize)

+        vp9_optimize_mby_16x16(x, rtcd);

+      vp9_inverse_transform_mby_16x16(IF_RTCD(&rtcd->common->idct), xd);

+    }

+  } else if (tx_size == TX_8X8) {

+    vp9_transform_mby_8x8(x);

+    vp9_quantize_mby_8x8(x);

+    if (x->optimize)

+      vp9_optimize_mby_8x8(x, rtcd);

+    vp9_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), xd);

+  } else {

+    vp9_transform_mby_4x4(x);

+    vp9_quantize_mby_4x4(x);

+    if (x->optimize)

+      vp9_optimize_mby_4x4(x, rtcd);

+    vp9_inverse_transform_mby_4x4(IF_RTCD(&rtcd->common->idct), xd);

+  }

+  vp9_recon_mby(xd);

+}

+void vp9_encode_intra16x16mbuv(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {

+  MACROBLOCKD *xd = &x->e_mbd;

+  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;

+#if CONFIG_COMP_INTRA_PRED

+  if (xd->mode_info_context->mbmi.second_uv_mode == (MB_PREDICTION_MODE)(DC_PRED - 1)) {

+#endif

+    vp9_build_intra_predictors_mbuv(xd);

+#if CONFIG_COMP_INTRA_PRED

+  } else {

+    vp9_build_comp_intra_predictors_mbuv(xd);

+  }

+#endif

+  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

+                    xd->predictor, x->src.uv_stride);

+  if (tx_size == TX_4X4) {

+    vp9_transform_mbuv_4x4(x);

+    vp9_quantize_mbuv_4x4(x);

+    if (x->optimize)

+      vp9_optimize_mbuv_4x4(x, rtcd);

+    vp9_inverse_transform_mbuv_4x4(IF_RTCD(&rtcd->common->idct), xd);

+  } else /* 16x16 or 8x8 */ {

+    vp9_transform_mbuv_8x8(x);

+    vp9_quantize_mbuv_8x8(x);

+    if (x->optimize)

+      vp9_optimize_mbuv_8x8(x, rtcd);

+    vp9_inverse_transform_mbuv_8x8(IF_RTCD(&rtcd->common->idct), xd);

+  }

+  vp9_recon_intra_mbuv(xd);

+}

+void vp9_encode_intra8x8(const VP9_ENCODER_RTCD *rtcd,

+                         MACROBLOCK *x, int ib) {

+  MACROBLOCKD *xd = &x->e_mbd;

+  BLOCKD *b = &xd->block[ib];

+  BLOCK *be = &x->block[ib];

+  const int iblock[4] = {0, 1, 4, 5};

+  int i;

+  TX_TYPE tx_type;

+#if CONFIG_COMP_INTRA_PRED

+  if (b->bmi.as_mode.second == (MB_PREDICTION_MODE)(DC_PRED - 1)) {

+#endif

+    vp9_intra8x8_predict(b, b->bmi.as_mode.first, b->predictor);

+#if CONFIG_COMP_INTRA_PRED

+  } else {

+    vp9_comp_intra8x8_predict(b, b->bmi.as_mode.first, b->bmi.as_mode.second,

+                              b->predictor);

+  }

+#endif

+  if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {

+    int idx = (ib & 0x02) ? (ib + 2) : ib;

+    // generate residual blocks

+    vp9_subtract_4b_c(be, b, 16);

+    tx_type = get_tx_type(xd, xd->block + idx);

+    if (tx_type != DCT_DCT) {

+      vp9_fht(be->src_diff, 32, (x->block + idx)->coeff,

+                tx_type, 8);

+      x->quantize_b_8x8(x->block + idx, xd->block + idx);

+      vp9_ihtllm_c(xd->block[idx].dqcoeff, xd->block[ib].diff, 32,

+                   tx_type, 8);

+    } else {

+      x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);

+      x->quantize_b_8x8(x->block + idx, xd->block + idx);

+      vp9_idct_idct8(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);

+    }

+  } else {

+    for (i = 0; i < 4; i++) {

+      b = &xd->block[ib + iblock[i]];

+      be = &x->block[ib + iblock[i]];

+      vp9_subtract_b(be, b, 16);

+      x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);

+      x->quantize_b_4x4(be, b);

+      vp9_inverse_transform_b_4x4(IF_RTCD(&rtcd->common->idct), b, 32);

+    }

+  }

+  // reconstruct submacroblock

+  for (i = 0; i < 4; i++) {

+    b = &xd->block[ib + iblock[i]];

+    vp9_recon_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst,

+                  b->dst_stride);

+  }

+}

+void vp9_encode_intra8x8mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {

+  int i, ib;

+  for (i = 0; i < 4; i++) {

+    ib = vp9_i8x8_block[i];

+    vp9_encode_intra8x8(rtcd, x, ib);

+  }

+}

+void vp9_encode_intra_uv4x4(const VP9_ENCODER_RTCD *rtcd,

+                            MACROBLOCK *x, int ib,

+                            int mode, int second) {

+  BLOCKD *b = &x->e_mbd.block[ib];

+  BLOCK *be = &x->block[ib];

+#if CONFIG_COMP_INTRA_PRED

+  if (second == -1) {

+#endif

+    vp9_intra_uv4x4_predict(b, mode, b->predictor);

+#if CONFIG_COMP_INTRA_PRED

+  } else {

+    vp9_comp_intra_uv4x4_predict(b, mode, second, b->predictor);

+  }

+#endif

+  vp9_subtract_b(be, b, 8);

+  x->vp9_short_fdct4x4(be->src_diff, be->coeff, 16);

+  x->quantize_b_4x4(be, b);

+  vp9_inverse_transform_b_4x4(IF_RTCD(&rtcd->common->idct), b, 16);

+  vp9_recon_uv_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst,

+                   b->dst_stride);

+}

+void vp9_encode_intra8x8mbuv(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {

+  int i, ib, mode, second;

+  BLOCKD *b;

+  for (i = 0; i < 4; i++) {

+    ib = vp9_i8x8_block[i];

+    b = &x->e_mbd.block[ib];

+    mode = b->bmi.as_mode.first;

+#if CONFIG_COMP_INTRA_PRED

+    second = b->bmi.as_mode.second;

+#else

+    second = -1;

+#endif

+    /*u */

+    vp9_encode_intra_uv4x4(rtcd, x, i + 16, mode, second);

+    /*v */

+    vp9_encode_intra_uv4x4(rtcd, x, i + 20, mode, second);

+  }

+}

--- /dev/null

+++ b/vp9/encoder/encodeintra.h

@@ -1,0 +1,27 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __ENCODEINTRA_H_

+#define __ENCODEINTRA_H_

+#include "onyx_int.h"

+int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);

+void vp9_encode_intra16x16mby(const VP9_ENCODER_RTCD *, MACROBLOCK *x);

+void vp9_encode_intra16x16mbuv(const VP9_ENCODER_RTCD *, MACROBLOCK *x);

+void vp9_encode_intra4x4mby(const VP9_ENCODER_RTCD *, MACROBLOCK *mb);

+void vp9_encode_intra4x4block(const VP9_ENCODER_RTCD *rtcd,

+                              MACROBLOCK *x, int ib);

+void vp9_encode_intra8x8mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);

+void vp9_encode_intra8x8mbuv(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);

+void vp9_encode_intra8x8(const VP9_ENCODER_RTCD *rtcd,

+                         MACROBLOCK *x, int ib);

+#endif  // __ENCODEINTRA_H_

--- /dev/null

+++ b/vp9/encoder/encodemb.c

@@ -1,0 +1,950 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "encodemb.h"

+#include "vp9/common/reconinter.h"

+#include "quantize.h"

+#include "tokenize.h"

+#include "vp9/common/invtrans.h"

+#include "vp9/common/reconintra.h"

+#include "vpx_mem/vpx_mem.h"

+#include "rdopt.h"

+#include "vp9/common/systemdependent.h"

+#include "vpx_rtcd.h"

+#if CONFIG_RUNTIME_CPU_DETECT

+#define IF_RTCD(x) (x)

+#else

+#define IF_RTCD(x) NULL

+#endif

+void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) {

+  unsigned char *src_ptr = (*(be->base_src) + be->src);

+  short *diff_ptr = be->src_diff;

+  unsigned char *pred_ptr = bd->predictor;

+  int src_stride = be->src_stride;

+  int r, c;

+  for (r = 0; r < 4; r++) {

+    for (c = 0; c < 4; c++) {

+      diff_ptr[c] = src_ptr[c] - pred_ptr[c];

+    }

+    diff_ptr += pitch;

+    pred_ptr += pitch;

+    src_ptr  += src_stride;

+  }

+}

+void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch) {

+  unsigned char *src_ptr = (*(be->base_src) + be->src);

+  short *diff_ptr = be->src_diff;

+  unsigned char *pred_ptr = bd->predictor;

+  int src_stride = be->src_stride;

+  int r, c;

+  for (r = 0; r < 8; r++) {

+    for (c = 0; c < 8; c++) {

+      diff_ptr[c] = src_ptr[c] - pred_ptr[c];

+    }

+    diff_ptr += pitch;

+    pred_ptr += pitch;

+    src_ptr  += src_stride;

+  }

+}

+void vp9_subtract_mbuv_s_c(short *diff, const unsigned char *usrc,

+                           const unsigned char *vsrc, int src_stride,

+                           const unsigned char *upred,

+                           const unsigned char *vpred, int dst_stride) {

+  short *udiff = diff + 256;

+  short *vdiff = diff + 320;

+  int r, c;

+  for (r = 0; r < 8; r++) {

+    for (c = 0; c < 8; c++) {

+      udiff[c] = usrc[c] - upred[c];

+    }

+    udiff += 8;

+    upred += dst_stride;

+    usrc  += src_stride;

+  }

+  for (r = 0; r < 8; r++) {

+    for (c = 0; c < 8; c++) {

+      vdiff[c] = vsrc[c] - vpred[c];

+    }

+    vdiff += 8;

+    vpred += dst_stride;

+    vsrc  += src_stride;

+  }

+}

+void vp9_subtract_mbuv_c(short *diff, unsigned char *usrc,

+                         unsigned char *vsrc, unsigned char *pred, int stride) {

+  unsigned char *upred = pred + 256;

+  unsigned char *vpred = pred + 320;

+  vp9_subtract_mbuv_s_c(diff, usrc, vsrc, stride, upred, vpred, 8);

+}

+void vp9_subtract_mby_s_c(short *diff, const unsigned char *src, int src_stride,

+                          const unsigned char *pred, int dst_stride) {

+  int r, c;

+  for (r = 0; r < 16; r++) {

+    for (c = 0; c < 16; c++) {

+      diff[c] = src[c] - pred[c];

+    }

+    diff += 16;

+    pred += dst_stride;

+    src  += src_stride;

+  }

+}

+void vp9_subtract_mby_c(short *diff, unsigned char *src,

+                        unsigned char *pred, int stride) {

+  vp9_subtract_mby_s_c(diff, src, stride, pred, 16);

+}

+static void subtract_mb(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {

+  BLOCK *b = &x->block[0];

+  vp9_subtract_mby(x->src_diff, *(b->base_src), x->e_mbd.predictor,

+                   b->src_stride);

+  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

+                    x->e_mbd.predictor, x->src.uv_stride);

+}

+static void build_dcblock_4x4(MACROBLOCK *x) {

+  short *src_diff_ptr = &x->src_diff[384];

+  int i;

+  for (i = 0; i < 16; i++) {

+    src_diff_ptr[i] = x->coeff[i * 16];

+  }

+}

+void vp9_transform_mby_4x4(MACROBLOCK *x) {

+  int i;

+  for (i = 0; i < 16; i += 2) {

+    x->vp9_short_fdct8x4(&x->block[i].src_diff[0],

+                         &x->block[i].coeff[0], 32);

+  }

+  if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {

+    // build dc block from 16 y dc values

+    build_dcblock_4x4(x);

+    // do 2nd order transform on the dc block

+    x->short_walsh4x4(&x->block[24].src_diff[0],

+                      &x->block[24].coeff[0], 8);

+  }

+}

+void vp9_transform_mbuv_4x4(MACROBLOCK *x) {

+  int i;

+  for (i = 16; i < 24; i += 2) {

+    x->vp9_short_fdct8x4(&x->block[i].src_diff[0],

+                         &x->block[i].coeff[0], 16);

+  }

+}

+static void transform_mb_4x4(MACROBLOCK *x) {

+  vp9_transform_mby_4x4(x);

+  vp9_transform_mbuv_4x4(x);

+}

+static void build_dcblock_8x8(MACROBLOCK *x) {

+  int16_t *src_diff_ptr = x->block[24].src_diff;

+  int i;

+  for (i = 0; i < 16; i++) {

+    src_diff_ptr[i] = 0;

+  }

+  src_diff_ptr[0] = x->coeff[0 * 16];

+  src_diff_ptr[1] = x->coeff[4 * 16];

+  src_diff_ptr[4] = x->coeff[8 * 16];

+  src_diff_ptr[8] = x->coeff[12 * 16];

+}

+void vp9_transform_mby_8x8(MACROBLOCK *x) {

+  int i;

+  for (i = 0; i < 9; i += 8) {

+    x->vp9_short_fdct8x8(&x->block[i].src_diff[0],

+                         &x->block[i].coeff[0], 32);

+  }

+  for (i = 2; i < 11; i += 8) {

+    x->vp9_short_fdct8x8(&x->block[i].src_diff[0],

+                         &x->block[i + 2].coeff[0], 32);

+  }

+  if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {

+    // build dc block from 2x2 y dc values

+    build_dcblock_8x8(x);

+    // do 2nd order transform on the dc block

+    x->short_fhaar2x2(&x->block[24].src_diff[0],

+                      &x->block[24].coeff[0], 8);

+  }

+}

+void vp9_transform_mbuv_8x8(MACROBLOCK *x) {

+  int i;

+  for (i = 16; i < 24; i += 4) {

+    x->vp9_short_fdct8x8(&x->block[i].src_diff[0],

+                         &x->block[i].coeff[0], 16);

+  }

+}

+void vp9_transform_mb_8x8(MACROBLOCK *x) {

+  vp9_transform_mby_8x8(x);

+  vp9_transform_mbuv_8x8(x);

+}

+void vp9_transform_mby_16x16(MACROBLOCK *x) {

+  vp9_clear_system_state();

+  x->vp9_short_fdct16x16(&x->block[0].src_diff[0],

+                         &x->block[0].coeff[0], 32);

+}

+void vp9_transform_mb_16x16(MACROBLOCK *x) {

+  vp9_transform_mby_16x16(x);

+  vp9_transform_mbuv_8x8(x);

+}

+#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )

+#define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )

+typedef struct vp9_token_state vp9_token_state;

+struct vp9_token_state {

+  int           rate;

+  int           error;

+  int           next;

+  signed char   token;

+  short         qc;

+};

+// TODO: experiments to find optimal multiple numbers

+#define Y1_RD_MULT 4

+#define UV_RD_MULT 2

+#define Y2_RD_MULT 4

+static const int plane_rd_mult[4] = {

+  Y1_RD_MULT,

+  Y2_RD_MULT,

+  UV_RD_MULT,

+  Y1_RD_MULT

+};

+#define UPDATE_RD_COST()\

+{\

+  rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\

+  rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\

+  if (rd_cost0 == rd_cost1) {\

+    rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\

+    rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\

+  }\

+}

+static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,

+                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

+                       const VP9_ENCODER_RTCD *rtcd, int tx_size) {

+  BLOCK *b;

+  BLOCKD *d;

+  vp9_token_state tokens[65][2];

+  uint64_t best_mask[2];

+  const short *dequant_ptr;

+  const short *coeff_ptr;

+  short *qcoeff_ptr;

+  short *dqcoeff_ptr;

+  int eob;

+  int i0;

+  int rc;

+  int x;

+  int sz = 0;

+  int next;

+  int rdmult;

+  int rddiv;

+  int final_eob;

+  int64_t rd_cost0, rd_cost1;

+  int rate0, rate1;

+  int error0, error1;

+  int t0, t1;

+  int best;

+  int band;

+  int pt;

+  int err_mult = plane_rd_mult[type];

+  int default_eob;

+  int const *scan, *bands;

+  b = &mb->block[i];

+  d = &mb->e_mbd.block[i];

+  switch (tx_size) {

+    default:

+    case TX_4X4:

+      scan = vp9_default_zig_zag1d;

+      bands = vp9_coef_bands;

+      default_eob = 16;

+      // TODO: this isn't called (for intra4x4 modes), but will be left in

+      // since it could be used later

+      {

+        TX_TYPE tx_type = get_tx_type(&mb->e_mbd, d);

+        if (tx_type != DCT_DCT) {

+          switch (tx_type) {

+            case ADST_DCT:

+              scan = vp9_row_scan;

+              break;

+            case DCT_ADST:

+              scan = vp9_col_scan;

+              break;

+            default:

+              scan = vp9_default_zig_zag1d;

+              break;

+          }

+        } else {

+          scan = vp9_default_zig_zag1d;

+        }

+      }

+      break;

+    case TX_8X8:

+      scan = vp9_default_zig_zag1d_8x8;

+      bands = vp9_coef_bands_8x8;

+      default_eob = 64;

+      break;

+  }

+  dequant_ptr = d->dequant;

+  coeff_ptr = b->coeff;

+  qcoeff_ptr = d->qcoeff;

+  dqcoeff_ptr = d->dqcoeff;

+  i0 = (type == PLANE_TYPE_Y_NO_DC);

+  eob = d->eob;

+  /* Now set up a Viterbi trellis to evaluate alternative roundings. */

+  rdmult = mb->rdmult * err_mult;

+  if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME)

+    rdmult = (rdmult * 9) >> 4;

+  rddiv = mb->rddiv;

+  best_mask[0] = best_mask[1] = 0;

+  /* Initialize the sentinel node of the trellis. */

+  tokens[eob][0].rate = 0;

+  tokens[eob][0].error = 0;

+  tokens[eob][0].next = default_eob;

+  tokens[eob][0].token = DCT_EOB_TOKEN;

+  tokens[eob][0].qc = 0;

+  *(tokens[eob] + 1) = *(tokens[eob] + 0);

+  next = eob;

+  for (i = eob; i-- > i0;) {

+    int base_bits;

+    int d2;

+    int dx;

+    rc = scan[i];

+    x = qcoeff_ptr[rc];

+    /* Only add a trellis state for non-zero coefficients. */

+    if (x) {

+      int shortcut = 0;

+      error0 = tokens[next][0].error;

+      error1 = tokens[next][1].error;

+      /* Evaluate the first possibility for this state. */

+      rate0 = tokens[next][0].rate;

+      rate1 = tokens[next][1].rate;

+      t0 = (vp9_dct_value_tokens_ptr + x)->Token;

+      /* Consider both possible successor states. */

+      if (next < default_eob) {

+        band = bands[i + 1];

+        pt = vp9_prev_token_class[t0];

+        rate0 +=

+          mb->token_costs[tx_size][type][band][pt][tokens[next][0].token];

+        rate1 +=

+          mb->token_costs[tx_size][type][band][pt][tokens[next][1].token];

+      }

+      UPDATE_RD_COST();

+      /* And pick the best. */

+      best = rd_cost1 < rd_cost0;

+      base_bits = *(vp9_dct_value_cost_ptr + x);

+      dx = dqcoeff_ptr[rc] - coeff_ptr[rc];

+      d2 = dx * dx;

+      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);

+      tokens[i][0].error = d2 + (best ? error1 : error0);

+      tokens[i][0].next = next;

+      tokens[i][0].token = t0;

+      tokens[i][0].qc = x;

+      best_mask[0] |= best << i;

+      /* Evaluate the second possibility for this state. */

+      rate0 = tokens[next][0].rate;

+      rate1 = tokens[next][1].rate;

+      if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc])) &&

+          (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) + dequant_ptr[rc != 0]))

+        shortcut = 1;

+      else

+        shortcut = 0;

+      if (shortcut) {

+        sz = -(x < 0);

+        x -= 2 * sz + 1;

+      }

+      /* Consider both possible successor states. */

+      if (!x) {

+        /* If we reduced this coefficient to zero, check to see if

+         *  we need to move the EOB back here.

+         */

+        t0 = tokens[next][0].token == DCT_EOB_TOKEN ?

+             DCT_EOB_TOKEN : ZERO_TOKEN;

+        t1 = tokens[next][1].token == DCT_EOB_TOKEN ?

+             DCT_EOB_TOKEN : ZERO_TOKEN;

+      } else {

+        t0 = t1 = (vp9_dct_value_tokens_ptr + x)->Token;

+      }

+      if (next < default_eob) {

+        band = bands[i + 1];

+        if (t0 != DCT_EOB_TOKEN) {

+          pt = vp9_prev_token_class[t0];

+          rate0 += mb->token_costs[tx_size][type][band][pt][

+              tokens[next][0].token];

+        }

+        if (t1 != DCT_EOB_TOKEN) {

+          pt = vp9_prev_token_class[t1];

+          rate1 += mb->token_costs[tx_size][type][band][pt][

+              tokens[next][1].token];

+        }

+      }

+      UPDATE_RD_COST();

+      /* And pick the best. */

+      best = rd_cost1 < rd_cost0;

+      base_bits = *(vp9_dct_value_cost_ptr + x);

+      if (shortcut) {

+        dx -= (dequant_ptr[rc != 0] + sz) ^ sz;

+        d2 = dx * dx;

+      }

+      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);

+      tokens[i][1].error = d2 + (best ? error1 : error0);

+      tokens[i][1].next = next;

+      tokens[i][1].token = best ? t1 : t0;

+      tokens[i][1].qc = x;

+      best_mask[1] |= best << i;

+      /* Finally, make this the new head of the trellis. */

+      next = i;

+    }

+    /* There's no choice to make for a zero coefficient, so we don't

+     *  add a new trellis node, but we do need to update the costs.

+     */

+    else {

+      band = bands[i + 1];

+      t0 = tokens[next][0].token;

+      t1 = tokens[next][1].token;

+      /* Update the cost of each path if we're past the EOB token. */

+      if (t0 != DCT_EOB_TOKEN) {

+        tokens[next][0].rate += mb->token_costs[tx_size][type][band][0][t0];

+        tokens[next][0].token = ZERO_TOKEN;

+      }

+      if (t1 != DCT_EOB_TOKEN) {

+        tokens[next][1].rate += mb->token_costs[tx_size][type][band][0][t1];

+        tokens[next][1].token = ZERO_TOKEN;

+      }

+      /* Don't update next, because we didn't add a new node. */

+    }

+  }

+  /* Now pick the best path through the whole trellis. */

+  band = bands[i + 1];

+  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

+  rate0 = tokens[next][0].rate;

+  rate1 = tokens[next][1].rate;

+  error0 = tokens[next][0].error;

+  error1 = tokens[next][1].error;

+  t0 = tokens[next][0].token;

+  t1 = tokens[next][1].token;

+  rate0 += mb->token_costs[tx_size][type][band][pt][t0];

+  rate1 += mb->token_costs[tx_size][type][band][pt][t1];

+  UPDATE_RD_COST();

+  best = rd_cost1 < rd_cost0;

+  final_eob = i0 - 1;

+  for (i = next; i < eob; i = next) {

+    x = tokens[i][best].qc;

+    if (x)

+      final_eob = i;

+    rc = scan[i];

+    qcoeff_ptr[rc] = x;

+    dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]);

+    next = tokens[i][best].next;

+    best = (best_mask[best] >> i) & 1;

+  }

+  final_eob++;

+  d->eob = final_eob;

+  *a = *l = (d->eob != !type);

+}

+/**************************************************************************

+our inverse hadamard transform effectively is weighted sum of all 16 inputs

+with weight either 1 or -1. It has a last stage scaling of (sum+1)>>2. And

+dc only idct is (dc+16)>>5. So if all the sums are between -65 and 63 the

+output after inverse wht and idct will be all zero. A sum of absolute value

+smaller than 65 guarantees all 16 different (+1/-1) weighted sums in wht

+fall between -65 and +65.

+**************************************************************************/

+#define SUM_2ND_COEFF_THRESH 65

+static void check_reset_2nd_coeffs(MACROBLOCKD *xd,

+                                   ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {

+  int sum = 0;

+  int i;

+  BLOCKD *bd = &xd->block[24];

+  if (bd->dequant[0] >= SUM_2ND_COEFF_THRESH

+      && bd->dequant[1] >= SUM_2ND_COEFF_THRESH)

+    return;

+  for (i = 0; i < bd->eob; i++) {

+    int coef = bd->dqcoeff[vp9_default_zig_zag1d[i]];

+    sum += (coef >= 0) ? coef : -coef;

+    if (sum >= SUM_2ND_COEFF_THRESH)

+      return;

+  }

+  if (sum < SUM_2ND_COEFF_THRESH) {

+    for (i = 0; i < bd->eob; i++) {

+      int rc = vp9_default_zig_zag1d[i];

+      bd->qcoeff[rc] = 0;

+      bd->dqcoeff[rc] = 0;

+    }

+    bd->eob = 0;

+    *a = *l = (bd->eob != 0);

+  }

+}

+#define SUM_2ND_COEFF_THRESH_8X8 32

+static void check_reset_8x8_2nd_coeffs(MACROBLOCKD *xd,

+                                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {

+  int sum = 0;

+  BLOCKD *bd = &xd->block[24];

+  int coef;

+  coef = bd->dqcoeff[0];

+  sum += (coef >= 0) ? coef : -coef;

+  coef = bd->dqcoeff[1];

+  sum += (coef >= 0) ? coef : -coef;

+  coef = bd->dqcoeff[4];

+  sum += (coef >= 0) ? coef : -coef;

+  coef = bd->dqcoeff[8];

+  sum += (coef >= 0) ? coef : -coef;

+  if (sum < SUM_2ND_COEFF_THRESH_8X8) {

+    bd->qcoeff[0] = 0;

+    bd->dqcoeff[0] = 0;

+    bd->qcoeff[1] = 0;

+    bd->dqcoeff[1] = 0;

+    bd->qcoeff[4] = 0;

+    bd->dqcoeff[4] = 0;

+    bd->qcoeff[8] = 0;

+    bd->dqcoeff[8] = 0;

+    bd->eob = 0;

+    *a = *l = (bd->eob != 0);

+  }

+}

+void vp9_optimize_mby_4x4(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {

+  int b;

+  PLANE_TYPE type;

+  int has_2nd_order;

+  ENTROPY_CONTEXT_PLANES t_above, t_left;

+  ENTROPY_CONTEXT *ta;

+  ENTROPY_CONTEXT *tl;

+  MB_PREDICTION_MODE mode = x->e_mbd.mode_info_context->mbmi.mode;

+  if (!x->e_mbd.above_context || !x->e_mbd.left_context)

+    return;

+  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  ta = (ENTROPY_CONTEXT *)&t_above;

+  tl = (ENTROPY_CONTEXT *)&t_left;

+  has_2nd_order = (mode != B_PRED && mode != I8X8_PRED && mode != SPLITMV);

+  type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;

+  for (b = 0; b < 16; b++) {

+    optimize_b(x, b, type,

+               ta + vp9_block2above[b], tl + vp9_block2left[b], rtcd, TX_4X4);

+  }

+  if (has_2nd_order) {

+    b = 24;

+    optimize_b(x, b, PLANE_TYPE_Y2,

+               ta + vp9_block2above[b], tl + vp9_block2left[b], rtcd, TX_4X4);

+    check_reset_2nd_coeffs(&x->e_mbd,

+                           ta + vp9_block2above[b], tl + vp9_block2left[b]);

+  }

+}

+void vp9_optimize_mbuv_4x4(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {

+  int b;

+  ENTROPY_CONTEXT_PLANES t_above, t_left;

+  ENTROPY_CONTEXT *ta;

+  ENTROPY_CONTEXT *tl;

+  if (!x->e_mbd.above_context || !x->e_mbd.left_context)

+    return;

+  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  ta = (ENTROPY_CONTEXT *)&t_above;

+  tl = (ENTROPY_CONTEXT *)&t_left;

+  for (b = 16; b < 24; b++) {

+    optimize_b(x, b, PLANE_TYPE_UV,

+               ta + vp9_block2above[b], tl + vp9_block2left[b], rtcd, TX_4X4);

+  }

+}

+static void optimize_mb_4x4(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {

+  vp9_optimize_mby_4x4(x, rtcd);

+  vp9_optimize_mbuv_4x4(x, rtcd);

+}

+void vp9_optimize_mby_8x8(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {

+  int b;

+  PLANE_TYPE type;

+  ENTROPY_CONTEXT_PLANES t_above, t_left;

+  ENTROPY_CONTEXT *ta;

+  ENTROPY_CONTEXT *tl;

+  int has_2nd_order = x->e_mbd.mode_info_context->mbmi.mode != SPLITMV;

+  if (!x->e_mbd.above_context || !x->e_mbd.left_context)

+    return;

+  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  ta = (ENTROPY_CONTEXT *)&t_above;

+  tl = (ENTROPY_CONTEXT *)&t_left;

+  type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;

+  for (b = 0; b < 16; b += 4) {

+    optimize_b(x, b, type,

+               ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],

+               rtcd, TX_8X8);

+    ta[vp9_block2above_8x8[b] + 1] = ta[vp9_block2above_8x8[b]];

+    tl[vp9_block2left_8x8[b] + 1]  = tl[vp9_block2left_8x8[b]];

+  }

+  // 8x8 always have 2nd roder haar block

+  if (has_2nd_order) {

+    check_reset_8x8_2nd_coeffs(&x->e_mbd,

+                               ta + vp9_block2above_8x8[24],

+                               tl + vp9_block2left_8x8[24]);

+  }

+}

+void vp9_optimize_mbuv_8x8(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {

+  int b;

+  ENTROPY_CONTEXT_PLANES t_above, t_left;

+  ENTROPY_CONTEXT *ta;

+  ENTROPY_CONTEXT *tl;

+  if (!x->e_mbd.above_context || !x->e_mbd.left_context)

+    return;

+  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  ta = (ENTROPY_CONTEXT *)&t_above;

+  tl = (ENTROPY_CONTEXT *)&t_left;

+  for (b = 16; b < 24; b += 4) {

+    optimize_b(x, b, PLANE_TYPE_UV,

+               ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],

+               rtcd, TX_8X8);

+    ta[vp9_block2above_8x8[b] + 1] = ta[vp9_block2above_8x8[b]];

+    tl[vp9_block2left_8x8[b] + 1]  = tl[vp9_block2left_8x8[b]];

+  }

+}

+static void optimize_mb_8x8(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {

+  vp9_optimize_mby_8x8(x, rtcd);

+  vp9_optimize_mbuv_8x8(x, rtcd);

+}

+static void optimize_b_16x16(MACROBLOCK *mb, int i, PLANE_TYPE type,

+                             ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

+                             const VP9_ENCODER_RTCD *rtcd) {

+  BLOCK *b = &mb->block[i];

+  BLOCKD *d = &mb->e_mbd.block[i];

+  vp9_token_state tokens[257][2];

+  unsigned best_index[257][2];

+  const short *dequant_ptr = d->dequant, *coeff_ptr = b->coeff;

+  short *qcoeff_ptr = qcoeff_ptr = d->qcoeff;

+  short *dqcoeff_ptr = dqcoeff_ptr = d->dqcoeff;

+  int eob = d->eob, final_eob, sz = 0;

+  int rc, x, next;

+  int64_t rdmult, rddiv, rd_cost0, rd_cost1;

+  int rate0, rate1, error0, error1, t0, t1;

+  int best, band, pt;

+  int err_mult = plane_rd_mult[type];

+  /* Now set up a Viterbi trellis to evaluate alternative roundings. */

+  rdmult = mb->rdmult * err_mult;

+  if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME)

+      rdmult = (rdmult * 9)>>4;

+  rddiv = mb->rddiv;

+  memset(best_index, 0, sizeof(best_index));

+  /* Initialize the sentinel node of the trellis. */

+  tokens[eob][0].rate = 0;

+  tokens[eob][0].error = 0;

+  tokens[eob][0].next = 256;

+  tokens[eob][0].token = DCT_EOB_TOKEN;

+  tokens[eob][0].qc = 0;

+  *(tokens[eob] + 1) = *(tokens[eob] + 0);

+  next = eob;

+  for (i = eob; i-- > 0;) {

+    int base_bits, d2, dx;

+    rc = vp9_default_zig_zag1d_16x16[i];

+    x = qcoeff_ptr[rc];

+    /* Only add a trellis state for non-zero coefficients. */

+    if (x) {

+      int shortcut = 0;

+      error0 = tokens[next][0].error;

+      error1 = tokens[next][1].error;

+      /* Evaluate the first possibility for this state. */

+      rate0 = tokens[next][0].rate;

+      rate1 = tokens[next][1].rate;

+      t0 = (vp9_dct_value_tokens_ptr + x)->Token;

+      /* Consider both possible successor states. */

+      if (next < 256) {

+        band = vp9_coef_bands_16x16[i + 1];

+        pt = vp9_prev_token_class[t0];

+        rate0 += mb->token_costs[TX_16X16][type][band][pt][tokens[next][0].token];

+        rate1 += mb->token_costs[TX_16X16][type][band][pt][tokens[next][1].token];

+      }

+      UPDATE_RD_COST();

+      /* And pick the best. */

+      best = rd_cost1 < rd_cost0;

+      base_bits = *(vp9_dct_value_cost_ptr + x);

+      dx = dqcoeff_ptr[rc] - coeff_ptr[rc];

+      d2 = dx*dx;

+      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);

+      tokens[i][0].error = d2 + (best ? error1 : error0);

+      tokens[i][0].next = next;

+      tokens[i][0].token = t0;

+      tokens[i][0].qc = x;

+      best_index[i][0] = best;

+      /* Evaluate the second possibility for this state. */

+      rate0 = tokens[next][0].rate;

+      rate1 = tokens[next][1].rate;

+      if((abs(x)*dequant_ptr[rc!=0]>abs(coeff_ptr[rc])) &&

+         (abs(x)*dequant_ptr[rc!=0]<abs(coeff_ptr[rc])+dequant_ptr[rc!=0]))

+        shortcut = 1;

+      else

+        shortcut = 0;

+      if (shortcut) {

+        sz = -(x < 0);

+        x -= 2*sz + 1;

+      }

+      /* Consider both possible successor states. */

+      if (!x) {

+        /* If we reduced this coefficient to zero, check to see if

+         *  we need to move the EOB back here.

+         */

+        t0 = tokens[next][0].token == DCT_EOB_TOKEN ?

+             DCT_EOB_TOKEN : ZERO_TOKEN;

+        t1 = tokens[next][1].token == DCT_EOB_TOKEN ?

+             DCT_EOB_TOKEN : ZERO_TOKEN;

+      }

+      else

+        t0=t1 = (vp9_dct_value_tokens_ptr + x)->Token;

+      if (next < 256) {

+        band = vp9_coef_bands_16x16[i + 1];

+        if (t0 != DCT_EOB_TOKEN) {

+            pt = vp9_prev_token_class[t0];

+            rate0 += mb->token_costs[TX_16X16][type][band][pt]

+                [tokens[next][0].token];

+        }

+        if (t1!=DCT_EOB_TOKEN) {

+            pt = vp9_prev_token_class[t1];

+            rate1 += mb->token_costs[TX_16X16][type][band][pt]

+                [tokens[next][1].token];

+        }

+      }

+      UPDATE_RD_COST();

+      /* And pick the best. */

+      best = rd_cost1 < rd_cost0;

+      base_bits = *(vp9_dct_value_cost_ptr + x);

+      if(shortcut) {

+        dx -= (dequant_ptr[rc!=0] + sz) ^ sz;

+        d2 = dx*dx;

+      }

+      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);

+      tokens[i][1].error = d2 + (best ? error1 : error0);

+      tokens[i][1].next = next;

+      tokens[i][1].token = best ? t1 : t0;

+      tokens[i][1].qc = x;

+      best_index[i][1] = best;

+      /* Finally, make this the new head of the trellis. */

+      next = i;

+    }

+    /* There's no choice to make for a zero coefficient, so we don't

+     *  add a new trellis node, but we do need to update the costs.

+     */

+    else {

+      band = vp9_coef_bands_16x16[i + 1];

+      t0 = tokens[next][0].token;

+      t1 = tokens[next][1].token;

+      /* Update the cost of each path if we're past the EOB token. */

+      if (t0 != DCT_EOB_TOKEN) {

+        tokens[next][0].rate += mb->token_costs[TX_16X16][type][band][0][t0];

+        tokens[next][0].token = ZERO_TOKEN;

+      }

+      if (t1 != DCT_EOB_TOKEN) {

+        tokens[next][1].rate += mb->token_costs[TX_16X16][type][band][0][t1];

+        tokens[next][1].token = ZERO_TOKEN;

+      }

+      /* Don't update next, because we didn't add a new node. */

+    }

+  }

+  /* Now pick the best path through the whole trellis. */

+  band = vp9_coef_bands_16x16[i + 1];

+  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

+  rate0 = tokens[next][0].rate;

+  rate1 = tokens[next][1].rate;

+  error0 = tokens[next][0].error;

+  error1 = tokens[next][1].error;

+  t0 = tokens[next][0].token;

+  t1 = tokens[next][1].token;

+  rate0 += mb->token_costs[TX_16X16][type][band][pt][t0];

+  rate1 += mb->token_costs[TX_16X16][type][band][pt][t1];

+  UPDATE_RD_COST();

+  best = rd_cost1 < rd_cost0;

+  final_eob = -1;

+  for (i = next; i < eob; i = next) {

+    x = tokens[i][best].qc;

+    if (x)

+      final_eob = i;

+    rc = vp9_default_zig_zag1d_16x16[i];

+    qcoeff_ptr[rc] = x;

+    dqcoeff_ptr[rc] = (x * dequant_ptr[rc!=0]);

+    next = tokens[i][best].next;

+    best = best_index[i][best];

+  }

+  final_eob++;

+  d->eob = final_eob;

+  *a = *l = (d->eob != !type);

+}

+void vp9_optimize_mby_16x16(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {

+  ENTROPY_CONTEXT_PLANES t_above, t_left;

+  ENTROPY_CONTEXT *ta, *tl;

+  if (!x->e_mbd.above_context || !x->e_mbd.left_context)

+    return;

+  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  ta = (ENTROPY_CONTEXT *)&t_above;

+  tl = (ENTROPY_CONTEXT *)&t_left;

+  optimize_b_16x16(x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, rtcd);

+}

+static void optimize_mb_16x16(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {

+  vp9_optimize_mby_16x16(x, rtcd);

+  vp9_optimize_mbuv_8x8(x, rtcd);

+}

+void vp9_encode_inter16x16(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {

+  MACROBLOCKD *xd = &x->e_mbd;

+  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;

+  vp9_build_inter_predictors_mb(xd);

+  subtract_mb(rtcd, x);

+  if (tx_size == TX_16X16) {

+    vp9_transform_mb_16x16(x);

+    vp9_quantize_mb_16x16(x);

+    if (x->optimize)

+      optimize_mb_16x16(x, rtcd);

+    vp9_inverse_transform_mb_16x16(IF_RTCD(&rtcd->common->idct), xd);

+  } else if (tx_size == TX_8X8) {

+    if (xd->mode_info_context->mbmi.mode == SPLITMV) {

+      assert(xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4);

+      vp9_transform_mby_8x8(x);

+      vp9_transform_mbuv_4x4(x);

+      vp9_quantize_mby_8x8(x);

+      vp9_quantize_mbuv_4x4(x);

+      if (x->optimize) {

+        vp9_optimize_mby_8x8(x, rtcd);

+        vp9_optimize_mbuv_4x4(x, rtcd);

+      }

+      vp9_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), xd);

+      vp9_inverse_transform_mbuv_4x4(IF_RTCD(&rtcd->common->idct), xd);

+    } else {

+      vp9_transform_mb_8x8(x);

+      vp9_quantize_mb_8x8(x);

+      if (x->optimize)

+        optimize_mb_8x8(x, rtcd);

+      vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), xd);

+    }

+  } else {

+    transform_mb_4x4(x);

+    vp9_quantize_mb_4x4(x);

+    if (x->optimize)

+      optimize_mb_4x4(x, rtcd);

+    vp9_inverse_transform_mb_4x4(IF_RTCD(&rtcd->common->idct), xd);

+  }

+  vp9_recon_mb(xd);

+}

+/* this function is used by first pass only */

+void vp9_encode_inter16x16y(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {

+  MACROBLOCKD *xd = &x->e_mbd;

+  BLOCK *b = &x->block[0];

+#if CONFIG_PRED_FILTER

+  // Disable the prediction filter for firstpass

+  xd->mode_info_context->mbmi.pred_filter_enabled = 0;

+#endif

+  vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);

+  vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);

+  vp9_transform_mby_4x4(x);

+  vp9_quantize_mby_4x4(x);

+  vp9_inverse_transform_mby_4x4(IF_RTCD(&rtcd->common->idct), xd);

+  vp9_recon_mby(xd);

+}

--- /dev/null

+++ b/vp9/encoder/encodemb.h

@@ -1,0 +1,70 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_ENCODEMB_H

+#define __INC_ENCODEMB_H

+#include "vpx_ports/config.h"

+#include "block.h"

+typedef struct {

+  MB_PREDICTION_MODE mode;

+  MV_REFERENCE_FRAME ref_frame;

+  MV_REFERENCE_FRAME second_ref_frame;

+#if CONFIG_PRED_FILTER

+  int pred_filter_flag;

+#endif

+} MODE_DEFINITION;

+#if CONFIG_RUNTIME_CPU_DETECT

+#define ENCODEMB_INVOKE(ctx,fn) (ctx)->fn

+#else

+#define ENCODEMB_INVOKE(ctx,fn) vp9_encodemb_##fn

+#endif

+#include "onyx_int.h"

+struct VP9_ENCODER_RTCD;

+void vp9_encode_inter16x16(const struct VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);

+void vp9_transform_mbuv_4x4(MACROBLOCK *x);

+void vp9_transform_mby_4x4(MACROBLOCK *x);

+void vp9_optimize_mby_4x4(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);

+void vp9_optimize_mbuv_4x4(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);

+void vp9_encode_inter16x16y(const struct VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);

+void vp9_transform_mb_8x8(MACROBLOCK *mb);

+void vp9_transform_mby_8x8(MACROBLOCK *x);

+void vp9_transform_mbuv_8x8(MACROBLOCK *x);

+void vp9_build_dcblock_8x8(MACROBLOCK *b);

+void vp9_optimize_mby_8x8(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);

+void vp9_optimize_mbuv_8x8(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);

+void vp9_transform_mb_16x16(MACROBLOCK *mb);

+void vp9_transform_mby_16x16(MACROBLOCK *x);

+void vp9_optimize_mby_16x16(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);

+void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);

+#if CONFIG_SUPERBLOCKS

+void vp9_subtract_mbuv_s_c(short *diff, const unsigned char *usrc,

+                           const unsigned char *vsrc, int src_stride,

+                           const unsigned char *upred,

+                           const unsigned char *vpred, int dst_stride);

+void vp9_subtract_mby_s_c(short *diff, const unsigned char *src,

+                          int src_stride, const unsigned char *pred,

+                          int dst_stride);

+#endif

+#endif

--- /dev/null

+++ b/vp9/encoder/encodemv.c

@@ -1,0 +1,547 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vp9/common/common.h"

+#include "encodemv.h"

+#include "vp9/common/entropymode.h"

+#include "vp9/common/systemdependent.h"

+#include <math.h>

+#ifdef ENTROPY_STATS

+extern unsigned int active_section;

+#endif

+#ifdef NMV_STATS

+nmv_context_counts tnmvcounts;

+#endif

+static void encode_nmv_component(vp9_writer* const bc,

+                                 int v,

+                                 int r,

+                                 const nmv_component* const mvcomp) {

+  int s, z, c, o, d;

+  assert (v != 0);            /* should not be zero */

+  s = v < 0;

+  vp9_write(bc, s, mvcomp->sign);

+  z = (s ? -v : v) - 1;       /* magnitude - 1 */

+  c = vp9_get_mv_class(z, &o);

+  write_token(bc, vp9_mv_class_tree, mvcomp->classes,

+              vp9_mv_class_encodings + c);

+  d = (o >> 3);               /* int mv data */

+  if (c == MV_CLASS_0) {

+    write_token(bc, vp9_mv_class0_tree, mvcomp->class0,

+                vp9_mv_class0_encodings + d);

+  } else {

+    int i, b;

+    b = c + CLASS0_BITS - 1;  /* number of bits */

+    for (i = 0; i < b; ++i)

+      vp9_write(bc, ((d >> i) & 1), mvcomp->bits[i]);

+  }

+}

+static void encode_nmv_component_fp(vp9_writer *bc,

+                                    int v,

+                                    int r,

+                                    const nmv_component* const mvcomp,

+                                    int usehp) {

+  int s, z, c, o, d, f, e;

+  assert (v != 0);            /* should not be zero */

+  s = v < 0;

+  z = (s ? -v : v) - 1;       /* magnitude - 1 */

+  c = vp9_get_mv_class(z, &o);

+  d = (o >> 3);               /* int mv data */

+  f = (o >> 1) & 3;           /* fractional pel mv data */

+  e = (o & 1);                /* high precision mv data */

+  /* Code the fractional pel bits */

+  if (c == MV_CLASS_0) {

+    write_token(bc, vp9_mv_fp_tree, mvcomp->class0_fp[d],

+                vp9_mv_fp_encodings + f);

+  } else {

+    write_token(bc, vp9_mv_fp_tree, mvcomp->fp,

+                vp9_mv_fp_encodings + f);

+  }

+  /* Code the high precision bit */

+  if (usehp) {

+    if (c == MV_CLASS_0) {

+      vp9_write(bc, e, mvcomp->class0_hp);

+    } else {

+      vp9_write(bc, e, mvcomp->hp);

+    }

+  }

+}

+static void build_nmv_component_cost_table(int *mvcost,

+                                           const nmv_component* const mvcomp,

+                                           int usehp) {

+  int i, v;

+  int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];

+  int bits_cost[MV_OFFSET_BITS][2];

+  int class0_fp_cost[CLASS0_SIZE][4], fp_cost[4];

+  int class0_hp_cost[2], hp_cost[2];

+  sign_cost[0] = vp9_cost_zero(mvcomp->sign);

+  sign_cost[1] = vp9_cost_one(mvcomp->sign);

+  vp9_cost_tokens(class_cost, mvcomp->classes, vp9_mv_class_tree);

+  vp9_cost_tokens(class0_cost, mvcomp->class0, vp9_mv_class0_tree);

+  for (i = 0; i < MV_OFFSET_BITS; ++i) {

+    bits_cost[i][0] = vp9_cost_zero(mvcomp->bits[i]);

+    bits_cost[i][1] = vp9_cost_one(mvcomp->bits[i]);

+  }

+  for (i = 0; i < CLASS0_SIZE; ++i)

+    vp9_cost_tokens(class0_fp_cost[i], mvcomp->class0_fp[i], vp9_mv_fp_tree);

+  vp9_cost_tokens(fp_cost, mvcomp->fp, vp9_mv_fp_tree);

+  if (usehp) {

+    class0_hp_cost[0] = vp9_cost_zero(mvcomp->class0_hp);

+    class0_hp_cost[1] = vp9_cost_one(mvcomp->class0_hp);

+    hp_cost[0] = vp9_cost_zero(mvcomp->hp);

+    hp_cost[1] = vp9_cost_one(mvcomp->hp);

+  }

+  mvcost[0] = 0;

+  for (v = 1; v <= MV_MAX; ++v) {

+    int z, c, o, d, e, f, cost = 0;

+    z = v - 1;

+    c = vp9_get_mv_class(z, &o);

+    cost += class_cost[c];

+    d = (o >> 3);               /* int mv data */

+    f = (o >> 1) & 3;           /* fractional pel mv data */

+    e = (o & 1);                /* high precision mv data */

+    if (c == MV_CLASS_0) {

+      cost += class0_cost[d];

+    } else {

+      int i, b;

+      b = c + CLASS0_BITS - 1;  /* number of bits */

+      for (i = 0; i < b; ++i)

+        cost += bits_cost[i][((d >> i) & 1)];

+    }

+    if (c == MV_CLASS_0) {

+      cost += class0_fp_cost[d][f];

+    } else {

+      cost += fp_cost[f];

+    }

+    if (usehp) {

+      if (c == MV_CLASS_0) {

+        cost += class0_hp_cost[e];

+      } else {

+        cost += hp_cost[e];

+      }

+    }

+    mvcost[v] = cost + sign_cost[0];

+    mvcost[-v] = cost + sign_cost[1];

+  }

+}

+static int update_nmv_savings(const unsigned int ct[2],

+                              const vp9_prob cur_p,

+                              const vp9_prob new_p,

+                              const vp9_prob upd_p) {

+#ifdef LOW_PRECISION_MV_UPDATE

+  vp9_prob mod_p = new_p | 1;

+#else

+  vp9_prob mod_p = new_p;

+#endif

+  const int cur_b = cost_branch256(ct, cur_p);

+  const int mod_b = cost_branch256(ct, mod_p);

+  const int cost = 7 * 256 +

+#ifndef LOW_PRECISION_MV_UPDATE

+      256 +

+#endif

+      (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p));

+  if (cur_b - mod_b - cost > 0) {

+    return cur_b - mod_b - cost;

+  } else {

+    return -vp9_cost_zero(upd_p);

+  }

+}

+static int update_nmv(

+  vp9_writer *const bc,

+  const unsigned int ct[2],

+  vp9_prob *const cur_p,

+  const vp9_prob new_p,

+  const vp9_prob upd_p) {

+#ifdef LOW_PRECISION_MV_UPDATE

+  vp9_prob mod_p = new_p | 1;

+#else

+  vp9_prob mod_p = new_p;

+#endif

+  const int cur_b = cost_branch256(ct, *cur_p);

+  const int mod_b = cost_branch256(ct, mod_p);

+  const int cost = 7 * 256 +

+#ifndef LOW_PRECISION_MV_UPDATE

+      256 +

+#endif

+      (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p));

+  if (cur_b - mod_b > cost) {

+    *cur_p = mod_p;

+    vp9_write(bc, 1, upd_p);

+#ifdef LOW_PRECISION_MV_UPDATE

+    vp9_write_literal(bc, mod_p >> 1, 7);

+#else

+    vp9_write_literal(bc, mod_p, 8);

+#endif

+    return 1;

+  } else {

+    vp9_write(bc, 0, upd_p);

+    return 0;

+  }

+}

+#ifdef NMV_STATS

+void init_nmvstats() {

+  vp9_zero(tnmvcounts);

+}

+void print_nmvstats() {

+  nmv_context prob;

+  unsigned int branch_ct_joint[MV_JOINTS - 1][2];

+  unsigned int branch_ct_sign[2][2];

+  unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];

+  unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];

+  unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];

+  unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];

+  unsigned int branch_ct_fp[2][4 - 1][2];

+  unsigned int branch_ct_class0_hp[2][2];

+  unsigned int branch_ct_hp[2][2];

+  int i, j, k;

+  vp9_counts_to_nmv_context(&tnmvcounts, &prob, 1,

+                            branch_ct_joint, branch_ct_sign, branch_ct_classes,

+                            branch_ct_class0, branch_ct_bits,

+                            branch_ct_class0_fp, branch_ct_fp,

+                            branch_ct_class0_hp, branch_ct_hp);

+  printf("\nCounts =\n  { ");

+  for (j = 0; j < MV_JOINTS; ++j)

+    printf("%d, ", tnmvcounts.joints[j]);

+  printf("},\n");

+  for (i=0; i< 2; ++i) {

+    printf("  {\n");

+    printf("    %d/%d,\n", tnmvcounts.comps[i].sign[0],

+                           tnmvcounts.comps[i].sign[1]);

+    printf("    { ");

+    for (j = 0; j < MV_CLASSES; ++j)

+      printf("%d, ", tnmvcounts.comps[i].classes[j]);

+    printf("},\n");

+    printf("    { ");

+    for (j = 0; j < CLASS0_SIZE; ++j)

+      printf("%d, ", tnmvcounts.comps[i].class0[j]);

+    printf("},\n");

+    printf("    { ");

+    for (j = 0; j < MV_OFFSET_BITS; ++j)

+      printf("%d/%d, ", tnmvcounts.comps[i].bits[j][0],

+                        tnmvcounts.comps[i].bits[j][1]);

+    printf("},\n");

+    printf("    {");

+    for (j = 0; j < CLASS0_SIZE; ++j) {

+      printf("{");

+      for (k = 0; k < 4; ++k)

+        printf("%d, ", tnmvcounts.comps[i].class0_fp[j][k]);

+      printf("}, ");

+    }

+    printf("},\n");

+    printf("    { ");

+    for (j = 0; j < 4; ++j)

+      printf("%d, ", tnmvcounts.comps[i].fp[j]);

+    printf("},\n");

+    printf("    %d/%d,\n",

+           tnmvcounts.comps[i].class0_hp[0],

+           tnmvcounts.comps[i].class0_hp[1]);

+    printf("    %d/%d,\n",

+           tnmvcounts.comps[i].hp[0],

+           tnmvcounts.comps[i].hp[1]);

+    printf("  },\n");

+  }

+  printf("\nProbs =\n  { ");

+  for (j = 0; j < MV_JOINTS - 1; ++j)

+    printf("%d, ", prob.joints[j]);

+  printf("},\n");

+  for (i=0; i< 2; ++i) {

+    printf("  {\n");

+    printf("    %d,\n", prob.comps[i].sign);

+    printf("    { ");

+    for (j = 0; j < MV_CLASSES - 1; ++j)

+      printf("%d, ", prob.comps[i].classes[j]);

+    printf("},\n");

+    printf("    { ");

+    for (j = 0; j < CLASS0_SIZE - 1; ++j)

+      printf("%d, ", prob.comps[i].class0[j]);

+    printf("},\n");

+    printf("    { ");

+    for (j = 0; j < MV_OFFSET_BITS; ++j)

+      printf("%d, ", prob.comps[i].bits[j]);

+    printf("},\n");

+    printf("    { ");

+    for (j = 0; j < CLASS0_SIZE; ++j) {

+      printf("{");

+      for (k = 0; k < 3; ++k)

+        printf("%d, ", prob.comps[i].class0_fp[j][k]);

+      printf("}, ");

+    }

+    printf("},\n");

+    printf("    { ");

+    for (j = 0; j < 3; ++j)

+      printf("%d, ", prob.comps[i].fp[j]);

+    printf("},\n");

+    printf("    %d,\n", prob.comps[i].class0_hp);

+    printf("    %d,\n", prob.comps[i].hp);

+    printf("  },\n");

+  }

+}

+static void add_nmvcount(nmv_context_counts* const dst,

+                         const nmv_context_counts* const src) {

+  int i, j, k;

+  for (j = 0; j < MV_JOINTS; ++j) {

+    dst->joints[j] += src->joints[j];

+  }

+  for (i = 0; i < 2; ++i) {

+    for (j = 0; j < MV_VALS; ++j) {

+      dst->comps[i].mvcount[j] += src->comps[i].mvcount[j];

+    }

+    dst->comps[i].sign[0] += src->comps[i].sign[0];

+    dst->comps[i].sign[1] += src->comps[i].sign[1];

+    for (j = 0; j < MV_CLASSES; ++j) {

+      dst->comps[i].classes[j] += src->comps[i].classes[j];

+    }

+    for (j = 0; j < CLASS0_SIZE; ++j) {

+      dst->comps[i].class0[j] += src->comps[i].class0[j];

+    }

+    for (j = 0; j < MV_OFFSET_BITS; ++j) {

+      dst->comps[i].bits[j][0] += src->comps[i].bits[j][0];

+      dst->comps[i].bits[j][1] += src->comps[i].bits[j][1];

+    }

+  }

+  for (i = 0; i < 2; ++i) {

+    for (j = 0; j < CLASS0_SIZE; ++j) {

+      for (k = 0; k < 4; ++k) {

+        dst->comps[i].class0_fp[j][k] += src->comps[i].class0_fp[j][k];

+      }

+    }

+    for (j = 0; j < 4; ++j) {

+      dst->comps[i].fp[j] += src->comps[i].fp[j];

+    }

+    dst->comps[i].class0_hp[0] += src->comps[i].class0_hp[0];

+    dst->comps[i].class0_hp[1] += src->comps[i].class0_hp[1];

+    dst->comps[i].hp[0] += src->comps[i].hp[0];

+    dst->comps[i].hp[1] += src->comps[i].hp[1];

+  }

+}

+#endif

+void vp9_write_nmvprobs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {

+  int i, j;

+  nmv_context prob;

+  unsigned int branch_ct_joint[MV_JOINTS - 1][2];

+  unsigned int branch_ct_sign[2][2];

+  unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];

+  unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];

+  unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];

+  unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];

+  unsigned int branch_ct_fp[2][4 - 1][2];

+  unsigned int branch_ct_class0_hp[2][2];

+  unsigned int branch_ct_hp[2][2];

+  int savings = 0;

+#ifdef NMV_STATS

+  if (!cpi->dummy_packing)

+    add_nmvcount(&tnmvcounts, &cpi->NMVcount);

+#endif

+  vp9_counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,

+                            branch_ct_joint, branch_ct_sign, branch_ct_classes,

+                            branch_ct_class0, branch_ct_bits,

+                            branch_ct_class0_fp, branch_ct_fp,

+                            branch_ct_class0_hp, branch_ct_hp);

+  /* write updates if they help */

+#ifdef MV_GROUP_UPDATE

+  for (j = 0; j < MV_JOINTS - 1; ++j) {

+    savings += update_nmv_savings(branch_ct_joint[j],

+                                  cpi->common.fc.nmvc.joints[j],

+                                  prob.joints[j],

+                                  VP9_NMV_UPDATE_PROB);

+  }

+  for (i = 0; i < 2; ++i) {

+    savings += update_nmv_savings(branch_ct_sign[i],

+                                  cpi->common.fc.nmvc.comps[i].sign,

+                                  prob.comps[i].sign,

+                                  VP9_NMV_UPDATE_PROB);

+    for (j = 0; j < MV_CLASSES - 1; ++j) {

+      savings += update_nmv_savings(branch_ct_classes[i][j],

+                                    cpi->common.fc.nmvc.comps[i].classes[j],

+                                    prob.comps[i].classes[j],

+                                    VP9_NMV_UPDATE_PROB);

+    }

+    for (j = 0; j < CLASS0_SIZE - 1; ++j) {

+      savings += update_nmv_savings(branch_ct_class0[i][j],

+                                    cpi->common.fc.nmvc.comps[i].class0[j],

+                                    prob.comps[i].class0[j],

+                                    VP9_NMV_UPDATE_PROB);

+    }

+    for (j = 0; j < MV_OFFSET_BITS; ++j) {

+      savings += update_nmv_savings(branch_ct_bits[i][j],

+                                    cpi->common.fc.nmvc.comps[i].bits[j],

+                                    prob.comps[i].bits[j],

+                                    VP9_NMV_UPDATE_PROB);

+    }

+  }

+  for (i = 0; i < 2; ++i) {

+    for (j = 0; j < CLASS0_SIZE; ++j) {

+      int k;

+      for (k = 0; k < 3; ++k) {

+        savings += update_nmv_savings(branch_ct_class0_fp[i][j][k],

+                                      cpi->common.fc.nmvc.comps[i].class0_fp[j][k],

+                                      prob.comps[i].class0_fp[j][k],

+                                      VP9_NMV_UPDATE_PROB);

+      }

+    }

+    for (j = 0; j < 3; ++j) {

+      savings += update_nmv_savings(branch_ct_fp[i][j],

+                                    cpi->common.fc.nmvc.comps[i].fp[j],

+                                    prob.comps[i].fp[j],

+                                    VP9_NMV_UPDATE_PROB);

+    }

+  }

+  if (usehp) {

+    for (i = 0; i < 2; ++i) {

+      savings += update_nmv_savings(branch_ct_class0_hp[i],

+                                    cpi->common.fc.nmvc.comps[i].class0_hp,

+                                    prob.comps[i].class0_hp,

+                                    VP9_NMV_UPDATE_PROB);

+      savings += update_nmv_savings(branch_ct_hp[i],

+                                    cpi->common.fc.nmvc.comps[i].hp,

+                                    prob.comps[i].hp,

+                                    VP9_NMV_UPDATE_PROB);

+    }

+  }

+  if (savings <= 0) {

+    vp9_write_bit(bc, 0);

+    return;

+  }

+  vp9_write_bit(bc, 1);

+#endif

+  for (j = 0; j < MV_JOINTS - 1; ++j) {

+    update_nmv(bc, branch_ct_joint[j],

+               &cpi->common.fc.nmvc.joints[j],

+               prob.joints[j],

+               VP9_NMV_UPDATE_PROB);

+  }

+  for (i = 0; i < 2; ++i) {

+    update_nmv(bc, branch_ct_sign[i],

+               &cpi->common.fc.nmvc.comps[i].sign,

+               prob.comps[i].sign,

+               VP9_NMV_UPDATE_PROB);

+    for (j = 0; j < MV_CLASSES - 1; ++j) {

+      update_nmv(bc, branch_ct_classes[i][j],

+                 &cpi->common.fc.nmvc.comps[i].classes[j],

+                 prob.comps[i].classes[j],

+                 VP9_NMV_UPDATE_PROB);

+    }

+    for (j = 0; j < CLASS0_SIZE - 1; ++j) {

+      update_nmv(bc, branch_ct_class0[i][j],

+                 &cpi->common.fc.nmvc.comps[i].class0[j],

+                 prob.comps[i].class0[j],

+                 VP9_NMV_UPDATE_PROB);

+    }

+    for (j = 0; j < MV_OFFSET_BITS; ++j) {

+      update_nmv(bc, branch_ct_bits[i][j],

+                 &cpi->common.fc.nmvc.comps[i].bits[j],

+                 prob.comps[i].bits[j],

+                 VP9_NMV_UPDATE_PROB);

+    }

+  }

+  for (i = 0; i < 2; ++i) {

+    for (j = 0; j < CLASS0_SIZE; ++j) {

+      int k;

+      for (k = 0; k < 3; ++k) {

+        update_nmv(bc, branch_ct_class0_fp[i][j][k],

+                   &cpi->common.fc.nmvc.comps[i].class0_fp[j][k],

+                   prob.comps[i].class0_fp[j][k],

+                   VP9_NMV_UPDATE_PROB);

+      }

+    }

+    for (j = 0; j < 3; ++j) {

+      update_nmv(bc, branch_ct_fp[i][j],

+                 &cpi->common.fc.nmvc.comps[i].fp[j],

+                 prob.comps[i].fp[j],

+                 VP9_NMV_UPDATE_PROB);

+    }

+  }

+  if (usehp) {

+    for (i = 0; i < 2; ++i) {

+      update_nmv(bc, branch_ct_class0_hp[i],

+                 &cpi->common.fc.nmvc.comps[i].class0_hp,

+                 prob.comps[i].class0_hp,

+                 VP9_NMV_UPDATE_PROB);

+      update_nmv(bc, branch_ct_hp[i],

+                 &cpi->common.fc.nmvc.comps[i].hp,

+                 prob.comps[i].hp,

+                 VP9_NMV_UPDATE_PROB);

+    }

+  }

+}

+void vp9_encode_nmv(vp9_writer* const bc, const MV* const mv,

+                    const MV* const ref, const nmv_context* const mvctx) {

+  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);

+  write_token(bc, vp9_mv_joint_tree, mvctx->joints,

+              vp9_mv_joint_encodings + j);

+  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {

+    encode_nmv_component(bc, mv->row, ref->col, &mvctx->comps[0]);

+  }

+  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {

+    encode_nmv_component(bc, mv->col, ref->col, &mvctx->comps[1]);

+  }

+}

+void vp9_encode_nmv_fp(vp9_writer* const bc, const MV* const mv,

+                       const MV* const ref, const nmv_context* const mvctx,

+                       int usehp) {

+  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);

+  usehp = usehp && vp9_use_nmv_hp(ref);

+  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {

+    encode_nmv_component_fp(bc, mv->row, ref->row, &mvctx->comps[0], usehp);

+  }

+  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {

+    encode_nmv_component_fp(bc, mv->col, ref->col, &mvctx->comps[1], usehp);

+  }

+}

+void vp9_build_nmv_cost_table(int *mvjoint,

+                              int *mvcost[2],

+                              const nmv_context* const mvctx,

+                              int usehp,

+                              int mvc_flag_v,

+                              int mvc_flag_h) {

+  vp9_clear_system_state();

+  vp9_cost_tokens(mvjoint, mvctx->joints, vp9_mv_joint_tree);

+  if (mvc_flag_v)

+    build_nmv_component_cost_table(mvcost[0], &mvctx->comps[0], usehp);

+  if (mvc_flag_h)

+    build_nmv_component_cost_table(mvcost[1], &mvctx->comps[1], usehp);

+}

--- /dev/null

+++ b/vp9/encoder/encodemv.h

@@ -1,0 +1,30 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_ENCODEMV_H

+#define __INC_ENCODEMV_H

+#include "onyx_int.h"

+void vp9_write_nmvprobs(VP9_COMP* const, int usehp, vp9_writer* const);

+void vp9_encode_nmv(vp9_writer* const w, const MV* const mv,

+                    const MV* const ref, const nmv_context* const mvctx);

+void vp9_encode_nmv_fp(vp9_writer* const w, const MV* const mv,

+                       const MV* const ref, const nmv_context *mvctx,

+                       int usehp);

+void vp9_build_nmv_cost_table(int *mvjoint,

+                              int *mvcost[2],

+                              const nmv_context *mvctx,

+                              int usehp,

+                              int mvc_flag_v,

+                              int mvc_flag_h);

+#endif

--- /dev/null

+++ b/vp9/encoder/firstpass.c

@@ -1,0 +1,2533 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "math.h"

+#include "limits.h"

+#include "block.h"

+#include "onyx_int.h"

+#include "variance.h"

+#include "encodeintra.h"

+#include "vp9/common/setupintrarecon.h"

+#include "mcomp.h"

+#include "firstpass.h"

+#include "vpx_scale/vpxscale.h"

+#include "encodemb.h"

+#include "vp9/common/extend.h"

+#include "vp9/common/systemdependent.h"

+#include "vpx_scale/yv12extend.h"

+#include "vpx_mem/vpx_mem.h"

+#include "vp9/common/swapyv12buffer.h"

+#include <stdio.h>

+#include "rdopt.h"

+#include "ratectrl.h"

+#include "vp9/common/quant_common.h"

+#include "vp9/common/entropymv.h"

+#include "encodemv.h"

+#define OUTPUT_FPF 0

+#if CONFIG_RUNTIME_CPU_DETECT

+#define IF_RTCD(x) (x)

+#else

+#define IF_RTCD(x) NULL

+#endif

+extern void vp9_build_block_offsets(MACROBLOCK *x);

+extern void vp9_setup_block_ptrs(MACROBLOCK *x);

+extern void vp9_frame_init_quantizer(VP9_COMP *cpi);

+extern void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb,

+                                   int_mv *mv);

+extern void vp9_alloc_compressor_data(VP9_COMP *cpi);

+#define IIFACTOR   12.5

+#define IIKFACTOR1 12.5

+#define IIKFACTOR2 15.0

+#define RMAX       128.0

+#define GF_RMAX    96.0

+#define ERR_DIVISOR   150.0

+#define KF_MB_INTRA_MIN 300

+#define GF_MB_INTRA_MIN 200

+#define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)

+#define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0

+#define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0

+static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame);

+static int select_cq_level(int qindex) {

+  int ret_val = QINDEX_RANGE - 1;

+  int i;

+  double target_q = (vp9_convert_qindex_to_q(qindex) * 0.5847) + 1.0;

+  for (i = 0; i < QINDEX_RANGE; i++) {

+    if (target_q <= vp9_convert_qindex_to_q(i)) {

+      ret_val = i;

+      break;

+    }

+  }

+  return ret_val;

+}

+// Resets the first pass file to the given position using a relative seek from the current position

+static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *Position) {

+  cpi->twopass.stats_in = Position;

+}

+static int lookup_next_frame_stats(VP9_COMP *cpi, FIRSTPASS_STATS *next_frame) {

+  if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)

+    return EOF;

+  *next_frame = *cpi->twopass.stats_in;

+  return 1;

+}

+// Read frame stats at an offset from the current position

+static int read_frame_stats(VP9_COMP *cpi,

+                            FIRSTPASS_STATS *frame_stats,

+                            int offset) {

+  FIRSTPASS_STATS *fps_ptr = cpi->twopass.stats_in;

+  // Check legality of offset

+  if (offset >= 0) {

+    if (&fps_ptr[offset] >= cpi->twopass.stats_in_end)

+      return EOF;

+  } else if (offset < 0) {

+    if (&fps_ptr[offset] < cpi->twopass.stats_in_start)

+      return EOF;

+  }

+  *frame_stats = fps_ptr[offset];

+  return 1;

+}

+static int input_stats(VP9_COMP *cpi, FIRSTPASS_STATS *fps) {

+  if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)

+    return EOF;

+  *fps = *cpi->twopass.stats_in;

+  cpi->twopass.stats_in =

+    (void *)((char *)cpi->twopass.stats_in + sizeof(FIRSTPASS_STATS));

+  return 1;

+}

+static void output_stats(const VP9_COMP            *cpi,

+                         struct vpx_codec_pkt_list *pktlist,

+                         FIRSTPASS_STATS            *stats) {

+  struct vpx_codec_cx_pkt pkt;

+  pkt.kind = VPX_CODEC_STATS_PKT;

+  pkt.data.twopass_stats.buf = stats;

+  pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);

+  vpx_codec_pkt_list_add(pktlist, &pkt);

+// TEMP debug code

+#if OUTPUT_FPF

+  {

+    FILE *fpfile;

+    fpfile = fopen("firstpass.stt", "a");

+    fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"

+            "%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"

+            "%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n",

+            stats->frame,

+            stats->intra_error,

+            stats->coded_error,

+            stats->sr_coded_error,

+            stats->ssim_weighted_pred_err,

+            stats->pcnt_inter,

+            stats->pcnt_motion,

+            stats->pcnt_second_ref,

+            stats->pcnt_neutral,

+            stats->MVr,

+            stats->mvr_abs,

+            stats->MVc,

+            stats->mvc_abs,

+            stats->MVrv,

+            stats->MVcv,

+            stats->mv_in_out_count,

+            stats->new_mv_count,

+            stats->count,

+            stats->duration);

+    fclose(fpfile);

+  }

+#endif

+}

+static void zero_stats(FIRSTPASS_STATS *section) {

+  section->frame      = 0.0;

+  section->intra_error = 0.0;

+  section->coded_error = 0.0;

+  section->sr_coded_error = 0.0;

+  section->ssim_weighted_pred_err = 0.0;

+  section->pcnt_inter  = 0.0;

+  section->pcnt_motion  = 0.0;

+  section->pcnt_second_ref = 0.0;

+  section->pcnt_neutral = 0.0;

+  section->MVr        = 0.0;

+  section->mvr_abs     = 0.0;

+  section->MVc        = 0.0;

+  section->mvc_abs     = 0.0;

+  section->MVrv       = 0.0;

+  section->MVcv       = 0.0;

+  section->mv_in_out_count  = 0.0;

+  section->new_mv_count = 0.0;

+  section->count      = 0.0;

+  section->duration   = 1.0;

+}

+static void accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) {

+  section->frame += frame->frame;

+  section->intra_error += frame->intra_error;

+  section->coded_error += frame->coded_error;

+  section->sr_coded_error += frame->sr_coded_error;

+  section->ssim_weighted_pred_err += frame->ssim_weighted_pred_err;

+  section->pcnt_inter  += frame->pcnt_inter;

+  section->pcnt_motion += frame->pcnt_motion;

+  section->pcnt_second_ref += frame->pcnt_second_ref;

+  section->pcnt_neutral += frame->pcnt_neutral;

+  section->MVr        += frame->MVr;

+  section->mvr_abs     += frame->mvr_abs;

+  section->MVc        += frame->MVc;

+  section->mvc_abs     += frame->mvc_abs;

+  section->MVrv       += frame->MVrv;

+  section->MVcv       += frame->MVcv;

+  section->mv_in_out_count  += frame->mv_in_out_count;

+  section->new_mv_count += frame->new_mv_count;

+  section->count      += frame->count;

+  section->duration   += frame->duration;

+}

+static void subtract_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) {

+  section->frame -= frame->frame;

+  section->intra_error -= frame->intra_error;

+  section->coded_error -= frame->coded_error;

+  section->sr_coded_error -= frame->sr_coded_error;

+  section->ssim_weighted_pred_err -= frame->ssim_weighted_pred_err;

+  section->pcnt_inter  -= frame->pcnt_inter;

+  section->pcnt_motion -= frame->pcnt_motion;

+  section->pcnt_second_ref -= frame->pcnt_second_ref;

+  section->pcnt_neutral -= frame->pcnt_neutral;

+  section->MVr        -= frame->MVr;

+  section->mvr_abs     -= frame->mvr_abs;

+  section->MVc        -= frame->MVc;

+  section->mvc_abs     -= frame->mvc_abs;

+  section->MVrv       -= frame->MVrv;

+  section->MVcv       -= frame->MVcv;

+  section->mv_in_out_count  -= frame->mv_in_out_count;

+  section->new_mv_count -= frame->new_mv_count;

+  section->count      -= frame->count;

+  section->duration   -= frame->duration;

+}

+static void avg_stats(FIRSTPASS_STATS *section) {

+  if (section->count < 1.0)

+    return;

+  section->intra_error /= section->count;

+  section->coded_error /= section->count;

+  section->sr_coded_error /= section->count;

+  section->ssim_weighted_pred_err /= section->count;

+  section->pcnt_inter  /= section->count;

+  section->pcnt_second_ref /= section->count;

+  section->pcnt_neutral /= section->count;

+  section->pcnt_motion /= section->count;

+  section->MVr        /= section->count;

+  section->mvr_abs     /= section->count;

+  section->MVc        /= section->count;

+  section->mvc_abs     /= section->count;

+  section->MVrv       /= section->count;

+  section->MVcv       /= section->count;

+  section->mv_in_out_count   /= section->count;

+  section->duration   /= section->count;

+}

+// Calculate a modified Error used in distributing bits between easier and harder frames

+static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {

+  double av_err = (cpi->twopass.total_stats->ssim_weighted_pred_err /

+                   cpi->twopass.total_stats->count);

+  double this_err = this_frame->ssim_weighted_pred_err;

+  double modified_err;

+  if (this_err > av_err)

+    modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW1);

+  else

+    modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW2);

+  return modified_err;

+}

+static const double weight_table[256] = {

+  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,

+  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,

+  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,

+  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,

+  0.020000, 0.031250, 0.062500, 0.093750, 0.125000, 0.156250, 0.187500, 0.218750,

+  0.250000, 0.281250, 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750,

+  0.500000, 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,

+  0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, 0.968750,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,

+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000

+};

+static double simple_weight(YV12_BUFFER_CONFIG *source) {

+  int i, j;

+  unsigned char *src = source->y_buffer;

+  double sum_weights = 0.0;

+  // Loop throught the Y plane raw examining levels and creating a weight for the image

+  i = source->y_height;

+  do {

+    j = source->y_width;

+    do {

+      sum_weights += weight_table[ *src];

+      src++;

+    } while (--j);

+    src -= source->y_width;

+    src += source->y_stride;

+  } while (--i);

+  sum_weights /= (source->y_height * source->y_width);

+  return sum_weights;

+}

+// This function returns the current per frame maximum bitrate target

+static int frame_max_bits(VP9_COMP *cpi) {

+  // Max allocation for a single frame based on the max section guidelines passed in and how many bits are left

+  int max_bits;

+  // For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user

+  max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats->count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));

+  // Trap case where we are out of bits

+  if (max_bits < 0)

+    max_bits = 0;

+  return max_bits;

+}

+void vp9_init_first_pass(VP9_COMP *cpi) {

+  zero_stats(cpi->twopass.total_stats);

+}

+void vp9_end_first_pass(VP9_COMP *cpi) {

+  output_stats(cpi, cpi->output_pkt_list, cpi->twopass.total_stats);

+}

+static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  BLOCK *b = &x->block[0];

+  BLOCKD *d = &x->e_mbd.block[0];

+  unsigned char *src_ptr = (*(b->base_src) + b->src);

+  int src_stride = b->src_stride;

+  unsigned char *ref_ptr;

+  int ref_stride = d->pre_stride;

+  // Set up pointers for this macro block recon buffer

+  xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;

+  ref_ptr = (unsigned char *)(*(d->base_pre) + d->pre);

+  vp9_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride,

+               (unsigned int *)(best_motion_err));

+}

+static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,

+                                     int_mv *ref_mv, MV *best_mv,

+                                     YV12_BUFFER_CONFIG *recon_buffer,

+                                     int *best_motion_err, int recon_yoffset) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  BLOCK *b = &x->block[0];

+  BLOCKD *d = &x->e_mbd.block[0];

+  int num00;

+  int_mv tmp_mv;

+  int_mv ref_mv_full;

+  int tmp_err;

+  int step_param = 3;

+  int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;

+  int n;

+  vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];

+  int new_mv_mode_penalty = 256;

+  // override the default variance function to use MSE

+  v_fn_ptr.vf = vp9_mse16x16;

+  // Set up pointers for this macro block recon buffer

+  xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;

+  // Initial step/diamond search centred on best mv

+  tmp_mv.as_int = 0;

+  ref_mv_full.as_mv.col = ref_mv->as_mv.col >> 3;

+  ref_mv_full.as_mv.row = ref_mv->as_mv.row >> 3;

+  tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv, step_param,

+                                    x->sadperbit16, &num00, &v_fn_ptr,

+                                    XMVCOST, ref_mv);

+  if (tmp_err < INT_MAX - new_mv_mode_penalty)

+    tmp_err += new_mv_mode_penalty;

+  if (tmp_err < *best_motion_err) {

+    *best_motion_err = tmp_err;

+    best_mv->row = tmp_mv.as_mv.row;

+    best_mv->col = tmp_mv.as_mv.col;

+  }

+  // Further step/diamond searches as necessary

+  n = num00;

+  num00 = 0;

+  while (n < further_steps) {

+    n++;

+    if (num00)

+      num00--;

+    else {

+      tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv,

+                                        step_param + n, x->sadperbit16,

+                                        &num00, &v_fn_ptr,

+                                        XMVCOST, ref_mv);

+      if (tmp_err < INT_MAX - new_mv_mode_penalty)

+        tmp_err += new_mv_mode_penalty;

+      if (tmp_err < *best_motion_err) {

+        *best_motion_err = tmp_err;

+        best_mv->row = tmp_mv.as_mv.row;

+        best_mv->col = tmp_mv.as_mv.col;

+      }

+    }

+  }

+}

+void vp9_first_pass(VP9_COMP *cpi) {

+  int mb_row, mb_col;

+  MACROBLOCK *const x = &cpi->mb;

+  VP9_COMMON *const cm = &cpi->common;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  int recon_yoffset, recon_uvoffset;

+  YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];

+  YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];

+  YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];

+  int recon_y_stride = lst_yv12->y_stride;

+  int recon_uv_stride = lst_yv12->uv_stride;

+  int64_t intra_error = 0;

+  int64_t coded_error = 0;

+  int64_t sr_coded_error = 0;

+  int sum_mvr = 0, sum_mvc = 0;

+  int sum_mvr_abs = 0, sum_mvc_abs = 0;

+  int sum_mvrs = 0, sum_mvcs = 0;

+  int mvcount = 0;

+  int intercount = 0;

+  int second_ref_count = 0;

+  int intrapenalty = 256;

+  int neutral_count = 0;

+  int new_mv_count = 0;

+  int sum_in_vectors = 0;

+  uint32_t lastmv_as_int = 0;

+  int_mv zero_ref_mv;

+  zero_ref_mv.as_int = 0;

+  vp9_clear_system_state();  // __asm emms;

+  x->src = * cpi->Source;

+  xd->pre = *lst_yv12;

+  xd->dst = *new_yv12;

+  x->partition_info = x->pi;

+  xd->mode_info_context = cm->mi;

+  vp9_build_block_offsets(x);

+  vp9_setup_block_dptrs(&x->e_mbd);

+  vp9_setup_block_ptrs(x);

+  // set up frame new frame for intra coded blocks

+  vp9_setup_intra_recon(new_yv12);

+  vp9_frame_init_quantizer(cpi);

+  // Initialise the MV cost table to the defaults

+  // if( cm->current_video_frame == 0)

+  // if ( 0 )

+  {

+    int flag[2] = {1, 1};

+    vp9_init_mv_probs(cm);

+    vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);

+  }

+  // for each macroblock row in image

+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {

+    int_mv best_ref_mv;

+    best_ref_mv.as_int = 0;

+    // reset above block coeffs

+    xd->up_available = (mb_row != 0);

+    recon_yoffset = (mb_row * recon_y_stride * 16);

+    recon_uvoffset = (mb_row * recon_uv_stride * 8);

+    // Set up limit values for motion vectors to prevent them extending outside the UMV borders

+    x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));

+    x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);

+    // for each macroblock col in image

+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

+      int this_error;

+      int gf_motion_error = INT_MAX;

+      int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);

+      xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;

+      xd->dst.u_buffer = new_yv12->u_buffer + recon_uvoffset;

+      xd->dst.v_buffer = new_yv12->v_buffer + recon_uvoffset;

+      xd->left_available = (mb_col != 0);

+      // Copy current mb to a buffer

+      vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);

+      // do intra 16x16 prediction

+      this_error = vp9_encode_intra(cpi, x, use_dc_pred);

+      // "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame)

+      // We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv.

+      // When the error score is very low this causes us to pick all or lots of INTRA modes and throw lots of key frames.

+      // This penalty adds a cost matching that of a 0,0 mv to the intra case.

+      this_error += intrapenalty;

+      // Cumulative intra error total

+      intra_error += (int64_t)this_error;

+      // Set up limit values for motion vectors to prevent them extending outside the UMV borders

+      x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));

+      x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);

+      // Other than for the first frame do a motion search

+      if (cm->current_video_frame > 0) {

+        int tmp_err;

+        int motion_error = INT_MAX;

+        int_mv mv, tmp_mv;

+        // Simple 0,0 motion with no mv overhead

+        zz_motion_search(cpi, x, lst_yv12, &motion_error, recon_yoffset);

+        mv.as_int = tmp_mv.as_int = 0;

+        // Test last reference frame using the previous best mv as the

+        // starting point (best reference) for the search

+        first_pass_motion_search(cpi, x, &best_ref_mv,

+                                 &mv.as_mv, lst_yv12,

+                                 &motion_error, recon_yoffset);

+        // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well

+        if (best_ref_mv.as_int) {

+          tmp_err = INT_MAX;

+          first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv.as_mv,

+                                   lst_yv12, &tmp_err, recon_yoffset);

+          if (tmp_err < motion_error) {

+            motion_error = tmp_err;

+            mv.as_int = tmp_mv.as_int;

+          }

+        }

+        // Experimental search in an older reference frame

+        if (cm->current_video_frame > 1) {

+          // Simple 0,0 motion with no mv overhead

+          zz_motion_search(cpi, x, gld_yv12,

+                           &gf_motion_error, recon_yoffset);

+          first_pass_motion_search(cpi, x, &zero_ref_mv,

+                                   &tmp_mv.as_mv, gld_yv12,

+                                   &gf_motion_error, recon_yoffset);

+          if ((gf_motion_error < motion_error) &&

+              (gf_motion_error < this_error)) {

+            second_ref_count++;

+          }

+          // Reset to last frame as reference buffer

+          xd->pre.y_buffer = lst_yv12->y_buffer + recon_yoffset;

+          xd->pre.u_buffer = lst_yv12->u_buffer + recon_uvoffset;

+          xd->pre.v_buffer = lst_yv12->v_buffer + recon_uvoffset;

+          // In accumulating a score for the older reference frame

+          // take the best of the motion predicted score and

+          // the intra coded error (just as will be done for)

+          // accumulation of "coded_error" for the last frame.

+          if (gf_motion_error < this_error)

+            sr_coded_error += gf_motion_error;

+          else

+            sr_coded_error += this_error;

+        } else

+          sr_coded_error += motion_error;

+        /* Intra assumed best */

+        best_ref_mv.as_int = 0;

+        if (motion_error <= this_error) {

+          // Keep a count of cases where the inter and intra were

+          // very close and very low. This helps with scene cut

+          // detection for example in cropped clips with black bars

+          // at the sides or top and bottom.

+          if ((((this_error - intrapenalty) * 9) <=

+               (motion_error * 10)) &&

+              (this_error < (2 * intrapenalty))) {

+            neutral_count++;

+          }

+          mv.as_mv.row <<= 3;

+          mv.as_mv.col <<= 3;

+          this_error = motion_error;

+          vp9_set_mbmode_and_mvs(x, NEWMV, &mv);

+          xd->mode_info_context->mbmi.txfm_size = TX_4X4;

+          vp9_encode_inter16x16y(IF_RTCD(&cpi->rtcd), x);

+          sum_mvr += mv.as_mv.row;

+          sum_mvr_abs += abs(mv.as_mv.row);

+          sum_mvc += mv.as_mv.col;

+          sum_mvc_abs += abs(mv.as_mv.col);

+          sum_mvrs += mv.as_mv.row * mv.as_mv.row;

+          sum_mvcs += mv.as_mv.col * mv.as_mv.col;

+          intercount++;

+          best_ref_mv.as_int = mv.as_int;

+          // Was the vector non-zero

+          if (mv.as_int) {

+            mvcount++;

+            // Was it different from the last non zero vector

+            if (mv.as_int != lastmv_as_int)

+              new_mv_count++;

+            lastmv_as_int = mv.as_int;

+            // Does the Row vector point inwards or outwards

+            if (mb_row < cm->mb_rows / 2) {

+              if (mv.as_mv.row > 0)

+                sum_in_vectors--;

+              else if (mv.as_mv.row < 0)

+                sum_in_vectors++;

+            } else if (mb_row > cm->mb_rows / 2) {

+              if (mv.as_mv.row > 0)

+                sum_in_vectors++;

+              else if (mv.as_mv.row < 0)

+                sum_in_vectors--;

+            }

+            // Does the Row vector point inwards or outwards

+            if (mb_col < cm->mb_cols / 2) {

+              if (mv.as_mv.col > 0)

+                sum_in_vectors--;

+              else if (mv.as_mv.col < 0)

+                sum_in_vectors++;

+            } else if (mb_col > cm->mb_cols / 2) {

+              if (mv.as_mv.col > 0)

+                sum_in_vectors++;

+              else if (mv.as_mv.col < 0)

+                sum_in_vectors--;

+            }

+          }

+        }

+      } else

+        sr_coded_error += (int64_t)this_error;

+      coded_error += (int64_t)this_error;

+      // adjust to the next column of macroblocks

+      x->src.y_buffer += 16;

+      x->src.u_buffer += 8;

+      x->src.v_buffer += 8;

+      recon_yoffset += 16;

+      recon_uvoffset += 8;

+    }

+    // adjust to the next row of mbs

+    x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;

+    x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;

+    x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;

+    // extend the recon for intra prediction

+    vp9_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,

+                      xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);

+    vp9_clear_system_state();  // __asm emms;

+  }

+  vp9_clear_system_state();  // __asm emms;

+  {

+    double weight = 0.0;

+    FIRSTPASS_STATS fps;

+    fps.frame      = cm->current_video_frame;

+    fps.intra_error = intra_error >> 8;

+    fps.coded_error = coded_error >> 8;

+    fps.sr_coded_error = sr_coded_error >> 8;

+    weight = simple_weight(cpi->Source);

+    if (weight < 0.1)

+      weight = 0.1;

+    fps.ssim_weighted_pred_err = fps.coded_error * weight;

+    fps.pcnt_inter  = 0.0;

+    fps.pcnt_motion = 0.0;

+    fps.MVr        = 0.0;

+    fps.mvr_abs     = 0.0;

+    fps.MVc        = 0.0;

+    fps.mvc_abs     = 0.0;

+    fps.MVrv       = 0.0;

+    fps.MVcv       = 0.0;

+    fps.mv_in_out_count  = 0.0;

+    fps.new_mv_count = 0.0;

+    fps.count      = 1.0;

+    fps.pcnt_inter   = 1.0 * (double)intercount / cm->MBs;

+    fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs;

+    fps.pcnt_neutral = 1.0 * (double)neutral_count / cm->MBs;

+    if (mvcount > 0) {

+      fps.MVr = (double)sum_mvr / (double)mvcount;

+      fps.mvr_abs = (double)sum_mvr_abs / (double)mvcount;

+      fps.MVc = (double)sum_mvc / (double)mvcount;

+      fps.mvc_abs = (double)sum_mvc_abs / (double)mvcount;

+      fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) / (double)mvcount;

+      fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) / (double)mvcount;

+      fps.mv_in_out_count = (double)sum_in_vectors / (double)(mvcount * 2);

+      fps.new_mv_count = new_mv_count;

+      fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs;

+    }

+    // TODO:  handle the case when duration is set to 0, or something less

+    // than the full time between subsequent cpi->source_time_stamp s  .

+    fps.duration = cpi->source->ts_end

+                   - cpi->source->ts_start;

+    // don't want to do output stats with a stack variable!

+    memcpy(cpi->twopass.this_frame_stats,

+           &fps,

+           sizeof(FIRSTPASS_STATS));

+    output_stats(cpi, cpi->output_pkt_list, cpi->twopass.this_frame_stats);

+    accumulate_stats(cpi->twopass.total_stats, &fps);

+  }

+  // Copy the previous Last Frame back into gf and and arf buffers if

+  // the prediction is good enough... but also dont allow it to lag too far

+  if ((cpi->twopass.sr_update_lag > 3) ||

+      ((cm->current_video_frame > 0) &&

+       (cpi->twopass.this_frame_stats->pcnt_inter > 0.20) &&

+       ((cpi->twopass.this_frame_stats->intra_error /

+         cpi->twopass.this_frame_stats->coded_error) > 2.0))) {

+    vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);

+    cpi->twopass.sr_update_lag = 1;

+  } else

+    cpi->twopass.sr_update_lag++;

+  // swap frame pointers so last frame refers to the frame we just compressed

+  vp9_swap_yv12_buffer(lst_yv12, new_yv12);

+  vp8_yv12_extend_frame_borders(lst_yv12);

+  // Special case for the first frame. Copy into the GF buffer as a second reference.

+  if (cm->current_video_frame == 0) {

+    vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);

+  }

+  // use this to see what the first pass reconstruction looks like

+  if (0) {

+    char filename[512];

+    FILE *recon_file;

+    sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);

+    if (cm->current_video_frame == 0)

+      recon_file = fopen(filename, "wb");

+    else

+      recon_file = fopen(filename, "ab");

+    if (fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file));

+    fclose(recon_file);

+  }

+  cm->current_video_frame++;

+}

+// Estimate a cost per mb attributable to overheads such as the coding of

+// modes and motion vectors.

+// Currently simplistic in its assumptions for testing.

+//

+static double bitcost(double prob) {

+  return -(log(prob) / log(2.0));

+}

+static long long estimate_modemvcost(VP9_COMP *cpi,

+                                     FIRSTPASS_STATS *fpstats) {

+  int mv_cost;

+  int mode_cost;

+  double av_pct_inter = fpstats->pcnt_inter / fpstats->count;

+  double av_pct_motion = fpstats->pcnt_motion / fpstats->count;

+  double av_intra = (1.0 - av_pct_inter);

+  double zz_cost;

+  double motion_cost;

+  double intra_cost;

+  zz_cost = bitcost(av_pct_inter - av_pct_motion);

+  motion_cost = bitcost(av_pct_motion);

+  intra_cost = bitcost(av_intra);

+  // Estimate of extra bits per mv overhead for mbs

+  // << 9 is the normalization to the (bits * 512) used in vp9_bits_per_mb

+  mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9;

+  // Crude estimate of overhead cost from modes

+  // << 9 is the normalization to (bits * 512) used in vp9_bits_per_mb

+  mode_cost =

+    (int)((((av_pct_inter - av_pct_motion) * zz_cost) +

+           (av_pct_motion * motion_cost) +

+           (av_intra * intra_cost)) * cpi->common.MBs) << 9;

+  // return mv_cost + mode_cost;

+  // TODO PGW Fix overhead costs for extended Q range

+  return 0;

+}

+static double calc_correction_factor(double err_per_mb,

+                                     double err_divisor,

+                                     double pt_low,

+                                     double pt_high,

+                                     int Q) {

+  double power_term;

+  double error_term = err_per_mb / err_divisor;

+  double correction_factor;

+  // Adjustment based on actual quantizer to power term.

+  power_term = (vp9_convert_qindex_to_q(Q) * 0.01) + pt_low;

+  power_term = (power_term > pt_high) ? pt_high : power_term;

+  // Adjustments to error term

+  // TBD

+  // Calculate correction factor

+  correction_factor = pow(error_term, power_term);

+  // Clip range

+  correction_factor =

+    (correction_factor < 0.05)

+    ? 0.05 : (correction_factor > 2.0) ? 2.0 : correction_factor;

+  return correction_factor;

+}

+// Given a current maxQ value sets a range for future values.

+// PGW TODO..

+// This code removes direct dependency on QIndex to determin the range

+// (now uses the actual quantizer) but has not been tuned.

+static void adjust_maxq_qrange(VP9_COMP *cpi) {

+  int i;

+  double q;

+  // Set the max corresponding to cpi->avg_q * 2.0

+  q = cpi->avg_q * 2.0;

+  cpi->twopass.maxq_max_limit = cpi->worst_quality;

+  for (i = cpi->best_quality; i <= cpi->worst_quality; i++) {

+    cpi->twopass.maxq_max_limit = i;

+    if (vp9_convert_qindex_to_q(i) >= q)

+      break;

+  }

+  // Set the min corresponding to cpi->avg_q * 0.5

+  q = cpi->avg_q * 0.5;

+  cpi->twopass.maxq_min_limit = cpi->best_quality;

+  for (i = cpi->worst_quality; i >= cpi->best_quality; i--) {

+    cpi->twopass.maxq_min_limit = i;

+    if (vp9_convert_qindex_to_q(i) <= q)

+      break;

+  }

+}

+static int estimate_max_q(VP9_COMP *cpi,

+                          FIRSTPASS_STATS *fpstats,

+                          int section_target_bandwitdh,

+                          int overhead_bits) {

+  int Q;

+  int num_mbs = cpi->common.MBs;

+  int target_norm_bits_per_mb;

+  double section_err = (fpstats->coded_error / fpstats->count);

+  double sr_err_diff;

+  double sr_correction;

+  double err_per_mb = section_err / num_mbs;

+  double err_correction_factor;

+  double speed_correction = 1.0;

+  int overhead_bits_per_mb;

+  if (section_target_bandwitdh <= 0)

+    return cpi->twopass.maxq_max_limit;          // Highest value allowed

+  target_norm_bits_per_mb =

+    (section_target_bandwitdh < (1 << 20))

+    ? (512 * section_target_bandwitdh) / num_mbs

+    : 512 * (section_target_bandwitdh / num_mbs);

+  // Look at the drop in prediction quality between the last frame

+  // and the GF buffer (which contained an older frame).

+  sr_err_diff =

+    (fpstats->sr_coded_error - fpstats->coded_error) /

+    (fpstats->count * cpi->common.MBs);

+  sr_correction = (sr_err_diff / 32.0);

+  sr_correction = pow(sr_correction, 0.25);

+  if (sr_correction < 0.75)

+    sr_correction = 0.75;

+  else if (sr_correction > 1.25)

+    sr_correction = 1.25;

+  // Calculate a corrective factor based on a rolling ratio of bits spent

+  // vs target bits

+  if ((cpi->rolling_target_bits > 0) &&

+      (cpi->active_worst_quality < cpi->worst_quality)) {

+    double rolling_ratio;

+    rolling_ratio = (double)cpi->rolling_actual_bits /

+                    (double)cpi->rolling_target_bits;

+    if (rolling_ratio < 0.95)

+      cpi->twopass.est_max_qcorrection_factor -= 0.005;

+    else if (rolling_ratio > 1.05)

+      cpi->twopass.est_max_qcorrection_factor += 0.005;

+    cpi->twopass.est_max_qcorrection_factor =

+      (cpi->twopass.est_max_qcorrection_factor < 0.1)

+      ? 0.1

+      : (cpi->twopass.est_max_qcorrection_factor > 10.0)

+      ? 10.0 : cpi->twopass.est_max_qcorrection_factor;

+  }

+  // Corrections for higher compression speed settings

+  // (reduced compression expected)

+  if (cpi->compressor_speed == 1) {

+    if (cpi->oxcf.cpu_used <= 5)

+      speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);

+    else

+      speed_correction = 1.25;

+  }

+  // Estimate of overhead bits per mb

+  // Correction to overhead bits for min allowed Q.

+  // PGW TODO.. This code is broken for the extended Q range

+  //            for now overhead set to 0.

+  overhead_bits_per_mb = overhead_bits / num_mbs;

+  overhead_bits_per_mb *= pow(0.98, (double)cpi->twopass.maxq_min_limit);

+  // Try and pick a max Q that will be high enough to encode the

+  // content at the given rate.

+  for (Q = cpi->twopass.maxq_min_limit; Q < cpi->twopass.maxq_max_limit; Q++) {

+    int bits_per_mb_at_this_q;

+    err_correction_factor =

+      calc_correction_factor(err_per_mb, ERR_DIVISOR, 0.4, 0.90, Q) *

+      sr_correction * speed_correction *

+      cpi->twopass.est_max_qcorrection_factor;

+    if (err_correction_factor < 0.05)

+      err_correction_factor = 0.05;

+    else if (err_correction_factor > 5.0)

+      err_correction_factor = 5.0;

+    bits_per_mb_at_this_q =

+      vp9_bits_per_mb(INTER_FRAME, Q) + overhead_bits_per_mb;

+    bits_per_mb_at_this_q = (int)(.5 + err_correction_factor *

+                                  (double)bits_per_mb_at_this_q);

+    // Mode and motion overhead

+    // As Q rises in real encode loop rd code will force overhead down

+    // We make a crude adjustment for this here as *.98 per Q step.

+    // PGW TODO.. This code is broken for the extended Q range

+    //            for now overhead set to 0.

+    // overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);

+    if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)

+      break;

+  }

+  // Restriction on active max q for constrained quality mode.

+  if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&

+      (Q < cpi->cq_target_quality)) {

+    Q = cpi->cq_target_quality;

+  }

+  // Adjust maxq_min_limit and maxq_max_limit limits based on

+  // averaga q observed in clip for non kf/gf/arf frames

+  // Give average a chance to settle though.

+  // PGW TODO.. This code is broken for the extended Q range

+  if ((cpi->ni_frames >

+       ((unsigned int)cpi->twopass.total_stats->count >> 8)) &&

+      (cpi->ni_frames > 150)) {

+    adjust_maxq_qrange(cpi);

+  }

+  return Q;

+}

+// For cq mode estimate a cq level that matches the observed

+// complexity and data rate.

+static int estimate_cq(VP9_COMP *cpi,

+                       FIRSTPASS_STATS *fpstats,

+                       int section_target_bandwitdh,

+                       int overhead_bits) {

+  int Q;

+  int num_mbs = cpi->common.MBs;

+  int target_norm_bits_per_mb;

+  double section_err = (fpstats->coded_error / fpstats->count);

+  double err_per_mb = section_err / num_mbs;

+  double err_correction_factor;

+  double sr_err_diff;

+  double sr_correction;

+  double speed_correction = 1.0;

+  double clip_iiratio;

+  double clip_iifactor;

+  int overhead_bits_per_mb;

+  target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20))

+                            ? (512 * section_target_bandwitdh) / num_mbs

+                            : 512 * (section_target_bandwitdh / num_mbs);

+  // Estimate of overhead bits per mb

+  overhead_bits_per_mb = overhead_bits / num_mbs;

+  // Corrections for higher compression speed settings

+  // (reduced compression expected)

+  if (cpi->compressor_speed == 1) {

+    if (cpi->oxcf.cpu_used <= 5)

+      speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);

+    else

+      speed_correction = 1.25;

+  }

+  // Look at the drop in prediction quality between the last frame

+  // and the GF buffer (which contained an older frame).

+  sr_err_diff =

+    (fpstats->sr_coded_error - fpstats->coded_error) /

+    (fpstats->count * cpi->common.MBs);

+  sr_correction = (sr_err_diff / 32.0);

+  sr_correction = pow(sr_correction, 0.25);

+  if (sr_correction < 0.75)

+    sr_correction = 0.75;

+  else if (sr_correction > 1.25)

+    sr_correction = 1.25;

+  // II ratio correction factor for clip as a whole

+  clip_iiratio = cpi->twopass.total_stats->intra_error /

+                 DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats->coded_error);

+  clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);

+  if (clip_iifactor < 0.80)

+    clip_iifactor = 0.80;

+  // Try and pick a Q that can encode the content at the given rate.

+  for (Q = 0; Q < MAXQ; Q++) {

+    int bits_per_mb_at_this_q;

+    // Error per MB based correction factor

+    err_correction_factor =

+      calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, Q) *

+      sr_correction * speed_correction * clip_iifactor;

+    if (err_correction_factor < 0.05)

+      err_correction_factor = 0.05;

+    else if (err_correction_factor > 5.0)

+      err_correction_factor = 5.0;

+    bits_per_mb_at_this_q =

+      vp9_bits_per_mb(INTER_FRAME, Q) + overhead_bits_per_mb;

+    bits_per_mb_at_this_q = (int)(.5 + err_correction_factor *

+                                  (double)bits_per_mb_at_this_q);

+    // Mode and motion overhead

+    // As Q rises in real encode loop rd code will force overhead down

+    // We make a crude adjustment for this here as *.98 per Q step.

+    // PGW TODO.. This code is broken for the extended Q range

+    //            for now overhead set to 0.

+    overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);

+    if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)

+      break;

+  }

+  // Clip value to range "best allowed to (worst allowed - 1)"

+  Q = select_cq_level(Q);

+  if (Q >= cpi->worst_quality)

+    Q = cpi->worst_quality - 1;

+  if (Q < cpi->best_quality)

+    Q = cpi->best_quality;

+  return Q;

+}

+extern void vp9_new_frame_rate(VP9_COMP *cpi, double framerate);

+void vp9_init_second_pass(VP9_COMP *cpi) {

+  FIRSTPASS_STATS this_frame;

+  FIRSTPASS_STATS *start_pos;

+  double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate;

+  double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth

+                                      * cpi->oxcf.two_pass_vbrmin_section / 100);

+  if (two_pass_min_rate < lower_bounds_min_rate)

+    two_pass_min_rate = lower_bounds_min_rate;

+  zero_stats(cpi->twopass.total_stats);

+  zero_stats(cpi->twopass.total_left_stats);

+  if (!cpi->twopass.stats_in_end)

+    return;

+  *cpi->twopass.total_stats = *cpi->twopass.stats_in_end;

+  *cpi->twopass.total_left_stats = *cpi->twopass.total_stats;

+  // each frame can have a different duration, as the frame rate in the source

+  // isn't guaranteed to be constant.   The frame rate prior to the first frame

+  // encoded in the second pass is a guess.  However the sum duration is not.

+  // Its calculated based on the actual durations of all frames from the first

+  // pass.

+  vp9_new_frame_rate(cpi,

+                     10000000.0 * cpi->twopass.total_stats->count /

+                     cpi->twopass.total_stats->duration);

+  cpi->output_frame_rate = cpi->oxcf.frame_rate;

+  cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats->duration *

+                                     cpi->oxcf.target_bandwidth / 10000000.0);

+  cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats->duration *

+                                      two_pass_min_rate / 10000000.0);

+  // Calculate a minimum intra value to be used in determining the IIratio

+  // scores used in the second pass. We have this minimum to make sure

+  // that clips that are static but "low complexity" in the intra domain

+  // are still boosted appropriately for KF/GF/ARF

+  cpi->twopass.kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;

+  cpi->twopass.gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;

+  // This variable monitors how far behind the second ref update is lagging

+  cpi->twopass.sr_update_lag = 1;

+  // Scan the first pass file and calculate an average Intra / Inter error score ratio for the sequence

+  {

+    double sum_iiratio = 0.0;

+    double IIRatio;

+    start_pos = cpi->twopass.stats_in;               // Note starting "file" position

+    while (input_stats(cpi, &this_frame) != EOF) {

+      IIRatio = this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);

+      IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio;

+      sum_iiratio += IIRatio;

+    }

+    cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats->count);

+    // Reset file position

+    reset_fpf_position(cpi, start_pos);

+  }

+  // Scan the first pass file and calculate a modified total error based upon the bias/power function

+  // used to allocate bits

+  {

+    start_pos = cpi->twopass.stats_in;               // Note starting "file" position

+    cpi->twopass.modified_error_total = 0.0;

+    cpi->twopass.modified_error_used = 0.0;

+    while (input_stats(cpi, &this_frame) != EOF) {

+      cpi->twopass.modified_error_total += calculate_modified_err(cpi, &this_frame);

+    }

+    cpi->twopass.modified_error_left = cpi->twopass.modified_error_total;

+    reset_fpf_position(cpi, start_pos);            // Reset file position

+  }

+}

+void vp9_end_second_pass(VP9_COMP *cpi) {

+}

+// This function gives and estimate of how badly we believe

+// the prediction quality is decaying from frame to frame.

+static double get_prediction_decay_rate(VP9_COMP *cpi,

+                                        FIRSTPASS_STATS *next_frame) {

+  double prediction_decay_rate;

+  double second_ref_decay;

+  double mb_sr_err_diff;

+  // Initial basis is the % mbs inter coded

+  prediction_decay_rate = next_frame->pcnt_inter;

+  // Look at the observed drop in prediction quality between the last frame

+  // and the GF buffer (which contains an older frame).

+  mb_sr_err_diff =

+    (next_frame->sr_coded_error - next_frame->coded_error) /

+    (cpi->common.MBs);

+  second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0);

+  second_ref_decay = pow(second_ref_decay, 0.5);

+  if (second_ref_decay < 0.85)

+    second_ref_decay = 0.85;

+  else if (second_ref_decay > 1.0)

+    second_ref_decay = 1.0;

+  if (second_ref_decay < prediction_decay_rate)

+    prediction_decay_rate = second_ref_decay;

+  return prediction_decay_rate;

+}

+// Function to test for a condition where a complex transition is followed

+// by a static section. For example in slide shows where there is a fade

+// between slides. This is to help with more optimal kf and gf positioning.

+static int detect_transition_to_still(

+  VP9_COMP *cpi,

+  int frame_interval,

+  int still_interval,

+  double loop_decay_rate,

+  double last_decay_rate) {

+  BOOL trans_to_still = FALSE;

+  // Break clause to detect very still sections after motion

+  // For example a static image after a fade or other transition

+  // instead of a clean scene cut.

+  if ((frame_interval > MIN_GF_INTERVAL) &&

+      (loop_decay_rate >= 0.999) &&

+      (last_decay_rate < 0.9)) {

+    int j;

+    FIRSTPASS_STATS *position = cpi->twopass.stats_in;

+    FIRSTPASS_STATS tmp_next_frame;

+    double zz_inter;

+    // Look ahead a few frames to see if static condition

+    // persists...

+    for (j = 0; j < still_interval; j++) {

+      if (EOF == input_stats(cpi, &tmp_next_frame))

+        break;

+      zz_inter =

+        (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion);

+      if (zz_inter < 0.999)

+        break;

+    }

+    // Reset file position

+    reset_fpf_position(cpi, position);

+    // Only if it does do we signal a transition to still

+    if (j == still_interval)

+      trans_to_still = TRUE;

+  }

+  return trans_to_still;

+}

+// This function detects a flash through the high relative pcnt_second_ref

+// score in the frame following a flash frame. The offset passed in should

+// reflect this

+static BOOL detect_flash(VP9_COMP *cpi, int offset) {

+  FIRSTPASS_STATS next_frame;

+  BOOL flash_detected = FALSE;

+  // Read the frame data.

+  // The return is FALSE (no flash detected) if not a valid frame

+  if (read_frame_stats(cpi, &next_frame, offset) != EOF) {

+    // What we are looking for here is a situation where there is a

+    // brief break in prediction (such as a flash) but subsequent frames

+    // are reasonably well predicted by an earlier (pre flash) frame.

+    // The recovery after a flash is indicated by a high pcnt_second_ref

+    // comapred to pcnt_inter.

+    if ((next_frame.pcnt_second_ref > next_frame.pcnt_inter) &&

+        (next_frame.pcnt_second_ref >= 0.5)) {

+      flash_detected = TRUE;

+    }

+  }

+  return flash_detected;

+}

+// Update the motion related elements to the GF arf boost calculation

+static void accumulate_frame_motion_stats(

+  VP9_COMP *cpi,

+  FIRSTPASS_STATS *this_frame,

+  double *this_frame_mv_in_out,

+  double *mv_in_out_accumulator,

+  double *abs_mv_in_out_accumulator,

+  double *mv_ratio_accumulator) {

+  // double this_frame_mv_in_out;

+  double this_frame_mvr_ratio;

+  double this_frame_mvc_ratio;

+  double motion_pct;

+  // Accumulate motion stats.

+  motion_pct = this_frame->pcnt_motion;

+  // Accumulate Motion In/Out of frame stats

+  *this_frame_mv_in_out = this_frame->mv_in_out_count * motion_pct;

+  *mv_in_out_accumulator += this_frame->mv_in_out_count * motion_pct;

+  *abs_mv_in_out_accumulator +=

+    fabs(this_frame->mv_in_out_count * motion_pct);

+  // Accumulate a measure of how uniform (or conversely how random)

+  // the motion field is. (A ratio of absmv / mv)

+  if (motion_pct > 0.05) {

+    this_frame_mvr_ratio = fabs(this_frame->mvr_abs) /

+                           DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVr));

+    this_frame_mvc_ratio = fabs(this_frame->mvc_abs) /

+                           DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVc));

+    *mv_ratio_accumulator +=

+      (this_frame_mvr_ratio < this_frame->mvr_abs)

+      ? (this_frame_mvr_ratio * motion_pct)

+      : this_frame->mvr_abs * motion_pct;

+    *mv_ratio_accumulator +=

+      (this_frame_mvc_ratio < this_frame->mvc_abs)

+      ? (this_frame_mvc_ratio * motion_pct)

+      : this_frame->mvc_abs * motion_pct;

+  }

+}

+// Calculate a baseline boost number for the current frame.

+static double calc_frame_boost(

+  VP9_COMP *cpi,

+  FIRSTPASS_STATS *this_frame,

+  double this_frame_mv_in_out) {

+  double frame_boost;

+  // Underlying boost factor is based on inter intra error ratio

+  if (this_frame->intra_error > cpi->twopass.gf_intra_err_min)

+    frame_boost = (IIFACTOR * this_frame->intra_error /

+                   DOUBLE_DIVIDE_CHECK(this_frame->coded_error));

+  else

+    frame_boost = (IIFACTOR * cpi->twopass.gf_intra_err_min /

+                   DOUBLE_DIVIDE_CHECK(this_frame->coded_error));

+  // Increase boost for frames where new data coming into frame

+  // (eg zoom out). Slightly reduce boost if there is a net balance

+  // of motion out of the frame (zoom in).

+  // The range for this_frame_mv_in_out is -1.0 to +1.0

+  if (this_frame_mv_in_out > 0.0)

+    frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);

+  // In extreme case boost is halved

+  else

+    frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);

+  // Clip to maximum

+  if (frame_boost > GF_RMAX)

+    frame_boost = GF_RMAX;

+  return frame_boost;

+}

+static int calc_arf_boost(

+  VP9_COMP *cpi,

+  int offset,

+  int f_frames,

+  int b_frames,

+  int *f_boost,

+  int *b_boost) {

+  FIRSTPASS_STATS this_frame;

+  int i;

+  double boost_score = 0.0;

+  double mv_ratio_accumulator = 0.0;

+  double decay_accumulator = 1.0;

+  double this_frame_mv_in_out = 0.0;

+  double mv_in_out_accumulator = 0.0;

+  double abs_mv_in_out_accumulator = 0.0;

+  int arf_boost;

+  BOOL flash_detected = FALSE;

+  // Search forward from the proposed arf/next gf position

+  for (i = 0; i < f_frames; i++) {

+    if (read_frame_stats(cpi, &this_frame, (i + offset)) == EOF)

+      break;

+    // Update the motion related elements to the boost calculation

+    accumulate_frame_motion_stats(cpi, &this_frame,

+                                  &this_frame_mv_in_out, &mv_in_out_accumulator,

+                                  &abs_mv_in_out_accumulator, &mv_ratio_accumulator);

+    // We want to discount the the flash frame itself and the recovery

+    // frame that follows as both will have poor scores.

+    flash_detected = detect_flash(cpi, (i + offset)) ||

+                     detect_flash(cpi, (i + offset + 1));

+    // Cumulative effect of prediction quality decay

+    if (!flash_detected) {

+      decay_accumulator =

+        decay_accumulator *

+        get_prediction_decay_rate(cpi, &this_frame);

+      decay_accumulator =

+        decay_accumulator < 0.1 ? 0.1 : decay_accumulator;

+    }

+    boost_score += (decay_accumulator *

+                    calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out));

+  }

+  *f_boost = boost_score;

+  // Reset for backward looking loop

+  boost_score = 0.0;

+  mv_ratio_accumulator = 0.0;

+  decay_accumulator = 1.0;

+  this_frame_mv_in_out = 0.0;

+  mv_in_out_accumulator = 0.0;

+  abs_mv_in_out_accumulator = 0.0;

+  // Search backward towards last gf position

+  for (i = -1; i >= -b_frames; i--) {

+    if (read_frame_stats(cpi, &this_frame, (i + offset)) == EOF)

+      break;

+    // Update the motion related elements to the boost calculation

+    accumulate_frame_motion_stats(cpi, &this_frame,

+                                  &this_frame_mv_in_out, &mv_in_out_accumulator,

+                                  &abs_mv_in_out_accumulator, &mv_ratio_accumulator);

+    // We want to discount the the flash frame itself and the recovery

+    // frame that follows as both will have poor scores.

+    flash_detected = detect_flash(cpi, (i + offset)) ||

+                     detect_flash(cpi, (i + offset + 1));

+    // Cumulative effect of prediction quality decay

+    if (!flash_detected) {

+      decay_accumulator =

+        decay_accumulator *

+        get_prediction_decay_rate(cpi, &this_frame);

+      decay_accumulator =

+        decay_accumulator < 0.1 ? 0.1 : decay_accumulator;

+    }

+    boost_score += (decay_accumulator *

+                    calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out));

+  }

+  *b_boost = boost_score;

+  arf_boost = (*f_boost + *b_boost);

+  if (arf_boost < ((b_frames + f_frames) * 20))

+    arf_boost = ((b_frames + f_frames) * 20);

+  return arf_boost;

+}

+static void configure_arnr_filter(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {

+  int half_gf_int;

+  int frames_after_arf;

+  int frames_bwd = cpi->oxcf.arnr_max_frames - 1;

+  int frames_fwd = cpi->oxcf.arnr_max_frames - 1;

+  // Define the arnr filter width for this group of frames:

+  // We only filter frames that lie within a distance of half

+  // the GF interval from the ARF frame. We also have to trap

+  // cases where the filter extends beyond the end of clip.

+  // Note: this_frame->frame has been updated in the loop

+  // so it now points at the ARF frame.

+  half_gf_int = cpi->baseline_gf_interval >> 1;

+  frames_after_arf = cpi->twopass.total_stats->count -

+                     this_frame->frame - 1;

+  switch (cpi->oxcf.arnr_type) {

+    case 1: // Backward filter

+      frames_fwd = 0;

+      if (frames_bwd > half_gf_int)

+        frames_bwd = half_gf_int;

+      break;

+    case 2: // Forward filter

+      if (frames_fwd > half_gf_int)

+        frames_fwd = half_gf_int;

+      if (frames_fwd > frames_after_arf)

+        frames_fwd = frames_after_arf;

+      frames_bwd = 0;

+      break;

+    case 3: // Centered filter

+    default:

+      frames_fwd >>= 1;

+      if (frames_fwd > frames_after_arf)

+        frames_fwd = frames_after_arf;

+      if (frames_fwd > half_gf_int)

+        frames_fwd = half_gf_int;

+      frames_bwd = frames_fwd;

+      // For even length filter there is one more frame backward

+      // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.

+      if (frames_bwd < half_gf_int)

+        frames_bwd += (cpi->oxcf.arnr_max_frames + 1) & 0x1;

+      break;

+  }

+  cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;

+}

+// Analyse and define a gf/arf group .

+static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {

+  FIRSTPASS_STATS next_frame;

+  FIRSTPASS_STATS *start_pos;

+  int i;

+  double boost_score = 0.0;

+  double old_boost_score = 0.0;

+  double gf_group_err = 0.0;

+  double gf_first_frame_err = 0.0;

+  double mod_frame_err = 0.0;

+  double mv_ratio_accumulator = 0.0;

+  double decay_accumulator = 1.0;

+  double zero_motion_accumulator = 1.0;

+  double loop_decay_rate = 1.00;          // Starting decay rate

+  double last_loop_decay_rate = 1.00;

+  double this_frame_mv_in_out = 0.0;

+  double mv_in_out_accumulator = 0.0;

+  double abs_mv_in_out_accumulator = 0.0;

+  int max_bits = frame_max_bits(cpi);     // Max for a single frame

+  unsigned int allow_alt_ref =

+    cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;

+  int f_boost = 0;

+  int b_boost = 0;

+  BOOL flash_detected;

+  cpi->twopass.gf_group_bits = 0;

+  vp9_clear_system_state();  // __asm emms;

+  start_pos = cpi->twopass.stats_in;

+  vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean

+  // Load stats for the current frame.

+  mod_frame_err = calculate_modified_err(cpi, this_frame);

+  // Note the error of the frame at the start of the group (this will be

+  // the GF frame error if we code a normal gf

+  gf_first_frame_err = mod_frame_err;

+  // Special treatment if the current frame is a key frame (which is also

+  // a gf). If it is then its error score (and hence bit allocation) need

+  // to be subtracted out from the calculation for the GF group

+  if (cpi->common.frame_type == KEY_FRAME)

+    gf_group_err -= gf_first_frame_err;

+  // Scan forward to try and work out how many frames the next gf group

+  // should contain and what level of boost is appropriate for the GF

+  // or ARF that will be coded with the group

+  i = 0;

+  while (((i < cpi->twopass.static_scene_max_gf_interval) ||

+          ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL)) &&

+         (i < cpi->twopass.frames_to_key)) {

+    i++;    // Increment the loop counter

+    // Accumulate error score of frames in this gf group

+    mod_frame_err = calculate_modified_err(cpi, this_frame);

+    gf_group_err += mod_frame_err;

+    if (EOF == input_stats(cpi, &next_frame))

+      break;

+    // Test for the case where there is a brief flash but the prediction

+    // quality back to an earlier frame is then restored.

+    flash_detected = detect_flash(cpi, 0);

+    // Update the motion related elements to the boost calculation

+    accumulate_frame_motion_stats(cpi, &next_frame,

+                                  &this_frame_mv_in_out, &mv_in_out_accumulator,

+                                  &abs_mv_in_out_accumulator, &mv_ratio_accumulator);

+    // Cumulative effect of prediction quality decay

+    if (!flash_detected) {

+      last_loop_decay_rate = loop_decay_rate;

+      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);

+      decay_accumulator = decay_accumulator * loop_decay_rate;

+      // Monitor for static sections.

+      if ((next_frame.pcnt_inter - next_frame.pcnt_motion) <

+          zero_motion_accumulator) {

+        zero_motion_accumulator =

+          (next_frame.pcnt_inter - next_frame.pcnt_motion);

+      }

+      // Break clause to detect very still sections after motion

+      // (for example a staic image after a fade or other transition).

+      if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,

+                                     last_loop_decay_rate)) {

+        allow_alt_ref = FALSE;

+        break;

+      }

+    }

+    // Calculate a boost number for this frame

+    boost_score +=

+      (decay_accumulator *

+       calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out));

+    // Break out conditions.

+    if (

+      // Break at cpi->max_gf_interval unless almost totally static

+      (i >= cpi->max_gf_interval && (zero_motion_accumulator < 0.995)) ||

+      (

+        // Dont break out with a very short interval

+        (i > MIN_GF_INTERVAL) &&

+        // Dont break out very close to a key frame

+        ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&

+        ((boost_score > 125.0) || (next_frame.pcnt_inter < 0.75)) &&

+        (!flash_detected) &&

+        ((mv_ratio_accumulator > 100.0) ||

+         (abs_mv_in_out_accumulator > 3.0) ||

+         (mv_in_out_accumulator < -2.0) ||

+         ((boost_score - old_boost_score) < 12.5))

+      )) {

+      boost_score = old_boost_score;

+      break;

+    }

+    vpx_memcpy(this_frame, &next_frame, sizeof(*this_frame));

+    old_boost_score = boost_score;

+  }

+  // Dont allow a gf too near the next kf

+  if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL) {

+    while (i < cpi->twopass.frames_to_key) {

+      i++;

+      if (EOF == input_stats(cpi, this_frame))

+        break;

+      if (i < cpi->twopass.frames_to_key) {

+        mod_frame_err = calculate_modified_err(cpi, this_frame);

+        gf_group_err += mod_frame_err;

+      }

+    }

+  }

+  // Set the interval till the next gf or arf.

+  cpi->baseline_gf_interval = i;

+  // Should we use the alternate refernce frame

+  if (allow_alt_ref &&

+      (i < cpi->oxcf.lag_in_frames) &&

+      (i >= MIN_GF_INTERVAL) &&

+      // dont use ARF very near next kf

+      (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) &&

+      ((next_frame.pcnt_inter > 0.75) ||

+       (next_frame.pcnt_second_ref > 0.5)) &&

+      ((mv_in_out_accumulator / (double)i > -0.2) ||

+       (mv_in_out_accumulator > -2.0)) &&

+      (boost_score > 100)) {

+    // Alterrnative boost calculation for alt ref

+    cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);

+    cpi->source_alt_ref_pending = TRUE;

+    configure_arnr_filter(cpi, this_frame);

+  } else {

+    cpi->gfu_boost = (int)boost_score;

+    cpi->source_alt_ref_pending = FALSE;

+  }

+  // Now decide how many bits should be allocated to the GF group as  a

+  // proportion of those remaining in the kf group.

+  // The final key frame group in the clip is treated as a special case

+  // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.

+  // This is also important for short clips where there may only be one

+  // key frame.

+  if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats->count -

+                                          cpi->common.current_video_frame)) {

+    cpi->twopass.kf_group_bits =

+      (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0;

+  }

+  // Calculate the bits to be allocated to the group as a whole

+  if ((cpi->twopass.kf_group_bits > 0) &&

+      (cpi->twopass.kf_group_error_left > 0)) {

+    cpi->twopass.gf_group_bits =

+      (int)((double)cpi->twopass.kf_group_bits *

+            (gf_group_err / (double)cpi->twopass.kf_group_error_left));

+  } else

+    cpi->twopass.gf_group_bits = 0;

+  cpi->twopass.gf_group_bits =

+    (cpi->twopass.gf_group_bits < 0)

+    ? 0

+    : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)

+    ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits;

+  // Clip cpi->twopass.gf_group_bits based on user supplied data rate

+  // variability limit (cpi->oxcf.two_pass_vbrmax_section)

+  if (cpi->twopass.gf_group_bits > max_bits * cpi->baseline_gf_interval)

+    cpi->twopass.gf_group_bits = max_bits * cpi->baseline_gf_interval;

+  // Reset the file position

+  reset_fpf_position(cpi, start_pos);

+  // Update the record of error used so far (only done once per gf group)

+  cpi->twopass.modified_error_used += gf_group_err;

+  // Assign  bits to the arf or gf.

+  for (i = 0; i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME); i++) {

+    int boost;

+    int allocation_chunks;

+    int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;

+    int gf_bits;

+    boost = (cpi->gfu_boost * vp9_gfboost_qadjust(Q)) / 100;

+    // Set max and minimum boost and hence minimum allocation

+    if (boost > ((cpi->baseline_gf_interval + 1) * 200))

+      boost = ((cpi->baseline_gf_interval + 1) * 200);

+    else if (boost < 125)

+      boost = 125;

+    if (cpi->source_alt_ref_pending && i == 0)

+      allocation_chunks =

+        ((cpi->baseline_gf_interval + 1) * 100) + boost;

+    else

+      allocation_chunks =

+        (cpi->baseline_gf_interval * 100) + (boost - 100);

+    // Prevent overflow

+    if (boost > 1028) {

+      int divisor = boost >> 10;

+      boost /= divisor;

+      allocation_chunks /= divisor;

+    }

+    // Calculate the number of bits to be spent on the gf or arf based on

+    // the boost number

+    gf_bits = (int)((double)boost *

+                    (cpi->twopass.gf_group_bits /

+                     (double)allocation_chunks));

+    // If the frame that is to be boosted is simpler than the average for

+    // the gf/arf group then use an alternative calculation

+    // based on the error score of the frame itself

+    if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval) {

+      double  alt_gf_grp_bits;

+      int     alt_gf_bits;

+      alt_gf_grp_bits =

+        (double)cpi->twopass.kf_group_bits  *

+        (mod_frame_err * (double)cpi->baseline_gf_interval) /

+        DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left);

+      alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /

+                                           (double)allocation_chunks));

+      if (gf_bits > alt_gf_bits) {

+        gf_bits = alt_gf_bits;

+      }

+    }

+    // Else if it is harder than other frames in the group make sure it at

+    // least receives an allocation in keeping with its relative error

+    // score, otherwise it may be worse off than an "un-boosted" frame

+    else {

+      int alt_gf_bits =

+        (int)((double)cpi->twopass.kf_group_bits *

+              mod_frame_err /

+              DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left));

+      if (alt_gf_bits > gf_bits) {

+        gf_bits = alt_gf_bits;

+      }

+    }

+    // Dont allow a negative value for gf_bits

+    if (gf_bits < 0)

+      gf_bits = 0;

+    gf_bits += cpi->min_frame_bandwidth;                     // Add in minimum for a frame

+    if (i == 0) {

+      cpi->twopass.gf_bits = gf_bits;

+    }

+    if (i == 1 || (!cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME))) {

+      cpi->per_frame_bandwidth = gf_bits;                 // Per frame bit target for this frame

+    }

+  }

+  {

+    // Adjust KF group bits and error remainin

+    cpi->twopass.kf_group_error_left -= gf_group_err;

+    cpi->twopass.kf_group_bits -= cpi->twopass.gf_group_bits;

+    if (cpi->twopass.kf_group_bits < 0)

+      cpi->twopass.kf_group_bits = 0;

+    // Note the error score left in the remaining frames of the group.

+    // For normal GFs we want to remove the error score for the first frame

+    // of the group (except in Key frame case where this has already

+    // happened)

+    if (!cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME)

+      cpi->twopass.gf_group_error_left = gf_group_err - gf_first_frame_err;

+    else

+      cpi->twopass.gf_group_error_left = gf_group_err;

+    cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits - cpi->min_frame_bandwidth;

+    if (cpi->twopass.gf_group_bits < 0)

+      cpi->twopass.gf_group_bits = 0;

+    // This condition could fail if there are two kfs very close together

+    // despite (MIN_GF_INTERVAL) and would cause a devide by 0 in the

+    // calculation of cpi->twopass.alt_extra_bits.

+    if (cpi->baseline_gf_interval >= 3) {

+      int boost = (cpi->source_alt_ref_pending)

+                  ? b_boost : cpi->gfu_boost;

+      if (boost >= 150) {

+        int pct_extra;

+        pct_extra = (boost - 100) / 50;

+        pct_extra = (pct_extra > 20) ? 20 : pct_extra;

+        cpi->twopass.alt_extra_bits =

+          (cpi->twopass.gf_group_bits * pct_extra) / 100;

+        cpi->twopass.gf_group_bits -= cpi->twopass.alt_extra_bits;

+        cpi->twopass.alt_extra_bits /=

+          ((cpi->baseline_gf_interval - 1) >> 1);

+      } else

+        cpi->twopass.alt_extra_bits = 0;

+    } else

+      cpi->twopass.alt_extra_bits = 0;

+  }

+  if (cpi->common.frame_type != KEY_FRAME) {

+    FIRSTPASS_STATS sectionstats;

+    zero_stats(&sectionstats);

+    reset_fpf_position(cpi, start_pos);

+    for (i = 0; i < cpi->baseline_gf_interval; i++) {

+      input_stats(cpi, &next_frame);

+      accumulate_stats(&sectionstats, &next_frame);

+    }

+    avg_stats(&sectionstats);

+    cpi->twopass.section_intra_rating =

+      sectionstats.intra_error /

+      DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);

+    reset_fpf_position(cpi, start_pos);

+  }

+}

+// Allocate bits to a normal frame that is neither a gf an arf or a key frame.

+static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {

+  int    target_frame_size;                                                             // gf_group_error_left

+  double modified_err;

+  double err_fraction;                                                                 // What portion of the remaining GF group error is used by this frame

+  int max_bits = frame_max_bits(cpi);    // Max for a single frame

+  // Calculate modified prediction error used in bit allocation

+  modified_err = calculate_modified_err(cpi, this_frame);

+  if (cpi->twopass.gf_group_error_left > 0)

+    err_fraction = modified_err / cpi->twopass.gf_group_error_left;                              // What portion of the remaining GF group error is used by this frame

+  else

+    err_fraction = 0.0;

+  target_frame_size = (int)((double)cpi->twopass.gf_group_bits * err_fraction);                    // How many of those bits available for allocation should we give it?

+  // Clip to target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at the top end.

+  if (target_frame_size < 0)

+    target_frame_size = 0;

+  else {

+    if (target_frame_size > max_bits)

+      target_frame_size = max_bits;

+    if (target_frame_size > cpi->twopass.gf_group_bits)

+      target_frame_size = cpi->twopass.gf_group_bits;

+  }

+  cpi->twopass.gf_group_error_left -= modified_err;                                               // Adjust error remaining

+  cpi->twopass.gf_group_bits -= target_frame_size;                                                // Adjust bits remaining

+  if (cpi->twopass.gf_group_bits < 0)

+    cpi->twopass.gf_group_bits = 0;

+  target_frame_size += cpi->min_frame_bandwidth;                                          // Add in the minimum number of bits that is set aside for every frame.

+  cpi->per_frame_bandwidth = target_frame_size;                                           // Per frame bit target for this frame

+}

+// Make a damped adjustment to the active max q.

+static int adjust_active_maxq(int old_maxqi, int new_maxqi) {

+  int i;

+  int ret_val = new_maxqi;

+  double old_q;

+  double new_q;

+  double target_q;

+  old_q = vp9_convert_qindex_to_q(old_maxqi);

+  new_q = vp9_convert_qindex_to_q(new_maxqi);

+  target_q = ((old_q * 7.0) + new_q) / 8.0;

+  if (target_q > old_q) {

+    for (i = old_maxqi; i <= new_maxqi; i++) {

+      if (vp9_convert_qindex_to_q(i) >= target_q) {

+        ret_val = i;

+        break;

+      }

+    }

+  } else {

+    for (i = old_maxqi; i >= new_maxqi; i--) {

+      if (vp9_convert_qindex_to_q(i) <= target_q) {

+        ret_val = i;

+        break;

+      }

+    }

+  }

+  return ret_val;

+}

+void vp9_second_pass(VP9_COMP *cpi) {

+  int tmp_q;

+  int frames_left = (int)(cpi->twopass.total_stats->count - cpi->common.current_video_frame);

+  FIRSTPASS_STATS this_frame;

+  FIRSTPASS_STATS this_frame_copy;

+  double this_frame_error;

+  double this_frame_intra_error;

+  double this_frame_coded_error;

+  FIRSTPASS_STATS *start_pos;

+  int overhead_bits;

+  if (!cpi->twopass.stats_in) {

+    return;

+  }

+  vp9_clear_system_state();

+  vpx_memset(&this_frame, 0, sizeof(FIRSTPASS_STATS));

+  if (EOF == input_stats(cpi, &this_frame))

+    return;

+  this_frame_error = this_frame.ssim_weighted_pred_err;

+  this_frame_intra_error = this_frame.intra_error;

+  this_frame_coded_error = this_frame.coded_error;

+  start_pos = cpi->twopass.stats_in;

+  // keyframe and section processing !

+  if (cpi->twopass.frames_to_key == 0) {

+    // Define next KF group and assign bits to it

+    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));

+    find_next_key_frame(cpi, &this_frame_copy);

+  }

+  // Is this a GF / ARF (Note that a KF is always also a GF)

+  if (cpi->frames_till_gf_update_due == 0) {

+    // Define next gf group and assign bits to it

+    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));

+    define_gf_group(cpi, &this_frame_copy);

+    // If we are going to code an altref frame at the end of the group and the current frame is not a key frame....

+    // If the previous group used an arf this frame has already benefited from that arf boost and it should not be given extra bits

+    // If the previous group was NOT coded using arf we may want to apply some boost to this GF as well

+    if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)) {

+      // Assign a standard frames worth of bits from those allocated to the GF group

+      int bak = cpi->per_frame_bandwidth;

+      vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));

+      assign_std_frame_bits(cpi, &this_frame_copy);

+      cpi->per_frame_bandwidth = bak;

+    }

+  }

+  // Otherwise this is an ordinary frame

+  else {

+    // Assign bits from those allocated to the GF group

+    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));

+    assign_std_frame_bits(cpi, &this_frame_copy);

+  }

+  // Keep a globally available copy of this and the next frame's iiratio.

+  cpi->twopass.this_iiratio = this_frame_intra_error /

+                              DOUBLE_DIVIDE_CHECK(this_frame_coded_error);

+  {

+    FIRSTPASS_STATS next_frame;

+    if (lookup_next_frame_stats(cpi, &next_frame) != EOF) {

+      cpi->twopass.next_iiratio = next_frame.intra_error /

+                                  DOUBLE_DIVIDE_CHECK(next_frame.coded_error);

+    }

+  }

+  // Set nominal per second bandwidth for this frame

+  cpi->target_bandwidth = cpi->per_frame_bandwidth * cpi->output_frame_rate;

+  if (cpi->target_bandwidth < 0)

+    cpi->target_bandwidth = 0;

+  // Account for mv, mode and other overheads.

+  overhead_bits = estimate_modemvcost(

+                    cpi, cpi->twopass.total_left_stats);

+  // Special case code for first frame.

+  if (cpi->common.current_video_frame == 0) {

+    cpi->twopass.est_max_qcorrection_factor = 1.0;

+    // Set a cq_level in constrained quality mode.

+    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {

+      int est_cq;

+      est_cq =

+        estimate_cq(cpi,

+                    cpi->twopass.total_left_stats,

+                    (int)(cpi->twopass.bits_left / frames_left),

+                    overhead_bits);

+      cpi->cq_target_quality = cpi->oxcf.cq_level;

+      if (est_cq > cpi->cq_target_quality)

+        cpi->cq_target_quality = est_cq;

+    }

+    // guess at maxq needed in 2nd pass

+    cpi->twopass.maxq_max_limit = cpi->worst_quality;

+    cpi->twopass.maxq_min_limit = cpi->best_quality;

+    tmp_q = estimate_max_q(

+              cpi,

+              cpi->twopass.total_left_stats,

+              (int)(cpi->twopass.bits_left / frames_left),

+              overhead_bits);

+    cpi->active_worst_quality         = tmp_q;

+    cpi->ni_av_qi                     = tmp_q;

+    cpi->avg_q                        = vp9_convert_qindex_to_q(tmp_q);

+    // Limit the maxq value returned subsequently.

+    // This increases the risk of overspend or underspend if the initial

+    // estimate for the clip is bad, but helps prevent excessive

+    // variation in Q, especially near the end of a clip

+    // where for example a small overspend may cause Q to crash

+    adjust_maxq_qrange(cpi);

+  }

+  // The last few frames of a clip almost always have to few or too many

+  // bits and for the sake of over exact rate control we dont want to make

+  // radical adjustments to the allowed quantizer range just to use up a

+  // few surplus bits or get beneath the target rate.

+  else if ((cpi->common.current_video_frame <

+            (((unsigned int)cpi->twopass.total_stats->count * 255) >> 8)) &&

+           ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <

+            (unsigned int)cpi->twopass.total_stats->count)) {

+    if (frames_left < 1)

+      frames_left = 1;

+    tmp_q = estimate_max_q(

+              cpi,

+              cpi->twopass.total_left_stats,

+              (int)(cpi->twopass.bits_left / frames_left),

+              overhead_bits);

+    // Make a damped adjustment to active max Q

+    cpi->active_worst_quality =

+      adjust_active_maxq(cpi->active_worst_quality, tmp_q);

+  }

+  cpi->twopass.frames_to_key--;

+  // Update the total stats remaining sturcture

+  subtract_stats(cpi->twopass.total_left_stats, &this_frame);

+}

+static BOOL test_candidate_kf(VP9_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame) {

+  BOOL is_viable_kf = FALSE;

+  // Does the frame satisfy the primary criteria of a key frame

+  //      If so, then examine how well it predicts subsequent frames

+  if ((this_frame->pcnt_second_ref < 0.10) &&

+      (next_frame->pcnt_second_ref < 0.10) &&

+      ((this_frame->pcnt_inter < 0.05) ||

+       (

+         ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) &&

+         ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&

+         ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) ||

+          (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) ||

+          ((next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5)

+         )

+       )

+      )

+     ) {

+    int i;

+    FIRSTPASS_STATS *start_pos;

+    FIRSTPASS_STATS local_next_frame;

+    double boost_score = 0.0;

+    double old_boost_score = 0.0;

+    double decay_accumulator = 1.0;

+    double next_iiratio;

+    vpx_memcpy(&local_next_frame, next_frame, sizeof(*next_frame));

+    // Note the starting file position so we can reset to it

+    start_pos = cpi->twopass.stats_in;

+    // Examine how well the key frame predicts subsequent frames

+    for (i = 0; i < 16; i++) {

+      next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));

+      if (next_iiratio > RMAX)

+        next_iiratio = RMAX;

+      // Cumulative effect of decay in prediction quality

+      if (local_next_frame.pcnt_inter > 0.85)

+        decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;

+      else

+        decay_accumulator = decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);

+      // decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;

+      // Keep a running total

+      boost_score += (decay_accumulator * next_iiratio);

+      // Test various breakout clauses

+      if ((local_next_frame.pcnt_inter < 0.05) ||

+          (next_iiratio < 1.5) ||

+          (((local_next_frame.pcnt_inter -

+             local_next_frame.pcnt_neutral) < 0.20) &&

+           (next_iiratio < 3.0)) ||

+          ((boost_score - old_boost_score) < 3.0) ||

+          (local_next_frame.intra_error < 200)

+         ) {

+        break;

+      }

+      old_boost_score = boost_score;

+      // Get the next frame details

+      if (EOF == input_stats(cpi, &local_next_frame))

+        break;

+    }

+    // If there is tolerable prediction for at least the next 3 frames then break out else discard this pottential key frame and move on

+    if (boost_score > 30.0 && (i > 3))

+      is_viable_kf = TRUE;

+    else {

+      // Reset the file position

+      reset_fpf_position(cpi, start_pos);

+      is_viable_kf = FALSE;

+    }

+  }

+  return is_viable_kf;

+}

+static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {

+  int i, j;

+  FIRSTPASS_STATS last_frame;

+  FIRSTPASS_STATS first_frame;

+  FIRSTPASS_STATS next_frame;

+  FIRSTPASS_STATS *start_position;

+  double decay_accumulator = 1.0;

+  double zero_motion_accumulator = 1.0;

+  double boost_score = 0;

+  double old_boost_score = 0.0;

+  double loop_decay_rate;

+  double kf_mod_err = 0.0;

+  double kf_group_err = 0.0;

+  double kf_group_intra_err = 0.0;

+  double kf_group_coded_err = 0.0;

+  double recent_loop_decay[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};

+  vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean

+  vp9_clear_system_state();  // __asm emms;

+  start_position = cpi->twopass.stats_in;

+  cpi->common.frame_type = KEY_FRAME;

+  // is this a forced key frame by interval

+  cpi->this_key_frame_forced = cpi->next_key_frame_forced;

+  // Clear the alt ref active flag as this can never be active on a key frame

+  cpi->source_alt_ref_active = FALSE;

+  // Kf is always a gf so clear frames till next gf counter

+  cpi->frames_till_gf_update_due = 0;

+  cpi->twopass.frames_to_key = 1;

+  // Take a copy of the initial frame details

+  vpx_memcpy(&first_frame, this_frame, sizeof(*this_frame));

+  cpi->twopass.kf_group_bits = 0;        // Total bits avaialable to kf group

+  cpi->twopass.kf_group_error_left = 0;  // Group modified error score.

+  kf_mod_err = calculate_modified_err(cpi, this_frame);

+  // find the next keyframe

+  i = 0;

+  while (cpi->twopass.stats_in < cpi->twopass.stats_in_end) {

+    // Accumulate kf group error

+    kf_group_err += calculate_modified_err(cpi, this_frame);

+    // These figures keep intra and coded error counts for all frames including key frames in the group.

+    // The effect of the key frame itself can be subtracted out using the first_frame data collected above

+    kf_group_intra_err += this_frame->intra_error;

+    kf_group_coded_err += this_frame->coded_error;

+    // load a the next frame's stats

+    vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));

+    input_stats(cpi, this_frame);

+    // Provided that we are not at the end of the file...

+    if (cpi->oxcf.auto_key

+        && lookup_next_frame_stats(cpi, &next_frame) != EOF) {

+      // Normal scene cut check

+      if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame)) {

+        break;

+      }

+      // How fast is prediction quality decaying

+      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);

+      // We want to know something about the recent past... rather than

+      // as used elsewhere where we are concened with decay in prediction

+      // quality since the last GF or KF.

+      recent_loop_decay[i % 8] = loop_decay_rate;

+      decay_accumulator = 1.0;

+      for (j = 0; j < 8; j++) {

+        decay_accumulator = decay_accumulator * recent_loop_decay[j];

+      }

+      // Special check for transition or high motion followed by a

+      // to a static scene.

+      if (detect_transition_to_still(cpi, i,

+                                     (cpi->key_frame_frequency - i),

+                                     loop_decay_rate,

+                                     decay_accumulator)) {

+        break;

+      }

+      // Step on to the next frame

+      cpi->twopass.frames_to_key++;

+      // If we don't have a real key frame within the next two

+      // forcekeyframeevery intervals then break out of the loop.

+      if (cpi->twopass.frames_to_key >= 2 * (int)cpi->key_frame_frequency)

+        break;

+    } else

+      cpi->twopass.frames_to_key++;

+    i++;

+  }

+  // If there is a max kf interval set by the user we must obey it.

+  // We already breakout of the loop above at 2x max.

+  // This code centers the extra kf if the actual natural

+  // interval is between 1x and 2x

+  if (cpi->oxcf.auto_key

+      && cpi->twopass.frames_to_key > (int)cpi->key_frame_frequency) {

+    FIRSTPASS_STATS *current_pos = cpi->twopass.stats_in;

+    FIRSTPASS_STATS tmp_frame;

+    cpi->twopass.frames_to_key /= 2;

+    // Copy first frame details

+    vpx_memcpy(&tmp_frame, &first_frame, sizeof(first_frame));

+    // Reset to the start of the group

+    reset_fpf_position(cpi, start_position);

+    kf_group_err = 0;

+    kf_group_intra_err = 0;

+    kf_group_coded_err = 0;

+    // Rescan to get the correct error data for the forced kf group

+    for (i = 0; i < cpi->twopass.frames_to_key; i++) {

+      // Accumulate kf group errors

+      kf_group_err += calculate_modified_err(cpi, &tmp_frame);

+      kf_group_intra_err += tmp_frame.intra_error;

+      kf_group_coded_err += tmp_frame.coded_error;

+      // Load a the next frame's stats

+      input_stats(cpi, &tmp_frame);

+    }

+    // Reset to the start of the group

+    reset_fpf_position(cpi, current_pos);

+    cpi->next_key_frame_forced = TRUE;

+  } else

+    cpi->next_key_frame_forced = FALSE;

+  // Special case for the last frame of the file

+  if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) {

+    // Accumulate kf group error

+    kf_group_err += calculate_modified_err(cpi, this_frame);

+    // These figures keep intra and coded error counts for all frames including key frames in the group.

+    // The effect of the key frame itself can be subtracted out using the first_frame data collected above

+    kf_group_intra_err += this_frame->intra_error;

+    kf_group_coded_err += this_frame->coded_error;

+  }

+  // Calculate the number of bits that should be assigned to the kf group.

+  if ((cpi->twopass.bits_left > 0) && (cpi->twopass.modified_error_left > 0.0)) {

+    // Max for a single normal frame (not key frame)

+    int max_bits = frame_max_bits(cpi);

+    // Maximum bits for the kf group

+    int64_t max_grp_bits;

+    // Default allocation based on bits left and relative

+    // complexity of the section

+    cpi->twopass.kf_group_bits = (int64_t)(cpi->twopass.bits_left *

+                                           (kf_group_err /

+                                            cpi->twopass.modified_error_left));

+    // Clip based on maximum per frame rate defined by the user.

+    max_grp_bits = (int64_t)max_bits * (int64_t)cpi->twopass.frames_to_key;

+    if (cpi->twopass.kf_group_bits > max_grp_bits)

+      cpi->twopass.kf_group_bits = max_grp_bits;

+  } else

+    cpi->twopass.kf_group_bits = 0;

+  // Reset the first pass file position

+  reset_fpf_position(cpi, start_position);

+  // determine how big to make this keyframe based on how well the subsequent frames use inter blocks

+  decay_accumulator = 1.0;

+  boost_score = 0.0;

+  loop_decay_rate = 1.00;       // Starting decay rate

+  for (i = 0; i < cpi->twopass.frames_to_key; i++) {

+    double r;

+    if (EOF == input_stats(cpi, &next_frame))

+      break;

+    if (next_frame.intra_error > cpi->twopass.kf_intra_err_min)

+      r = (IIKFACTOR2 * next_frame.intra_error /

+           DOUBLE_DIVIDE_CHECK(next_frame.coded_error));

+    else

+      r = (IIKFACTOR2 * cpi->twopass.kf_intra_err_min /

+           DOUBLE_DIVIDE_CHECK(next_frame.coded_error));

+    if (r > RMAX)

+      r = RMAX;

+    // Monitor for static sections.

+    if ((next_frame.pcnt_inter - next_frame.pcnt_motion) <

+        zero_motion_accumulator) {

+      zero_motion_accumulator =

+        (next_frame.pcnt_inter - next_frame.pcnt_motion);

+    }

+    // How fast is prediction quality decaying

+    if (!detect_flash(cpi, 0)) {

+      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);

+      decay_accumulator = decay_accumulator * loop_decay_rate;

+      decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;

+    }

+    boost_score += (decay_accumulator * r);

+    if ((i > MIN_GF_INTERVAL) &&

+        ((boost_score - old_boost_score) < 6.25)) {

+      break;

+    }

+    old_boost_score = boost_score;

+  }

+  {

+    FIRSTPASS_STATS sectionstats;

+    zero_stats(&sectionstats);

+    reset_fpf_position(cpi, start_position);

+    for (i = 0; i < cpi->twopass.frames_to_key; i++) {

+      input_stats(cpi, &next_frame);

+      accumulate_stats(&sectionstats, &next_frame);

+    }

+    avg_stats(&sectionstats);

+    cpi->twopass.section_intra_rating =

+      sectionstats.intra_error

+      / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);

+  }

+  // Reset the first pass file position

+  reset_fpf_position(cpi, start_position);

+  // Work out how many bits to allocate for the key frame itself

+  if (1) {

+    int kf_boost = boost_score;

+    int allocation_chunks;

+    int alt_kf_bits;

+    if (kf_boost < 300) {

+      kf_boost += (cpi->twopass.frames_to_key * 3);

+      if (kf_boost > 300)

+        kf_boost = 300;

+    }

+    if (kf_boost < 250)                                                      // Min KF boost

+      kf_boost = 250;

+    // Make a note of baseline boost and the zero motion

+    // accumulator value for use elsewhere.

+    cpi->kf_boost = kf_boost;

+    cpi->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);

+    // We do three calculations for kf size.

+    // The first is based on the error score for the whole kf group.

+    // The second (optionaly) on the key frames own error if this is

+    // smaller than the average for the group.

+    // The final one insures that the frame receives at least the

+    // allocation it would have received based on its own error score vs

+    // the error score remaining

+    // Special case if the sequence appears almost totaly static

+    // In this case we want to spend almost all of the bits on the

+    // key frame.

+    // cpi->twopass.frames_to_key-1 because key frame itself is taken

+    // care of by kf_boost.

+    if (zero_motion_accumulator >= 0.99) {

+      allocation_chunks =

+        ((cpi->twopass.frames_to_key - 1) * 10) + kf_boost;

+    } else {

+      allocation_chunks =

+        ((cpi->twopass.frames_to_key - 1) * 100) + kf_boost;

+    }

+    // Prevent overflow

+    if (kf_boost > 1028) {

+      int divisor = kf_boost >> 10;

+      kf_boost /= divisor;

+      allocation_chunks /= divisor;

+    }

+    cpi->twopass.kf_group_bits = (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;

+    // Calculate the number of bits to be spent on the key frame

+    cpi->twopass.kf_bits  = (int)((double)kf_boost * ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));

+    // If the key frame is actually easier than the average for the

+    // kf group (which does sometimes happen... eg a blank intro frame)

+    // Then use an alternate calculation based on the kf error score

+    // which should give a smaller key frame.

+    if (kf_mod_err < kf_group_err / cpi->twopass.frames_to_key) {

+      double  alt_kf_grp_bits =

+        ((double)cpi->twopass.bits_left *

+         (kf_mod_err * (double)cpi->twopass.frames_to_key) /

+         DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left));

+      alt_kf_bits = (int)((double)kf_boost *

+                          (alt_kf_grp_bits / (double)allocation_chunks));

+      if (cpi->twopass.kf_bits > alt_kf_bits) {

+        cpi->twopass.kf_bits = alt_kf_bits;

+      }

+    }

+    // Else if it is much harder than other frames in the group make sure

+    // it at least receives an allocation in keeping with its relative

+    // error score

+    else {

+      alt_kf_bits =

+        (int)((double)cpi->twopass.bits_left *

+              (kf_mod_err /

+               DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left)));

+      if (alt_kf_bits > cpi->twopass.kf_bits) {

+        cpi->twopass.kf_bits = alt_kf_bits;

+      }

+    }

+    cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits;

+    cpi->twopass.kf_bits += cpi->min_frame_bandwidth;                                          // Add in the minimum frame allowance

+    cpi->per_frame_bandwidth = cpi->twopass.kf_bits;                                           // Peer frame bit target for this frame

+    cpi->target_bandwidth = cpi->twopass.kf_bits * cpi->output_frame_rate;                      // Convert to a per second bitrate

+  }

+  // Note the total error score of the kf group minus the key frame itself

+  cpi->twopass.kf_group_error_left = (int)(kf_group_err - kf_mod_err);

+  // Adjust the count of total modified error left.

+  // The count of bits left is adjusted elsewhere based on real coded frame sizes

+  cpi->twopass.modified_error_left -= kf_group_err;

+}

--- /dev/null

+++ b/vp9/encoder/firstpass.h

@@ -1,0 +1,23 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#if !defined __INC_FIRSTPASS_H

+#define      __INC_FIRSTPASS_H

+extern void vp9_init_first_pass(VP9_COMP *cpi);

+extern void vp9_first_pass(VP9_COMP *cpi);

+extern void vp9_end_first_pass(VP9_COMP *cpi);

+extern void vp9_init_second_pass(VP9_COMP *cpi);

+extern void vp9_second_pass(VP9_COMP *cpi);

+extern void vp9_end_second_pass(VP9_COMP *cpi);

+#endif

--- /dev/null

+++ b/vp9/encoder/generic/csystemdependent.c

@@ -1,0 +1,48 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vp9/encoder/variance.h"

+#include "vp9/encoder/onyx_int.h"

+void vp9_arch_x86_encoder_init(VP9_COMP *cpi);

+void vp9_arch_arm_encoder_init(VP9_COMP *cpi);

+void (*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc,

+                                        YV12_BUFFER_CONFIG *dst_ybc,

+                                        int fraction);

+extern void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,

+                                        YV12_BUFFER_CONFIG *dst_ybc,

+                                        int fraction);

+void vp9_cmachine_specific_config(VP9_COMP *cpi) {

+#if CONFIG_RUNTIME_CPU_DETECT

+  cpi->rtcd.common                    = &cpi->common.rtcd;

+  cpi->rtcd.search.full_search             = vp9_full_search_sad;

+  cpi->rtcd.search.refining_search         = vp9_refining_search_sad;

+  cpi->rtcd.search.diamond_search          = vp9_diamond_search_sad;

+  cpi->rtcd.temporal.apply                 = vp9_temporal_filter_apply_c;

+#endif

+  vp9_yv12_copy_partial_frame_ptr = vp9_yv12_copy_partial_frame;

+#if ARCH_X86 || ARCH_X86_64

+  vp9_arch_x86_encoder_init(cpi);

+#endif

+#if ARCH_ARM

+  vp9_arch_arm_encoder_init(cpi);

+#endif

+}

--- /dev/null

+++ b/vp9/encoder/lookahead.c

@@ -1,0 +1,191 @@

+/*

+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <assert.h>

+#include <stdlib.h>

+#include "vpx_config.h"

+#include "lookahead.h"

+#include "vp9/common/extend.h"

+#define MAX_LAG_BUFFERS 25

+struct lookahead_ctx {

+  unsigned int max_sz;         /* Absolute size of the queue */

+  unsigned int sz;             /* Number of buffers currently in the queue */

+  unsigned int read_idx;       /* Read index */

+  unsigned int write_idx;      /* Write index */

+  struct lookahead_entry *buf; /* Buffer list */

+};

+/* Return the buffer at the given absolute index and increment the index */

+static struct lookahead_entry *

+pop(struct lookahead_ctx *ctx,

+    unsigned int         *idx) {

+  unsigned int            index = *idx;

+  struct lookahead_entry *buf = ctx->buf + index;

+  assert(index < ctx->max_sz);

+  if (++index >= ctx->max_sz)

+    index -= ctx->max_sz;

+  *idx = index;

+  return buf;

+}

+void

+vp9_lookahead_destroy(struct lookahead_ctx *ctx) {

+  if (ctx) {

+    if (ctx->buf) {

+      int i;

+      for (i = 0; i < ctx->max_sz; i++)

+        vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img);

+      free(ctx->buf);

+    }

+    free(ctx);

+  }

+}

+struct lookahead_ctx *

+vp9_lookahead_init(unsigned int width,

+                   unsigned int height,

+                   unsigned int depth) {

+  struct lookahead_ctx *ctx = NULL;

+  int i;

+  /* Clamp the lookahead queue depth */

+  if (depth < 1)

+    depth = 1;

+  else if (depth > MAX_LAG_BUFFERS)

+    depth = MAX_LAG_BUFFERS;

+  /* Align the buffer dimensions */

+  width = (width + 15) &~15;

+  height = (height + 15) &~15;

+  /* Allocate the lookahead structures */

+  ctx = calloc(1, sizeof(*ctx));

+  if (ctx) {

+    ctx->max_sz = depth;

+    ctx->buf = calloc(depth, sizeof(*ctx->buf));

+    if (!ctx->buf)

+      goto bail;

+    for (i = 0; i < depth; i++)

+      if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img,

+                                      width, height, VP8BORDERINPIXELS))

+        goto bail;

+  }

+  return ctx;

+bail:

+  vp9_lookahead_destroy(ctx);

+  return NULL;

+}

+int

+vp9_lookahead_push(struct lookahead_ctx *ctx,

+                   YV12_BUFFER_CONFIG   *src,

+                   int64_t               ts_start,

+                   int64_t               ts_end,

+                   unsigned int          flags,

+                   unsigned char        *active_map) {

+  struct lookahead_entry *buf;

+  int row, col, active_end;

+  int mb_rows = (src->y_height + 15) >> 4;

+  int mb_cols = (src->y_width + 15) >> 4;

+  if (ctx->sz + 1 > ctx->max_sz)

+    return 1;

+  ctx->sz++;

+  buf = pop(ctx, &ctx->write_idx);

+  // Only do this partial copy if the following conditions are all met:

+  // 1. Lookahead queue has has size of 1.

+  // 2. Active map is provided.

+  // 3. This is not a key frame, golden nor altref frame.

+  if (ctx->max_sz == 1 && active_map && !flags) {

+    for (row = 0; row < mb_rows; ++row) {

+      col = 0;

+      while (1) {

+        // Find the first active macroblock in this row.

+        for (; col < mb_cols; ++col) {

+          if (active_map[col])

+            break;

+        }

+        // No more active macroblock in this row.

+        if (col == mb_cols)

+          break;

+        // Find the end of active region in this row.

+        active_end = col;

+        for (; active_end < mb_cols; ++active_end) {

+          if (!active_map[active_end])

+            break;

+        }

+        // Only copy this active region.

+        vp9_copy_and_extend_frame_with_rect(src, &buf->img,

+                                            row << 4,

+                                            col << 4, 16,

+                                            (active_end - col) << 4);

+        // Start again from the end of this active region.

+        col = active_end;

+      }

+      active_map += mb_cols;

+    }

+  } else {

+    vp9_copy_and_extend_frame(src, &buf->img);

+  }

+  buf->ts_start = ts_start;

+  buf->ts_end = ts_end;

+  buf->flags = flags;

+  return 0;

+}

+struct lookahead_entry *

+vp9_lookahead_pop(struct lookahead_ctx *ctx,

+                  int                   drain) {

+  struct lookahead_entry *buf = NULL;

+  if (ctx->sz && (drain || ctx->sz == ctx->max_sz)) {

+    buf = pop(ctx, &ctx->read_idx);

+    ctx->sz--;

+  }

+  return buf;

+}

+struct lookahead_entry *

+vp9_lookahead_peek(struct lookahead_ctx *ctx,

+                   int                   index) {

+  struct lookahead_entry *buf = NULL;

+  assert(index < ctx->max_sz);

+  if (index < ctx->sz) {

+    index += ctx->read_idx;

+    if (index >= ctx->max_sz)

+      index -= ctx->max_sz;

+    buf = ctx->buf + index;

+  }

+  return buf;

+}

+unsigned int

+vp9_lookahead_depth(struct lookahead_ctx *ctx) {

+  return ctx->sz;

+}

--- /dev/null

+++ b/vp9/encoder/lookahead.h

@@ -1,0 +1,105 @@

+/*

+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef LOOKAHEAD_H

+#define LOOKAHEAD_H

+#include "vpx_scale/yv12config.h"

+#include "vpx/vpx_integer.h"

+struct lookahead_entry {

+  YV12_BUFFER_CONFIG  img;

+  int64_t             ts_start;

+  int64_t             ts_end;

+  unsigned int        flags;

+};

+struct lookahead_ctx;

+/**\brief Initializes the lookahead stage

+ *

+ * The lookahead stage is a queue of frame buffers on which some analysis

+ * may be done when buffers are enqueued.

+ *

+ *

+ */

+struct lookahead_ctx *vp9_lookahead_init(unsigned int width,

+                                         unsigned int height,

+                                         unsigned int depth

+                                        );

+/**\brief Destroys the lookahead stage

+ *

+ */

+void vp9_lookahead_destroy(struct lookahead_ctx *ctx);

+/**\brief Enqueue a source buffer

+ *

+ * This function will copy the source image into a new framebuffer with

+ * the expected stride/border.

+ *

+ * If active_map is non-NULL and there is only one frame in the queue, then copy

+ * only active macroblocks.

+ *

+ * \param[in] ctx         Pointer to the lookahead context

+ * \param[in] src         Pointer to the image to enqueue

+ * \param[in] ts_start    Timestamp for the start of this frame

+ * \param[in] ts_end      Timestamp for the end of this frame

+ * \param[in] flags       Flags set on this frame

+ * \param[in] active_map  Map that specifies which macroblock is active

+ */

+int

+vp9_lookahead_push(struct lookahead_ctx *ctx,

+                   YV12_BUFFER_CONFIG   *src,

+                   int64_t               ts_start,

+                   int64_t               ts_end,

+                   unsigned int          flags,

+                   unsigned char        *active_map);

+/**\brief Get the next source buffer to encode

+ *

+ *

+ * \param[in] ctx       Pointer to the lookahead context

+ * \param[in] drain     Flag indicating the buffer should be drained

+ *                      (return a buffer regardless of the current queue depth)

+ *

+ * \retval NULL, if drain set and queue is empty

+ * \retval NULL, if drain not set and queue not of the configured depth

+ *

+ */

+struct lookahead_entry *

+vp9_lookahead_pop(struct lookahead_ctx *ctx,

+                  int                   drain);

+/**\brief Get a future source buffer to encode

+ *

+ * \param[in] ctx       Pointer to the lookahead context

+ * \param[in] index     Index of the frame to be returned, 0 == next frame

+ *

+ * \retval NULL, if no buffer exists at the specified index

+ *

+ */

+struct lookahead_entry *

+vp9_lookahead_peek(struct lookahead_ctx *ctx,

+                   int                   index);

+/**\brief Get the number of frames currently in the lookahead queue

+ *

+ * \param[in] ctx       Pointer to the lookahead context

+ */

+unsigned int

+vp9_lookahead_depth(struct lookahead_ctx *ctx);

+#endif

--- /dev/null

+++ b/vp9/encoder/mbgraph.c

@@ -1,0 +1,480 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <limits.h>

+#include <vp9/encoder/encodeintra.h>

+#include <vp9/encoder/rdopt.h>

+#include <vp9/common/setupintrarecon.h>

+#include <vp9/common/blockd.h>

+#include <vp9/common/reconinter.h>

+#include <vp9/common/systemdependent.h>

+#include <vpx_mem/vpx_mem.h>

+#include <vp9/encoder/segmentation.h>

+static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,

+                                              int_mv *ref_mv,

+                                              int_mv *dst_mv) {

+  MACROBLOCK   *const x  = &cpi->mb;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  BLOCK *b  = &x->block[0];

+  BLOCKD *d = &xd->block[0];

+  vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];

+  unsigned int best_err;

+  int step_param, further_steps;

+  int tmp_col_min = x->mv_col_min;

+  int tmp_col_max = x->mv_col_max;

+  int tmp_row_min = x->mv_row_min;

+  int tmp_row_max = x->mv_row_max;

+  int_mv ref_full;

+  // Further step/diamond searches as necessary

+  if (cpi->Speed < 8) {

+    step_param = cpi->sf.first_step + ((cpi->Speed > 5) ? 1 : 0);

+    further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;

+  } else {

+    step_param = cpi->sf.first_step + 2;

+    further_steps = 0;

+  }

+  vp9_clamp_mv_min_max(x, ref_mv);

+  ref_full.as_mv.col = ref_mv->as_mv.col >> 3;

+  ref_full.as_mv.row = ref_mv->as_mv.row >> 3;

+  /*cpi->sf.search_method == HEX*/

+  best_err = vp9_hex_search(

+      x, b, d,

+      &ref_full, dst_mv,

+      step_param,

+      x->errorperbit,

+      &v_fn_ptr,

+      NULLMVCOST,

+      NULLMVCOST,

+      ref_mv);

+  // Try sub-pixel MC

+  // if (bestsme > error_thresh && bestsme < INT_MAX)

+  {

+    int distortion;

+    unsigned int sse;

+    best_err = cpi->find_fractional_mv_step(

+        x, b, d,

+        dst_mv, ref_mv,

+        x->errorperbit, &v_fn_ptr,

+        NULLMVCOST,

+        & distortion, &sse);

+  }

+#if CONFIG_PRED_FILTER

+  // Disable the prediction filter

+  xd->mode_info_context->mbmi.pred_filter_enabled = 0;

+#endif

+  vp9_set_mbmode_and_mvs(x, NEWMV, dst_mv);

+  vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);

+  best_err = vp9_sad16x16(xd->dst.y_buffer, xd->dst.y_stride,

+                          xd->predictor, 16, INT_MAX);

+  /* restore UMV window */

+  x->mv_col_min = tmp_col_min;

+  x->mv_col_max = tmp_col_max;

+  x->mv_row_min = tmp_row_min;

+  x->mv_row_max = tmp_row_max;

+  return best_err;

+}

+static int do_16x16_motion_search

+(

+  VP9_COMP *cpi,

+  int_mv *ref_mv,

+  int_mv *dst_mv,

+  YV12_BUFFER_CONFIG *buf,

+  int buf_mb_y_offset,

+  YV12_BUFFER_CONFIG *ref,

+  int mb_y_offset

+) {

+  MACROBLOCK   *const x  = &cpi->mb;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  unsigned int err, tmp_err;

+  int_mv tmp_mv;

+  int n;

+  for (n = 0; n < 16; n++) {

+    BLOCKD *d = &xd->block[n];

+    BLOCK *b  = &x->block[n];

+    b->base_src   = &buf->y_buffer;

+    b->src_stride = buf->y_stride;

+    b->src        = buf->y_stride * (n & 12) + (n & 3) * 4 + buf_mb_y_offset;

+    d->base_pre   = &ref->y_buffer;

+    d->pre_stride = ref->y_stride;

+    d->pre        = ref->y_stride * (n & 12) + (n & 3) * 4 + mb_y_offset;

+  }

+  // Try zero MV first

+  // FIXME should really use something like near/nearest MV and/or MV prediction

+  xd->pre.y_buffer = ref->y_buffer + mb_y_offset;

+  xd->pre.y_stride = ref->y_stride;

+  err = vp9_sad16x16(ref->y_buffer + mb_y_offset, ref->y_stride,

+                     xd->dst.y_buffer, xd->dst.y_stride, INT_MAX);

+  dst_mv->as_int = 0;

+  // Test last reference frame using the previous best mv as the

+  // starting point (best reference) for the search

+  tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv);

+  if (tmp_err < err) {

+    err            = tmp_err;

+    dst_mv->as_int = tmp_mv.as_int;

+  }

+  // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well

+  if (ref_mv->as_int) {

+    int tmp_err;

+    int_mv zero_ref_mv, tmp_mv;

+    zero_ref_mv.as_int = 0;

+    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv);

+    if (tmp_err < err) {

+      dst_mv->as_int = tmp_mv.as_int;

+      err = tmp_err;

+    }

+  }

+  return err;

+}

+static int do_16x16_zerozero_search

+(

+  VP9_COMP *cpi,

+  int_mv *dst_mv,

+  YV12_BUFFER_CONFIG *buf,

+  int buf_mb_y_offset,

+  YV12_BUFFER_CONFIG *ref,

+  int mb_y_offset

+) {

+  MACROBLOCK   *const x  = &cpi->mb;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  unsigned int err;

+  int n;

+  for (n = 0; n < 16; n++) {

+    BLOCKD *d = &xd->block[n];

+    BLOCK *b  = &x->block[n];

+    b->base_src   = &buf->y_buffer;

+    b->src_stride = buf->y_stride;

+    b->src        = buf->y_stride * (n & 12) + (n & 3) * 4 + buf_mb_y_offset;

+    d->base_pre   = &ref->y_buffer;

+    d->pre_stride = ref->y_stride;

+    d->pre        = ref->y_stride * (n & 12) + (n & 3) * 4 + mb_y_offset;

+  }

+  // Try zero MV first

+  // FIXME should really use something like near/nearest MV and/or MV prediction

+  xd->pre.y_buffer = ref->y_buffer + mb_y_offset;

+  xd->pre.y_stride = ref->y_stride;

+  // VARIANCE_INVOKE(&cpi->rtcd.variance, satd16x16)

+  err = vp9_sad16x16(ref->y_buffer + mb_y_offset, ref->y_stride,

+                     xd->dst.y_buffer, xd->dst.y_stride, INT_MAX);

+  dst_mv->as_int = 0;

+  return err;

+}

+static int find_best_16x16_intra

+(

+  VP9_COMP *cpi,

+  YV12_BUFFER_CONFIG *buf,

+  int mb_y_offset,

+  MB_PREDICTION_MODE *pbest_mode

+) {

+  MACROBLOCK   *const x  = &cpi->mb;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  MB_PREDICTION_MODE best_mode = -1, mode;

+  int best_err = INT_MAX;

+  // calculate SATD for each intra prediction mode;

+  // we're intentionally not doing 4x4, we just want a rough estimate

+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {

+    unsigned int err;

+    xd->mode_info_context->mbmi.mode = mode;

+    vp9_build_intra_predictors_mby(xd);

+    err = vp9_sad16x16(xd->predictor, 16, buf->y_buffer + mb_y_offset,

+                       buf->y_stride, best_err);

+    // find best

+    if (err < best_err) {

+      best_err  = err;

+      best_mode = mode;

+    }

+  }

+  if (pbest_mode)

+    *pbest_mode = best_mode;

+  return best_err;

+}

+static void update_mbgraph_mb_stats

+(

+  VP9_COMP *cpi,

+  MBGRAPH_MB_STATS *stats,

+  YV12_BUFFER_CONFIG *buf,

+  int mb_y_offset,

+  YV12_BUFFER_CONFIG *golden_ref,

+  int_mv *prev_golden_ref_mv,

+  int gld_y_offset,

+  YV12_BUFFER_CONFIG *alt_ref,

+  int_mv *prev_alt_ref_mv,

+  int arf_y_offset

+) {

+  MACROBLOCK   *const x  = &cpi->mb;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  int intra_error;

+  // FIXME in practice we're completely ignoring chroma here

+  xd->dst.y_buffer = buf->y_buffer + mb_y_offset;

+  // do intra 16x16 prediction

+  intra_error = find_best_16x16_intra(cpi, buf, mb_y_offset, &stats->ref[INTRA_FRAME].m.mode);

+  if (intra_error <= 0)

+    intra_error = 1;

+  stats->ref[INTRA_FRAME].err = intra_error;

+  // Golden frame MV search, if it exists and is different than last frame

+  if (golden_ref) {

+    int g_motion_error = do_16x16_motion_search(cpi, prev_golden_ref_mv,

+                                                &stats->ref[GOLDEN_FRAME].m.mv,

+                                                buf, mb_y_offset,

+                                                golden_ref, gld_y_offset);

+    stats->ref[GOLDEN_FRAME].err = g_motion_error;

+  } else {

+    stats->ref[GOLDEN_FRAME].err = INT_MAX;

+    stats->ref[GOLDEN_FRAME].m.mv.as_int = 0;

+  }

+  // Alt-ref frame MV search, if it exists and is different than last/golden frame

+  if (alt_ref) {

+    // int a_motion_error = do_16x16_motion_search(cpi, prev_alt_ref_mv,

+    //                                            &stats->ref[ALTREF_FRAME].m.mv,

+    //                                            buf, mb_y_offset,

+    //                                            alt_ref, arf_y_offset);

+    int a_motion_error =

+      do_16x16_zerozero_search(cpi,

+                               &stats->ref[ALTREF_FRAME].m.mv,

+                               buf, mb_y_offset,

+                               alt_ref, arf_y_offset);

+    stats->ref[ALTREF_FRAME].err = a_motion_error;

+  } else {

+    stats->ref[ALTREF_FRAME].err = INT_MAX;

+    stats->ref[ALTREF_FRAME].m.mv.as_int = 0;

+  }

+}

+static void update_mbgraph_frame_stats

+(

+  VP9_COMP *cpi,

+  MBGRAPH_FRAME_STATS *stats,

+  YV12_BUFFER_CONFIG *buf,

+  YV12_BUFFER_CONFIG *golden_ref,

+  YV12_BUFFER_CONFIG *alt_ref

+) {

+  MACROBLOCK   *const x  = &cpi->mb;

+  VP9_COMMON   *const cm = &cpi->common;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  int mb_col, mb_row, offset = 0;

+  int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;

+  int_mv arf_top_mv, gld_top_mv;

+  MODE_INFO mi_local;

+  // Set up limit values for motion vectors to prevent them extending outside the UMV borders

+  arf_top_mv.as_int = 0;

+  gld_top_mv.as_int = 0;

+  x->mv_row_min     = -(VP8BORDERINPIXELS - 16 - INTERP_EXTEND);

+  x->mv_row_max     = (cm->mb_rows - 1) * 16 + VP8BORDERINPIXELS - 16 - INTERP_EXTEND;

+  xd->up_available  = 0;

+  xd->dst.y_stride  = buf->y_stride;

+  xd->pre.y_stride  = buf->y_stride;

+  xd->dst.uv_stride = buf->uv_stride;

+  xd->mode_info_context = &mi_local;

+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {

+    int_mv arf_left_mv, gld_left_mv;

+    int mb_y_in_offset  = mb_y_offset;

+    int arf_y_in_offset = arf_y_offset;

+    int gld_y_in_offset = gld_y_offset;

+    // Set up limit values for motion vectors to prevent them extending outside the UMV borders

+    arf_left_mv.as_int = arf_top_mv.as_int;

+    gld_left_mv.as_int = gld_top_mv.as_int;

+    x->mv_col_min      = -(VP8BORDERINPIXELS - 16 - INTERP_EXTEND);

+    x->mv_col_max      = (cm->mb_cols - 1) * 16 + VP8BORDERINPIXELS - 16 - INTERP_EXTEND;

+    xd->left_available = 0;

+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

+      MBGRAPH_MB_STATS *mb_stats = &stats->mb_stats[offset + mb_col];

+      update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset,

+                              golden_ref, &gld_left_mv, gld_y_in_offset,

+                              alt_ref,    &arf_left_mv, arf_y_in_offset);

+      arf_left_mv.as_int = mb_stats->ref[ALTREF_FRAME].m.mv.as_int;

+      gld_left_mv.as_int = mb_stats->ref[GOLDEN_FRAME].m.mv.as_int;

+      if (mb_col == 0) {

+        arf_top_mv.as_int = arf_left_mv.as_int;

+        gld_top_mv.as_int = gld_left_mv.as_int;

+      }

+      xd->left_available = 1;

+      mb_y_in_offset    += 16;

+      gld_y_in_offset   += 16;

+      arf_y_in_offset   += 16;

+      x->mv_col_min     -= 16;

+      x->mv_col_max     -= 16;

+    }

+    xd->up_available = 1;

+    mb_y_offset     += buf->y_stride * 16;

+    gld_y_offset    += golden_ref->y_stride * 16;

+    if (alt_ref)

+      arf_y_offset    += alt_ref->y_stride * 16;

+    x->mv_row_min   -= 16;

+    x->mv_row_max   -= 16;

+    offset          += cm->mb_cols;

+  }

+}

+// void separate_arf_mbs_byzz

+static void separate_arf_mbs(VP9_COMP *cpi) {

+  VP9_COMMON *const cm = &cpi->common;

+  int mb_col, mb_row, offset, i;

+  int ncnt[4];

+  int n_frames = cpi->mbgraph_n_frames;

+  int *arf_not_zz;

+  CHECK_MEM_ERROR(arf_not_zz,

+                  vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));

+  vpx_memset(arf_not_zz, 0, sizeof(arf_not_zz));

+  // We are not interested in results beyond the alt ref itself.

+  if (n_frames > cpi->frames_till_gf_update_due)

+    n_frames = cpi->frames_till_gf_update_due;

+  // defer cost to reference frames

+  for (i = n_frames - 1; i >= 0; i--) {

+    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];

+    for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;

+         offset += cm->mb_cols, mb_row++) {

+      for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

+        MBGRAPH_MB_STATS *mb_stats =

+          &frame_stats->mb_stats[offset + mb_col];

+        int altref_err = mb_stats->ref[ALTREF_FRAME].err;

+        int intra_err  = mb_stats->ref[INTRA_FRAME ].err;

+        int golden_err = mb_stats->ref[GOLDEN_FRAME].err;

+        // Test for altref vs intra and gf and that its mv was 0,0.

+        if ((altref_err > 1000) ||

+            (altref_err > intra_err) ||

+            (altref_err > golden_err)) {

+          arf_not_zz[offset + mb_col]++;

+        }

+      }

+    }

+  }

+  vpx_memset(ncnt, 0, sizeof(ncnt));

+  for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;

+       offset += cm->mb_cols, mb_row++) {

+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

+      // If any of the blocks in the sequence failed then the MB

+      // goes in segment 0

+      if (arf_not_zz[offset + mb_col]) {

+        ncnt[0]++;

+        cpi->segmentation_map[offset + mb_col] = 0;

+      } else {

+        ncnt[1]++;

+        cpi->segmentation_map[offset + mb_col] = 1;

+      }

+    }

+  }

+  // Only bother with segmentation if over 10% of the MBs in static segment

+  // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) )

+  if (1) {

+    // Note % of blocks that are marked as static

+    if (cm->MBs)

+      cpi->static_mb_pct = (ncnt[1] * 100) / cm->MBs;

+    // This error case should not be reachable as this function should

+    // never be called with the common data structure unititialized.

+    else

+      cpi->static_mb_pct = 0;

+    cpi->seg0_cnt = ncnt[0];

+    vp9_enable_segmentation((VP9_PTR) cpi);

+  } else {

+    cpi->static_mb_pct = 0;

+    vp9_disable_segmentation((VP9_PTR) cpi);

+  }

+  // Free localy allocated storage

+  vpx_free(arf_not_zz);

+}

+void vp9_update_mbgraph_stats

+(

+  VP9_COMP *cpi

+) {

+  VP9_COMMON *const cm = &cpi->common;

+  int i, n_frames = vp9_lookahead_depth(cpi->lookahead);

+  YV12_BUFFER_CONFIG *golden_ref = &cm->yv12_fb[cm->gld_fb_idx];

+  // we need to look ahead beyond where the ARF transitions into

+  // being a GF - so exit if we don't look ahead beyond that

+  if (n_frames <= cpi->frames_till_gf_update_due)

+    return;

+  if (n_frames > cpi->common.frames_till_alt_ref_frame)

+    n_frames = cpi->common.frames_till_alt_ref_frame;

+  if (n_frames > MAX_LAG_BUFFERS)

+    n_frames = MAX_LAG_BUFFERS;

+  cpi->mbgraph_n_frames = n_frames;

+  for (i = 0; i < n_frames; i++) {

+    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];

+    vpx_memset(frame_stats->mb_stats, 0,

+               cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats));

+  }

+  // do motion search to find contribution of each reference to data

+  // later on in this GF group

+  // FIXME really, the GF/last MC search should be done forward, and

+  // the ARF MC search backwards, to get optimal results for MV caching

+  for (i = 0; i < n_frames; i++) {

+    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];

+    struct lookahead_entry *q_cur =

+      vp9_lookahead_peek(cpi->lookahead, i);

+    assert(q_cur != NULL);

+    update_mbgraph_frame_stats(cpi, frame_stats, &q_cur->img,

+                               golden_ref, cpi->Source);

+  }

+  vp9_clear_system_state();  // __asm emms;

+  separate_arf_mbs(cpi);

+}

--- /dev/null

+++ b/vp9/encoder/mbgraph.h

@@ -1,0 +1,16 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_MBGRAPH_H__

+#define __INC_MBGRAPH_H__ 1

+extern void vp9_update_mbgraph_stats(VP9_COMP *cpi);

+#endif /* __INC_MBGRAPH_H__ */

--- /dev/null

+++ b/vp9/encoder/mcomp.c

@@ -1,0 +1,2203 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vp9/encoder/onyx_int.h"

+#include "mcomp.h"

+#include "vpx_mem/vpx_mem.h"

+#include "vpx_ports/config.h"

+#include <stdio.h>

+#include <limits.h>

+#include <math.h>

+#include "vp9/common/findnearmv.h"

+#ifdef ENTROPY_STATS

+static int mv_ref_ct [31] [4] [2];

+static int mv_mode_cts [4] [2];

+#endif

+void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) {

+  int col_min = (ref_mv->as_mv.col >> 3) - MAX_FULL_PEL_VAL +

+      ((ref_mv->as_mv.col & 7) ? 1 : 0);

+  int row_min = (ref_mv->as_mv.row >> 3) - MAX_FULL_PEL_VAL +

+      ((ref_mv->as_mv.row & 7) ? 1 : 0);

+  int col_max = (ref_mv->as_mv.col >> 3) + MAX_FULL_PEL_VAL;

+  int row_max = (ref_mv->as_mv.row >> 3) + MAX_FULL_PEL_VAL;

+  /* Get intersection of UMV window and valid MV window to reduce # of checks in diamond search. */

+  if (x->mv_col_min < col_min)

+    x->mv_col_min = col_min;

+  if (x->mv_col_max > col_max)

+    x->mv_col_max = col_max;

+  if (x->mv_row_min < row_min)

+    x->mv_row_min = row_min;

+  if (x->mv_row_max > row_max)

+    x->mv_row_max = row_max;

+}

+int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, DEC_MVCOSTS,

+                    int Weight, int ishp) {

+  MV v;

+  v.row = (mv->as_mv.row - ref->as_mv.row);

+  v.col = (mv->as_mv.col - ref->as_mv.col);

+  return ((mvjcost[vp9_get_mv_joint(v)] +

+           mvcost[0][v.row] + mvcost[1][v.col]) *

+          Weight) >> 7;

+}

+static int mv_err_cost(int_mv *mv, int_mv *ref, DEC_MVCOSTS,

+                       int error_per_bit, int ishp) {

+  if (mvcost) {

+    MV v;

+    v.row = (mv->as_mv.row - ref->as_mv.row);

+    v.col = (mv->as_mv.col - ref->as_mv.col);

+    return ((mvjcost[vp9_get_mv_joint(v)] +

+             mvcost[0][v.row] + mvcost[1][v.col]) *

+            error_per_bit + 128) >> 8;

+  }

+  return 0;

+}

+static int mvsad_err_cost(int_mv *mv, int_mv *ref, DEC_MVSADCOSTS,

+                          int error_per_bit) {

+  if (mvsadcost) {

+    MV v;

+    v.row = (mv->as_mv.row - ref->as_mv.row);

+    v.col = (mv->as_mv.col - ref->as_mv.col);

+    return ((mvjsadcost[vp9_get_mv_joint(v)] +

+             mvsadcost[0][v.row] + mvsadcost[1][v.col]) *

+            error_per_bit + 128) >> 8;

+  }

+  return 0;

+}

+void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride) {

+  int Len;

+  int search_site_count = 0;

+  // Generate offsets for 4 search sites per step.

+  Len = MAX_FIRST_STEP;

+  x->ss[search_site_count].mv.col = 0;

+  x->ss[search_site_count].mv.row = 0;

+  x->ss[search_site_count].offset = 0;

+  search_site_count++;

+  while (Len > 0) {

+    // Compute offsets for search sites.

+    x->ss[search_site_count].mv.col = 0;

+    x->ss[search_site_count].mv.row = -Len;

+    x->ss[search_site_count].offset = -Len * stride;

+    search_site_count++;

+    // Compute offsets for search sites.

+    x->ss[search_site_count].mv.col = 0;

+    x->ss[search_site_count].mv.row = Len;

+    x->ss[search_site_count].offset = Len * stride;

+    search_site_count++;

+    // Compute offsets for search sites.

+    x->ss[search_site_count].mv.col = -Len;

+    x->ss[search_site_count].mv.row = 0;

+    x->ss[search_site_count].offset = -Len;

+    search_site_count++;

+    // Compute offsets for search sites.

+    x->ss[search_site_count].mv.col = Len;

+    x->ss[search_site_count].mv.row = 0;

+    x->ss[search_site_count].offset = Len;

+    search_site_count++;

+    // Contract.

+    Len /= 2;

+  }

+  x->ss_count = search_site_count;

+  x->searches_per_step = 4;

+}

+void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {

+  int Len;

+  int search_site_count = 0;

+  // Generate offsets for 8 search sites per step.

+  Len = MAX_FIRST_STEP;

+  x->ss[search_site_count].mv.col = 0;

+  x->ss[search_site_count].mv.row = 0;

+  x->ss[search_site_count].offset = 0;

+  search_site_count++;

+  while (Len > 0) {

+    // Compute offsets for search sites.

+    x->ss[search_site_count].mv.col = 0;

+    x->ss[search_site_count].mv.row = -Len;

+    x->ss[search_site_count].offset = -Len * stride;

+    search_site_count++;

+    // Compute offsets for search sites.

+    x->ss[search_site_count].mv.col = 0;

+    x->ss[search_site_count].mv.row = Len;

+    x->ss[search_site_count].offset = Len * stride;

+    search_site_count++;

+    // Compute offsets for search sites.

+    x->ss[search_site_count].mv.col = -Len;

+    x->ss[search_site_count].mv.row = 0;

+    x->ss[search_site_count].offset = -Len;

+    search_site_count++;

+    // Compute offsets for search sites.

+    x->ss[search_site_count].mv.col = Len;

+    x->ss[search_site_count].mv.row = 0;

+    x->ss[search_site_count].offset = Len;

+    search_site_count++;

+    // Compute offsets for search sites.

+    x->ss[search_site_count].mv.col = -Len;

+    x->ss[search_site_count].mv.row = -Len;

+    x->ss[search_site_count].offset = -Len * stride - Len;

+    search_site_count++;

+    // Compute offsets for search sites.

+    x->ss[search_site_count].mv.col = Len;

+    x->ss[search_site_count].mv.row = -Len;

+    x->ss[search_site_count].offset = -Len * stride + Len;

+    search_site_count++;

+    // Compute offsets for search sites.

+    x->ss[search_site_count].mv.col = -Len;

+    x->ss[search_site_count].mv.row = Len;

+    x->ss[search_site_count].offset = Len * stride - Len;

+    search_site_count++;

+    // Compute offsets for search sites.

+    x->ss[search_site_count].mv.col = Len;

+    x->ss[search_site_count].mv.row = Len;

+    x->ss[search_site_count].offset = Len * stride + Len;

+    search_site_count++;

+    // Contract.

+    Len /= 2;

+  }

+  x->ss_count = search_site_count;

+  x->searches_per_step = 8;

+}

+/*

+ * To avoid the penalty for crossing cache-line read, preload the reference

+ * area in a small buffer, which is aligned to make sure there won't be crossing

+ * cache-line read while reading from this buffer. This reduced the cpu

+ * cycles spent on reading ref data in sub-pixel filter functions.

+ * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x

+ * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we

+ * could reduce the area.

+ */

+/* estimated cost of a motion vector (r,c) */

+#define MVC(r, c)                                       \

+    (mvcost ?                                           \

+     ((mvjcost[((r) != rr) * 2 + ((c) != rc)] +         \

+       mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) * \

+      error_per_bit + 128) >> 8 : 0)

+#define SP(x) (((x) & 7) << 1)  // convert motion vector component to offset

+                                // for svf calc

+#define IFMVCV(r, c, s, e)                                \

+    if (c >= minc && c <= maxc && r >= minr && r <= maxr) \

+      s                                                   \

+    else                                                  \

+      e;

+/* pointer to predictor base of a motionvector */

+#define PRE(r, c) (y + (((r) >> 3) * y_stride + ((c) >> 3) -(offset)))

+/* returns subpixel variance error function */

+#define DIST(r, c) \

+    vfp->svf(PRE(r, c), y_stride, SP(c), SP(r), z, b->src_stride, &sse)

+/* checks if (r, c) has better score than previous best */

+#define CHECK_BETTER(v, r, c) \

+    IFMVCV(r, c, {                                                       \

+      thismse = (DIST(r, c));                                            \

+      if ((v = MVC(r, c) + thismse) < besterr) {                         \

+        besterr = v;                                                     \

+        br = r;                                                          \

+        bc = c;                                                          \

+        *distortion = thismse;                                           \

+        *sse1 = sse;                                                     \

+      }                                                                  \

+    },                                                                   \

+    v = INT_MAX;)

+#define MIN(x,y) (((x)<(y))?(x):(y))

+#define MAX(x,y) (((x)>(y))?(x):(y))

+int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

+                                             int_mv *bestmv, int_mv *ref_mv,

+                                             int error_per_bit,

+                                             const vp9_variance_fn_ptr_t *vfp,

+                                             DEC_MVCOSTS,

+                                             int *distortion,

+                                             unsigned int *sse1) {

+  unsigned char *z = (*(b->base_src) + b->src);

+  MACROBLOCKD *xd = &x->e_mbd;

+  int rr, rc, br, bc, hstep;

+  int tr, tc;

+  unsigned int besterr = INT_MAX;

+  unsigned int left, right, up, down, diag;

+  unsigned int sse;

+  unsigned int whichdir;

+  unsigned int halfiters = 4;

+  unsigned int quarteriters = 4;

+  unsigned int eighthiters = 4;

+  int thismse;

+  int maxc, minc, maxr, minr;

+  int y_stride;

+  int offset;

+  int usehp = xd->allow_high_precision_mv;

+#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)

+  unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;

+  unsigned char *y;

+  int buf_r1, buf_r2, buf_c1, buf_c2;

+  // Clamping to avoid out-of-range data access

+  buf_r1 = ((bestmv->as_mv.row - INTERP_EXTEND) < x->mv_row_min) ?

+      (bestmv->as_mv.row - x->mv_row_min) : INTERP_EXTEND - 1;

+  buf_r2 = ((bestmv->as_mv.row + INTERP_EXTEND) > x->mv_row_max) ?

+      (x->mv_row_max - bestmv->as_mv.row) : INTERP_EXTEND - 1;

+  buf_c1 = ((bestmv->as_mv.col - INTERP_EXTEND) < x->mv_col_min) ?

+      (bestmv->as_mv.col - x->mv_col_min) : INTERP_EXTEND - 1;

+  buf_c2 = ((bestmv->as_mv.col + INTERP_EXTEND) > x->mv_col_max) ?

+      (x->mv_col_max - bestmv->as_mv.col) : INTERP_EXTEND - 1;

+  y_stride = 32;

+  /* Copy to intermediate buffer before searching. */

+  vfp->copymem(y0 - buf_c1 - d->pre_stride * buf_r1, d->pre_stride, xd->y_buf, y_stride, 16 + buf_r1 + buf_r2);

+  y = xd->y_buf + y_stride * buf_r1 + buf_c1;

+#else

+  unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;

+  y_stride = d->pre_stride;

+#endif

+  rr = ref_mv->as_mv.row;

+  rc = ref_mv->as_mv.col;

+  br = bestmv->as_mv.row << 3;

+  bc = bestmv->as_mv.col << 3;

+  hstep = 4;

+  minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) - ((1 << MV_MAX_BITS) - 1));

+  maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) + ((1 << MV_MAX_BITS) - 1));

+  minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) - ((1 << MV_MAX_BITS) - 1));

+  maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) + ((1 << MV_MAX_BITS) - 1));

+  tr = br;

+  tc = bc;

+  offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;

+  // central mv

+  bestmv->as_mv.row <<= 3;

+  bestmv->as_mv.col <<= 3;

+  // calculate central point error

+  besterr = vfp->vf(y, y_stride, z, b->src_stride, sse1);

+  *distortion = besterr;

+  besterr += mv_err_cost(bestmv, ref_mv, MVCOSTS,

+                         error_per_bit, xd->allow_high_precision_mv);

+  // TODO: Each subsequent iteration checks at least one point in

+  // common with the last iteration could be 2 ( if diag selected)

+  while (--halfiters) {

+    // 1/2 pel

+    CHECK_BETTER(left, tr, tc - hstep);

+    CHECK_BETTER(right, tr, tc + hstep);

+    CHECK_BETTER(up, tr - hstep, tc);

+    CHECK_BETTER(down, tr + hstep, tc);

+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);

+    switch (whichdir) {

+      case 0:

+        CHECK_BETTER(diag, tr - hstep, tc - hstep);

+        break;

+      case 1:

+        CHECK_BETTER(diag, tr - hstep, tc + hstep);

+        break;

+      case 2:

+        CHECK_BETTER(diag, tr + hstep, tc - hstep);

+        break;

+      case 3:

+        CHECK_BETTER(diag, tr + hstep, tc + hstep);

+        break;

+    }

+    // no reason to check the same one again.

+    if (tr == br && tc == bc)

+      break;

+    tr = br;

+    tc = bc;

+  }

+  // TODO: Each subsequent iteration checks at least one point in common with

+  // the last iteration could be 2 ( if diag selected) 1/4 pel

+  hstep >>= 1;

+  while (--quarteriters) {

+    CHECK_BETTER(left, tr, tc - hstep);

+    CHECK_BETTER(right, tr, tc + hstep);

+    CHECK_BETTER(up, tr - hstep, tc);

+    CHECK_BETTER(down, tr + hstep, tc);

+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);

+    switch (whichdir) {

+      case 0:

+        CHECK_BETTER(diag, tr - hstep, tc - hstep);

+        break;

+      case 1:

+        CHECK_BETTER(diag, tr - hstep, tc + hstep);

+        break;

+      case 2:

+        CHECK_BETTER(diag, tr + hstep, tc - hstep);

+        break;

+      case 3:

+        CHECK_BETTER(diag, tr + hstep, tc + hstep);

+        break;

+    }

+    // no reason to check the same one again.

+    if (tr == br && tc == bc)

+      break;

+    tr = br;

+    tc = bc;

+  }

+  if (xd->allow_high_precision_mv) {

+    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);

+  } else {

+    usehp = 0;

+  }

+  if (usehp) {

+    hstep >>= 1;

+    while (--eighthiters) {

+      CHECK_BETTER(left, tr, tc - hstep);

+      CHECK_BETTER(right, tr, tc + hstep);

+      CHECK_BETTER(up, tr - hstep, tc);

+      CHECK_BETTER(down, tr + hstep, tc);

+      whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);

+      switch (whichdir) {

+        case 0:

+          CHECK_BETTER(diag, tr - hstep, tc - hstep);

+          break;

+        case 1:

+          CHECK_BETTER(diag, tr - hstep, tc + hstep);

+          break;

+        case 2:

+          CHECK_BETTER(diag, tr + hstep, tc - hstep);

+          break;

+        case 3:

+          CHECK_BETTER(diag, tr + hstep, tc + hstep);

+          break;

+      }

+      // no reason to check the same one again.

+      if (tr == br && tc == bc)

+        break;

+      tr = br;

+      tc = bc;

+    }

+  }

+  bestmv->as_mv.row = br;

+  bestmv->as_mv.col = bc;

+  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||

+      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))

+    return INT_MAX;

+  return besterr;

+}

+#undef MVC

+#undef PRE

+#undef DIST

+#undef IFMVCV

+#undef CHECK_BETTER

+#undef MIN

+#undef MAX

+int vp9_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

+                                 int_mv *bestmv, int_mv *ref_mv,

+                                 int error_per_bit,

+                                 const vp9_variance_fn_ptr_t *vfp,

+                                 DEC_MVCOSTS, int *distortion,

+                                 unsigned int *sse1) {

+  int bestmse = INT_MAX;

+  int_mv startmv;

+  int_mv this_mv;

+  int_mv orig_mv;

+  int yrow_movedback = 0, ycol_movedback = 0;

+  unsigned char *z = (*(b->base_src) + b->src);

+  int left, right, up, down, diag;

+  unsigned int sse;

+  int whichdir;

+  int thismse;

+  int y_stride;

+  MACROBLOCKD *xd = &x->e_mbd;

+  int usehp = xd->allow_high_precision_mv;

+#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)

+  unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;

+  unsigned char *y;

+  y_stride = 32;

+  /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */

+  vfp->copymem(y0 - 1 - d->pre_stride, d->pre_stride, xd->y_buf, y_stride, 18);

+  y = xd->y_buf + y_stride + 1;

+#else

+  unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;

+  y_stride = d->pre_stride;

+#endif

+  // central mv

+  bestmv->as_mv.row <<= 3;

+  bestmv->as_mv.col <<= 3;

+  startmv = *bestmv;

+  orig_mv = *bestmv;

+  // calculate central point error

+  bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);

+  *distortion = bestmse;

+  bestmse += mv_err_cost(bestmv, ref_mv, MVCOSTS, error_per_bit,

+                         xd->allow_high_precision_mv);

+  // go left then right and check error

+  this_mv.as_mv.row = startmv.as_mv.row;

+  this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);

+  thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);

+  left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

+                               xd->allow_high_precision_mv);

+  if (left < bestmse) {

+    *bestmv = this_mv;

+    bestmse = left;

+    *distortion = thismse;

+    *sse1 = sse;

+  }

+  this_mv.as_mv.col += 8;

+  thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);

+  right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

+                                xd->allow_high_precision_mv);

+  if (right < bestmse) {

+    *bestmv = this_mv;

+    bestmse = right;

+    *distortion = thismse;

+    *sse1 = sse;

+  }

+  // go up then down and check error

+  this_mv.as_mv.col = startmv.as_mv.col;

+  this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);

+  thismse =  vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);

+  up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

+                             xd->allow_high_precision_mv);

+  if (up < bestmse) {

+    *bestmv = this_mv;

+    bestmse = up;

+    *distortion = thismse;

+    *sse1 = sse;

+  }

+  this_mv.as_mv.row += 8;

+  thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);

+  down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

+                               xd->allow_high_precision_mv);

+  if (down < bestmse) {

+    *bestmv = this_mv;

+    bestmse = down;

+    *distortion = thismse;

+    *sse1 = sse;

+  }

+  // now check 1 more diagonal

+  whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);

+  // for(whichdir =0;whichdir<4;whichdir++)

+  // {

+  this_mv = startmv;

+  switch (whichdir) {

+    case 0:

+      this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;

+      this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;

+      thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);

+      break;

+    case 1:

+      this_mv.as_mv.col += 4;

+      this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;

+      thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);

+      break;

+    case 2:

+      this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;

+      this_mv.as_mv.row += 4;

+      thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);

+      break;

+    case 3:

+    default:

+      this_mv.as_mv.col += 4;

+      this_mv.as_mv.row += 4;

+      thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);

+      break;

+  }

+  diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

+                               xd->allow_high_precision_mv);

+  if (diag < bestmse) {

+    *bestmv = this_mv;

+    bestmse = diag;

+    *distortion = thismse;

+    *sse1 = sse;

+  }

+//  }

+  // time to check quarter pels.

+  if (bestmv->as_mv.row < startmv.as_mv.row) {

+    y -= y_stride;

+    yrow_movedback = 1;

+  }

+  if (bestmv->as_mv.col < startmv.as_mv.col) {

+    y--;

+    ycol_movedback = 1;

+  }

+  startmv = *bestmv;

+  // go left then right and check error

+  this_mv.as_mv.row = startmv.as_mv.row;

+  if (startmv.as_mv.col & 7) {

+    this_mv.as_mv.col = startmv.as_mv.col - 2;

+    thismse = vfp->svf(y, y_stride,

+                       SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

+                       z, b->src_stride, &sse);

+  } else {

+    this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;

+    thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z,

+                       b->src_stride, &sse);

+  }

+  left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

+                               xd->allow_high_precision_mv);

+  if (left < bestmse) {

+    *bestmv = this_mv;

+    bestmse = left;

+    *distortion = thismse;

+    *sse1 = sse;

+  }

+  this_mv.as_mv.col += 4;

+  thismse = vfp->svf(y, y_stride,

+                     SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

+                     z, b->src_stride, &sse);

+  right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

+                                xd->allow_high_precision_mv);

+  if (right < bestmse) {

+    *bestmv = this_mv;

+    bestmse = right;

+    *distortion = thismse;

+    *sse1 = sse;

+  }

+  // go up then down and check error

+  this_mv.as_mv.col = startmv.as_mv.col;

+  if (startmv.as_mv.row & 7) {

+    this_mv.as_mv.row = startmv.as_mv.row - 2;

+    thismse = vfp->svf(y, y_stride,

+                       SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

+                       z, b->src_stride, &sse);

+  } else {

+    this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;

+    thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6),

+                       z, b->src_stride, &sse);

+  }

+  up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

+                             xd->allow_high_precision_mv);

+  if (up < bestmse) {

+    *bestmv = this_mv;

+    bestmse = up;

+    *distortion = thismse;

+    *sse1 = sse;

+  }

+  this_mv.as_mv.row += 4;

+  thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

+                     z, b->src_stride, &sse);

+  down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

+                               xd->allow_high_precision_mv);

+  if (down < bestmse) {

+    *bestmv = this_mv;

+    bestmse = down;

+    *distortion = thismse;

+    *sse1 = sse;

+  }

+  // now check 1 more diagonal

+  whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);

+//  for(whichdir=0;whichdir<4;whichdir++)

+//  {

+  this_mv = startmv;

+  switch (whichdir) {

+    case 0:

+      if (startmv.as_mv.row & 7) {

+        this_mv.as_mv.row -= 2;

+        if (startmv.as_mv.col & 7) {

+          this_mv.as_mv.col -= 2;

+          thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

+        } else {

+          this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;

+          thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z, b->src_stride, &sse);;

+        }

+      } else {

+        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;

+        if (startmv.as_mv.col & 7) {

+          this_mv.as_mv.col -= 2;

+          thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6), z, b->src_stride, &sse);

+        } else {

+          this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;

+          thismse = vfp->svf(y - y_stride - 1, y_stride, SP(6), SP(6), z, b->src_stride, &sse);

+        }

+      }

+      break;

+    case 1:

+      this_mv.as_mv.col += 2;

+      if (startmv.as_mv.row & 7) {

+        this_mv.as_mv.row -= 2;

+        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

+      } else {

+        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;

+        thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6), z, b->src_stride, &sse);

+      }

+      break;

+    case 2:

+      this_mv.as_mv.row += 2;

+      if (startmv.as_mv.col & 7) {

+        this_mv.as_mv.col -= 2;

+        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

+                           z, b->src_stride, &sse);

+      } else {

+        this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;

+        thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z,

+                           b->src_stride, &sse);

+      }

+      break;

+    case 3:

+      this_mv.as_mv.col += 2;

+      this_mv.as_mv.row += 2;

+      thismse = vfp->svf(y, y_stride,

+                         SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

+                         z, b->src_stride, &sse);

+      break;

+  }

+  diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

+                               xd->allow_high_precision_mv);

+  if (diag < bestmse) {

+    *bestmv = this_mv;

+    bestmse = diag;

+    *distortion = thismse;

+    *sse1 = sse;

+  }

+  if (x->e_mbd.allow_high_precision_mv) {

+    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);

+  } else {

+    usehp = 0;

+  }

+  if (!usehp)

+    return bestmse;

+  /* Now do 1/8th pixel */

+  if (bestmv->as_mv.row < orig_mv.as_mv.row && !yrow_movedback) {

+    y -= y_stride;

+    yrow_movedback = 1;

+  }

+  if (bestmv->as_mv.col < orig_mv.as_mv.col && !ycol_movedback) {

+    y--;

+    ycol_movedback = 1;

+  }

+  startmv = *bestmv;

+  // go left then right and check error

+  this_mv.as_mv.row = startmv.as_mv.row;

+  if (startmv.as_mv.col & 7) {

+    this_mv.as_mv.col = startmv.as_mv.col - 1;

+    thismse = vfp->svf(y, y_stride,

+                       SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

+                       z, b->src_stride, &sse);

+  } else {

+    this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;

+    thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row),

+                       z, b->src_stride, &sse);

+  }

+  left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

+                               xd->allow_high_precision_mv);

+  if (left < bestmse) {

+    *bestmv = this_mv;

+    bestmse = left;

+    *distortion = thismse;

+    *sse1 = sse;

+  }

+  this_mv.as_mv.col += 2;

+  thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),

+                     z, b->src_stride, &sse);

+  right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

+                                xd->allow_high_precision_mv);

+  if (right < bestmse) {

+    *bestmv = this_mv;

+    bestmse = right;

+    *distortion = thismse;

+    *sse1 = sse;

+  }

+  // go up then down and check error

+  this_mv.as_mv.col = startmv.as_mv.col;

+  if (startmv.as_mv.row & 7) {

+    this_mv.as_mv.row = startmv.as_mv.row - 1;

+    thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

+  } else {

+    this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;

+    thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);

+  }

+  up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

+                             xd->allow_high_precision_mv);

+  if (up < bestmse) {

+    *bestmv = this_mv;

+    bestmse = up;

+    *distortion = thismse;

+    *sse1 = sse;

+  }

+  this_mv.as_mv.row += 2;

+  thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

+  down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

+                               xd->allow_high_precision_mv);

+  if (down < bestmse) {

+    *bestmv = this_mv;

+    bestmse = down;

+    *distortion = thismse;

+    *sse1 = sse;

+  }

+  // now check 1 more diagonal

+  whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);

+//  for(whichdir=0;whichdir<4;whichdir++)

+//  {

+  this_mv = startmv;

+  switch (whichdir) {

+    case 0:

+      if (startmv.as_mv.row & 7) {

+        this_mv.as_mv.row -= 1;

+        if (startmv.as_mv.col & 7) {

+          this_mv.as_mv.col -= 1;

+          thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

+        } else {

+          this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;

+          thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row), z, b->src_stride, &sse);;

+        }

+      } else {

+        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;

+        if (startmv.as_mv.col & 7) {

+          this_mv.as_mv.col -= 1;

+          thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);

+        } else {

+          this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;

+          thismse = vfp->svf(y - y_stride - 1, y_stride, SP(7), SP(7), z, b->src_stride, &sse);

+        }

+      }

+      break;

+    case 1:

+      this_mv.as_mv.col += 1;

+      if (startmv.as_mv.row & 7) {

+        this_mv.as_mv.row -= 1;

+        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

+      } else {

+        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;

+        thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);

+      }

+      break;

+    case 2:

+      this_mv.as_mv.row += 1;

+      if (startmv.as_mv.col & 7) {

+        this_mv.as_mv.col -= 1;

+        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

+      } else {

+        this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;

+        thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

+      }

+      break;

+    case 3:

+      this_mv.as_mv.col += 1;

+      this_mv.as_mv.row += 1;

+      thismse = vfp->svf(y, y_stride,  SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);

+      break;

+  }

+  diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

+                               xd->allow_high_precision_mv);

+  if (diag < bestmse) {

+    *bestmv = this_mv;

+    bestmse = diag;

+    *distortion = thismse;

+    *sse1 = sse;

+  }

+  return bestmse;

+}

+#undef SP

+int vp9_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

+                                  int_mv *bestmv, int_mv *ref_mv,

+                                  int error_per_bit,

+                                  const vp9_variance_fn_ptr_t *vfp,

+                                  DEC_MVCOSTS,

+                                  int *distortion,

+                                  unsigned int *sse1) {

+  int bestmse = INT_MAX;

+  int_mv startmv;

+  int_mv this_mv;

+  unsigned char *z = (*(b->base_src) + b->src);

+  int left, right, up, down, diag;

+  unsigned int sse;

+  int whichdir;

+  int thismse;

+  int y_stride;

+  MACROBLOCKD *xd = &x->e_mbd;

+#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)

+  unsigned char *y0 = *(d->base_pre) + d->pre +

+      (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;

+  unsigned char *y;

+  y_stride = 32;

+  /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */

+  vfp->copymem(y0 - 1 - d->pre_stride, d->pre_stride, xd->y_buf, y_stride, 18);

+  y = xd->y_buf + y_stride + 1;

+#else

+  unsigned char *y = *(d->base_pre) + d->pre +

+      (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;

+  y_stride = d->pre_stride;

+#endif

+  // central mv

+  bestmv->as_mv.row <<= 3;

+  bestmv->as_mv.col <<= 3;

+  startmv = *bestmv;

+  // calculate central point error

+  bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);

+  *distortion = bestmse;

+  bestmse += mv_err_cost(bestmv, ref_mv, MVCOSTS, error_per_bit,

+                         xd->allow_high_precision_mv);

+  // go left then right and check error

+  this_mv.as_mv.row = startmv.as_mv.row;

+  this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);

+  thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);

+  left = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

+                               xd->allow_high_precision_mv);

+  if (left < bestmse) {

+    *bestmv = this_mv;

+    bestmse = left;

+    *distortion = thismse;

+    *sse1 = sse;

+  }

+  this_mv.as_mv.col += 8;

+  thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);

+  right = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

+                                xd->allow_high_precision_mv);

+  if (right < bestmse) {

+    *bestmv = this_mv;

+    bestmse = right;

+    *distortion = thismse;

+    *sse1 = sse;

+  }

+  // go up then down and check error

+  this_mv.as_mv.col = startmv.as_mv.col;

+  this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);

+  thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);

+  up = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

+                             xd->allow_high_precision_mv);

+  if (up < bestmse) {

+    *bestmv = this_mv;

+    bestmse = up;

+    *distortion = thismse;

+    *sse1 = sse;

+  }

+  this_mv.as_mv.row += 8;

+  thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);

+  down = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

+                               xd->allow_high_precision_mv);

+  if (down < bestmse) {

+    *bestmv = this_mv;

+    bestmse = down;

+    *distortion = thismse;

+    *sse1 = sse;

+  }

+  // now check 1 more diagonal -

+  whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);

+  this_mv = startmv;

+  switch (whichdir) {

+    case 0:

+      this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;

+      this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;

+      thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);

+      break;

+    case 1:

+      this_mv.as_mv.col += 4;

+      this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;

+      thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);

+      break;

+    case 2:

+      this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;

+      this_mv.as_mv.row += 4;

+      thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);

+      break;

+    case 3:

+    default:

+      this_mv.as_mv.col += 4;

+      this_mv.as_mv.row += 4;

+      thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);

+      break;

+  }

+  diag = thismse + mv_err_cost(&this_mv, ref_mv, MVCOSTS, error_per_bit,

+                               xd->allow_high_precision_mv);

+  if (diag < bestmse) {

+    *bestmv = this_mv;

+    bestmse = diag;

+    *distortion = thismse;

+    *sse1 = sse;

+  }

+  return bestmse;

+}

+#define CHECK_BOUNDS(range) \

+  {\

+    all_in = 1;\

+    all_in &= ((br-range) >= x->mv_row_min);\

+    all_in &= ((br+range) <= x->mv_row_max);\

+    all_in &= ((bc-range) >= x->mv_col_min);\

+    all_in &= ((bc+range) <= x->mv_col_max);\

+  }

+#define CHECK_POINT \

+  {\

+    if (this_mv.as_mv.col < x->mv_col_min) continue;\

+    if (this_mv.as_mv.col > x->mv_col_max) continue;\

+    if (this_mv.as_mv.row < x->mv_row_min) continue;\

+    if (this_mv.as_mv.row > x->mv_row_max) continue;\

+  }

+#define CHECK_BETTER \

+  {\

+    if (thissad < bestsad)\

+    {\

+      thissad += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);\

+      if (thissad < bestsad)\

+      {\

+        bestsad = thissad;\

+        best_site = i;\

+      }\

+    }\

+  }

+static const MV next_chkpts[6][3] = {

+  {{ -2, 0}, { -1, -2}, {1, -2}},

+  {{ -1, -2}, {1, -2}, {2, 0}},

+  {{1, -2}, {2, 0}, {1, 2}},

+  {{2, 0}, {1, 2}, { -1, 2}},

+  {{1, 2}, { -1, 2}, { -2, 0}},

+  {{ -1, 2}, { -2, 0}, { -1, -2}}

+};

+int vp9_hex_search

+(

+  MACROBLOCK *x,

+  BLOCK *b,

+  BLOCKD *d,

+  int_mv *ref_mv,

+  int_mv *best_mv,

+  int search_param,

+  int sad_per_bit,

+  const vp9_variance_fn_ptr_t *vfp,

+  DEC_MVSADCOSTS,

+  DEC_MVCOSTS,

+  int_mv *center_mv

+) {

+  MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} };

+  MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}};

+  int i, j;

+  unsigned char *what = (*(b->base_src) + b->src);

+  int what_stride = b->src_stride;

+  int in_what_stride = d->pre_stride;

+  int br, bc;

+  int_mv this_mv;

+  unsigned int bestsad = 0x7fffffff;

+  unsigned int thissad;

+  unsigned char *base_offset;

+  unsigned char *this_offset;

+  int k = -1;

+  int all_in;

+  int best_site = -1;

+  int_mv fcenter_mv;

+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;

+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

+  // adjust ref_mv to make sure it is within MV range

+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);

+  br = ref_mv->as_mv.row;

+  bc = ref_mv->as_mv.col;

+  // Work out the start point for the search

+  base_offset = (unsigned char *)(*(d->base_pre) + d->pre);

+  this_offset = base_offset + (br * (d->pre_stride)) + bc;

+  this_mv.as_mv.row = br;

+  this_mv.as_mv.col = bc;

+  bestsad = vfp->sdf(what, what_stride, this_offset,

+                     in_what_stride, 0x7fffffff)

+            + mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);

+  // hex search

+  // j=0

+  CHECK_BOUNDS(2)

+  if (all_in) {

+    for (i = 0; i < 6; i++) {

+      this_mv.as_mv.row = br + hex[i].row;

+      this_mv.as_mv.col = bc + hex[i].col;

+      this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;

+      thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);

+      CHECK_BETTER

+    }

+  } else {

+    for (i = 0; i < 6; i++) {

+      this_mv.as_mv.row = br + hex[i].row;

+      this_mv.as_mv.col = bc + hex[i].col;

+      CHECK_POINT

+      this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;

+      thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);

+      CHECK_BETTER

+    }

+  }

+  if (best_site == -1)

+    goto cal_neighbors;

+  else {

+    br += hex[best_site].row;

+    bc += hex[best_site].col;

+    k = best_site;

+  }

+  for (j = 1; j < 127; j++) {

+    best_site = -1;

+    CHECK_BOUNDS(2)

+    if (all_in) {

+      for (i = 0; i < 3; i++) {

+        this_mv.as_mv.row = br + next_chkpts[k][i].row;

+        this_mv.as_mv.col = bc + next_chkpts[k][i].col;

+        this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;

+        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);

+        CHECK_BETTER

+      }

+    } else {

+      for (i = 0; i < 3; i++) {

+        this_mv.as_mv.row = br + next_chkpts[k][i].row;

+        this_mv.as_mv.col = bc + next_chkpts[k][i].col;

+        CHECK_POINT

+        this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;

+        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);

+        CHECK_BETTER

+      }

+    }

+    if (best_site == -1)

+      break;

+    else {

+      br += next_chkpts[k][best_site].row;

+      bc += next_chkpts[k][best_site].col;

+      k += 5 + best_site;

+      if (k >= 12) k -= 12;

+      else if (k >= 6) k -= 6;

+    }

+  }

+  // check 4 1-away neighbors

+cal_neighbors:

+  for (j = 0; j < 32; j++) {

+    best_site = -1;

+    CHECK_BOUNDS(1)

+    if (all_in) {

+      for (i = 0; i < 4; i++) {

+        this_mv.as_mv.row = br + neighbors[i].row;

+        this_mv.as_mv.col = bc + neighbors[i].col;

+        this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;

+        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);

+        CHECK_BETTER

+      }

+    } else {

+      for (i = 0; i < 4; i++) {

+        this_mv.as_mv.row = br + neighbors[i].row;

+        this_mv.as_mv.col = bc + neighbors[i].col;

+        CHECK_POINT

+        this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;

+        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);

+        CHECK_BETTER

+      }

+    }

+    if (best_site == -1)

+      break;

+    else {

+      br += neighbors[best_site].row;

+      bc += neighbors[best_site].col;

+    }

+  }

+  best_mv->as_mv.row = br;

+  best_mv->as_mv.col = bc;

+  return bestsad;

+}

+#undef CHECK_BOUNDS

+#undef CHECK_POINT

+#undef CHECK_BETTER

+int vp9_diamond_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

+                           int_mv *ref_mv, int_mv *best_mv,

+                           int search_param, int sad_per_bit, int *num00,

+                           vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,

+                           int_mv *center_mv) {

+  int i, j, step;

+  unsigned char *what = (*(b->base_src) + b->src);

+  int what_stride = b->src_stride;

+  unsigned char *in_what;

+  int in_what_stride = d->pre_stride;

+  unsigned char *best_address;

+  int tot_steps;

+  int_mv this_mv;

+  int bestsad = INT_MAX;

+  int best_site = 0;

+  int last_site = 0;

+  int ref_row, ref_col;

+  int this_row_offset, this_col_offset;

+  search_site *ss;

+  unsigned char *check_here;

+  int thissad;

+  MACROBLOCKD *xd = &x->e_mbd;

+  int_mv fcenter_mv;

+  int *mvjsadcost = x->nmvjointsadcost;

+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};

+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;

+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);

+  ref_row = ref_mv->as_mv.row;

+  ref_col = ref_mv->as_mv.col;

+  *num00 = 0;

+  best_mv->as_mv.row = ref_row;

+  best_mv->as_mv.col = ref_col;

+  // Work out the start point for the search

+  in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);

+  best_address = in_what;

+  // Check the starting position

+  bestsad = fn_ptr->sdf(what, what_stride, in_what,

+                        in_what_stride, 0x7fffffff)

+            + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);

+  // search_param determines the length of the initial step and hence the number of iterations

+  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.

+  ss = &x->ss[search_param * x->searches_per_step];

+  tot_steps = (x->ss_count / x->searches_per_step) - search_param;

+  i = 1;

+  for (step = 0; step < tot_steps; step++) {

+    for (j = 0; j < x->searches_per_step; j++) {

+      // Trap illegal vectors

+      this_row_offset = best_mv->as_mv.row + ss[i].mv.row;

+      this_col_offset = best_mv->as_mv.col + ss[i].mv.col;

+      if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&

+          (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))

+      {

+        check_here = ss[i].offset + best_address;

+        thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);

+        if (thissad < bestsad) {

+          this_mv.as_mv.row = this_row_offset;

+          this_mv.as_mv.col = this_col_offset;

+          thissad += mvsad_err_cost(&this_mv, &fcenter_mv,

+                                    MVSADCOSTS, sad_per_bit);

+          if (thissad < bestsad) {

+            bestsad = thissad;

+            best_site = i;

+          }

+        }

+      }

+      i++;

+    }

+    if (best_site != last_site) {

+      best_mv->as_mv.row += ss[best_site].mv.row;

+      best_mv->as_mv.col += ss[best_site].mv.col;

+      best_address += ss[best_site].offset;

+      last_site = best_site;

+    } else if (best_address == in_what)

+      (*num00)++;

+  }

+  this_mv.as_mv.row = best_mv->as_mv.row << 3;

+  this_mv.as_mv.col = best_mv->as_mv.col << 3;

+  if (bestsad == INT_MAX)

+    return INT_MAX;

+  return

+      fn_ptr->vf(what, what_stride, best_address, in_what_stride,

+                 (unsigned int *)(&thissad)) +

+      mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,

+                  xd->allow_high_precision_mv);

+}

+int vp9_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

+                             int_mv *ref_mv, int_mv *best_mv, int search_param,

+                             int sad_per_bit, int *num00,

+                             vp9_variance_fn_ptr_t *fn_ptr,

+                             DEC_MVCOSTS, int_mv *center_mv) {

+  int i, j, step;

+  unsigned char *what = (*(b->base_src) + b->src);

+  int what_stride = b->src_stride;

+  unsigned char *in_what;

+  int in_what_stride = d->pre_stride;

+  unsigned char *best_address;

+  int tot_steps;

+  int_mv this_mv;

+  int bestsad = INT_MAX;

+  int best_site = 0;

+  int last_site = 0;

+  int ref_row;

+  int ref_col;

+  int this_row_offset;

+  int this_col_offset;

+  search_site *ss;

+  unsigned char *check_here;

+  unsigned int thissad;

+  MACROBLOCKD *xd = &x->e_mbd;

+  int_mv fcenter_mv;

+  int *mvjsadcost = x->nmvjointsadcost;

+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};

+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;

+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);

+  ref_row = ref_mv->as_mv.row;

+  ref_col = ref_mv->as_mv.col;

+  *num00 = 0;

+  best_mv->as_mv.row = ref_row;

+  best_mv->as_mv.col = ref_col;

+  // Work out the start point for the search

+  in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);

+  best_address = in_what;

+  // Check the starting position

+  bestsad = fn_ptr->sdf(what, what_stride,

+                        in_what, in_what_stride, 0x7fffffff)

+            + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);

+  // search_param determines the length of the initial step and hence the number of iterations

+  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.

+  ss = &x->ss[search_param * x->searches_per_step];

+  tot_steps = (x->ss_count / x->searches_per_step) - search_param;

+  i = 1;

+  for (step = 0; step < tot_steps; step++) {

+    int all_in = 1, t;

+    // To know if all neighbor points are within the bounds, 4 bounds checking are enough instead of

+    // checking 4 bounds for each points.

+    all_in &= ((best_mv->as_mv.row + ss[i].mv.row) > x->mv_row_min);

+    all_in &= ((best_mv->as_mv.row + ss[i + 1].mv.row) < x->mv_row_max);

+    all_in &= ((best_mv->as_mv.col + ss[i + 2].mv.col) > x->mv_col_min);

+    all_in &= ((best_mv->as_mv.col + ss[i + 3].mv.col) < x->mv_col_max);

+    if (all_in) {

+      unsigned int sad_array[4];

+      for (j = 0; j < x->searches_per_step; j += 4) {

+        unsigned char *block_offset[4];

+        for (t = 0; t < 4; t++)

+          block_offset[t] = ss[i + t].offset + best_address;

+        fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,

+                       sad_array);

+        for (t = 0; t < 4; t++, i++) {

+          if (sad_array[t] < bestsad) {

+            this_mv.as_mv.row = best_mv->as_mv.row + ss[i].mv.row;

+            this_mv.as_mv.col = best_mv->as_mv.col + ss[i].mv.col;

+            sad_array[t] += mvsad_err_cost(&this_mv, &fcenter_mv,

+                                           MVSADCOSTS, sad_per_bit);

+            if (sad_array[t] < bestsad) {

+              bestsad = sad_array[t];

+              best_site = i;

+            }

+          }

+        }

+      }

+    } else {

+      for (j = 0; j < x->searches_per_step; j++) {

+        // Trap illegal vectors

+        this_row_offset = best_mv->as_mv.row + ss[i].mv.row;

+        this_col_offset = best_mv->as_mv.col + ss[i].mv.col;

+        if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&

+            (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {

+          check_here = ss[i].offset + best_address;

+          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);

+          if (thissad < bestsad) {

+            this_mv.as_mv.row = this_row_offset;

+            this_mv.as_mv.col = this_col_offset;

+            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,

+                                      MVSADCOSTS, sad_per_bit);

+            if (thissad < bestsad) {

+              bestsad = thissad;

+              best_site = i;

+            }

+          }

+        }

+        i++;

+      }

+    }

+    if (best_site != last_site) {

+      best_mv->as_mv.row += ss[best_site].mv.row;

+      best_mv->as_mv.col += ss[best_site].mv.col;

+      best_address += ss[best_site].offset;

+      last_site = best_site;

+    } else if (best_address == in_what)

+      (*num00)++;

+  }

+  this_mv.as_mv.row = best_mv->as_mv.row << 3;

+  this_mv.as_mv.col = best_mv->as_mv.col << 3;

+  if (bestsad == INT_MAX)

+    return INT_MAX;

+  return

+      fn_ptr->vf(what, what_stride, best_address, in_what_stride,

+                 (unsigned int *)(&thissad)) +

+      mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,

+                  xd->allow_high_precision_mv);

+}

+/* do_refine: If last step (1-away) of n-step search doesn't pick the center

+              point as the best match, we will do a final 1-away diamond

+              refining search  */

+int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b,

+                           BLOCKD *d, int_mv *mvp_full, int step_param,

+                           int sadpb, int further_steps,

+                           int do_refine, vp9_variance_fn_ptr_t *fn_ptr,

+                           int_mv *ref_mv, int_mv *dst_mv) {

+  int_mv temp_mv;

+  int thissme, n, num00;

+  int bestsme = cpi->diamond_search_sad(x, b, d, mvp_full, &temp_mv,

+                                        step_param, sadpb, &num00,

+                                        fn_ptr, XMVCOST, ref_mv);

+  dst_mv->as_int = temp_mv.as_int;

+  n = num00;

+  num00 = 0;

+  /* If there won't be more n-step search, check to see if refining search is needed. */

+  if (n > further_steps)

+    do_refine = 0;

+  while (n < further_steps) {

+    n++;

+    if (num00)

+      num00--;

+    else {

+      thissme = cpi->diamond_search_sad(x, b, d, mvp_full, &temp_mv,

+                                        step_param + n, sadpb, &num00,

+                                        fn_ptr, XMVCOST, ref_mv);

+      /* check to see if refining search is needed. */

+      if (num00 > (further_steps - n))

+        do_refine = 0;

+      if (thissme < bestsme) {

+        bestsme = thissme;

+        dst_mv->as_int = temp_mv.as_int;

+      }

+    }

+  }

+  /* final 1-away diamond refining search */

+  if (do_refine == 1) {

+    int search_range = 8;

+    int_mv best_mv;

+    best_mv.as_int = dst_mv->as_int;

+    thissme = cpi->refining_search_sad(x, b, d, &best_mv, sadpb, search_range,

+                                       fn_ptr, XMVCOST, ref_mv);

+    if (thissme < bestsme) {

+      bestsme = thissme;

+      dst_mv->as_int = best_mv.as_int;

+    }

+  }

+  return bestsme;

+}

+int vp9_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,

+                        int sad_per_bit, int distance,

+                        vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,

+                        int_mv *center_mv) {

+  unsigned char *what = (*(b->base_src) + b->src);

+  int what_stride = b->src_stride;

+  unsigned char *in_what;

+  int in_what_stride = d->pre_stride;

+  int mv_stride = d->pre_stride;

+  unsigned char *bestaddress;

+  int_mv *best_mv = &d->bmi.as_mv.first;

+  int_mv this_mv;

+  int bestsad = INT_MAX;

+  int r, c;

+  unsigned char *check_here;

+  int thissad;

+  MACROBLOCKD *xd = &x->e_mbd;

+  int ref_row = ref_mv->as_mv.row;

+  int ref_col = ref_mv->as_mv.col;

+  int row_min = ref_row - distance;

+  int row_max = ref_row + distance;

+  int col_min = ref_col - distance;

+  int col_max = ref_col + distance;

+  int_mv fcenter_mv;

+  int *mvjsadcost = x->nmvjointsadcost;

+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};

+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;

+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

+  // Work out the mid point for the search

+  in_what = *(d->base_pre) + d->pre;

+  bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;

+  best_mv->as_mv.row = ref_row;

+  best_mv->as_mv.col = ref_col;

+  // Baseline value at the centre

+  bestsad = fn_ptr->sdf(what, what_stride, bestaddress,

+                        in_what_stride, 0x7fffffff)

+            + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);

+  // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border

+  if (col_min < x->mv_col_min)

+    col_min = x->mv_col_min;

+  if (col_max > x->mv_col_max)

+    col_max = x->mv_col_max;

+  if (row_min < x->mv_row_min)

+    row_min = x->mv_row_min;

+  if (row_max > x->mv_row_max)

+    row_max = x->mv_row_max;

+  for (r = row_min; r < row_max; r++) {

+    this_mv.as_mv.row = r;

+    check_here = r * mv_stride + in_what + col_min;

+    for (c = col_min; c < col_max; c++) {

+      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);

+      this_mv.as_mv.col = c;

+      thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,

+                                 MVSADCOSTS, sad_per_bit);

+      if (thissad < bestsad) {

+        bestsad = thissad;

+        best_mv->as_mv.row = r;

+        best_mv->as_mv.col = c;

+        bestaddress = check_here;

+      }

+      check_here++;

+    }

+  }

+  this_mv.as_mv.row = best_mv->as_mv.row << 3;

+  this_mv.as_mv.col = best_mv->as_mv.col << 3;

+  if (bestsad < INT_MAX)

+    return

+        fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,

+                   (unsigned int *)(&thissad)) +

+        mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,

+                    xd->allow_high_precision_mv);

+  else

+    return INT_MAX;

+}

+int vp9_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,

+                          int sad_per_bit, int distance,

+                          vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,

+                          int_mv *center_mv) {

+  unsigned char *what = (*(b->base_src) + b->src);

+  int what_stride = b->src_stride;

+  unsigned char *in_what;

+  int in_what_stride = d->pre_stride;

+  int mv_stride = d->pre_stride;

+  unsigned char *bestaddress;

+  int_mv *best_mv = &d->bmi.as_mv.first;

+  int_mv this_mv;

+  int bestsad = INT_MAX;

+  int r, c;

+  unsigned char *check_here;

+  unsigned int thissad;

+  MACROBLOCKD *xd = &x->e_mbd;

+  int ref_row = ref_mv->as_mv.row;

+  int ref_col = ref_mv->as_mv.col;

+  int row_min = ref_row - distance;

+  int row_max = ref_row + distance;

+  int col_min = ref_col - distance;

+  int col_max = ref_col + distance;

+  unsigned int sad_array[3];

+  int_mv fcenter_mv;

+  int *mvjsadcost = x->nmvjointsadcost;

+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};

+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;

+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

+  // Work out the mid point for the search

+  in_what = *(d->base_pre) + d->pre;

+  bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;

+  best_mv->as_mv.row = ref_row;

+  best_mv->as_mv.col = ref_col;

+  // Baseline value at the centre

+  bestsad = fn_ptr->sdf(what, what_stride,

+                        bestaddress, in_what_stride, 0x7fffffff)

+            + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);

+  // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border

+  if (col_min < x->mv_col_min)

+    col_min = x->mv_col_min;

+  if (col_max > x->mv_col_max)

+    col_max = x->mv_col_max;

+  if (row_min < x->mv_row_min)

+    row_min = x->mv_row_min;

+  if (row_max > x->mv_row_max)

+    row_max = x->mv_row_max;

+  for (r = row_min; r < row_max; r++) {

+    this_mv.as_mv.row = r;

+    check_here = r * mv_stride + in_what + col_min;

+    c = col_min;

+    while ((c + 2) < col_max) {

+      int i;

+      fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);

+      for (i = 0; i < 3; i++) {

+        thissad = sad_array[i];

+        if (thissad < bestsad) {

+          this_mv.as_mv.col = c;

+          thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,

+                                     MVSADCOSTS, sad_per_bit);

+          if (thissad < bestsad) {

+            bestsad = thissad;

+            best_mv->as_mv.row = r;

+            best_mv->as_mv.col = c;

+            bestaddress = check_here;

+          }

+        }

+        check_here++;

+        c++;

+      }

+    }

+    while (c < col_max) {

+      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);

+      if (thissad < bestsad) {

+        this_mv.as_mv.col = c;

+        thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,

+                                   MVSADCOSTS, sad_per_bit);

+        if (thissad < bestsad) {

+          bestsad = thissad;

+          best_mv->as_mv.row = r;

+          best_mv->as_mv.col = c;

+          bestaddress = check_here;

+        }

+      }

+      check_here++;

+      c++;

+    }

+  }

+  this_mv.as_mv.row = best_mv->as_mv.row << 3;

+  this_mv.as_mv.col = best_mv->as_mv.col << 3;

+  if (bestsad < INT_MAX)

+    return

+        fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,

+                   (unsigned int *)(&thissad)) +

+        mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,

+                    xd->allow_high_precision_mv);

+  else

+    return INT_MAX;

+}

+int vp9_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,

+                          int sad_per_bit, int distance,

+                          vp9_variance_fn_ptr_t *fn_ptr,

+                          DEC_MVCOSTS,

+                          int_mv *center_mv) {

+  unsigned char *what = (*(b->base_src) + b->src);

+  int what_stride = b->src_stride;

+  unsigned char *in_what;

+  int in_what_stride = d->pre_stride;

+  int mv_stride = d->pre_stride;

+  unsigned char *bestaddress;

+  int_mv *best_mv = &d->bmi.as_mv.first;

+  int_mv this_mv;

+  int bestsad = INT_MAX;

+  int r, c;

+  unsigned char *check_here;

+  unsigned int thissad;

+  MACROBLOCKD *xd = &x->e_mbd;

+  int ref_row = ref_mv->as_mv.row;

+  int ref_col = ref_mv->as_mv.col;

+  int row_min = ref_row - distance;

+  int row_max = ref_row + distance;

+  int col_min = ref_col - distance;

+  int col_max = ref_col + distance;

+  DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8);

+  unsigned int sad_array[3];

+  int_mv fcenter_mv;

+  int *mvjsadcost = x->nmvjointsadcost;

+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};

+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;

+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

+  // Work out the mid point for the search

+  in_what = *(d->base_pre) + d->pre;

+  bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;

+  best_mv->as_mv.row = ref_row;

+  best_mv->as_mv.col = ref_col;

+  // Baseline value at the centre

+  bestsad = fn_ptr->sdf(what, what_stride,

+                        bestaddress, in_what_stride, 0x7fffffff)

+            + mvsad_err_cost(best_mv, &fcenter_mv, MVSADCOSTS, sad_per_bit);

+  // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border

+  if (col_min < x->mv_col_min)

+    col_min = x->mv_col_min;

+  if (col_max > x->mv_col_max)

+    col_max = x->mv_col_max;

+  if (row_min < x->mv_row_min)

+    row_min = x->mv_row_min;

+  if (row_max > x->mv_row_max)

+    row_max = x->mv_row_max;

+  for (r = row_min; r < row_max; r++) {

+    this_mv.as_mv.row = r;

+    check_here = r * mv_stride + in_what + col_min;

+    c = col_min;

+    while ((c + 7) < col_max) {

+      int i;

+      fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8);

+      for (i = 0; i < 8; i++) {

+        thissad = (unsigned int)sad_array8[i];

+        if (thissad < bestsad) {

+          this_mv.as_mv.col = c;

+          thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,

+                                     MVSADCOSTS, sad_per_bit);

+          if (thissad < bestsad) {

+            bestsad = thissad;

+            best_mv->as_mv.row = r;

+            best_mv->as_mv.col = c;

+            bestaddress = check_here;

+          }

+        }

+        check_here++;

+        c++;

+      }

+    }

+    while ((c + 2) < col_max) {

+      int i;

+      fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);

+      for (i = 0; i < 3; i++) {

+        thissad = sad_array[i];

+        if (thissad < bestsad) {

+          this_mv.as_mv.col = c;

+          thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,

+                                     MVSADCOSTS, sad_per_bit);

+          if (thissad < bestsad) {

+            bestsad = thissad;

+            best_mv->as_mv.row = r;

+            best_mv->as_mv.col = c;

+            bestaddress = check_here;

+          }

+        }

+        check_here++;

+        c++;

+      }

+    }

+    while (c < col_max) {

+      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);

+      if (thissad < bestsad) {

+        this_mv.as_mv.col = c;

+        thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,

+                                   MVSADCOSTS, sad_per_bit);

+        if (thissad < bestsad) {

+          bestsad = thissad;

+          best_mv->as_mv.row = r;

+          best_mv->as_mv.col = c;

+          bestaddress = check_here;

+        }

+      }

+      check_here++;

+      c++;

+    }

+  }

+  this_mv.as_mv.row = best_mv->as_mv.row << 3;

+  this_mv.as_mv.col = best_mv->as_mv.col << 3;

+  if (bestsad < INT_MAX)

+    return

+        fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,

+                   (unsigned int *)(&thissad)) +

+        mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,

+                    xd->allow_high_precision_mv);

+  else

+    return INT_MAX;

+}

+int vp9_refining_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,

+                            int error_per_bit, int search_range,

+                            vp9_variance_fn_ptr_t *fn_ptr, DEC_MVCOSTS,

+                            int_mv *center_mv) {

+  MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};

+  int i, j;

+  short this_row_offset, this_col_offset;

+  int what_stride = b->src_stride;

+  int in_what_stride = d->pre_stride;

+  unsigned char *what = (*(b->base_src) + b->src);

+  unsigned char *best_address = (unsigned char *)(*(d->base_pre) + d->pre +

+                                                  (ref_mv->as_mv.row * (d->pre_stride)) + ref_mv->as_mv.col);

+  unsigned char *check_here;

+  unsigned int thissad;

+  int_mv this_mv;

+  unsigned int bestsad = INT_MAX;

+  MACROBLOCKD *xd = &x->e_mbd;

+  int_mv fcenter_mv;

+  int *mvjsadcost = x->nmvjointsadcost;

+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};

+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;

+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

+  bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) +

+      mvsad_err_cost(ref_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);

+  for (i = 0; i < search_range; i++) {

+    int best_site = -1;

+    for (j = 0; j < 4; j++) {

+      this_row_offset = ref_mv->as_mv.row + neighbors[j].row;

+      this_col_offset = ref_mv->as_mv.col + neighbors[j].col;

+      if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&

+          (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {

+        check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address;

+        thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);

+        if (thissad < bestsad) {

+          this_mv.as_mv.row = this_row_offset;

+          this_mv.as_mv.col = this_col_offset;

+          thissad += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);

+          if (thissad < bestsad) {

+            bestsad = thissad;

+            best_site = j;

+          }

+        }

+      }

+    }

+    if (best_site == -1)

+      break;

+    else {

+      ref_mv->as_mv.row += neighbors[best_site].row;

+      ref_mv->as_mv.col += neighbors[best_site].col;

+      best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col;

+    }

+  }

+  this_mv.as_mv.row = ref_mv->as_mv.row << 3;

+  this_mv.as_mv.col = ref_mv->as_mv.col << 3;

+  if (bestsad < INT_MAX)

+    return

+        fn_ptr->vf(what, what_stride, best_address, in_what_stride,

+                   (unsigned int *)(&thissad)) +

+        mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,

+                    xd->allow_high_precision_mv);

+  else

+    return INT_MAX;

+}

+int vp9_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

+                              int_mv *ref_mv, int error_per_bit,

+                              int search_range, vp9_variance_fn_ptr_t *fn_ptr,

+                              DEC_MVCOSTS, int_mv *center_mv) {

+  MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};

+  int i, j;

+  short this_row_offset, this_col_offset;

+  int what_stride = b->src_stride;

+  int in_what_stride = d->pre_stride;

+  unsigned char *what = (*(b->base_src) + b->src);

+  unsigned char *best_address = (unsigned char *)(*(d->base_pre) + d->pre +

+                                                  (ref_mv->as_mv.row * (d->pre_stride)) + ref_mv->as_mv.col);

+  unsigned char *check_here;

+  unsigned int thissad;

+  int_mv this_mv;

+  unsigned int bestsad = INT_MAX;

+  MACROBLOCKD *xd = &x->e_mbd;

+  int_mv fcenter_mv;

+  int *mvjsadcost = x->nmvjointsadcost;

+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};

+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;

+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;

+  bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) +

+      mvsad_err_cost(ref_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);

+  for (i = 0; i < search_range; i++) {

+    int best_site = -1;

+    int all_in = 1;

+    all_in &= ((ref_mv->as_mv.row - 1) > x->mv_row_min);

+    all_in &= ((ref_mv->as_mv.row + 1) < x->mv_row_max);

+    all_in &= ((ref_mv->as_mv.col - 1) > x->mv_col_min);

+    all_in &= ((ref_mv->as_mv.col + 1) < x->mv_col_max);

+    if (all_in) {

+      unsigned int sad_array[4];

+      unsigned char *block_offset[4];

+      block_offset[0] = best_address - in_what_stride;

+      block_offset[1] = best_address - 1;

+      block_offset[2] = best_address + 1;

+      block_offset[3] = best_address + in_what_stride;

+      fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);

+      for (j = 0; j < 4; j++) {

+        if (sad_array[j] < bestsad) {

+          this_mv.as_mv.row = ref_mv->as_mv.row + neighbors[j].row;

+          this_mv.as_mv.col = ref_mv->as_mv.col + neighbors[j].col;

+          sad_array[j] += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);

+          if (sad_array[j] < bestsad) {

+            bestsad = sad_array[j];

+            best_site = j;

+          }

+        }

+      }

+    } else {

+      for (j = 0; j < 4; j++) {

+        this_row_offset = ref_mv->as_mv.row + neighbors[j].row;

+        this_col_offset = ref_mv->as_mv.col + neighbors[j].col;

+        if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&

+            (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {

+          check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address;

+          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);

+          if (thissad < bestsad) {

+            this_mv.as_mv.row = this_row_offset;

+            this_mv.as_mv.col = this_col_offset;

+            thissad += mvsad_err_cost(&this_mv, &fcenter_mv, MVSADCOSTS, error_per_bit);

+            if (thissad < bestsad) {

+              bestsad = thissad;

+              best_site = j;

+            }

+          }

+        }

+      }

+    }

+    if (best_site == -1)

+      break;

+    else {

+      ref_mv->as_mv.row += neighbors[best_site].row;

+      ref_mv->as_mv.col += neighbors[best_site].col;

+      best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col;

+    }

+  }

+  this_mv.as_mv.row = ref_mv->as_mv.row << 3;

+  this_mv.as_mv.col = ref_mv->as_mv.col << 3;

+  if (bestsad < INT_MAX)

+    return

+        fn_ptr->vf(what, what_stride, best_address, in_what_stride,

+                   (unsigned int *)(&thissad)) +

+        mv_err_cost(&this_mv, center_mv, MVCOSTS, x->errorperbit,

+                    xd->allow_high_precision_mv);

+  else

+    return INT_MAX;

+}

+#ifdef ENTROPY_STATS

+void print_mode_context(void) {

+  FILE *f = fopen("modecont.c", "a");

+  int i, j;

+  fprintf(f, "#include \"entropy.h\"\n");

+  fprintf(f, "const int vp9_mode_contexts[6][4] =");

+  fprintf(f, "{\n");

+  for (j = 0; j < 6; j++) {

+    fprintf(f, "  {/* %d */ ", j);

+    fprintf(f, "    ");

+    for (i = 0; i < 4; i++) {

+      int this_prob;

+      int count;

+      // context probs

+      count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];

+      if (count)

+        this_prob = 256 * mv_ref_ct[j][i][0] / count;

+      else

+        this_prob = 128;

+      if (this_prob == 0)

+        this_prob = 1;

+      fprintf(f, "%5d, ", this_prob);

+    }

+    fprintf(f, "  },\n");

+  }

+  fprintf(f, "};\n");

+  fclose(f);

+}

+/* MV ref count ENTROPY_STATS stats code */

+void init_mv_ref_counts() {

+  vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct));

+  vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts));

+}

+void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4]) {

+  if (m == ZEROMV) {

+    ++mv_ref_ct [ct[0]] [0] [0];

+    ++mv_mode_cts[0][0];

+  } else {

+    ++mv_ref_ct [ct[0]] [0] [1];

+    ++mv_mode_cts[0][1];

+    if (m == NEARESTMV) {

+      ++mv_ref_ct [ct[1]] [1] [0];

+      ++mv_mode_cts[1][0];

+    } else {

+      ++mv_ref_ct [ct[1]] [1] [1];

+      ++mv_mode_cts[1][1];

+      if (m == NEARMV) {

+        ++mv_ref_ct [ct[2]] [2] [0];

+        ++mv_mode_cts[2][0];

+      } else {

+        ++mv_ref_ct [ct[2]] [2] [1];

+        ++mv_mode_cts[2][1];

+        if (m == NEWMV) {

+          ++mv_ref_ct [ct[3]] [3] [0];

+          ++mv_mode_cts[3][0];

+        } else {

+          ++mv_ref_ct [ct[3]] [3] [1];

+          ++mv_mode_cts[3][1];

+        }

+      }

+    }

+  }

+}

+#endif/* END MV ref count ENTROPY_STATS stats code */

--- /dev/null

+++ b/vp9/encoder/mcomp.h

@@ -1,0 +1,159 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_MCOMP_H

+#define __INC_MCOMP_H

+#include "block.h"

+#include "variance.h"

+#define MVCOSTS mvjcost, mvcost

+#define MVSADCOSTS mvjsadcost, mvsadcost

+#define DEC_MVCOSTS int *mvjcost, int *mvcost[2]

+#define DEC_MVSADCOSTS int *mvjsadcost, int *mvsadcost[2]

+#define NULLMVCOST NULL, NULL

+#define XMVCOST x->nmvjointcost, (x->e_mbd.allow_high_precision_mv?x->nmvcost_hp:x->nmvcost)

+#ifdef ENTROPY_STATS

+extern void init_mv_ref_counts();

+extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);

+#endif

+#define MAX_MVSEARCH_STEPS 8                                    // The maximum number of steps in a step search given the largest allowed initial step

+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)      // Max full pel mv specified in 1 pel units

+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))            // Maximum size of the first step in full pel units

+extern void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv);

+extern int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, DEC_MVCOSTS,

+                           int Weight, int ishp);

+extern void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);

+extern void vp9_init3smotion_compensation(MACROBLOCK *x,  int stride);

+// Runs sequence of diamond searches in smaller steps for RD

+struct VP9_COMP;

+int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b,

+                           BLOCKD *d, int_mv *mvp_full, int step_param,

+                           int sadpb, int further_steps, int do_refine,

+                           vp9_variance_fn_ptr_t *fn_ptr,

+                           int_mv *ref_mv, int_mv *dst_mv);

+extern int vp9_hex_search

+(

+  MACROBLOCK *x,

+  BLOCK *b,

+  BLOCKD *d,

+  int_mv *ref_mv,

+  int_mv *best_mv,

+  int search_param,

+  int error_per_bit,

+  const vp9_variance_fn_ptr_t *vf,

+  DEC_MVSADCOSTS,

+  DEC_MVCOSTS,

+  int_mv *center_mv

+);

+typedef int (fractional_mv_step_fp)

+(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv,

+ int error_per_bit, const vp9_variance_fn_ptr_t *vfp, DEC_MVCOSTS,

+ int *distortion, unsigned int *sse);

+extern fractional_mv_step_fp vp9_find_best_sub_pixel_step_iteratively;

+extern fractional_mv_step_fp vp9_find_best_sub_pixel_step;

+extern fractional_mv_step_fp vp9_find_best_half_pixel_step;

+#define prototype_full_search_sad(sym)\

+  int (sym)\

+  (\

+   MACROBLOCK *x, \

+   BLOCK *b, \

+   BLOCKD *d, \

+   int_mv *ref_mv, \

+   int sad_per_bit, \

+   int distance, \

+   vp9_variance_fn_ptr_t *fn_ptr, \

+   DEC_MVSADCOSTS, \

+   int_mv *center_mv \

+  )

+#define prototype_refining_search_sad(sym)\

+  int (sym)\

+  (\

+   MACROBLOCK *x, \

+   BLOCK *b, \

+   BLOCKD *d, \

+   int_mv *ref_mv, \

+   int sad_per_bit, \

+   int distance, \

+   vp9_variance_fn_ptr_t *fn_ptr, \

+   DEC_MVSADCOSTS, \

+   int_mv *center_mv \

+  )

+#define prototype_diamond_search_sad(sym)\

+  int (sym)\

+  (\

+   MACROBLOCK *x, \

+   BLOCK *b, \

+   BLOCKD *d, \

+   int_mv *ref_mv, \

+   int_mv *best_mv, \

+   int search_param, \

+   int sad_per_bit, \

+   int *num00, \

+   vp9_variance_fn_ptr_t *fn_ptr, \

+   DEC_MVSADCOSTS, \

+   int_mv *center_mv \

+  )

+#if ARCH_X86 || ARCH_X86_64

+#include "x86/mcomp_x86.h"

+#endif

+typedef prototype_full_search_sad(*vp9_full_search_fn_t);

+extern prototype_full_search_sad(vp9_full_search_sad);

+extern prototype_full_search_sad(vp9_full_search_sadx3);

+extern prototype_full_search_sad(vp9_full_search_sadx8);

+typedef prototype_refining_search_sad(*vp9_refining_search_fn_t);

+extern prototype_refining_search_sad(vp9_refining_search_sad);

+extern prototype_refining_search_sad(vp9_refining_search_sadx4);

+typedef prototype_diamond_search_sad(*vp9_diamond_search_fn_t);

+extern prototype_diamond_search_sad(vp9_diamond_search_sad);

+extern prototype_diamond_search_sad(vp9_diamond_search_sadx4);

+#ifndef vp9_search_full_search

+#define vp9_search_full_search vp9_full_search_sad

+#endif

+extern prototype_full_search_sad(vp9_search_full_search);

+#ifndef vp9_search_refining_search

+#define vp9_search_refining_search vp9_refining_search_sad

+#endif

+extern prototype_refining_search_sad(vp9_search_refining_search);

+#ifndef vp9_search_diamond_search

+#define vp9_search_diamond_search vp9_diamond_search_sad

+#endif

+extern prototype_diamond_search_sad(vp9_search_diamond_search);

+typedef struct {

+  prototype_full_search_sad(*full_search);

+  prototype_refining_search_sad(*refining_search);

+  prototype_diamond_search_sad(*diamond_search);

+} vp9_search_rtcd_vtable_t;

+#if CONFIG_RUNTIME_CPU_DETECT

+#define SEARCH_INVOKE(ctx,fn) (ctx)->fn

+#else

+#define SEARCH_INVOKE(ctx,fn) vp9_search_##fn

+#endif

+#endif

--- /dev/null

+++ b/vp9/encoder/modecosts.c

@@ -1,0 +1,49 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vp9/common/blockd.h"

+#include "onyx_int.h"

+#include "treewriter.h"

+#include "vp9/common/entropymode.h"

+void vp9_init_mode_costs(VP9_COMP *c) {

+  VP9_COMMON *x = &c->common;

+  const vp9_tree_p T = vp9_bmode_tree;

+  int i, j;

+  for (i = 0; i < VP9_BINTRAMODES; i++) {

+    for (j = 0; j < VP9_BINTRAMODES; j++) {

+      vp9_cost_tokens((int *)c->mb.bmode_costs[i][j],

+                      x->kf_bmode_prob[i][j], T);

+    }

+  }

+  vp9_cost_tokens((int *)c->mb.inter_bmode_costs, x->fc.bmode_prob, T);

+  vp9_cost_tokens((int *)c->mb.inter_bmode_costs,

+                  x->fc.sub_mv_ref_prob[0], vp9_sub_mv_ref_tree);

+  vp9_cost_tokens(c->mb.mbmode_cost[1], x->fc.ymode_prob, vp9_ymode_tree);

+  vp9_cost_tokens(c->mb.mbmode_cost[0],

+                  x->kf_ymode_prob[c->common.kf_ymode_probs_index],

+                  vp9_kf_ymode_tree);

+  vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],

+                  x->fc.uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);

+  vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],

+                  x->kf_uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);

+  vp9_cost_tokens(c->mb.i8x8_mode_costs,

+                  x->fc.i8x8_mode_prob, vp9_i8x8_mode_tree);

+  for (i = 0; i <= VP9_SWITCHABLE_FILTERS; ++i)

+    vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],

+                    x->fc.switchable_interp_prob[i],

+                    vp9_switchable_interp_tree);

+}

--- /dev/null

+++ b/vp9/encoder/modecosts.h

@@ -1,0 +1,17 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_MODECOSTS_H

+#define __INC_MODECOSTS_H

+void vp9_init_mode_costs(VP9_COMP *x);

+#endif

--- /dev/null

+++ b/vp9/encoder/onyx_if.c

@@ -1,0 +1,4486 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_config.h"

+#include "vp9/common/onyxc_int.h"

+#include "onyx_int.h"

+#include "vp9/common/systemdependent.h"

+#include "quantize.h"

+#include "vp9/common/alloccommon.h"

+#include "mcomp.h"

+#include "firstpass.h"

+#include "psnr.h"

+#include "vpx_scale/vpxscale.h"

+#include "vp9/common/extend.h"

+#include "ratectrl.h"

+#include "vp9/common/quant_common.h"

+#include "segmentation.h"

+#include "vpx_scale/yv12extend.h"

+#if CONFIG_POSTPROC

+#include "vp9/common/postproc.h"

+#endif

+#include "vpx_mem/vpx_mem.h"

+#include "vp9/common/swapyv12buffer.h"

+#include "vpx_ports/vpx_timer.h"

+#include "temporal_filter.h"

+#include "vp9/common/seg_common.h"

+#include "mbgraph.h"

+#include "vp9/common/pred_common.h"

+#include "vp9/encoder/rdopt.h"

+#include "bitstream.h"

+#include "ratectrl.h"

+#if CONFIG_NEWBESTREFMV

+#include "vp9/common/mvref_common.h"

+#endif

+#if ARCH_ARM

+#include "vpx_ports/arm.h"

+#endif

+#include <math.h>

+#include <stdio.h>

+#include <limits.h>

+#if CONFIG_RUNTIME_CPU_DETECT

+#define IF_RTCD(x) (x)

+#define RTCD(x) &cpi->common.rtcd.x

+#else

+#define IF_RTCD(x) NULL

+#define RTCD(x) NULL

+#endif

+extern void vp9_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi);

+extern void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val);

+extern void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi);

+extern void vp9_cmachine_specific_config(VP9_COMP *cpi);

+extern void vp9_deblock_frame(YV12_BUFFER_CONFIG *source,

+                              YV12_BUFFER_CONFIG *post,

+                              int filt_lvl, int low_var_thresh, int flag);

+extern void print_tree_update_probs();

+#if HAVE_ARMV7

+extern void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc,

+                                          YV12_BUFFER_CONFIG *dst_ybc);

+extern void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc,

+                                              YV12_BUFFER_CONFIG *dst_ybc);

+#endif

+int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);

+extern void vp9_temporal_filter_prepare_c(VP9_COMP *cpi, int distance);

+static void set_default_lf_deltas(VP9_COMP *cpi);

+#define DEFAULT_INTERP_FILTER EIGHTTAP  /* SWITCHABLE for better performance */

+#define SEARCH_BEST_FILTER 0            /* to search exhaustively for

+                                           best filter */

+#define RESET_FOREACH_FILTER 0          /* whether to reset the encoder state

+                                           before trying each new filter */

+#define SHARP_FILTER_QTHRESH 0          /* Q threshold for 8-tap sharp filter */

+#define ALTREF_HIGH_PRECISION_MV 1      /* whether to use high precision mv

+                                           for altref computation */

+#define HIGH_PRECISION_MV_QTHRESH 200   /* Q threshold for use of high precision

+                                           mv. Choose a very high value for

+                                           now so that HIGH_PRECISION is always

+                                           chosen */

+#if CONFIG_INTERNAL_STATS

+#include "math.h"

+extern double vp9_calc_ssim(YV12_BUFFER_CONFIG *source,

+                            YV12_BUFFER_CONFIG *dest, int lumamask,

+                            double *weight);

+extern double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source,

+                             YV12_BUFFER_CONFIG *dest, double *ssim_y,

+                             double *ssim_u, double *ssim_v);

+#endif

+// #define OUTPUT_YUV_REC

+#ifdef OUTPUT_YUV_SRC

+FILE *yuv_file;

+#endif

+#ifdef OUTPUT_YUV_REC

+FILE *yuv_rec_file;

+#endif

+#if 0

+FILE *framepsnr;

+FILE *kf_list;

+FILE *keyfile;

+#endif

+#if 0

+extern int skip_true_count;

+extern int skip_false_count;

+#endif

+#ifdef ENTROPY_STATS

+extern int intra_mode_stats[VP9_BINTRAMODES][VP9_BINTRAMODES][VP9_BINTRAMODES];

+#endif

+#ifdef NMV_STATS

+extern void init_nmvstats();

+extern void print_nmvstats();

+#endif

+#ifdef SPEEDSTATS

+unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

+#endif

+#if defined(SECTIONBITS_OUTPUT)

+extern unsigned __int64 Sectionbits[500];

+#endif

+#ifdef MODE_STATS

+extern INT64 Sectionbits[500];

+extern unsigned int y_modes[VP9_YMODES];

+extern unsigned int i8x8_modes[VP9_I8X8_MODES];

+extern unsigned int uv_modes[VP9_UV_MODES];

+extern unsigned int uv_modes_y[VP9_YMODES][VP9_UV_MODES];

+extern unsigned int b_modes[B_MODE_COUNT];

+extern unsigned int inter_y_modes[MB_MODE_COUNT];

+extern unsigned int inter_uv_modes[VP9_UV_MODES];

+extern unsigned int inter_b_modes[B_MODE_COUNT];

+#endif

+extern void vp9_init_quantizer(VP9_COMP *cpi);

+static int base_skip_false_prob[QINDEX_RANGE][3];

+// Tables relating active max Q to active min Q

+static int kf_low_motion_minq[QINDEX_RANGE];

+static int kf_high_motion_minq[QINDEX_RANGE];

+static int gf_low_motion_minq[QINDEX_RANGE];

+static int gf_high_motion_minq[QINDEX_RANGE];

+static int inter_minq[QINDEX_RANGE];

+// Functions to compute the active minq lookup table entries based on a

+// formulaic approach to facilitate easier adjustment of the Q tables.

+// The formulae were derived from computing a 3rd order polynomial best

+// fit to the original data (after plotting real maxq vs minq (not q index))

+static int calculate_minq_index(double maxq,

+                                double x3, double x2, double x, double c) {

+  int i;

+  double minqtarget;

+  double thisq;

+  minqtarget = ((x3 * maxq * maxq * maxq) +

+                (x2 * maxq * maxq) +

+                (x * maxq) +

+                c);

+  if (minqtarget > maxq)

+    minqtarget = maxq;

+  for (i = 0; i < QINDEX_RANGE; i++) {

+    thisq = vp9_convert_qindex_to_q(i);

+    if (minqtarget <= vp9_convert_qindex_to_q(i))

+      return i;

+  }

+  return QINDEX_RANGE - 1;

+}

+static void init_minq_luts(void) {

+  int i;

+  double maxq;

+  for (i = 0; i < QINDEX_RANGE; i++) {

+    maxq = vp9_convert_qindex_to_q(i);

+    kf_low_motion_minq[i] = calculate_minq_index(maxq,

+                                                 0.0000003,

+                                                 -0.000015,

+                                                 0.074,

+                                                 0.0);

+    kf_high_motion_minq[i] = calculate_minq_index(maxq,

+                                                  0.0000004,

+                                                  -0.000125,

+                                                  0.14,

+                                                  0.0);

+    gf_low_motion_minq[i] = calculate_minq_index(maxq,

+                                                 0.0000015,

+                                                 -0.0009,

+                                                 0.33,

+                                                 0.0);

+    gf_high_motion_minq[i] = calculate_minq_index(maxq,

+                                                  0.0000021,

+                                                  -0.00125,

+                                                  0.45,

+                                                  0.0);

+    inter_minq[i] = calculate_minq_index(maxq,

+                                         0.00000271,

+                                         -0.00113,

+                                         0.697,

+                                         0.0);

+  }

+}

+static void init_base_skip_probs(void) {

+  int i;

+  double q;

+  int skip_prob, t;

+  for (i = 0; i < QINDEX_RANGE; i++) {

+    q = vp9_convert_qindex_to_q(i);

+    // Exponential decay caluclation of baseline skip prob with clamping

+    // Based on crude best fit of old table.

+    t = (int)(564.25 * pow(2.71828, (-0.012 * q)));

+    skip_prob = t;

+    if (skip_prob < 1)

+      skip_prob = 1;

+    else if (skip_prob > 255)

+      skip_prob = 255;

+    base_skip_false_prob[i][1] = skip_prob;

+    skip_prob = t * 0.75;

+    if (skip_prob < 1)

+      skip_prob = 1;

+    else if (skip_prob > 255)

+      skip_prob = 255;

+    base_skip_false_prob[i][2] = skip_prob;

+    skip_prob = t * 1.25;

+    if (skip_prob < 1)

+      skip_prob = 1;

+    else if (skip_prob > 255)

+      skip_prob = 255;

+    base_skip_false_prob[i][0] = skip_prob;

+  }

+}

+static void update_base_skip_probs(VP9_COMP *cpi) {

+  VP9_COMMON *cm = &cpi->common;

+  if (cm->frame_type != KEY_FRAME) {

+    vp9_update_skip_probs(cpi);

+    if (cm->refresh_alt_ref_frame) {

+      int k;

+      for (k = 0; k < MBSKIP_CONTEXTS; ++k)

+        cpi->last_skip_false_probs[2][k] = cm->mbskip_pred_probs[k];

+      cpi->last_skip_probs_q[2] = cm->base_qindex;

+    } else if (cpi->common.refresh_golden_frame) {

+      int k;

+      for (k = 0; k < MBSKIP_CONTEXTS; ++k)

+        cpi->last_skip_false_probs[1][k] = cm->mbskip_pred_probs[k];

+      cpi->last_skip_probs_q[1] = cm->base_qindex;

+    } else {

+      int k;

+      for (k = 0; k < MBSKIP_CONTEXTS; ++k)

+        cpi->last_skip_false_probs[0][k] = cm->mbskip_pred_probs[k];

+      cpi->last_skip_probs_q[0] = cm->base_qindex;

+      // update the baseline table for the current q

+      for (k = 0; k < MBSKIP_CONTEXTS; ++k)

+        cpi->base_skip_false_prob[cm->base_qindex][k] =

+          cm->mbskip_pred_probs[k];

+    }

+  }

+}

+void vp9_initialize_enc() {

+  static int init_done = 0;

+  if (!init_done) {

+    vp8_scale_machine_specific_config();

+    vp9_initialize_common();

+    vp9_tokenize_initialize();

+    vp9_init_quant_tables();

+    vp9_init_me_luts();

+    init_minq_luts();

+    init_base_skip_probs();

+    init_done = 1;

+  }

+}

+#ifdef PACKET_TESTING

+extern FILE *vpxlogc;

+#endif

+static void setup_features(VP9_COMP *cpi) {

+  MACROBLOCKD *xd = &cpi->mb.e_mbd;

+  // Set up default state for MB feature flags

+  xd->segmentation_enabled = 0;   // Default segmentation disabled

+  xd->update_mb_segmentation_map = 0;

+  xd->update_mb_segmentation_data = 0;

+  vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));

+  vp9_clearall_segfeatures(xd);

+  xd->mode_ref_lf_delta_enabled = 0;

+  xd->mode_ref_lf_delta_update = 0;

+  vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));

+  vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));

+  vpx_memset(xd->last_ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));

+  vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));

+  set_default_lf_deltas(cpi);

+}

+static void dealloc_compressor_data(VP9_COMP *cpi) {

+  vpx_free(cpi->tplist);

+  cpi->tplist = NULL;

+  // Delete last frame MV storage buffers

+  vpx_free(cpi->lfmv);

+  cpi->lfmv = 0;

+  vpx_free(cpi->lf_ref_frame_sign_bias);

+  cpi->lf_ref_frame_sign_bias = 0;

+  vpx_free(cpi->lf_ref_frame);

+  cpi->lf_ref_frame = 0;

+  // Delete sementation map

+  vpx_free(cpi->segmentation_map);

+  cpi->segmentation_map = 0;

+  vpx_free(cpi->common.last_frame_seg_map);

+  cpi->common.last_frame_seg_map = 0;

+  vpx_free(cpi->coding_context.last_frame_seg_map_copy);

+  cpi->coding_context.last_frame_seg_map_copy = 0;

+  vpx_free(cpi->active_map);

+  cpi->active_map = 0;

+  vp9_de_alloc_frame_buffers(&cpi->common);

+  vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf);

+  vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source);

+#if VP9_TEMPORAL_ALT_REF

+  vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer);

+#endif

+  vp9_lookahead_destroy(cpi->lookahead);

+  vpx_free(cpi->tok);

+  cpi->tok = 0;

+  // Structure used to monitor GF usage

+  vpx_free(cpi->gf_active_flags);

+  cpi->gf_active_flags = 0;

+  // Activity mask based per mb zbin adjustments

+  vpx_free(cpi->mb_activity_map);

+  cpi->mb_activity_map = 0;

+  vpx_free(cpi->mb_norm_activity_map);

+  cpi->mb_norm_activity_map = 0;

+  vpx_free(cpi->mb.pip);

+  cpi->mb.pip = 0;

+  vpx_free(cpi->twopass.total_stats);

+  cpi->twopass.total_stats = 0;

+  vpx_free(cpi->twopass.total_left_stats);

+  cpi->twopass.total_left_stats = 0;

+  vpx_free(cpi->twopass.this_frame_stats);

+  cpi->twopass.this_frame_stats = 0;

+}

+// Computes a q delta (in "q index" terms) to get from a starting q value

+// to a target value

+// target q value

+static int compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) {

+  int i;

+  int start_index = cpi->worst_quality;

+  int target_index = cpi->worst_quality;

+  // Convert the average q value to an index.

+  for (i = cpi->best_quality; i < cpi->worst_quality; i++) {

+    start_index = i;

+    if (vp9_convert_qindex_to_q(i) >= qstart)

+      break;

+  }

+  // Convert the q target to an index

+  for (i = cpi->best_quality; i < cpi->worst_quality; i++) {

+    target_index = i;

+    if (vp9_convert_qindex_to_q(i) >= qtarget)

+      break;

+  }

+  return target_index - start_index;

+}

+static void init_seg_features(VP9_COMP *cpi) {

+  VP9_COMMON *cm = &cpi->common;

+  MACROBLOCKD *xd = &cpi->mb.e_mbd;

+  int high_q = (int)(cpi->avg_q > 48.0);

+  int qi_delta;

+  // Disable and clear down for KF

+  if (cm->frame_type == KEY_FRAME) {

+    // Clear down the global segmentation map

+    vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));

+    xd->update_mb_segmentation_map = 0;

+    xd->update_mb_segmentation_data = 0;

+    cpi->static_mb_pct = 0;

+    // Disable segmentation

+    vp9_disable_segmentation((VP9_PTR)cpi);

+    // Clear down the segment features.

+    vp9_clearall_segfeatures(xd);

+  }

+  // If this is an alt ref frame

+  else if (cm->refresh_alt_ref_frame) {

+    // Clear down the global segmentation map

+    vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));

+    xd->update_mb_segmentation_map = 0;

+    xd->update_mb_segmentation_data = 0;

+    cpi->static_mb_pct = 0;

+    // Disable segmentation and individual segment features by default

+    vp9_disable_segmentation((VP9_PTR)cpi);

+    vp9_clearall_segfeatures(xd);

+    // Scan frames from current to arf frame.

+    // This function re-enables segmentation if appropriate.

+    vp9_update_mbgraph_stats(cpi);

+    // If segmentation was enabled set those features needed for the

+    // arf itself.

+    if (xd->segmentation_enabled) {

+      xd->update_mb_segmentation_map = 1;

+      xd->update_mb_segmentation_data = 1;

+      qi_delta = compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 0.875));

+      vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, (qi_delta - 2));

+      vp9_set_segdata(xd, 1, SEG_LVL_ALT_LF, -2);

+      vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_Q);

+      vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_LF);

+      // Where relevant assume segment data is delta data

+      xd->mb_segment_abs_delta = SEGMENT_DELTADATA;

+    }

+  }

+  // All other frames if segmentation has been enabled

+  else if (xd->segmentation_enabled) {

+    // First normal frame in a valid gf or alt ref group

+    if (cpi->common.frames_since_golden == 0) {

+      // Set up segment features for normal frames in an af group

+      if (cpi->source_alt_ref_active) {

+        xd->update_mb_segmentation_map = 0;

+        xd->update_mb_segmentation_data = 1;

+        xd->mb_segment_abs_delta = SEGMENT_DELTADATA;

+        qi_delta = compute_qdelta(cpi, cpi->avg_q,

+                                  (cpi->avg_q * 1.125));

+        vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, (qi_delta + 2));

+        vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, 0);

+        vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_Q);

+        vp9_set_segdata(xd, 1, SEG_LVL_ALT_LF, -2);

+        vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_LF);

+        // Segment coding disabled for compred testing

+        if (high_q || (cpi->static_mb_pct == 100)) {

+          // set_segref(xd, 1, LAST_FRAME);

+          vp9_set_segref(xd, 1, ALTREF_FRAME);

+          vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);

+          vp9_set_segdata(xd, 1, SEG_LVL_MODE, ZEROMV);

+          vp9_enable_segfeature(xd, 1, SEG_LVL_MODE);

+          // EOB segment coding not fixed for 8x8 yet

+          vp9_set_segdata(xd, 1, SEG_LVL_EOB, 0);

+          vp9_enable_segfeature(xd, 1, SEG_LVL_EOB);

+        }

+      }

+      // Disable segmentation and clear down features if alt ref

+      // is not active for this group

+      else {

+        vp9_disable_segmentation((VP9_PTR)cpi);

+        vpx_memset(cpi->segmentation_map, 0,

+                   (cm->mb_rows * cm->mb_cols));

+        xd->update_mb_segmentation_map = 0;

+        xd->update_mb_segmentation_data = 0;

+        vp9_clearall_segfeatures(xd);

+      }

+    }

+    // Special case where we are coding over the top of a previous

+    // alt ref frame

+    // Segment coding disabled for compred testing

+    else if (cpi->is_src_frame_alt_ref) {

+      // Enable mode and ref frame features for segment 0 as well

+      vp9_enable_segfeature(xd, 0, SEG_LVL_REF_FRAME);

+      vp9_enable_segfeature(xd, 0, SEG_LVL_MODE);

+      vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);

+      vp9_enable_segfeature(xd, 1, SEG_LVL_MODE);

+      // All mbs should use ALTREF_FRAME, ZEROMV exclusively

+      vp9_clear_segref(xd, 0);

+      vp9_set_segref(xd, 0, ALTREF_FRAME);

+      vp9_clear_segref(xd, 1);

+      vp9_set_segref(xd, 1, ALTREF_FRAME);

+      vp9_set_segdata(xd, 0, SEG_LVL_MODE, ZEROMV);

+      vp9_set_segdata(xd, 1, SEG_LVL_MODE, ZEROMV);

+      // Skip all MBs if high Q

+      if (high_q) {

+        vp9_enable_segfeature(xd, 0, SEG_LVL_EOB);

+        vp9_set_segdata(xd, 0, SEG_LVL_EOB, 0);

+        vp9_enable_segfeature(xd, 1, SEG_LVL_EOB);

+        vp9_set_segdata(xd, 1, SEG_LVL_EOB, 0);

+      }

+      // Enable data udpate

+      xd->update_mb_segmentation_data = 1;

+    }

+    // All other frames.

+    else {

+      // No updates.. leave things as they are.

+      xd->update_mb_segmentation_map = 0;

+      xd->update_mb_segmentation_data = 0;

+    }

+  }

+}

+// DEBUG: Print out the segment id of each MB in the current frame.

+static void print_seg_map(VP9_COMP *cpi) {

+  VP9_COMMON *cm = &cpi->common;

+  int row, col;

+  int map_index = 0;

+  FILE *statsfile;

+  statsfile = fopen("segmap.stt", "a");

+  fprintf(statsfile, "%10d\n",

+          cm->current_video_frame);

+  for (row = 0; row < cpi->common.mb_rows; row++) {

+    for (col = 0; col < cpi->common.mb_cols; col++) {

+      fprintf(statsfile, "%10d",

+              cpi->segmentation_map[map_index]);

+      map_index++;

+    }

+    fprintf(statsfile, "\n");

+  }

+  fprintf(statsfile, "\n");

+  fclose(statsfile);

+}

+static void update_reference_segmentation_map(VP9_COMP *cpi) {

+  VP9_COMMON *cm = &cpi->common;

+  int row, col, sb_rows = (cm->mb_rows + 1) >> 1, sb_cols = (cm->mb_cols + 1) >> 1;

+  MODE_INFO *mi = cm->mi;

+  uint8_t *segmap = cpi->segmentation_map;

+  uint8_t *segcache = cm->last_frame_seg_map;

+  for (row = 0; row < sb_rows; row++) {

+    for (col = 0; col < sb_cols; col++) {

+      MODE_INFO *miptr = mi + col * 2;

+      uint8_t *cache = segcache + col * 2;

+#if CONFIG_SUPERBLOCKS

+      if (miptr->mbmi.encoded_as_sb) {

+        cache[0] = miptr->mbmi.segment_id;

+        if (!(cm->mb_cols & 1) || col < sb_cols - 1)

+          cache[1] = miptr->mbmi.segment_id;

+        if (!(cm->mb_rows & 1) || row < sb_rows - 1) {

+          cache[cm->mb_cols] = miptr->mbmi.segment_id;

+          if (!(cm->mb_cols & 1) || col < sb_cols - 1)

+            cache[cm->mb_cols + 1] = miptr->mbmi.segment_id;

+        }

+      } else

+#endif

+      {

+        cache[0] = miptr[0].mbmi.segment_id;

+        if (!(cm->mb_cols & 1) || col < sb_cols - 1)

+          cache[1] = miptr[1].mbmi.segment_id;

+        if (!(cm->mb_rows & 1) || row < sb_rows - 1) {

+          cache[cm->mb_cols] = miptr[cm->mode_info_stride].mbmi.segment_id;

+          if (!(cm->mb_cols & 1) || col < sb_cols - 1)

+            cache[1] = miptr[1].mbmi.segment_id;

+          cache[cm->mb_cols + 1] = miptr[cm->mode_info_stride + 1].mbmi.segment_id;

+        }

+      }

+    }

+    segmap += 2 * cm->mb_cols;

+    segcache += 2 * cm->mb_cols;

+    mi += 2 * cm->mode_info_stride;

+  }

+}

+static void set_default_lf_deltas(VP9_COMP *cpi) {

+  cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;

+  cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;

+  vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));

+  vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));

+  // Test of ref frame deltas

+  cpi->mb.e_mbd.ref_lf_deltas[INTRA_FRAME] = 2;

+  cpi->mb.e_mbd.ref_lf_deltas[LAST_FRAME] = 0;

+  cpi->mb.e_mbd.ref_lf_deltas[GOLDEN_FRAME] = -2;

+  cpi->mb.e_mbd.ref_lf_deltas[ALTREF_FRAME] = -2;

+  cpi->mb.e_mbd.mode_lf_deltas[0] = 4;               // BPRED

+  cpi->mb.e_mbd.mode_lf_deltas[1] = -2;              // Zero

+  cpi->mb.e_mbd.mode_lf_deltas[2] = 2;               // New mv

+  cpi->mb.e_mbd.mode_lf_deltas[3] = 4;               // Split mv

+}

+void vp9_set_speed_features(VP9_COMP *cpi) {

+  SPEED_FEATURES *sf = &cpi->sf;

+  int Mode = cpi->compressor_speed;

+  int Speed = cpi->Speed;

+  int i;

+  VP9_COMMON *cm = &cpi->common;

+  // Only modes 0 and 1 supported for now in experimental code basae

+  if (Mode > 1)

+    Mode = 1;

+  // Initialise default mode frequency sampling variables

+  for (i = 0; i < MAX_MODES; i ++) {

+    cpi->mode_check_freq[i] = 0;

+    cpi->mode_test_hit_counts[i] = 0;

+    cpi->mode_chosen_counts[i] = 0;

+  }

+  // best quality defaults

+  sf->RD = 1;

+  sf->search_method = NSTEP;

+  sf->improved_dct = 1;

+  sf->auto_filter = 1;

+  sf->recode_loop = 1;

+  sf->quarter_pixel_search = 1;

+  sf->half_pixel_search = 1;

+  sf->iterative_sub_pixel = 1;

+#if CONFIG_LOSSLESS

+  sf->optimize_coefficients = 0;

+#else

+  sf->optimize_coefficients = 1;

+#endif

+  sf->no_skip_block4x4_search = 1;

+  sf->first_step = 0;

+  sf->max_step_search_steps = MAX_MVSEARCH_STEPS;

+  sf->improved_mv_pred = 1;

+  // default thresholds to 0

+  for (i = 0; i < MAX_MODES; i++)

+    sf->thresh_mult[i] = 0;

+  switch (Mode) {

+    case 0: // best quality mode

+#if CONFIG_PRED_FILTER

+      sf->thresh_mult[THR_ZEROMV        ] = 0;

+      sf->thresh_mult[THR_ZEROMV_FILT   ] = 0;

+      sf->thresh_mult[THR_ZEROG         ] = 0;

+      sf->thresh_mult[THR_ZEROG_FILT    ] = 0;

+      sf->thresh_mult[THR_ZEROA         ] = 0;

+      sf->thresh_mult[THR_ZEROA_FILT    ] = 0;

+      sf->thresh_mult[THR_NEARESTMV     ] = 0;

+      sf->thresh_mult[THR_NEARESTMV_FILT] = 0;

+      sf->thresh_mult[THR_NEARESTG      ] = 0;

+      sf->thresh_mult[THR_NEARESTG_FILT ] = 0;

+      sf->thresh_mult[THR_NEARESTA      ] = 0;

+      sf->thresh_mult[THR_NEARESTA_FILT ] = 0;

+      sf->thresh_mult[THR_NEARMV        ] = 0;

+      sf->thresh_mult[THR_NEARMV_FILT   ] = 0;

+      sf->thresh_mult[THR_NEARG         ] = 0;

+      sf->thresh_mult[THR_NEARG_FILT    ] = 0;

+      sf->thresh_mult[THR_NEARA         ] = 0;

+      sf->thresh_mult[THR_NEARA_FILT    ] = 0;

+      sf->thresh_mult[THR_DC       ] = 0;

+      sf->thresh_mult[THR_V_PRED   ] = 1000;

+      sf->thresh_mult[THR_H_PRED   ] = 1000;

+      sf->thresh_mult[THR_D45_PRED ] = 1000;

+      sf->thresh_mult[THR_D135_PRED] = 1000;

+      sf->thresh_mult[THR_D117_PRED] = 1000;

+      sf->thresh_mult[THR_D153_PRED] = 1000;

+      sf->thresh_mult[THR_D27_PRED ] = 1000;

+      sf->thresh_mult[THR_D63_PRED ] = 1000;

+      sf->thresh_mult[THR_B_PRED   ] = 2000;

+      sf->thresh_mult[THR_I8X8_PRED] = 2000;

+      sf->thresh_mult[THR_TM       ] = 1000;

+      sf->thresh_mult[THR_NEWMV    ] = 1000;

+      sf->thresh_mult[THR_NEWG     ] = 1000;

+      sf->thresh_mult[THR_NEWA     ] = 1000;

+      sf->thresh_mult[THR_NEWMV_FILT    ] = 1000;

+      sf->thresh_mult[THR_NEWG_FILT     ] = 1000;

+      sf->thresh_mult[THR_NEWA_FILT     ] = 1000;

+#else

+      sf->thresh_mult[THR_ZEROMV   ] = 0;

+      sf->thresh_mult[THR_ZEROG    ] = 0;

+      sf->thresh_mult[THR_ZEROA    ] = 0;

+      sf->thresh_mult[THR_NEARESTMV] = 0;

+      sf->thresh_mult[THR_NEARESTG ] = 0;

+      sf->thresh_mult[THR_NEARESTA ] = 0;

+      sf->thresh_mult[THR_NEARMV   ] = 0;

+      sf->thresh_mult[THR_NEARG    ] = 0;

+      sf->thresh_mult[THR_NEARA    ] = 0;

+      sf->thresh_mult[THR_DC       ] = 0;

+      sf->thresh_mult[THR_V_PRED   ] = 1000;

+      sf->thresh_mult[THR_H_PRED   ] = 1000;

+      sf->thresh_mult[THR_D45_PRED ] = 1000;

+      sf->thresh_mult[THR_D135_PRED] = 1000;

+      sf->thresh_mult[THR_D117_PRED] = 1000;

+      sf->thresh_mult[THR_D153_PRED] = 1000;

+      sf->thresh_mult[THR_D27_PRED ] = 1000;

+      sf->thresh_mult[THR_D63_PRED ] = 1000;

+      sf->thresh_mult[THR_B_PRED   ] = 2000;

+      sf->thresh_mult[THR_I8X8_PRED] = 2000;

+      sf->thresh_mult[THR_TM       ] = 1000;

+      sf->thresh_mult[THR_NEWMV    ] = 1000;

+      sf->thresh_mult[THR_NEWG     ] = 1000;

+      sf->thresh_mult[THR_NEWA     ] = 1000;

+#endif

+      sf->thresh_mult[THR_SPLITMV  ] = 2500;

+      sf->thresh_mult[THR_SPLITG   ] = 5000;

+      sf->thresh_mult[THR_SPLITA   ] = 5000;

+      sf->thresh_mult[THR_COMP_ZEROLG   ] = 0;

+      sf->thresh_mult[THR_COMP_NEARESTLG] = 0;

+      sf->thresh_mult[THR_COMP_NEARLG   ] = 0;

+      sf->thresh_mult[THR_COMP_ZEROLA   ] = 0;

+      sf->thresh_mult[THR_COMP_NEARESTLA] = 0;

+      sf->thresh_mult[THR_COMP_NEARLA   ] = 0;

+      sf->thresh_mult[THR_COMP_ZEROGA   ] = 0;

+      sf->thresh_mult[THR_COMP_NEARESTGA] = 0;

+      sf->thresh_mult[THR_COMP_NEARGA   ] = 0;

+      sf->thresh_mult[THR_COMP_NEWLG    ] = 1000;

+      sf->thresh_mult[THR_COMP_NEWLA    ] = 1000;

+      sf->thresh_mult[THR_COMP_NEWGA    ] = 1000;

+      sf->thresh_mult[THR_COMP_SPLITLA  ] = 2500;

+      sf->thresh_mult[THR_COMP_SPLITGA  ] = 5000;

+      sf->thresh_mult[THR_COMP_SPLITLG  ] = 5000;

+      sf->first_step = 0;

+      sf->max_step_search_steps = MAX_MVSEARCH_STEPS;

+      sf->search_best_filter = SEARCH_BEST_FILTER;

+      break;

+    case 1:

+#if CONFIG_PRED_FILTER

+      sf->thresh_mult[THR_NEARESTMV] = 0;

+      sf->thresh_mult[THR_NEARESTMV_FILT] = 0;

+      sf->thresh_mult[THR_ZEROMV   ] = 0;

+      sf->thresh_mult[THR_ZEROMV_FILT   ] = 0;

+      sf->thresh_mult[THR_DC       ] = 0;

+      sf->thresh_mult[THR_NEARMV   ] = 0;

+      sf->thresh_mult[THR_NEARMV_FILT   ] = 0;

+      sf->thresh_mult[THR_V_PRED   ] = 1000;

+      sf->thresh_mult[THR_H_PRED   ] = 1000;

+      sf->thresh_mult[THR_D45_PRED ] = 1000;

+      sf->thresh_mult[THR_D135_PRED] = 1000;

+      sf->thresh_mult[THR_D117_PRED] = 1000;

+      sf->thresh_mult[THR_D153_PRED] = 1000;

+      sf->thresh_mult[THR_D27_PRED ] = 1000;

+      sf->thresh_mult[THR_D63_PRED ] = 1000;

+      sf->thresh_mult[THR_B_PRED   ] = 2500;

+      sf->thresh_mult[THR_I8X8_PRED] = 2500;

+      sf->thresh_mult[THR_TM       ] = 1000;

+      sf->thresh_mult[THR_NEARESTG ] = 1000;

+      sf->thresh_mult[THR_NEARESTG_FILT ] = 1000;

+      sf->thresh_mult[THR_NEARESTA ] = 1000;

+      sf->thresh_mult[THR_NEARESTA_FILT ] = 1000;

+      sf->thresh_mult[THR_ZEROG    ] = 1000;

+      sf->thresh_mult[THR_ZEROA    ] = 1000;

+      sf->thresh_mult[THR_NEARG    ] = 1000;

+      sf->thresh_mult[THR_NEARA    ] = 1000;

+      sf->thresh_mult[THR_ZEROG_FILT    ] = 1000;

+      sf->thresh_mult[THR_ZEROA_FILT    ] = 1000;

+      sf->thresh_mult[THR_NEARG_FILT    ] = 1000;

+      sf->thresh_mult[THR_NEARA_FILT    ] = 1000;

+      sf->thresh_mult[THR_ZEROMV   ] = 0;

+      sf->thresh_mult[THR_ZEROG    ] = 0;

+      sf->thresh_mult[THR_ZEROA    ] = 0;

+      sf->thresh_mult[THR_NEARESTMV] = 0;

+      sf->thresh_mult[THR_NEARESTG ] = 0;

+      sf->thresh_mult[THR_NEARESTA ] = 0;

+      sf->thresh_mult[THR_NEARMV   ] = 0;

+      sf->thresh_mult[THR_NEARG    ] = 0;

+      sf->thresh_mult[THR_NEARA    ] = 0;

+      sf->thresh_mult[THR_ZEROMV_FILT   ] = 0;

+      sf->thresh_mult[THR_ZEROG_FILT    ] = 0;

+      sf->thresh_mult[THR_ZEROA_FILT    ] = 0;

+      sf->thresh_mult[THR_NEARESTMV_FILT] = 0;

+      sf->thresh_mult[THR_NEARESTG_FILT ] = 0;

+      sf->thresh_mult[THR_NEARESTA_FILT ] = 0;

+      sf->thresh_mult[THR_NEARMV_FILT   ] = 0;

+      sf->thresh_mult[THR_NEARG_FILT    ] = 0;

+      sf->thresh_mult[THR_NEARA_FILT    ] = 0;

+      sf->thresh_mult[THR_NEWMV    ] = 1000;

+      sf->thresh_mult[THR_NEWG     ] = 1000;

+      sf->thresh_mult[THR_NEWA     ] = 1000;

+      sf->thresh_mult[THR_NEWMV_FILT    ] = 1000;

+      sf->thresh_mult[THR_NEWG_FILT     ] = 1000;

+      sf->thresh_mult[THR_NEWA_FILT     ] = 1000;

+#else

+      sf->thresh_mult[THR_NEARESTMV] = 0;

+      sf->thresh_mult[THR_ZEROMV   ] = 0;

+      sf->thresh_mult[THR_DC       ] = 0;

+      sf->thresh_mult[THR_NEARMV   ] = 0;

+      sf->thresh_mult[THR_V_PRED   ] = 1000;

+      sf->thresh_mult[THR_H_PRED   ] = 1000;

+      sf->thresh_mult[THR_D45_PRED ] = 1000;

+      sf->thresh_mult[THR_D135_PRED] = 1000;

+      sf->thresh_mult[THR_D117_PRED] = 1000;

+      sf->thresh_mult[THR_D153_PRED] = 1000;

+      sf->thresh_mult[THR_D27_PRED ] = 1000;

+      sf->thresh_mult[THR_D63_PRED ] = 1000;

+      sf->thresh_mult[THR_B_PRED   ] = 2500;

+      sf->thresh_mult[THR_I8X8_PRED] = 2500;

+      sf->thresh_mult[THR_TM       ] = 1000;

+      sf->thresh_mult[THR_NEARESTG ] = 1000;

+      sf->thresh_mult[THR_NEARESTA ] = 1000;

+      sf->thresh_mult[THR_ZEROG    ] = 1000;

+      sf->thresh_mult[THR_ZEROA    ] = 1000;

+      sf->thresh_mult[THR_NEARG    ] = 1000;

+      sf->thresh_mult[THR_NEARA    ] = 1000;

+      sf->thresh_mult[THR_ZEROMV   ] = 0;

+      sf->thresh_mult[THR_ZEROG    ] = 0;

+      sf->thresh_mult[THR_ZEROA    ] = 0;

+      sf->thresh_mult[THR_NEARESTMV] = 0;

+      sf->thresh_mult[THR_NEARESTG ] = 0;

+      sf->thresh_mult[THR_NEARESTA ] = 0;

+      sf->thresh_mult[THR_NEARMV   ] = 0;

+      sf->thresh_mult[THR_NEARG    ] = 0;

+      sf->thresh_mult[THR_NEARA    ] = 0;

+      sf->thresh_mult[THR_NEWMV    ] = 1000;

+      sf->thresh_mult[THR_NEWG     ] = 1000;

+      sf->thresh_mult[THR_NEWA     ] = 1000;

+#endif

+      sf->thresh_mult[THR_SPLITMV  ] = 1700;

+      sf->thresh_mult[THR_SPLITG   ] = 4500;

+      sf->thresh_mult[THR_SPLITA   ] = 4500;

+      sf->thresh_mult[THR_COMP_ZEROLG   ] = 0;

+      sf->thresh_mult[THR_COMP_NEARESTLG] = 0;

+      sf->thresh_mult[THR_COMP_NEARLG   ] = 0;

+      sf->thresh_mult[THR_COMP_ZEROLA   ] = 0;

+      sf->thresh_mult[THR_COMP_NEARESTLA] = 0;

+      sf->thresh_mult[THR_COMP_NEARLA   ] = 0;

+      sf->thresh_mult[THR_COMP_ZEROGA   ] = 0;

+      sf->thresh_mult[THR_COMP_NEARESTGA] = 0;

+      sf->thresh_mult[THR_COMP_NEARGA   ] = 0;

+      sf->thresh_mult[THR_COMP_NEWLG    ] = 1000;

+      sf->thresh_mult[THR_COMP_NEWLA    ] = 1000;

+      sf->thresh_mult[THR_COMP_NEWGA    ] = 1000;

+      sf->thresh_mult[THR_COMP_SPLITLA  ] = 1700;

+      sf->thresh_mult[THR_COMP_SPLITGA  ] = 4500;

+      sf->thresh_mult[THR_COMP_SPLITLG  ] = 4500;

+      if (Speed > 0) {

+        /* Disable coefficient optimization above speed 0 */

+        sf->optimize_coefficients = 0;

+        sf->no_skip_block4x4_search = 0;

+        sf->first_step = 1;

+        cpi->mode_check_freq[THR_SPLITG] = 2;

+        cpi->mode_check_freq[THR_SPLITA] = 2;

+        cpi->mode_check_freq[THR_SPLITMV] = 0;

+        cpi->mode_check_freq[THR_COMP_SPLITGA] = 2;

+        cpi->mode_check_freq[THR_COMP_SPLITLG] = 2;

+        cpi->mode_check_freq[THR_COMP_SPLITLA] = 0;

+      }

+      if (Speed > 1) {

+        cpi->mode_check_freq[THR_SPLITG] = 4;

+        cpi->mode_check_freq[THR_SPLITA] = 4;

+        cpi->mode_check_freq[THR_SPLITMV] = 2;

+        cpi->mode_check_freq[THR_COMP_SPLITGA] = 4;

+        cpi->mode_check_freq[THR_COMP_SPLITLG] = 4;

+        cpi->mode_check_freq[THR_COMP_SPLITLA] = 2;

+        sf->thresh_mult[THR_TM       ] = 1500;

+        sf->thresh_mult[THR_V_PRED   ] = 1500;

+        sf->thresh_mult[THR_H_PRED   ] = 1500;

+        sf->thresh_mult[THR_D45_PRED ] = 1500;

+        sf->thresh_mult[THR_D135_PRED] = 1500;

+        sf->thresh_mult[THR_D117_PRED] = 1500;

+        sf->thresh_mult[THR_D153_PRED] = 1500;

+        sf->thresh_mult[THR_D27_PRED ] = 1500;

+        sf->thresh_mult[THR_D63_PRED ] = 1500;

+        sf->thresh_mult[THR_B_PRED   ] = 5000;

+        sf->thresh_mult[THR_I8X8_PRED] = 5000;

+        if (cpi->ref_frame_flags & VP9_LAST_FLAG) {

+          sf->thresh_mult[THR_NEWMV    ] = 2000;

+#if CONFIG_PRED_FILTER

+          sf->thresh_mult[THR_NEWMV_FILT    ] = 2000;

+#endif

+          sf->thresh_mult[THR_SPLITMV  ] = 10000;

+          sf->thresh_mult[THR_COMP_SPLITLG  ] = 20000;

+        }

+        if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {

+          sf->thresh_mult[THR_NEARESTG ] = 1500;

+          sf->thresh_mult[THR_ZEROG    ] = 1500;

+          sf->thresh_mult[THR_NEARG    ] = 1500;

+          sf->thresh_mult[THR_NEWG     ] = 2000;

+#if CONFIG_PRED_FILTER

+          sf->thresh_mult[THR_NEARESTG_FILT ] = 1500;

+          sf->thresh_mult[THR_ZEROG_FILT    ] = 1500;

+          sf->thresh_mult[THR_NEARG_FILT    ] = 1500;

+          sf->thresh_mult[THR_NEWG_FILT     ] = 2000;

+#endif

+          sf->thresh_mult[THR_SPLITG   ] = 20000;

+          sf->thresh_mult[THR_COMP_SPLITGA  ] = 20000;

+        }

+        if (cpi->ref_frame_flags & VP9_ALT_FLAG) {

+          sf->thresh_mult[THR_NEARESTA ] = 1500;

+          sf->thresh_mult[THR_ZEROA    ] = 1500;

+          sf->thresh_mult[THR_NEARA    ] = 1500;

+          sf->thresh_mult[THR_NEWA     ] = 2000;

+#if CONFIG_PRED_FILTER

+          sf->thresh_mult[THR_NEARESTA_FILT ] = 1500;

+          sf->thresh_mult[THR_ZEROA_FILT    ] = 1500;

+          sf->thresh_mult[THR_NEARA_FILT    ] = 1500;

+          sf->thresh_mult[THR_NEWA_FILT     ] = 2000;

+#endif

+          sf->thresh_mult[THR_SPLITA   ] = 20000;

+          sf->thresh_mult[THR_COMP_SPLITLA  ] = 10000;

+        }

+        sf->thresh_mult[THR_COMP_ZEROLG   ] = 1500;

+        sf->thresh_mult[THR_COMP_NEARESTLG] = 1500;

+        sf->thresh_mult[THR_COMP_NEARLG   ] = 1500;

+        sf->thresh_mult[THR_COMP_ZEROLA   ] = 1500;

+        sf->thresh_mult[THR_COMP_NEARESTLA] = 1500;

+        sf->thresh_mult[THR_COMP_NEARLA   ] = 1500;

+        sf->thresh_mult[THR_COMP_ZEROGA   ] = 1500;

+        sf->thresh_mult[THR_COMP_NEARESTGA] = 1500;

+        sf->thresh_mult[THR_COMP_NEARGA   ] = 1500;

+        sf->thresh_mult[THR_COMP_NEWLG    ] = 2000;

+        sf->thresh_mult[THR_COMP_NEWLA    ] = 2000;

+        sf->thresh_mult[THR_COMP_NEWGA    ] = 2000;

+      }

+      if (Speed > 2) {

+        cpi->mode_check_freq[THR_SPLITG] = 15;

+        cpi->mode_check_freq[THR_SPLITA] = 15;

+        cpi->mode_check_freq[THR_SPLITMV] = 7;

+        cpi->mode_check_freq[THR_COMP_SPLITGA] = 15;

+        cpi->mode_check_freq[THR_COMP_SPLITLG] = 15;

+        cpi->mode_check_freq[THR_COMP_SPLITLA] = 7;

+        sf->thresh_mult[THR_TM       ] = 2000;

+        sf->thresh_mult[THR_V_PRED   ] = 2000;

+        sf->thresh_mult[THR_H_PRED   ] = 2000;

+        sf->thresh_mult[THR_D45_PRED ] = 2000;

+        sf->thresh_mult[THR_D135_PRED] = 2000;

+        sf->thresh_mult[THR_D117_PRED] = 2000;

+        sf->thresh_mult[THR_D153_PRED] = 2000;

+        sf->thresh_mult[THR_D27_PRED ] = 2000;

+        sf->thresh_mult[THR_D63_PRED ] = 2000;

+        sf->thresh_mult[THR_B_PRED   ] = 7500;

+        sf->thresh_mult[THR_I8X8_PRED] = 7500;

+        if (cpi->ref_frame_flags & VP9_LAST_FLAG) {

+          sf->thresh_mult[THR_NEWMV    ] = 2000;

+#if CONFIG_PRED_FILTER

+          sf->thresh_mult[THR_NEWMV_FILT    ] = 2000;

+#endif

+          sf->thresh_mult[THR_SPLITMV  ] = 25000;

+          sf->thresh_mult[THR_COMP_SPLITLG  ] = 50000;

+        }

+        if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {

+          sf->thresh_mult[THR_NEARESTG ] = 2000;

+          sf->thresh_mult[THR_ZEROG    ] = 2000;

+          sf->thresh_mult[THR_NEARG    ] = 2000;

+          sf->thresh_mult[THR_NEWG     ] = 2500;

+#if CONFIG_PRED_FILTER

+          sf->thresh_mult[THR_NEARESTG_FILT ] = 2000;

+          sf->thresh_mult[THR_ZEROG_FILT    ] = 2000;

+          sf->thresh_mult[THR_NEARG_FILT    ] = 2000;

+          sf->thresh_mult[THR_NEWG_FILT     ] = 2500;

+#endif

+          sf->thresh_mult[THR_SPLITG   ] = 50000;

+          sf->thresh_mult[THR_COMP_SPLITGA  ] = 50000;

+        }

+        if (cpi->ref_frame_flags & VP9_ALT_FLAG) {

+          sf->thresh_mult[THR_NEARESTA ] = 2000;

+          sf->thresh_mult[THR_ZEROA    ] = 2000;

+          sf->thresh_mult[THR_NEARA    ] = 2000;

+          sf->thresh_mult[THR_NEWA     ] = 2500;

+#if CONFIG_PRED_FILTER

+          sf->thresh_mult[THR_NEARESTA_FILT ] = 2000;

+          sf->thresh_mult[THR_ZEROA_FILT    ] = 2000;

+          sf->thresh_mult[THR_NEARA_FILT    ] = 2000;

+          sf->thresh_mult[THR_NEWA_FILT     ] = 2500;

+#endif

+          sf->thresh_mult[THR_SPLITA   ] = 50000;

+          sf->thresh_mult[THR_COMP_SPLITLA  ] = 25000;

+        }

+        sf->thresh_mult[THR_COMP_ZEROLG   ] = 2000;

+        sf->thresh_mult[THR_COMP_NEARESTLG] = 2000;

+        sf->thresh_mult[THR_COMP_NEARLG   ] = 2000;

+        sf->thresh_mult[THR_COMP_ZEROLA   ] = 2000;

+        sf->thresh_mult[THR_COMP_NEARESTLA] = 2000;

+        sf->thresh_mult[THR_COMP_NEARLA   ] = 2000;

+        sf->thresh_mult[THR_COMP_ZEROGA   ] = 2000;

+        sf->thresh_mult[THR_COMP_NEARESTGA] = 2000;

+        sf->thresh_mult[THR_COMP_NEARGA   ] = 2000;

+        sf->thresh_mult[THR_COMP_NEWLG    ] = 2500;

+        sf->thresh_mult[THR_COMP_NEWLA    ] = 2500;

+        sf->thresh_mult[THR_COMP_NEWGA    ] = 2500;

+        sf->improved_dct = 0;

+        // Only do recode loop on key frames, golden frames and

+        // alt ref frames

+        sf->recode_loop = 2;

+      }

+      break;

+  }; /* switch */

+  /* disable frame modes if flags not set */

+  if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {

+    sf->thresh_mult[THR_NEWMV    ] = INT_MAX;

+    sf->thresh_mult[THR_NEARESTMV] = INT_MAX;

+    sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;

+    sf->thresh_mult[THR_NEARMV   ] = INT_MAX;

+#if CONFIG_PRED_FILTER

+    sf->thresh_mult[THR_NEWMV_FILT    ] = INT_MAX;

+    sf->thresh_mult[THR_NEARESTMV_FILT] = INT_MAX;

+    sf->thresh_mult[THR_ZEROMV_FILT   ] = INT_MAX;

+    sf->thresh_mult[THR_NEARMV_FILT   ] = INT_MAX;

+#endif

+    sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;

+  }

+  if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) {

+    sf->thresh_mult[THR_NEARESTG ] = INT_MAX;

+    sf->thresh_mult[THR_ZEROG    ] = INT_MAX;

+    sf->thresh_mult[THR_NEARG    ] = INT_MAX;

+    sf->thresh_mult[THR_NEWG     ] = INT_MAX;

+#if CONFIG_PRED_FILTER

+    sf->thresh_mult[THR_NEARESTG_FILT ] = INT_MAX;

+    sf->thresh_mult[THR_ZEROG_FILT    ] = INT_MAX;

+    sf->thresh_mult[THR_NEARG_FILT    ] = INT_MAX;

+    sf->thresh_mult[THR_NEWG_FILT     ] = INT_MAX;

+#endif

+    sf->thresh_mult[THR_SPLITG   ] = INT_MAX;

+  }

+  if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) {

+    sf->thresh_mult[THR_NEARESTA ] = INT_MAX;

+    sf->thresh_mult[THR_ZEROA    ] = INT_MAX;

+    sf->thresh_mult[THR_NEARA    ] = INT_MAX;

+    sf->thresh_mult[THR_NEWA     ] = INT_MAX;

+#if CONFIG_PRED_FILTER

+    sf->thresh_mult[THR_NEARESTA_FILT ] = INT_MAX;

+    sf->thresh_mult[THR_ZEROA_FILT    ] = INT_MAX;

+    sf->thresh_mult[THR_NEARA_FILT    ] = INT_MAX;

+    sf->thresh_mult[THR_NEWA_FILT     ] = INT_MAX;

+#endif

+    sf->thresh_mult[THR_SPLITA   ] = INT_MAX;

+  }

+  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) != (VP9_LAST_FLAG | VP9_GOLD_FLAG)) {

+    sf->thresh_mult[THR_COMP_ZEROLG   ] = INT_MAX;

+    sf->thresh_mult[THR_COMP_NEARESTLG] = INT_MAX;

+    sf->thresh_mult[THR_COMP_NEARLG   ] = INT_MAX;

+    sf->thresh_mult[THR_COMP_NEWLG    ] = INT_MAX;

+    sf->thresh_mult[THR_COMP_SPLITLG  ] = INT_MAX;

+  }

+  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != (VP9_LAST_FLAG | VP9_ALT_FLAG)) {

+    sf->thresh_mult[THR_COMP_ZEROLA   ] = INT_MAX;

+    sf->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX;

+    sf->thresh_mult[THR_COMP_NEARLA   ] = INT_MAX;

+    sf->thresh_mult[THR_COMP_NEWLA    ] = INT_MAX;

+    sf->thresh_mult[THR_COMP_SPLITLA  ] = INT_MAX;

+  }

+  if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != (VP9_GOLD_FLAG | VP9_ALT_FLAG)) {

+    sf->thresh_mult[THR_COMP_ZEROGA   ] = INT_MAX;

+    sf->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX;

+    sf->thresh_mult[THR_COMP_NEARGA   ] = INT_MAX;

+    sf->thresh_mult[THR_COMP_NEWGA    ] = INT_MAX;

+    sf->thresh_mult[THR_COMP_SPLITGA  ] = INT_MAX;

+  }

+  // Slow quant, dct and trellis not worthwhile for first pass

+  // so make sure they are always turned off.

+  if (cpi->pass == 1) {

+    sf->optimize_coefficients = 0;

+    sf->improved_dct = 0;

+  }

+  if (cpi->sf.search_method == NSTEP) {

+    vp9_init3smotion_compensation(&cpi->mb,

+                                  cm->yv12_fb[cm->lst_fb_idx].y_stride);

+  } else if (cpi->sf.search_method == DIAMOND) {

+    vp9_init_dsmotion_compensation(&cpi->mb,

+                                   cm->yv12_fb[cm->lst_fb_idx].y_stride);

+  }

+  cpi->mb.vp9_short_fdct16x16 = vp9_short_fdct16x16;

+  cpi->mb.vp9_short_fdct8x8 = vp9_short_fdct8x8;

+  cpi->mb.vp9_short_fdct8x4 = vp9_short_fdct8x4;

+  cpi->mb.vp9_short_fdct4x4 = vp9_short_fdct4x4;

+  cpi->mb.short_walsh4x4 = vp9_short_walsh4x4;

+  cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2;

+#if CONFIG_LOSSLESS

+  if (cpi->oxcf.lossless) {

+    cpi->mb.vp9_short_fdct8x4 = vp9_short_walsh8x4_x8;

+    cpi->mb.vp9_short_fdct4x4 = vp9_short_walsh4x4_x8;

+    cpi->mb.short_walsh4x4 = vp9_short_walsh4x4;

+    cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2;

+    cpi->mb.short_walsh4x4 = vp9_short_walsh4x4_lossless;

+  }

+#endif

+  cpi->mb.quantize_b_4x4      = vp9_regular_quantize_b_4x4;

+  cpi->mb.quantize_b_4x4_pair = vp9_regular_quantize_b_4x4_pair;

+  cpi->mb.quantize_b_8x8      = vp9_regular_quantize_b_8x8;

+  cpi->mb.quantize_b_16x16    = vp9_regular_quantize_b_16x16;

+  cpi->mb.quantize_b_2x2      = vp9_regular_quantize_b_2x2;

+  vp9_init_quantizer(cpi);

+#if CONFIG_RUNTIME_CPU_DETECT

+  cpi->mb.e_mbd.rtcd = &cpi->common.rtcd;

+#endif

+  if (cpi->sf.iterative_sub_pixel == 1) {

+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step_iteratively;

+  } else if (cpi->sf.quarter_pixel_search) {

+    cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step;

+  } else if (cpi->sf.half_pixel_search) {

+    cpi->find_fractional_mv_step = vp9_find_best_half_pixel_step;

+  }

+  if (cpi->sf.optimize_coefficients == 1 && cpi->pass != 1)

+    cpi->mb.optimize = 1;

+  else

+    cpi->mb.optimize = 0;

+#ifdef SPEEDSTATS

+  frames_at_speed[cpi->Speed]++;

+#endif

+}

+static void alloc_raw_frame_buffers(VP9_COMP *cpi) {

+  int width = (cpi->oxcf.Width + 15) & ~15;

+  int height = (cpi->oxcf.Height + 15) & ~15;

+  cpi->lookahead = vp9_lookahead_init(cpi->oxcf.Width, cpi->oxcf.Height,

+                                      cpi->oxcf.lag_in_frames);

+  if (!cpi->lookahead)

+    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

+                       "Failed to allocate lag buffers");

+#if VP9_TEMPORAL_ALT_REF

+  if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer,

+                                  width, height, VP8BORDERINPIXELS))

+    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

+                       "Failed to allocate altref buffer");

+#endif

+}

+static int alloc_partition_data(VP9_COMP *cpi) {

+  vpx_free(cpi->mb.pip);

+  cpi->mb.pip = vpx_calloc((cpi->common.mb_cols + 1) *

+                           (cpi->common.mb_rows + 1),

+                           sizeof(PARTITION_INFO));

+  if (!cpi->mb.pip)

+    return 1;

+  cpi->mb.pi = cpi->mb.pip + cpi->common.mode_info_stride + 1;

+  return 0;

+}

+void vp9_alloc_compressor_data(VP9_COMP *cpi) {

+  VP9_COMMON *cm = &cpi->common;

+  int width = cm->Width;

+  int height = cm->Height;

+  if (vp9_alloc_frame_buffers(cm, width, height))

+    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

+                       "Failed to allocate frame buffers");

+  if (alloc_partition_data(cpi))

+    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

+                       "Failed to allocate partition data");

+  if ((width & 0xf) != 0)

+    width += 16 - (width & 0xf);

+  if ((height & 0xf) != 0)

+    height += 16 - (height & 0xf);

+  if (vp8_yv12_alloc_frame_buffer(&cpi->last_frame_uf,

+                                  width, height, VP8BORDERINPIXELS))

+    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

+                       "Failed to allocate last frame buffer");

+  if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source,

+                                  width, height, VP8BORDERINPIXELS))

+    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

+                       "Failed to allocate scaled source buffer");

+  vpx_free(cpi->tok);

+  {

+    unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16;

+    CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));

+  }

+  // Data used for real time vc mode to see if gf needs refreshing

+  cpi->inter_zz_count = 0;

+  cpi->gf_bad_count = 0;

+  cpi->gf_update_recommended = 0;

+  // Structures used to minitor GF usage

+  vpx_free(cpi->gf_active_flags);

+  CHECK_MEM_ERROR(cpi->gf_active_flags,

+                  vpx_calloc(1, cm->mb_rows * cm->mb_cols));

+  cpi->gf_active_count = cm->mb_rows * cm->mb_cols;

+  vpx_free(cpi->mb_activity_map);

+  CHECK_MEM_ERROR(cpi->mb_activity_map,

+                  vpx_calloc(sizeof(unsigned int),

+                             cm->mb_rows * cm->mb_cols));

+  vpx_free(cpi->mb_norm_activity_map);

+  CHECK_MEM_ERROR(cpi->mb_norm_activity_map,

+                  vpx_calloc(sizeof(unsigned int),

+                             cm->mb_rows * cm->mb_cols));

+  vpx_free(cpi->twopass.total_stats);

+  cpi->twopass.total_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));

+  vpx_free(cpi->twopass.total_left_stats);

+  cpi->twopass.total_left_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));

+  vpx_free(cpi->twopass.this_frame_stats);

+  cpi->twopass.this_frame_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));

+  if (!cpi->twopass.total_stats ||

+      !cpi->twopass.total_left_stats ||

+      !cpi->twopass.this_frame_stats)

+    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

+                       "Failed to allocate firstpass stats");

+  vpx_free(cpi->tplist);

+  CHECK_MEM_ERROR(cpi->tplist,

+                  vpx_malloc(sizeof(TOKENLIST) * (cpi->common.mb_rows)));

+}

+// TODO perhaps change number of steps expose to outside world when setting

+// max and min limits. Also this will likely want refining for the extended Q

+// range.

+//

+// Table that converts 0-63 Q range values passed in outside to the Qindex

+// range used internally.

+static const int q_trans[] = {

+  0,    4,   8,  12,  16,  20,  24,  28,

+  32,   36,  40,  44,  48,  52,  56,  60,

+  64,   68,  72,  76,  80,  84,  88,  92,

+  96,  100, 104, 108, 112, 116, 120, 124,

+  128, 132, 136, 140, 144, 148, 152, 156,

+  160, 164, 168, 172, 176, 180, 184, 188,

+  192, 196, 200, 204, 208, 212, 216, 220,

+  224, 228, 232, 236, 240, 244, 249, 255,

+};

+int vp9_reverse_trans(int x) {

+  int i;

+  for (i = 0; i < 64; i++)

+    if (q_trans[i] >= x)

+      return i;

+  return 63;

+};

+void vp9_new_frame_rate(VP9_COMP *cpi, double framerate) {

+  if (framerate < .1)

+    framerate = 30;

+  cpi->oxcf.frame_rate             = framerate;

+  cpi->output_frame_rate            = cpi->oxcf.frame_rate;

+  cpi->per_frame_bandwidth          = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);

+  cpi->av_per_frame_bandwidth        = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);

+  cpi->min_frame_bandwidth          = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);

+  if (cpi->min_frame_bandwidth < FRAME_OVERHEAD_BITS)

+    cpi->min_frame_bandwidth = FRAME_OVERHEAD_BITS;

+  // Set Maximum gf/arf interval

+  cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2);

+  if (cpi->max_gf_interval < 12)

+    cpi->max_gf_interval = 12;

+  // Extended interval for genuinely static scenes

+  cpi->twopass.static_scene_max_gf_interval = cpi->key_frame_frequency >> 1;

+  // Special conditions when altr ref frame enabled in lagged compress mode

+  if (cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames) {

+    if (cpi->max_gf_interval > cpi->oxcf.lag_in_frames - 1)

+      cpi->max_gf_interval = cpi->oxcf.lag_in_frames - 1;

+    if (cpi->twopass.static_scene_max_gf_interval > cpi->oxcf.lag_in_frames - 1)

+      cpi->twopass.static_scene_max_gf_interval = cpi->oxcf.lag_in_frames - 1;

+  }

+  if (cpi->max_gf_interval > cpi->twopass.static_scene_max_gf_interval)

+    cpi->max_gf_interval = cpi->twopass.static_scene_max_gf_interval;

+}

+static int

+rescale(int val, int num, int denom) {

+  int64_t llnum = num;

+  int64_t llden = denom;

+  int64_t llval = val;

+  return llval * llnum / llden;

+}

+static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {

+  VP9_COMP *cpi = (VP9_COMP *)(ptr);

+  VP9_COMMON *cm = &cpi->common;

+  cpi->oxcf = *oxcf;

+  cpi->goldfreq = 7;

+  cm->version = oxcf->Version;

+  vp9_setup_version(cm);

+  // change includes all joint functionality

+  vp9_change_config(ptr, oxcf);

+  // Initialize active best and worst q and average q values.

+  cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;

+  cpi->active_best_quality          = cpi->oxcf.best_allowed_q;

+  cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;

+  // Initialise the starting buffer levels

+  cpi->buffer_level                 = cpi->oxcf.starting_buffer_level;

+  cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;

+  cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;

+  cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;

+  cpi->long_rolling_target_bits     = cpi->av_per_frame_bandwidth;

+  cpi->long_rolling_actual_bits     = cpi->av_per_frame_bandwidth;

+  cpi->total_actual_bits            = 0;

+  cpi->total_target_vs_actual       = 0;

+  cpi->static_mb_pct = 0;

+#if VP9_TEMPORAL_ALT_REF

+  {

+    int i;

+    cpi->fixed_divide[0] = 0;

+    for (i = 1; i < 512; i++)

+      cpi->fixed_divide[i] = 0x80000 / i;

+  }

+#endif

+}

+void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {

+  VP9_COMP *cpi = (VP9_COMP *)(ptr);

+  VP9_COMMON *cm = &cpi->common;

+  if (!cpi)

+    return;

+  if (!oxcf)

+    return;

+  if (cm->version != oxcf->Version) {

+    cm->version = oxcf->Version;

+    vp9_setup_version(cm);

+  }

+  cpi->oxcf = *oxcf;

+  switch (cpi->oxcf.Mode) {

+      // Real time and one pass deprecated in test code base

+    case MODE_FIRSTPASS:

+      cpi->pass = 1;

+      cpi->compressor_speed = 1;

+      break;

+    case MODE_SECONDPASS:

+      cpi->pass = 2;

+      cpi->compressor_speed = 1;

+      if (cpi->oxcf.cpu_used < -5) {

+        cpi->oxcf.cpu_used = -5;

+      }

+      if (cpi->oxcf.cpu_used > 5)

+        cpi->oxcf.cpu_used = 5;

+      break;

+    case MODE_SECONDPASS_BEST:

+      cpi->pass = 2;

+      cpi->compressor_speed = 0;

+      break;

+  }

+  cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];

+  cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q];

+  cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];

+#if CONFIG_LOSSLESS

+  cpi->oxcf.lossless = oxcf->lossless;

+  if (cpi->oxcf.lossless) {

+    cpi->common.rtcd.idct.idct1        = vp9_short_inv_walsh4x4_1_x8_c;

+    cpi->common.rtcd.idct.idct16       = vp9_short_inv_walsh4x4_x8_c;

+    cpi->common.rtcd.idct.idct1_scalar_add  = vp9_dc_only_inv_walsh_add_c;

+    cpi->common.rtcd.idct.iwalsh1      = vp9_short_inv_walsh4x4_1_c;

+    cpi->common.rtcd.idct.iwalsh16     = vp9_short_inv_walsh4x4_lossless_c;

+  }

+#endif

+  cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;

+  cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;

+  // cpi->use_golden_frame_only = 0;

+  // cpi->use_last_frame_only = 0;

+  cm->refresh_golden_frame = 0;

+  cm->refresh_last_frame = 1;

+  cm->refresh_entropy_probs = 1;

+  setup_features(cpi);

+  cpi->mb.e_mbd.allow_high_precision_mv = 0;   // Default mv precision adaptation

+  {

+    int i;

+    for (i = 0; i < MAX_MB_SEGMENTS; i++)

+      cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;

+  }

+  // At the moment the first order values may not be > MAXQ

+  if (cpi->oxcf.fixed_q > MAXQ)

+    cpi->oxcf.fixed_q = MAXQ;

+  // local file playback mode == really big buffer

+  if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK) {

+    cpi->oxcf.starting_buffer_level   = 60000;

+    cpi->oxcf.optimal_buffer_level    = 60000;

+    cpi->oxcf.maximum_buffer_size     = 240000;

+  }

+  // Convert target bandwidth from Kbit/s to Bit/s

+  cpi->oxcf.target_bandwidth       *= 1000;

+  cpi->oxcf.starting_buffer_level =

+    rescale(cpi->oxcf.starting_buffer_level,

+            cpi->oxcf.target_bandwidth, 1000);

+  // Set or reset optimal and maximum buffer levels.

+  if (cpi->oxcf.optimal_buffer_level == 0)

+    cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;

+  else

+    cpi->oxcf.optimal_buffer_level =

+      rescale(cpi->oxcf.optimal_buffer_level,

+              cpi->oxcf.target_bandwidth, 1000);

+  if (cpi->oxcf.maximum_buffer_size == 0)

+    cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;

+  else

+    cpi->oxcf.maximum_buffer_size =

+      rescale(cpi->oxcf.maximum_buffer_size,

+              cpi->oxcf.target_bandwidth, 1000);

+  // Set up frame rate and related parameters rate control values.

+  vp9_new_frame_rate(cpi, cpi->oxcf.frame_rate);

+  // Set absolute upper and lower quality limits

+  cpi->worst_quality               = cpi->oxcf.worst_allowed_q;

+  cpi->best_quality                = cpi->oxcf.best_allowed_q;

+  // active values should only be modified if out of new range

+  if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q) {

+    cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;

+  }

+  // less likely

+  else if (cpi->active_worst_quality < cpi->oxcf.best_allowed_q) {

+    cpi->active_worst_quality = cpi->oxcf.best_allowed_q;

+  }

+  if (cpi->active_best_quality < cpi->oxcf.best_allowed_q) {

+    cpi->active_best_quality = cpi->oxcf.best_allowed_q;

+  }

+  // less likely

+  else if (cpi->active_best_quality > cpi->oxcf.worst_allowed_q) {

+    cpi->active_best_quality = cpi->oxcf.worst_allowed_q;

+  }

+  cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;

+  cpi->cq_target_quality = cpi->oxcf.cq_level;

+  if (!cm->use_bilinear_mc_filter)

+    cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;

+  else

+    cm->mcomp_filter_type = BILINEAR;

+  cpi->target_bandwidth = cpi->oxcf.target_bandwidth;

+  cm->Width       = cpi->oxcf.Width;

+  cm->Height      = cpi->oxcf.Height;

+  cm->horiz_scale  = cpi->horiz_scale;

+  cm->vert_scale   = cpi->vert_scale;

+  // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)

+  if (cpi->oxcf.Sharpness > 7)

+    cpi->oxcf.Sharpness = 7;

+  cm->sharpness_level = cpi->oxcf.Sharpness;

+  if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) {

+    int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);

+    int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);

+    Scale2Ratio(cm->horiz_scale, &hr, &hs);

+    Scale2Ratio(cm->vert_scale, &vr, &vs);

+    // always go to the next whole number

+    cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;

+    cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;

+  }

+  if (((cm->Width + 15) & 0xfffffff0) !=

+      cm->yv12_fb[cm->lst_fb_idx].y_width ||

+      ((cm->Height + 15) & 0xfffffff0) !=

+      cm->yv12_fb[cm->lst_fb_idx].y_height ||

+      cm->yv12_fb[cm->lst_fb_idx].y_width == 0) {

+    alloc_raw_frame_buffers(cpi);

+    vp9_alloc_compressor_data(cpi);

+  }

+  if (cpi->oxcf.fixed_q >= 0) {

+    cpi->last_q[0] = cpi->oxcf.fixed_q;

+    cpi->last_q[1] = cpi->oxcf.fixed_q;

+    cpi->last_boosted_qindex = cpi->oxcf.fixed_q;

+  }

+  cpi->Speed = cpi->oxcf.cpu_used;

+  // force to allowlag to 0 if lag_in_frames is 0;

+  if (cpi->oxcf.lag_in_frames == 0) {

+    cpi->oxcf.allow_lag = 0;

+  }

+  // Limit on lag buffers as these are not currently dynamically allocated

+  else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)

+    cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;

+  // YX Temp

+  cpi->alt_ref_source = NULL;

+  cpi->is_src_frame_alt_ref = 0;

+#if 0

+  // Experimental RD Code

+  cpi->frame_distortion = 0;

+  cpi->last_frame_distortion = 0;

+#endif

+}

+#define M_LOG2_E 0.693147180559945309417

+#define log2f(x) (log (x) / (float) M_LOG2_E)

+static void cal_nmvjointsadcost(int *mvjointsadcost) {

+  mvjointsadcost[0] = 600;

+  mvjointsadcost[1] = 300;

+  mvjointsadcost[2] = 300;

+  mvjointsadcost[0] = 300;

+}

+static void cal_nmvsadcosts(int *mvsadcost[2]) {

+  int i = 1;

+  mvsadcost [0] [0] = 0;

+  mvsadcost [1] [0] = 0;

+  do {

+    double z = 256 * (2 * (log2f(8 * i) + .6));

+    mvsadcost [0][i] = (int) z;

+    mvsadcost [1][i] = (int) z;

+    mvsadcost [0][-i] = (int) z;

+    mvsadcost [1][-i] = (int) z;

+  } while (++i <= MV_MAX);

+}

+static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {

+  int i = 1;

+  mvsadcost [0] [0] = 0;

+  mvsadcost [1] [0] = 0;

+  do {

+    double z = 256 * (2 * (log2f(8 * i) + .6));

+    mvsadcost [0][i] = (int) z;

+    mvsadcost [1][i] = (int) z;

+    mvsadcost [0][-i] = (int) z;

+    mvsadcost [1][-i] = (int) z;

+  } while (++i <= MV_MAX);

+}

+VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {

+  int i;

+  volatile union {

+    VP9_COMP *cpi;

+    VP9_PTR   ptr;

+  } ctx;

+  VP9_COMP *cpi;

+  VP9_COMMON *cm;

+  cpi = ctx.cpi = vpx_memalign(32, sizeof(VP9_COMP));

+  // Check that the CPI instance is valid

+  if (!cpi)

+    return 0;

+  cm = &cpi->common;

+  vpx_memset(cpi, 0, sizeof(VP9_COMP));

+  if (setjmp(cm->error.jmp)) {

+    VP9_PTR ptr = ctx.ptr;

+    ctx.cpi->common.error.setjmp = 0;

+    vp9_remove_compressor(&ptr);

+    return 0;

+  }

+  cpi->common.error.setjmp = 1;

+  CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));

+  vp9_create_common(&cpi->common);

+  vp9_cmachine_specific_config(cpi);

+  init_config((VP9_PTR)cpi, oxcf);

+  memcpy(cpi->base_skip_false_prob, base_skip_false_prob, sizeof(base_skip_false_prob));

+  cpi->common.current_video_frame   = 0;

+  cpi->kf_overspend_bits            = 0;

+  cpi->kf_bitrate_adjustment        = 0;

+  cpi->frames_till_gf_update_due      = 0;

+  cpi->gf_overspend_bits            = 0;

+  cpi->non_gf_bitrate_adjustment     = 0;

+  cm->prob_last_coded               = 128;

+  cm->prob_gf_coded                 = 128;

+  cm->prob_intra_coded              = 63;

+#if CONFIG_SUPERBLOCKS

+  cm->sb_coded                      = 200;

+#endif

+  for (i = 0; i < COMP_PRED_CONTEXTS; i++)

+    cm->prob_comppred[i]         = 128;

+  for (i = 0; i < TX_SIZE_MAX - 1; i++)

+    cm->prob_tx[i]               = 128;

+  // Prime the recent reference frame useage counters.

+  // Hereafter they will be maintained as a sort of moving average

+  cpi->recent_ref_frame_usage[INTRA_FRAME]  = 1;

+  cpi->recent_ref_frame_usage[LAST_FRAME]   = 1;

+  cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;

+  cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;

+  // Set reference frame sign bias for ALTREF frame to 1 (for now)

+  cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;

+  cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;

+  cpi->gold_is_last = 0;

+  cpi->alt_is_last  = 0;

+  cpi->gold_is_alt  = 0;

+  // allocate memory for storing last frame's MVs for MV prediction.

+  CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int_mv)));

+  CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int)));

+  CHECK_MEM_ERROR(cpi->lf_ref_frame, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int)));

+  // Create the encoder segmentation map and set all entries to 0

+  CHECK_MEM_ERROR(cpi->segmentation_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));

+  // And a copy in common for temporal coding

+  CHECK_MEM_ERROR(cm->last_frame_seg_map,

+                  vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));

+  // And a place holder structure is the coding context

+  // for use if we want to save and restore it

+  CHECK_MEM_ERROR(cpi->coding_context.last_frame_seg_map_copy,

+                  vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));

+  CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));

+  vpx_memset(cpi->active_map, 1, (cpi->common.mb_rows * cpi->common.mb_cols));

+  cpi->active_map_enabled = 0;

+  for (i = 0; i < (sizeof(cpi->mbgraph_stats) /

+                   sizeof(cpi->mbgraph_stats[0])); i++) {

+    CHECK_MEM_ERROR(cpi->mbgraph_stats[i].mb_stats,

+                    vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols *

+                               sizeof(*cpi->mbgraph_stats[i].mb_stats),

+                               1));

+  }

+#ifdef ENTROPY_STATS

+  if (cpi->pass != 1)

+    init_context_counters();

+#endif

+#ifdef MODE_STATS

+  vp9_zero(y_modes);

+  vp9_zero(i8x8_modes);

+  vp9_zero(uv_modes);

+  vp9_zero(uv_modes_y);

+  vp9_zero(b_modes);

+  vp9_zero(inter_y_modes);

+  vp9_zero(inter_uv_modes);

+  vp9_zero(inter_b_modes);

+#endif

+#ifdef NMV_STATS

+  init_nmvstats();

+#endif

+  /*Initialize the feed-forward activity masking.*/

+  cpi->activity_avg = 90 << 12;

+  cpi->frames_since_key = 8;        // Give a sensible default for the first frame.

+  cpi->key_frame_frequency = cpi->oxcf.key_freq;

+  cpi->this_key_frame_forced = FALSE;

+  cpi->next_key_frame_forced = FALSE;

+  cpi->source_alt_ref_pending = FALSE;

+  cpi->source_alt_ref_active = FALSE;

+  cpi->common.refresh_alt_ref_frame = 0;

+  cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;

+#if CONFIG_INTERNAL_STATS

+  cpi->b_calculate_ssimg = 0;

+  cpi->count = 0;

+  cpi->bytes = 0;

+  if (cpi->b_calculate_psnr) {

+    cpi->total_sq_error = 0.0;

+    cpi->total_sq_error2 = 0.0;

+    cpi->total_y = 0.0;

+    cpi->total_u = 0.0;

+    cpi->total_v = 0.0;

+    cpi->total = 0.0;

+    cpi->totalp_y = 0.0;

+    cpi->totalp_u = 0.0;

+    cpi->totalp_v = 0.0;

+    cpi->totalp = 0.0;

+    cpi->tot_recode_hits = 0;

+    cpi->summed_quality = 0;

+    cpi->summed_weights = 0;

+  }

+  if (cpi->b_calculate_ssimg) {

+    cpi->total_ssimg_y = 0;

+    cpi->total_ssimg_u = 0;

+    cpi->total_ssimg_v = 0;

+    cpi->total_ssimg_all = 0;

+  }

+#endif

+#ifndef LLONG_MAX

+#define LLONG_MAX  9223372036854775807LL

+#endif

+  cpi->first_time_stamp_ever = LLONG_MAX;

+  cpi->frames_till_gf_update_due      = 0;

+  cpi->key_frame_count              = 1;

+  cpi->ni_av_qi                     = cpi->oxcf.worst_allowed_q;

+  cpi->ni_tot_qi                    = 0;

+  cpi->ni_frames                   = 0;

+  cpi->tot_q = 0.0;

+  cpi->avg_q = vp9_convert_qindex_to_q(cpi->oxcf.worst_allowed_q);

+  cpi->total_byte_count             = 0;

+  cpi->rate_correction_factor         = 1.0;

+  cpi->key_frame_rate_correction_factor = 1.0;

+  cpi->gf_rate_correction_factor  = 1.0;

+  cpi->twopass.est_max_qcorrection_factor  = 1.0;

+  cal_nmvjointsadcost(cpi->mb.nmvjointsadcost);

+  cpi->mb.nmvcost[0] = &cpi->mb.nmvcosts[0][MV_MAX];

+  cpi->mb.nmvcost[1] = &cpi->mb.nmvcosts[1][MV_MAX];

+  cpi->mb.nmvsadcost[0] = &cpi->mb.nmvsadcosts[0][MV_MAX];

+  cpi->mb.nmvsadcost[1] = &cpi->mb.nmvsadcosts[1][MV_MAX];

+  cal_nmvsadcosts(cpi->mb.nmvsadcost);

+  cpi->mb.nmvcost_hp[0] = &cpi->mb.nmvcosts_hp[0][MV_MAX];

+  cpi->mb.nmvcost_hp[1] = &cpi->mb.nmvcosts_hp[1][MV_MAX];

+  cpi->mb.nmvsadcost_hp[0] = &cpi->mb.nmvsadcosts_hp[0][MV_MAX];

+  cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX];

+  cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);

+  for (i = 0; i < KEY_FRAME_CONTEXT; i++) {

+    cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate;

+  }

+#ifdef OUTPUT_YUV_SRC

+  yuv_file = fopen("bd.yuv", "ab");

+#endif

+#ifdef OUTPUT_YUV_REC

+  yuv_rec_file = fopen("rec.yuv", "wb");

+#endif

+#if 0

+  framepsnr = fopen("framepsnr.stt", "a");

+  kf_list = fopen("kf_list.stt", "w");

+#endif

+  cpi->output_pkt_list = oxcf->output_pkt_list;

+  if (cpi->pass == 1) {

+    vp9_init_first_pass(cpi);

+  } else if (cpi->pass == 2) {

+    size_t packet_sz = sizeof(FIRSTPASS_STATS);

+    int packets = oxcf->two_pass_stats_in.sz / packet_sz;

+    cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;

+    cpi->twopass.stats_in = cpi->twopass.stats_in_start;

+    cpi->twopass.stats_in_end = (void *)((char *)cpi->twopass.stats_in

+                                         + (packets - 1) * packet_sz);

+    vp9_init_second_pass(cpi);

+  }

+  vp9_set_speed_features(cpi);

+  // Set starting values of RD threshold multipliers (128 = *1)

+  for (i = 0; i < MAX_MODES; i++) {

+    cpi->rd_thresh_mult[i] = 128;

+  }

+#ifdef ENTROPY_STATS

+  init_mv_ref_counts();

+#endif

+#define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \

+    cpi->fn_ptr[BT].sdf            = SDF; \

+    cpi->fn_ptr[BT].vf             = VF; \

+    cpi->fn_ptr[BT].svf            = SVF; \

+    cpi->fn_ptr[BT].svf_halfpix_h  = SVFHH; \

+    cpi->fn_ptr[BT].svf_halfpix_v  = SVFHV; \

+    cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \

+    cpi->fn_ptr[BT].sdx3f          = SDX3F; \

+    cpi->fn_ptr[BT].sdx8f          = SDX8F; \

+    cpi->fn_ptr[BT].sdx4df         = SDX4DF;

+#if CONFIG_SUPERBLOCKS

+  BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,

+      vp9_variance_halfpixvar32x32_h, vp9_variance_halfpixvar32x32_v,

+      vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,

+      vp9_sad32x32x4d)

+#endif

+  BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,

+       vp9_variance_halfpixvar16x16_h, vp9_variance_halfpixvar16x16_v,

+       vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,

+       vp9_sad16x16x4d)

+  BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,

+      NULL, NULL, NULL, vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)

+  BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,

+      NULL, NULL, NULL, vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)

+  BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,

+      NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)

+  BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,

+      NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)

+#if ARCH_X86 || ARCH_X86_64

+  cpi->fn_ptr[BLOCK_16X16].copymem  = vp9_copy32xn;

+  cpi->fn_ptr[BLOCK_16X8].copymem   = vp9_copy32xn;

+  cpi->fn_ptr[BLOCK_8X16].copymem   = vp9_copy32xn;

+  cpi->fn_ptr[BLOCK_8X8].copymem    = vp9_copy32xn;

+  cpi->fn_ptr[BLOCK_4X4].copymem    = vp9_copy32xn;

+#endif

+  cpi->full_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, full_search);

+  cpi->diamond_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, diamond_search);

+  cpi->refining_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, refining_search);

+  // make sure frame 1 is okay

+  cpi->error_bins[0] = cpi->common.MBs;

+  /* vp9_init_quantizer() is first called here. Add check in

+   * vp9_frame_init_quantizer() so that vp9_init_quantizer is only

+   * called later when needed. This will avoid unnecessary calls of

+   * vp9_init_quantizer() for every frame.

+   */

+  vp9_init_quantizer(cpi);

+  vp9_loop_filter_init(cm);

+  cpi->common.error.setjmp = 0;

+  vp9_zero(cpi->y_uv_mode_count)

+  return (VP9_PTR) cpi;

+}

+void vp9_remove_compressor(VP9_PTR *ptr) {

+  VP9_COMP *cpi = (VP9_COMP *)(*ptr);

+  int i;

+  if (!cpi)

+    return;

+  if (cpi && (cpi->common.current_video_frame > 0)) {

+    if (cpi->pass == 2) {

+      vp9_end_second_pass(cpi);

+    }

+#ifdef ENTROPY_STATS

+    if (cpi->pass != 1) {

+      print_context_counters();

+      print_tree_update_probs();

+      print_mode_context();

+    }

+#endif

+#ifdef NMV_STATS

+    if (cpi->pass != 1)

+      print_nmvstats();

+#endif

+#if CONFIG_INTERNAL_STATS

+    vp9_clear_system_state();

+    // printf("\n8x8-4x4:%d-%d\n", cpi->t8x8_count, cpi->t4x4_count);

+    if (cpi->pass != 1) {

+      FILE *f = fopen("opsnr.stt", "a");

+      double time_encoded = (cpi->last_end_time_stamp_seen

+                             - cpi->first_time_stamp_ever) / 10000000.000;

+      double total_encode_time = (cpi->time_receive_data + cpi->time_compress_data)   / 1000.000;

+      double dr = (double)cpi->bytes * (double) 8 / (double)1000  / time_encoded;

+#if defined(MODE_STATS)

+      print_mode_contexts(&cpi->common);

+#endif

+      if (cpi->b_calculate_psnr) {

+        YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];

+        double samples = 3.0 / 2 * cpi->count * lst_yv12->y_width * lst_yv12->y_height;

+        double total_psnr = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error);

+        double total_psnr2 = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error2);

+        double total_ssim = 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);

+        fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\tVPXSSIM\t  Time(ms)\n");

+        fprintf(f, "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n",

+                dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim,

+                total_encode_time);

+//                fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f %10ld\n",

+//                        dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim,

+//                        total_encode_time, cpi->tot_recode_hits);

+      }

+      if (cpi->b_calculate_ssimg) {

+        fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t  Time(ms)\n");

+        fprintf(f, "%7.2f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,

+                cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,

+                cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time);

+//                fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f  %10ld\n", dr,

+//                        cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,

+//                        cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time, cpi->tot_recode_hits);

+      }

+      fclose(f);

+    }

+#endif

+#ifdef MODE_STATS

+    {

+      extern int count_mb_seg[4];

+      char modes_stats_file[250];

+      FILE *f;

+      double dr = (double)cpi->oxcf.frame_rate * (double)cpi->bytes * (double)8 / (double)cpi->count / (double)1000;

+      sprintf(modes_stats_file, "modes_q%03d.stt", cpi->common.base_qindex);

+      f = fopen(modes_stats_file, "w");

+      fprintf(f, "intra_mode in Intra Frames:\n");

+      {

+        int i;

+        fprintf(f, "Y: ");

+        for (i = 0; i < VP9_YMODES; i++) fprintf(f, " %8d,", y_modes[i]);

+        fprintf(f, "\n");

+      }

+      {

+        int i;

+        fprintf(f, "I8: ");

+        for (i = 0; i < VP9_I8X8_MODES; i++) fprintf(f, " %8d,", i8x8_modes[i]);

+        fprintf(f, "\n");

+      }

+      {

+        int i;

+        fprintf(f, "UV: ");

+        for (i = 0; i < VP9_UV_MODES; i++) fprintf(f, " %8d,", uv_modes[i]);

+        fprintf(f, "\n");

+      }

+      {

+        int i, j;

+        fprintf(f, "KeyFrame Y-UV:\n");

+        for (i = 0; i < VP9_YMODES; i++) {

+          fprintf(f, "%2d:", i);

+          for (j = 0; j < VP9_UV_MODES; j++) fprintf(f, "%8d, ", uv_modes_y[i][j]);

+          fprintf(f, "\n");

+        }

+      }

+      {

+        int i, j;

+        fprintf(f, "Inter Y-UV:\n");

+        for (i = 0; i < VP9_YMODES; i++) {

+          fprintf(f, "%2d:", i);

+          for (j = 0; j < VP9_UV_MODES; j++) fprintf(f, "%8d, ", cpi->y_uv_mode_count[i][j]);

+          fprintf(f, "\n");

+        }

+      }

+      {

+        int i;

+        fprintf(f, "B: ");

+        for (i = 0; i < VP9_BINTRAMODES; i++)

+          fprintf(f, "%8d, ", b_modes[i]);

+        fprintf(f, "\n");

+      }

+      fprintf(f, "Modes in Inter Frames:\n");

+      {

+        int i;

+        fprintf(f, "Y: ");

+        for (i = 0; i < MB_MODE_COUNT; i++) fprintf(f, " %8d,", inter_y_modes[i]);

+        fprintf(f, "\n");

+      }

+      {

+        int i;

+        fprintf(f, "UV: ");

+        for (i = 0; i < VP9_UV_MODES; i++) fprintf(f, " %8d,", inter_uv_modes[i]);

+        fprintf(f, "\n");

+      }

+      {

+        int i;

+        fprintf(f, "B: ");

+        for (i = 0; i < B_MODE_COUNT; i++) fprintf(f, "%8d, ", inter_b_modes[i]);

+        fprintf(f, "\n");

+      }

+      fprintf(f, "P:%8d, %8d, %8d, %8d\n", count_mb_seg[0], count_mb_seg[1], count_mb_seg[2], count_mb_seg[3]);

+      fprintf(f, "PB:%8d, %8d, %8d, %8d\n", inter_b_modes[LEFT4X4], inter_b_modes[ABOVE4X4], inter_b_modes[ZERO4X4], inter_b_modes[NEW4X4]);

+      fclose(f);

+    }

+#endif

+#ifdef ENTROPY_STATS

+    {

+      int i, j, k;

+      FILE *fmode = fopen("modecontext.c", "w");

+      fprintf(fmode, "\n#include \"entropymode.h\"\n\n");

+      fprintf(fmode, "const unsigned int vp9_kf_default_bmode_counts ");

+      fprintf(fmode, "[VP9_BINTRAMODES] [VP9_BINTRAMODES] [VP9_BINTRAMODES] =\n{\n");

+      for (i = 0; i < 10; i++) {

+        fprintf(fmode, "    { // Above Mode :  %d\n", i);

+        for (j = 0; j < 10; j++) {

+          fprintf(fmode, "        {");

+          for (k = 0; k < VP9_BINTRAMODES; k++) {

+            if (!intra_mode_stats[i][j][k])

+              fprintf(fmode, " %5d, ", 1);

+            else

+              fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]);

+          }

+          fprintf(fmode, "}, // left_mode %d\n", j);

+        }

+        fprintf(fmode, "    },\n");

+      }

+      fprintf(fmode, "};\n");

+      fclose(fmode);

+    }

+#endif

+#if defined(SECTIONBITS_OUTPUT)

+    if (0) {

+      int i;

+      FILE *f = fopen("tokenbits.stt", "a");

+      for (i = 0; i < 28; i++)

+        fprintf(f, "%8d", (int)(Sectionbits[i] / 256));

+      fprintf(f, "\n");

+      fclose(f);

+    }

+#endif

+#if 0

+    {

+      printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);

+      printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");

+      printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_mb_row / 1000, cpi->time_compress_data / 1000, (cpi->time_receive_data + cpi->time_compress_data) / 1000);

+    }

+#endif

+  }

+  dealloc_compressor_data(cpi);

+  vpx_free(cpi->mb.ss);

+  vpx_free(cpi->tok);

+  for (i = 0; i < sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]); i++) {

+    vpx_free(cpi->mbgraph_stats[i].mb_stats);

+  }

+  vp9_remove_common(&cpi->common);

+  vpx_free(cpi);

+  *ptr = 0;

+#ifdef OUTPUT_YUV_SRC

+  fclose(yuv_file);

+#endif

+#ifdef OUTPUT_YUV_REC

+  fclose(yuv_rec_file);

+#endif

+#if 0

+  if (keyfile)

+    fclose(keyfile);

+  if (framepsnr)

+    fclose(framepsnr);

+  if (kf_list)

+    fclose(kf_list);

+#endif

+}

+static uint64_t calc_plane_error(unsigned char *orig, int orig_stride,

+                                 unsigned char *recon, int recon_stride,

+                                 unsigned int cols, unsigned int rows) {

+  unsigned int row, col;

+  uint64_t total_sse = 0;

+  int diff;

+  for (row = 0; row + 16 <= rows; row += 16) {

+    for (col = 0; col + 16 <= cols; col += 16) {

+      unsigned int sse;

+      vp9_mse16x16(orig + col, orig_stride, recon + col, recon_stride, &sse);

+      total_sse += sse;

+    }

+    /* Handle odd-sized width */

+    if (col < cols) {

+      unsigned int   border_row, border_col;

+      unsigned char *border_orig = orig;

+      unsigned char *border_recon = recon;

+      for (border_row = 0; border_row < 16; border_row++) {

+        for (border_col = col; border_col < cols; border_col++) {

+          diff = border_orig[border_col] - border_recon[border_col];

+          total_sse += diff * diff;

+        }

+        border_orig += orig_stride;

+        border_recon += recon_stride;

+      }

+    }

+    orig += orig_stride * 16;

+    recon += recon_stride * 16;

+  }

+  /* Handle odd-sized height */

+  for (; row < rows; row++) {

+    for (col = 0; col < cols; col++) {

+      diff = orig[col] - recon[col];

+      total_sse += diff * diff;

+    }

+    orig += orig_stride;

+    recon += recon_stride;

+  }

+  return total_sse;

+}

+static void generate_psnr_packet(VP9_COMP *cpi) {

+  YV12_BUFFER_CONFIG      *orig = cpi->Source;

+  YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;

+  struct vpx_codec_cx_pkt  pkt;

+  uint64_t                 sse;

+  int                      i;

+  unsigned int             width = cpi->common.Width;

+  unsigned int             height = cpi->common.Height;

+  pkt.kind = VPX_CODEC_PSNR_PKT;

+  sse = calc_plane_error(orig->y_buffer, orig->y_stride,

+                         recon->y_buffer, recon->y_stride,

+                         width, height);

+  pkt.data.psnr.sse[0] = sse;

+  pkt.data.psnr.sse[1] = sse;

+  pkt.data.psnr.samples[0] = width * height;

+  pkt.data.psnr.samples[1] = width * height;

+  width = (width + 1) / 2;

+  height = (height + 1) / 2;

+  sse = calc_plane_error(orig->u_buffer, orig->uv_stride,

+                         recon->u_buffer, recon->uv_stride,

+                         width, height);

+  pkt.data.psnr.sse[0] += sse;

+  pkt.data.psnr.sse[2] = sse;

+  pkt.data.psnr.samples[0] += width * height;

+  pkt.data.psnr.samples[2] = width * height;

+  sse = calc_plane_error(orig->v_buffer, orig->uv_stride,

+                         recon->v_buffer, recon->uv_stride,

+                         width, height);

+  pkt.data.psnr.sse[0] += sse;

+  pkt.data.psnr.sse[3] = sse;

+  pkt.data.psnr.samples[0] += width * height;

+  pkt.data.psnr.samples[3] = width * height;

+  for (i = 0; i < 4; i++)

+    pkt.data.psnr.psnr[i] = vp9_mse2psnr(pkt.data.psnr.samples[i], 255.0,

+                                         pkt.data.psnr.sse[i]);

+  vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);

+}

+int vp9_use_as_reference(VP9_PTR ptr, int ref_frame_flags) {

+  VP9_COMP *cpi = (VP9_COMP *)(ptr);

+  if (ref_frame_flags > 7)

+    return -1;

+  cpi->ref_frame_flags = ref_frame_flags;

+  return 0;

+}

+int vp9_update_reference(VP9_PTR ptr, int ref_frame_flags) {

+  VP9_COMP *cpi = (VP9_COMP *)(ptr);

+  if (ref_frame_flags > 7)

+    return -1;

+  cpi->common.refresh_golden_frame = 0;

+  cpi->common.refresh_alt_ref_frame = 0;

+  cpi->common.refresh_last_frame   = 0;

+  if (ref_frame_flags & VP9_LAST_FLAG)

+    cpi->common.refresh_last_frame = 1;

+  if (ref_frame_flags & VP9_GOLD_FLAG)

+    cpi->common.refresh_golden_frame = 1;

+  if (ref_frame_flags & VP9_ALT_FLAG)

+    cpi->common.refresh_alt_ref_frame = 1;

+  return 0;

+}

+int vp9_get_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,

+                          YV12_BUFFER_CONFIG *sd) {

+  VP9_COMP *cpi = (VP9_COMP *)(ptr);

+  VP9_COMMON *cm = &cpi->common;

+  int ref_fb_idx;

+  if (ref_frame_flag == VP9_LAST_FLAG)

+    ref_fb_idx = cm->lst_fb_idx;

+  else if (ref_frame_flag == VP9_GOLD_FLAG)

+    ref_fb_idx = cm->gld_fb_idx;

+  else if (ref_frame_flag == VP9_ALT_FLAG)

+    ref_fb_idx = cm->alt_fb_idx;

+  else

+    return -1;

+  vp8_yv12_copy_frame_ptr(&cm->yv12_fb[ref_fb_idx], sd);

+  return 0;

+}

+int vp9_set_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,

+                          YV12_BUFFER_CONFIG *sd) {

+  VP9_COMP *cpi = (VP9_COMP *)(ptr);

+  VP9_COMMON *cm = &cpi->common;

+  int ref_fb_idx;

+  if (ref_frame_flag == VP9_LAST_FLAG)

+    ref_fb_idx = cm->lst_fb_idx;

+  else if (ref_frame_flag == VP9_GOLD_FLAG)

+    ref_fb_idx = cm->gld_fb_idx;

+  else if (ref_frame_flag == VP9_ALT_FLAG)

+    ref_fb_idx = cm->alt_fb_idx;

+  else

+    return -1;

+  vp8_yv12_copy_frame_ptr(sd, &cm->yv12_fb[ref_fb_idx]);

+  return 0;

+}

+int vp9_update_entropy(VP9_PTR comp, int update) {

+  VP9_COMP *cpi = (VP9_COMP *) comp;

+  VP9_COMMON *cm = &cpi->common;

+  cm->refresh_entropy_probs = update;

+  return 0;

+}

+#ifdef OUTPUT_YUV_SRC

+void vp9_write_yuv_frame(YV12_BUFFER_CONFIG *s) {

+  unsigned char *src = s->y_buffer;

+  int h = s->y_height;

+  do {

+    fwrite(src, s->y_width, 1,  yuv_file);

+    src += s->y_stride;

+  } while (--h);

+  src = s->u_buffer;

+  h = s->uv_height;

+  do {

+    fwrite(src, s->uv_width, 1,  yuv_file);

+    src += s->uv_stride;

+  } while (--h);

+  src = s->v_buffer;

+  h = s->uv_height;

+  do {

+    fwrite(src, s->uv_width, 1, yuv_file);

+    src += s->uv_stride;

+  } while (--h);

+}

+#endif

+#ifdef OUTPUT_YUV_REC

+void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {

+  YV12_BUFFER_CONFIG *s = cm->frame_to_show;

+  unsigned char *src = s->y_buffer;

+  int h = cm->Height;

+  do {

+    fwrite(src, s->y_width, 1,  yuv_rec_file);

+    src += s->y_stride;

+  } while (--h);

+  src = s->u_buffer;

+  h = (cm->Height + 1) / 2;

+  do {

+    fwrite(src, s->uv_width, 1,  yuv_rec_file);

+    src += s->uv_stride;

+  } while (--h);

+  src = s->v_buffer;

+  h = (cm->Height + 1) / 2;

+  do {

+    fwrite(src, s->uv_width, 1, yuv_rec_file);

+    src += s->uv_stride;

+  } while (--h);

+}

+#endif

+static void update_alt_ref_frame_stats(VP9_COMP *cpi) {

+  VP9_COMMON *cm = &cpi->common;

+  // Update data structure that monitors level of reference to last GF

+  vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));

+  cpi->gf_active_count = cm->mb_rows * cm->mb_cols;

+  // this frame refreshes means next frames don't unless specified by user

+  cpi->common.frames_since_golden = 0;

+  // Clear the alternate reference update pending flag.

+  cpi->source_alt_ref_pending = FALSE;

+  // Set the alternate refernce frame active flag

+  cpi->source_alt_ref_active = TRUE;

+}

+static void update_golden_frame_stats(VP9_COMP *cpi) {

+  VP9_COMMON *cm = &cpi->common;

+  // Update the Golden frame usage counts.

+  if (cm->refresh_golden_frame) {

+    // Update data structure that monitors level of reference to last GF

+    vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));

+    cpi->gf_active_count = cm->mb_rows * cm->mb_cols;

+    // this frame refreshes means next frames don't unless specified by user

+    cm->refresh_golden_frame = 0;

+    cpi->common.frames_since_golden = 0;

+    // if ( cm->frame_type == KEY_FRAME )

+    // {

+    cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;

+    cpi->recent_ref_frame_usage[LAST_FRAME] = 1;

+    cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;

+    cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;

+    // }

+    // else

+    // {

+    //  // Carry a potrtion of count over to begining of next gf sequence

+    //  cpi->recent_ref_frame_usage[INTRA_FRAME] >>= 5;

+    //  cpi->recent_ref_frame_usage[LAST_FRAME] >>= 5;

+    //  cpi->recent_ref_frame_usage[GOLDEN_FRAME] >>= 5;

+    //  cpi->recent_ref_frame_usage[ALTREF_FRAME] >>= 5;

+    // }

+    // ******** Fixed Q test code only ************

+    // If we are going to use the ALT reference for the next group of frames set a flag to say so.

+    if (cpi->oxcf.fixed_q >= 0 &&

+        cpi->oxcf.play_alternate && !cpi->common.refresh_alt_ref_frame) {

+      cpi->source_alt_ref_pending = TRUE;

+      cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;

+    }

+    if (!cpi->source_alt_ref_pending)

+      cpi->source_alt_ref_active = FALSE;

+    // Decrement count down till next gf

+    if (cpi->frames_till_gf_update_due > 0)

+      cpi->frames_till_gf_update_due--;

+  } else if (!cpi->common.refresh_alt_ref_frame) {

+    // Decrement count down till next gf

+    if (cpi->frames_till_gf_update_due > 0)

+      cpi->frames_till_gf_update_due--;

+    if (cpi->common.frames_till_alt_ref_frame)

+      cpi->common.frames_till_alt_ref_frame--;

+    cpi->common.frames_since_golden++;

+    if (cpi->common.frames_since_golden > 1) {

+      cpi->recent_ref_frame_usage[INTRA_FRAME] += cpi->count_mb_ref_frame_usage[INTRA_FRAME];

+      cpi->recent_ref_frame_usage[LAST_FRAME] += cpi->count_mb_ref_frame_usage[LAST_FRAME];

+      cpi->recent_ref_frame_usage[GOLDEN_FRAME] += cpi->count_mb_ref_frame_usage[GOLDEN_FRAME];

+      cpi->recent_ref_frame_usage[ALTREF_FRAME] += cpi->count_mb_ref_frame_usage[ALTREF_FRAME];

+    }

+  }

+}

+static int find_fp_qindex() {

+  int i;

+  for (i = 0; i < QINDEX_RANGE; i++) {

+    if (vp9_convert_qindex_to_q(i) >= 30.0) {

+      break;

+    }

+  }

+  if (i == QINDEX_RANGE)

+    i--;

+  return i;

+}

+static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags) {

+  (void) size;

+  (void) dest;

+  (void) frame_flags;

+  vp9_set_quantizer(cpi, find_fp_qindex());

+  vp9_first_pass(cpi);

+}

+#define WRITE_RECON_BUFFER 0

+#if WRITE_RECON_BUFFER

+void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {

+  // write the frame

+  FILE *yframe;

+  int i;

+  char filename[255];

+  sprintf(filename, "cx\\y%04d.raw", this_frame);

+  yframe = fopen(filename, "wb");

+  for (i = 0; i < frame->y_height; i++)

+    fwrite(frame->y_buffer + i * frame->y_stride,

+           frame->y_width, 1, yframe);

+  fclose(yframe);

+  sprintf(filename, "cx\\u%04d.raw", this_frame);

+  yframe = fopen(filename, "wb");

+  for (i = 0; i < frame->uv_height; i++)

+    fwrite(frame->u_buffer + i * frame->uv_stride,

+           frame->uv_width, 1, yframe);

+  fclose(yframe);

+  sprintf(filename, "cx\\v%04d.raw", this_frame);

+  yframe = fopen(filename, "wb");

+  for (i = 0; i < frame->uv_height; i++)

+    fwrite(frame->v_buffer + i * frame->uv_stride,

+           frame->uv_width, 1, yframe);

+  fclose(yframe);

+}

+#endif

+static double compute_edge_pixel_proportion(YV12_BUFFER_CONFIG *frame) {

+#define EDGE_THRESH 128

+  int i, j;

+  int num_edge_pels = 0;

+  int num_pels = (frame->y_height - 2) * (frame->y_width - 2);

+  unsigned char *prev = frame->y_buffer + 1;

+  unsigned char *curr = frame->y_buffer + 1 + frame->y_stride;

+  unsigned char *next = frame->y_buffer + 1 + 2 * frame->y_stride;

+  for (i = 1; i < frame->y_height - 1; i++) {

+    for (j = 1; j < frame->y_width - 1; j++) {

+      /* Sobel hor and ver gradients */

+      int v = 2 * (curr[1] - curr[-1]) + (prev[1] - prev[-1]) + (next[1] - next[-1]);

+      int h = 2 * (prev[0] - next[0]) + (prev[1] - next[1]) + (prev[-1] - next[-1]);

+      h = (h < 0 ? -h : h);

+      v = (v < 0 ? -v : v);

+      if (h > EDGE_THRESH || v > EDGE_THRESH) num_edge_pels++;

+      curr++;

+      prev++;

+      next++;

+    }

+    curr += frame->y_stride - frame->y_width + 2;

+    prev += frame->y_stride - frame->y_width + 2;

+    next += frame->y_stride - frame->y_width + 2;

+  }

+  return (double)num_edge_pels / (double)num_pels;

+}

+// Function to test for conditions that indicate we should loop

+// back and recode a frame.

+static BOOL recode_loop_test(VP9_COMP *cpi,

+                             int high_limit, int low_limit,

+                             int q, int maxq, int minq) {

+  BOOL    force_recode = FALSE;

+  VP9_COMMON *cm = &cpi->common;

+  // Is frame recode allowed at all

+  // Yes if either recode mode 1 is selected or mode two is selcted

+  // and the frame is a key frame. golden frame or alt_ref_frame

+  if ((cpi->sf.recode_loop == 1) ||

+      ((cpi->sf.recode_loop == 2) &&

+       ((cm->frame_type == KEY_FRAME) ||

+        cm->refresh_golden_frame ||

+        cm->refresh_alt_ref_frame))) {

+    // General over and under shoot tests

+    if (((cpi->projected_frame_size > high_limit) && (q < maxq)) ||

+        ((cpi->projected_frame_size < low_limit) && (q > minq))) {

+      force_recode = TRUE;

+    }

+    // Special Constrained quality tests

+    else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {

+      // Undershoot and below auto cq level

+      if ((q > cpi->cq_target_quality) &&

+          (cpi->projected_frame_size <

+           ((cpi->this_frame_target * 7) >> 3))) {

+        force_recode = TRUE;

+      }

+      // Severe undershoot and between auto and user cq level

+      else if ((q > cpi->oxcf.cq_level) &&

+               (cpi->projected_frame_size < cpi->min_frame_bandwidth) &&

+               (cpi->active_best_quality > cpi->oxcf.cq_level)) {

+        force_recode = TRUE;

+        cpi->active_best_quality = cpi->oxcf.cq_level;

+      }

+    }

+  }

+  return force_recode;

+}

+static void update_reference_frames(VP9_COMMON *cm) {

+  YV12_BUFFER_CONFIG *yv12_fb = cm->yv12_fb;

+  // At this point the new frame has been encoded.

+  // If any buffer copy / swapping is signaled it should be done here.

+  if (cm->frame_type == KEY_FRAME) {

+    yv12_fb[cm->new_fb_idx].flags |= VP9_GOLD_FLAG | VP9_ALT_FLAG;

+    yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;

+    yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;

+    cm->alt_fb_idx = cm->gld_fb_idx = cm->new_fb_idx;

+  } else { /* For non key frames */

+    if (cm->refresh_alt_ref_frame) {

+      assert(!cm->copy_buffer_to_arf);

+      cm->yv12_fb[cm->new_fb_idx].flags |= VP9_ALT_FLAG;

+      cm->yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;

+      cm->alt_fb_idx = cm->new_fb_idx;

+    } else if (cm->copy_buffer_to_arf) {

+      assert(!(cm->copy_buffer_to_arf & ~0x3));

+      if (cm->copy_buffer_to_arf == 1) {

+        if (cm->alt_fb_idx != cm->lst_fb_idx) {

+          yv12_fb[cm->lst_fb_idx].flags |= VP9_ALT_FLAG;

+          yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;

+          cm->alt_fb_idx = cm->lst_fb_idx;

+        }

+      } else { /* if (cm->copy_buffer_to_arf == 2) */

+        if (cm->alt_fb_idx != cm->gld_fb_idx) {

+          yv12_fb[cm->gld_fb_idx].flags |= VP9_ALT_FLAG;

+          yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;

+          cm->alt_fb_idx = cm->gld_fb_idx;

+        }

+      }

+    }

+    if (cm->refresh_golden_frame) {

+      assert(!cm->copy_buffer_to_gf);

+      cm->yv12_fb[cm->new_fb_idx].flags |= VP9_GOLD_FLAG;

+      cm->yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;

+      cm->gld_fb_idx = cm->new_fb_idx;

+    } else if (cm->copy_buffer_to_gf) {

+      assert(!(cm->copy_buffer_to_arf & ~0x3));

+      if (cm->copy_buffer_to_gf == 1) {

+        if (cm->gld_fb_idx != cm->lst_fb_idx) {

+          yv12_fb[cm->lst_fb_idx].flags |= VP9_GOLD_FLAG;

+          yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;

+          cm->gld_fb_idx = cm->lst_fb_idx;

+        }

+      } else { /* if (cm->copy_buffer_to_gf == 2) */

+        if (cm->alt_fb_idx != cm->gld_fb_idx) {

+          yv12_fb[cm->alt_fb_idx].flags |= VP9_GOLD_FLAG;

+          yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;

+          cm->gld_fb_idx = cm->alt_fb_idx;

+        }

+      }

+    }

+  }

+  if (cm->refresh_last_frame) {

+    cm->yv12_fb[cm->new_fb_idx].flags |= VP9_LAST_FLAG;

+    cm->yv12_fb[cm->lst_fb_idx].flags &= ~VP9_LAST_FLAG;

+    cm->lst_fb_idx = cm->new_fb_idx;

+  }

+}

+static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {

+  if (cm->no_lpf) {

+    cm->filter_level = 0;

+  }

+#if CONFIG_LOSSLESS

+  else if (cpi->oxcf.lossless) {

+    cm->filter_level = 0;

+  }

+#endif

+  else {

+    struct vpx_usec_timer timer;

+    vp9_clear_system_state();

+    vpx_usec_timer_start(&timer);

+    if (cpi->sf.auto_filter == 0)

+      vp9_pick_filter_level_fast(cpi->Source, cpi);

+    else

+      vp9_pick_filter_level(cpi->Source, cpi);

+    vpx_usec_timer_mark(&timer);

+    cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);

+  }

+  if (cm->filter_level > 0) {

+    vp9_set_alt_lf_level(cpi, cm->filter_level);

+    vp9_loop_filter_frame(cm, &cpi->mb.e_mbd);

+  }

+  vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);

+}

+#if CONFIG_PRED_FILTER

+void select_pred_filter_mode(VP9_COMP *cpi) {

+  VP9_COMMON *cm = &cpi->common;

+  int prob_pred_filter_off = cm->prob_pred_filter_off;

+  // Force filter on/off if probability is extreme

+  if (prob_pred_filter_off >= 255 * 0.95)

+    cm->pred_filter_mode = 0;   // Off at the frame level

+  else if (prob_pred_filter_off <= 255 * 0.05)

+    cm->pred_filter_mode = 1;   // On at the frame level

+  else

+    cm->pred_filter_mode = 2;   // Selectable at the MB level

+}

+void update_pred_filt_prob(VP9_COMP *cpi) {

+  VP9_COMMON *cm = &cpi->common;

+  int prob_pred_filter_off;

+  // Based on the selection in the previous frame determine what mode

+  // to use for the current frame and work out the signaling probability

+  if (cpi->pred_filter_on_count + cpi->pred_filter_off_count) {

+    prob_pred_filter_off = cpi->pred_filter_off_count * 256 /

+                           (cpi->pred_filter_on_count + cpi->pred_filter_off_count);

+    if (prob_pred_filter_off < 1)

+      prob_pred_filter_off = 1;

+    if (prob_pred_filter_off > 255)

+      prob_pred_filter_off = 255;

+    cm->prob_pred_filter_off = prob_pred_filter_off;

+  } else

+    cm->prob_pred_filter_off = 128;

+  /*

+      {

+        FILE *fp = fopen("filt_use.txt", "a");

+        fprintf (fp, "%d %d prob=%d\n", cpi->pred_filter_off_count,

+                 cpi->pred_filter_on_count, cm->prob_pred_filter_off);

+        fclose(fp);

+      }

+  */

+}

+#endif

+static void encode_frame_to_data_rate

+(

+  VP9_COMP *cpi,

+  unsigned long *size,

+  unsigned char *dest,

+  unsigned int *frame_flags

+) {

+  VP9_COMMON *cm = &cpi->common;

+  MACROBLOCKD *xd = &cpi->mb.e_mbd;

+  int Q;

+  int frame_over_shoot_limit;

+  int frame_under_shoot_limit;

+  int Loop = FALSE;

+  int loop_count;

+  int this_q;

+  int last_zbin_oq;

+  int q_low;

+  int q_high;

+  int zbin_oq_high;

+  int zbin_oq_low = 0;

+  int top_index;

+  int bottom_index;

+  int active_worst_qchanged = FALSE;

+  int overshoot_seen = FALSE;

+  int undershoot_seen = FALSE;

+  int loop_size_estimate = 0;

+  SPEED_FEATURES *sf = &cpi->sf;

+#if RESET_FOREACH_FILTER

+  int q_low0;

+  int q_high0;

+  int zbin_oq_high0;

+  int zbin_oq_low0 = 0;

+  int Q0;

+  int last_zbin_oq0;

+  int active_best_quality0;

+  int active_worst_quality0;

+  double rate_correction_factor0;

+  double gf_rate_correction_factor0;

+#endif

+  /* list of filters to search over */

+  int mcomp_filters_to_search[] = {

+    EIGHTTAP, EIGHTTAP_SHARP, SIXTAP, SWITCHABLE

+  };

+  int mcomp_filters = sizeof(mcomp_filters_to_search) /

+      sizeof(*mcomp_filters_to_search);

+  int mcomp_filter_index = 0;

+  INT64 mcomp_filter_cost[4];

+  // Clear down mmx registers to allow floating point in what follows

+  vp9_clear_system_state();

+  // For an alt ref frame in 2 pass we skip the call to the second

+  // pass function that sets the target bandwidth so must set it here

+  if (cpi->common.refresh_alt_ref_frame) {

+    cpi->per_frame_bandwidth = cpi->twopass.gf_bits;                           // Per frame bit target for the alt ref frame

+    cpi->target_bandwidth = cpi->twopass.gf_bits * cpi->output_frame_rate;      // per second target bitrate

+  }

+  // Default turn off buffer to buffer copying

+  cm->copy_buffer_to_gf = 0;

+  cm->copy_buffer_to_arf = 0;

+  // Clear zbin over-quant value and mode boost values.

+  cpi->zbin_over_quant = 0;

+  cpi->zbin_mode_boost = 0;

+  // Enable or disable mode based tweaking of the zbin

+  // For 2 Pass Only used where GF/ARF prediction quality

+  // is above a threshold

+  cpi->zbin_mode_boost = 0;

+#if CONFIG_LOSSLESS

+  cpi->zbin_mode_boost_enabled = FALSE;

+#else

+  cpi->zbin_mode_boost_enabled = TRUE;

+#endif

+  if (cpi->gfu_boost <= 400) {

+    cpi->zbin_mode_boost_enabled = FALSE;

+  }

+  // Current default encoder behaviour for the altref sign bias

+  if (cpi->source_alt_ref_active)

+    cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;

+  else

+    cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 0;

+  // Check to see if a key frame is signalled

+  // For two pass with auto key frame enabled cm->frame_type may already be set, but not for one pass.

+  if ((cm->current_video_frame == 0) ||

+      (cm->frame_flags & FRAMEFLAGS_KEY) ||

+      (cpi->oxcf.auto_key && (cpi->frames_since_key % cpi->key_frame_frequency == 0))) {

+    // Key frame from VFW/auto-keyframe/first frame

+    cm->frame_type = KEY_FRAME;

+  }

+  // Set default state for segment based loop filter update flags

+  xd->mode_ref_lf_delta_update = 0;

+  // Set various flags etc to special state if it is a key frame

+  if (cm->frame_type == KEY_FRAME) {

+    int i;

+    // Reset the loop filter deltas and segmentation map

+    setup_features(cpi);

+    // If segmentation is enabled force a map update for key frames

+    if (xd->segmentation_enabled) {

+      xd->update_mb_segmentation_map = 1;

+      xd->update_mb_segmentation_data = 1;

+    }

+    // The alternate reference frame cannot be active for a key frame

+    cpi->source_alt_ref_active = FALSE;

+    // Reset the RD threshold multipliers to default of * 1 (128)

+    for (i = 0; i < MAX_MODES; i++) {

+      cpi->rd_thresh_mult[i] = 128;

+    }

+  }

+  // Test code for new segment features

+  init_seg_features(cpi);

+  // Decide how big to make the frame

+  vp9_pick_frame_size(cpi);

+  vp9_clear_system_state();

+  // Set an active best quality and if necessary active worst quality

+  Q = cpi->active_worst_quality;

+  if (cm->frame_type == KEY_FRAME) {

+    int high = 2000;

+    int low = 400;

+    if (cpi->kf_boost > high)

+      cpi->active_best_quality = kf_low_motion_minq[Q];

+    else if (cpi->kf_boost < low)

+      cpi->active_best_quality = kf_high_motion_minq[Q];

+    else {

+      int gap = high - low;

+      int offset = high - cpi->kf_boost;

+      int qdiff = kf_high_motion_minq[Q] - kf_low_motion_minq[Q];

+      int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;

+      cpi->active_best_quality = kf_low_motion_minq[Q] + adjustment;

+    }

+    // Make an adjustment based on the %s static

+    // The main impact of this is at lower Q to prevent overly large key

+    // frames unless a lot of the image is static.

+    if (cpi->kf_zeromotion_pct < 64)

+      cpi->active_best_quality += 4 - (cpi->kf_zeromotion_pct >> 4);

+    // Special case for key frames forced because we have reached

+    // the maximum key frame interval. Here force the Q to a range

+    // based on the ambient Q to reduce the risk of popping

+    if (cpi->this_key_frame_forced) {

+      int delta_qindex;

+      int qindex = cpi->last_boosted_qindex;

+      delta_qindex = compute_qdelta(cpi, qindex,

+                                    (qindex * 0.75));

+      cpi->active_best_quality = qindex + delta_qindex;

+      if (cpi->active_best_quality < cpi->best_quality)

+        cpi->active_best_quality = cpi->best_quality;

+    }

+  }

+  else if (cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame) {

+    int high = 2000;

+    int low = 400;

+    // Use the lower of cpi->active_worst_quality and recent

+    // average Q as basis for GF/ARF Q limit unless last frame was

+    // a key frame.

+    if ((cpi->frames_since_key > 1) &&

+        (cpi->avg_frame_qindex < cpi->active_worst_quality)) {

+      Q = cpi->avg_frame_qindex;

+    }

+    // For constrained quality dont allow Q less than the cq level

+    if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&

+        (Q < cpi->cq_target_quality)) {

+      Q = cpi->cq_target_quality;

+    }

+    if (cpi->gfu_boost > high)

+      cpi->active_best_quality = gf_low_motion_minq[Q];

+    else if (cpi->gfu_boost < low)

+      cpi->active_best_quality = gf_high_motion_minq[Q];

+    else {

+      int gap = high - low;

+      int offset = high - cpi->gfu_boost;

+      int qdiff = gf_high_motion_minq[Q] - gf_low_motion_minq[Q];

+      int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;

+      cpi->active_best_quality = gf_low_motion_minq[Q] + adjustment;

+    }

+    // Constrained quality use slightly lower active best.

+    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {

+      cpi->active_best_quality =

+        cpi->active_best_quality * 15 / 16;

+    }

+  } else {

+    cpi->active_best_quality = inter_minq[Q];

+    // For the constant/constrained quality mode we dont want

+    // q to fall below the cq level.

+    if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&

+        (cpi->active_best_quality < cpi->cq_target_quality)) {

+      // If we are strongly undershooting the target rate in the last

+      // frames then use the user passed in cq value not the auto

+      // cq value.

+      if (cpi->rolling_actual_bits < cpi->min_frame_bandwidth)

+        cpi->active_best_quality = cpi->oxcf.cq_level;

+      else

+        cpi->active_best_quality = cpi->cq_target_quality;

+    }

+  }

+  // Clip the active best and worst quality values to limits

+  if (cpi->active_worst_quality > cpi->worst_quality)

+    cpi->active_worst_quality = cpi->worst_quality;

+  if (cpi->active_best_quality < cpi->best_quality)

+    cpi->active_best_quality = cpi->best_quality;

+  if (cpi->active_best_quality > cpi->worst_quality)

+    cpi->active_best_quality = cpi->worst_quality;

+  if (cpi->active_worst_quality < cpi->active_best_quality)

+    cpi->active_worst_quality = cpi->active_best_quality;

+  // Specuial case code to try and match quality with forced key frames

+  if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {

+    Q = cpi->last_boosted_qindex;

+  } else {

+    // Determine initial Q to try

+    Q = vp9_regulate_q(cpi, cpi->this_frame_target);

+  }

+  last_zbin_oq = cpi->zbin_over_quant;

+  // Set highest allowed value for Zbin over quant

+  if (cm->frame_type == KEY_FRAME)

+    zbin_oq_high = 0; // ZBIN_OQ_MAX/16

+  else if (cm->refresh_alt_ref_frame || (cm->refresh_golden_frame && !cpi->source_alt_ref_active))

+    zbin_oq_high = 16;

+  else

+    zbin_oq_high = ZBIN_OQ_MAX;

+  vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit,

+                                &frame_over_shoot_limit);

+  // Limit Q range for the adaptive loop.

+  bottom_index = cpi->active_best_quality;

+  top_index    = cpi->active_worst_quality;

+  q_low  = cpi->active_best_quality;

+  q_high = cpi->active_worst_quality;

+  loop_count = 0;

+  if (cm->frame_type != KEY_FRAME) {

+    /* TODO: Decide this more intelligently */

+    if (sf->search_best_filter) {

+      cm->mcomp_filter_type = mcomp_filters_to_search[0];

+      mcomp_filter_index = 0;

+    } else {

+      cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;

+    }

+    /* TODO: Decide this more intelligently */

+    xd->allow_high_precision_mv = (Q < HIGH_PRECISION_MV_QTHRESH);

+  }

+#if CONFIG_POSTPROC

+  if (cpi->oxcf.noise_sensitivity > 0) {

+    unsigned char *src;

+    int l = 0;

+    switch (cpi->oxcf.noise_sensitivity) {

+      case 1:

+        l = 20;

+        break;

+      case 2:

+        l = 40;

+        break;

+      case 3:

+        l = 60;

+        break;

+      case 4:

+      case 5:

+        l = 100;

+        break;

+      case 6:

+        l = 150;

+        break;

+    }

+    if (cm->frame_type == KEY_FRAME) {

+      vp9_de_noise(cpi->Source, cpi->Source, l, 1,  0, RTCD(postproc));

+    } else {

+      vp9_de_noise(cpi->Source, cpi->Source, l, 1,  0, RTCD(postproc));

+      src = cpi->Source->y_buffer;

+      if (cpi->Source->y_stride < 0) {

+        src += cpi->Source->y_stride * (cpi->Source->y_height - 1);

+      }

+    }

+  }

+#endif

+#ifdef OUTPUT_YUV_SRC

+  vp9_write_yuv_frame(cpi->Source);

+#endif

+#if RESET_FOREACH_FILTER

+  if (sf->search_best_filter) {

+    q_low0 = q_low;

+    q_high0 = q_high;

+    Q0 = Q;

+    zbin_oq_low0 = zbin_oq_low;

+    zbin_oq_high0 = zbin_oq_high;

+    last_zbin_oq0 = last_zbin_oq;

+    rate_correction_factor0 = cpi->rate_correction_factor;

+    gf_rate_correction_factor0 = cpi->gf_rate_correction_factor;

+    active_best_quality0 = cpi->active_best_quality;

+    active_worst_quality0 = cpi->active_worst_quality;

+  }

+#endif

+  do {

+    vp9_clear_system_state();  // __asm emms;

+    vp9_set_quantizer(cpi, Q);

+    this_q = Q;

+    if (loop_count == 0) {

+      // setup skip prob for costing in mode/mv decision

+      if (cpi->common.mb_no_coeff_skip) {

+        int k;

+        for (k = 0; k < MBSKIP_CONTEXTS; k++)

+          cm->mbskip_pred_probs[k] = cpi->base_skip_false_prob[Q][k];

+        if (cm->frame_type != KEY_FRAME) {

+          if (cpi->common.refresh_alt_ref_frame) {

+            for (k = 0; k < MBSKIP_CONTEXTS; k++) {

+              if (cpi->last_skip_false_probs[2][k] != 0)

+                cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[2][k];

+            }

+          } else if (cpi->common.refresh_golden_frame) {

+            for (k = 0; k < MBSKIP_CONTEXTS; k++) {

+              if (cpi->last_skip_false_probs[1][k] != 0)

+                cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[1][k];

+            }

+          } else {

+            int k;

+            for (k = 0; k < MBSKIP_CONTEXTS; k++) {

+              if (cpi->last_skip_false_probs[0][k] != 0)

+                cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[0][k];

+            }

+          }

+          // as this is for cost estimate, let's make sure it does not

+          // get extreme either way

+          {

+            int k;

+            for (k = 0; k < MBSKIP_CONTEXTS; ++k) {

+              if (cm->mbskip_pred_probs[k] < 5)

+                cm->mbskip_pred_probs[k] = 5;

+              if (cm->mbskip_pred_probs[k] > 250)

+                cm->mbskip_pred_probs[k] = 250;

+              if (cpi->is_src_frame_alt_ref)

+                cm->mbskip_pred_probs[k] = 1;

+            }

+          }

+        }

+      }

+      // Set up entropy depending on frame type.

+      if (cm->frame_type == KEY_FRAME)

+        vp9_setup_key_frame(cpi);

+      else

+        vp9_setup_inter_frame(cpi);

+    }

+    // transform / motion compensation build reconstruction frame

+    vp9_encode_frame(cpi);

+    // Update the skip mb flag probabilities based on the distribution

+    // seen in the last encoder iteration.

+    update_base_skip_probs(cpi);

+    vp9_clear_system_state();  // __asm emms;

+#if CONFIG_PRED_FILTER

+    // Update prediction filter on/off probability based on

+    // selection made for the current frame

+    if (cm->frame_type != KEY_FRAME)

+      update_pred_filt_prob(cpi);

+#endif

+    // Dummy pack of the bitstream using up to date stats to get an

+    // accurate estimate of output frame size to determine if we need

+    // to recode.

+    vp9_save_coding_context(cpi);

+    cpi->dummy_packing = 1;

+    vp9_pack_bitstream(cpi, dest, size);

+    cpi->projected_frame_size = (*size) << 3;

+    vp9_restore_coding_context(cpi);

+    if (frame_over_shoot_limit == 0)

+      frame_over_shoot_limit = 1;

+    active_worst_qchanged = FALSE;

+    // Special case handling for forced key frames

+    if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {

+      int last_q = Q;

+      int kf_err = vp9_calc_ss_err(cpi->Source,

+                                   &cm->yv12_fb[cm->new_fb_idx]);

+      int high_err_target = cpi->ambient_err;

+      int low_err_target = (cpi->ambient_err >> 1);

+      // Prevent possible divide by zero error below for perfect KF

+      kf_err += (!kf_err);

+      // The key frame is not good enough or we can afford

+      // to make it better without undue risk of popping.

+      if (((kf_err > high_err_target) &&

+           (cpi->projected_frame_size <= frame_over_shoot_limit)) ||

+          ((kf_err > low_err_target) &&

+           (cpi->projected_frame_size <= frame_under_shoot_limit))) {

+        // Lower q_high

+        q_high = (Q > q_low) ? (Q - 1) : q_low;

+        // Adjust Q

+        Q = (Q * high_err_target) / kf_err;

+        if (Q < ((q_high + q_low) >> 1))

+          Q = (q_high + q_low) >> 1;

+      }

+      // The key frame is much better than the previous frame

+      else if ((kf_err < low_err_target) &&

+               (cpi->projected_frame_size >= frame_under_shoot_limit)) {

+        // Raise q_low

+        q_low = (Q < q_high) ? (Q + 1) : q_high;

+        // Adjust Q

+        Q = (Q * low_err_target) / kf_err;

+        if (Q > ((q_high + q_low + 1) >> 1))

+          Q = (q_high + q_low + 1) >> 1;

+      }

+      // Clamp Q to upper and lower limits:

+      if (Q > q_high)

+        Q = q_high;

+      else if (Q < q_low)

+        Q = q_low;

+      Loop = ((Q != last_q)) ? TRUE : FALSE;

+    }

+    // Is the projected frame size out of range and are we allowed to attempt to recode.

+    else if (recode_loop_test(cpi,

+                              frame_over_shoot_limit, frame_under_shoot_limit,

+                              Q, top_index, bottom_index)) {

+      int last_q = Q;

+      int Retries = 0;

+      // Frame size out of permitted range:

+      // Update correction factor & compute new Q to try...

+      // Frame is too large

+      if (cpi->projected_frame_size > cpi->this_frame_target) {

+        q_low = (Q < q_high) ? (Q + 1) : q_high; // Raise Qlow as to at least the current value

+        if (cpi->zbin_over_quant > 0)            // If we are using over quant do the same for zbin_oq_low

+          zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;

+        if (undershoot_seen || (loop_count > 1)) {

+          // Update rate_correction_factor unless cpi->active_worst_quality has changed.

+          if (!active_worst_qchanged)

+            vp9_update_rate_correction_factors(cpi, 1);

+          Q = (q_high + q_low + 1) / 2;

+          // Adjust cpi->zbin_over_quant (only allowed when Q is max)

+          if (Q < MAXQ)

+            cpi->zbin_over_quant = 0;

+          else {

+            zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;

+            cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;

+          }

+        } else {

+          // Update rate_correction_factor unless cpi->active_worst_quality has changed.

+          if (!active_worst_qchanged)

+            vp9_update_rate_correction_factors(cpi, 0);

+          Q = vp9_regulate_q(cpi, cpi->this_frame_target);

+          while (((Q < q_low) || (cpi->zbin_over_quant < zbin_oq_low)) && (Retries < 10)) {

+            vp9_update_rate_correction_factors(cpi, 0);

+            Q = vp9_regulate_q(cpi, cpi->this_frame_target);

+            Retries++;

+          }

+        }

+        overshoot_seen = TRUE;

+      }

+      // Frame is too small

+      else {

+        if (cpi->zbin_over_quant == 0)

+          q_high = (Q > q_low) ? (Q - 1) : q_low; // Lower q_high if not using over quant

+        else                                    // else lower zbin_oq_high

+          zbin_oq_high = (cpi->zbin_over_quant > zbin_oq_low) ? (cpi->zbin_over_quant - 1) : zbin_oq_low;

+        if (overshoot_seen || (loop_count > 1)) {

+          // Update rate_correction_factor unless cpi->active_worst_quality has changed.

+          if (!active_worst_qchanged)

+            vp9_update_rate_correction_factors(cpi, 1);

+          Q = (q_high + q_low) / 2;

+          // Adjust cpi->zbin_over_quant (only allowed when Q is max)

+          if (Q < MAXQ)

+            cpi->zbin_over_quant = 0;

+          else

+            cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;

+        } else {

+          // Update rate_correction_factor unless cpi->active_worst_quality has changed.

+          if (!active_worst_qchanged)

+            vp9_update_rate_correction_factors(cpi, 0);

+          Q = vp9_regulate_q(cpi, cpi->this_frame_target);

+          // Special case reset for qlow for constrained quality.

+          // This should only trigger where there is very substantial

+          // undershoot on a frame and the auto cq level is above

+          // the user passsed in value.

+          if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&

+              (Q < q_low)) {

+            q_low = Q;

+          }

+          while (((Q > q_high) || (cpi->zbin_over_quant > zbin_oq_high)) && (Retries < 10)) {

+            vp9_update_rate_correction_factors(cpi, 0);

+            Q = vp9_regulate_q(cpi, cpi->this_frame_target);

+            Retries++;

+          }

+        }

+        undershoot_seen = TRUE;

+      }

+      // Clamp Q to upper and lower limits:

+      if (Q > q_high)

+        Q = q_high;

+      else if (Q < q_low)

+        Q = q_low;

+      // Clamp cpi->zbin_over_quant

+      cpi->zbin_over_quant = (cpi->zbin_over_quant < zbin_oq_low) ?

+          zbin_oq_low : (cpi->zbin_over_quant > zbin_oq_high) ?

+          zbin_oq_high : cpi->zbin_over_quant;

+      // Loop = ((Q != last_q) || (last_zbin_oq != cpi->zbin_over_quant)) ? TRUE : FALSE;

+      Loop = ((Q != last_q)) ? TRUE : FALSE;

+      last_zbin_oq = cpi->zbin_over_quant;

+    } else

+      Loop = FALSE;

+    if (cpi->is_src_frame_alt_ref)

+      Loop = FALSE;

+    if (cm->frame_type != KEY_FRAME &&

+        !sf->search_best_filter &&

+        cm->mcomp_filter_type == SWITCHABLE) {

+      int interp_factor = Q / 3;  /* denominator is 256 */

+      int count[VP9_SWITCHABLE_FILTERS];

+      int tot_count = 0, c = 0, thr;

+      int i, j;

+      for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {

+        count[i] = 0;

+        for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {

+          count[i] += cpi->switchable_interp_count[j][i];

+        }

+        tot_count += count[i];

+      }

+      thr = ((tot_count * interp_factor + 128) >> 8);

+      for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {

+        c += (count[i] >= thr);

+      }

+      if (c == 1) {

+        /* Mostly one filter is used. So set the filter at frame level */

+        for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {

+          if (count[i]) {

+            cm->mcomp_filter_type = vp9_switchable_interp[i];

+            Loop = TRUE;  /* Make sure to loop since the filter changed */

+            break;

+          }

+        }

+      }

+    }

+    if (Loop == FALSE && cm->frame_type != KEY_FRAME && sf->search_best_filter) {

+      if (mcomp_filter_index < mcomp_filters) {

+        INT64 err = vp9_calc_ss_err(cpi->Source,

+                                    &cm->yv12_fb[cm->new_fb_idx]);

+        INT64 rate = cpi->projected_frame_size << 8;

+        mcomp_filter_cost[mcomp_filter_index] =

+          (RDCOST(cpi->RDMULT, cpi->RDDIV, rate, err));

+        mcomp_filter_index++;

+        if (mcomp_filter_index < mcomp_filters) {

+          cm->mcomp_filter_type = mcomp_filters_to_search[mcomp_filter_index];

+          loop_count = -1;

+          Loop = TRUE;

+        } else {

+          int f;

+          INT64 best_cost = mcomp_filter_cost[0];

+          int mcomp_best_filter = mcomp_filters_to_search[0];

+          for (f = 1; f < mcomp_filters; f++) {

+            if (mcomp_filter_cost[f] < best_cost) {

+              mcomp_best_filter = mcomp_filters_to_search[f];

+              best_cost = mcomp_filter_cost[f];

+            }

+          }

+          if (mcomp_best_filter != mcomp_filters_to_search[mcomp_filters - 1]) {

+            loop_count = -1;

+            Loop = TRUE;

+            cm->mcomp_filter_type = mcomp_best_filter;

+          }

+          /*

+          printf("  best filter = %d, ( ", mcomp_best_filter);

+          for (f=0;f<mcomp_filters; f++) printf("%d ",  mcomp_filter_cost[f]);

+          printf(")\n");

+          */

+        }

+#if RESET_FOREACH_FILTER

+        if (Loop == TRUE) {

+          overshoot_seen = FALSE;

+          undershoot_seen = FALSE;

+          zbin_oq_low = zbin_oq_low0;

+          zbin_oq_high = zbin_oq_high0;

+          q_low = q_low0;

+          q_high = q_high0;

+          Q = Q0;

+          cpi->zbin_over_quant = last_zbin_oq = last_zbin_oq0;

+          cpi->rate_correction_factor = rate_correction_factor0;

+          cpi->gf_rate_correction_factor = gf_rate_correction_factor0;

+          cpi->active_best_quality = active_best_quality0;

+          cpi->active_worst_quality = active_worst_quality0;

+        }

+#endif

+      }

+    }

+    if (Loop == TRUE) {

+      loop_count++;

+#if CONFIG_INTERNAL_STATS

+      cpi->tot_recode_hits++;

+#endif

+    }

+  } while (Loop == TRUE);

+  // Special case code to reduce pulsing when key frames are forced at a

+  // fixed interval. Note the reconstruction error if it is the frame before

+  // the force key frame

+  if (cpi->next_key_frame_forced && (cpi->twopass.frames_to_key == 0)) {

+    cpi->ambient_err = vp9_calc_ss_err(cpi->Source,

+                                       &cm->yv12_fb[cm->new_fb_idx]);

+  }

+  // This frame's MVs are saved and will be used in next frame's MV

+  // prediction. Last frame has one more line(add to bottom) and one

+  // more column(add to right) than cm->mip. The edge elements are

+  // initialized to 0.

+  if (cm->show_frame) { // do not save for altref frame

+    int mb_row;

+    int mb_col;

+    MODE_INFO *tmp = cm->mip;

+    if (cm->frame_type != KEY_FRAME) {

+      for (mb_row = 0; mb_row < cm->mb_rows + 1; mb_row ++) {

+        for (mb_col = 0; mb_col < cm->mb_cols + 1; mb_col ++) {

+          if (tmp->mbmi.ref_frame != INTRA_FRAME)

+            cpi->lfmv[mb_col + mb_row * (cm->mode_info_stride + 1)].as_int = tmp->mbmi.mv[0].as_int;

+          cpi->lf_ref_frame_sign_bias[mb_col + mb_row * (cm->mode_info_stride + 1)] = cm->ref_frame_sign_bias[tmp->mbmi.ref_frame];

+          cpi->lf_ref_frame[mb_col + mb_row * (cm->mode_info_stride + 1)] = tmp->mbmi.ref_frame;

+          tmp++;

+        }

+      }

+    }

+  }

+  // Update the GF useage maps.

+  // This is done after completing the compression of a frame when all modes

+  // etc. are finalized but before loop filter

+  vp9_update_gf_useage_maps(cpi, cm, &cpi->mb);

+  if (cm->frame_type == KEY_FRAME)

+    cm->refresh_last_frame = 1;

+#if 0

+  {

+    FILE *f = fopen("gfactive.stt", "a");

+    fprintf(f, "%8d %8d %8d %8d %8d\n", cm->current_video_frame, (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols), cpi->this_iiratio, cpi->next_iiratio, cm->refresh_golden_frame);

+    fclose(f);

+  }

+#endif

+  cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];

+#if WRITE_RECON_BUFFER

+  if (cm->show_frame)

+    write_cx_frame_to_file(cm->frame_to_show,

+                           cm->current_video_frame);

+  else

+    write_cx_frame_to_file(cm->frame_to_show,

+                           cm->current_video_frame + 1000);

+#endif

+  // Pick the loop filter level for the frame.

+  loopfilter_frame(cpi, cm);

+  // build the bitstream

+  cpi->dummy_packing = 0;

+  vp9_pack_bitstream(cpi, dest, size);

+  if (cpi->mb.e_mbd.update_mb_segmentation_map) {

+    update_reference_segmentation_map(cpi);

+  }

+#if CONFIG_PRED_FILTER

+  // Select the prediction filtering mode to use for the

+  // next frame based on the current frame selections

+  if (cm->frame_type != KEY_FRAME)

+    select_pred_filter_mode(cpi);

+#endif

+  update_reference_frames(cm);

+  vp9_copy(cpi->common.fc.coef_counts, cpi->coef_counts);

+  vp9_copy(cpi->common.fc.hybrid_coef_counts, cpi->hybrid_coef_counts);

+  vp9_copy(cpi->common.fc.coef_counts_8x8, cpi->coef_counts_8x8);

+  vp9_copy(cpi->common.fc.hybrid_coef_counts_8x8, cpi->hybrid_coef_counts_8x8);

+  vp9_copy(cpi->common.fc.coef_counts_16x16, cpi->coef_counts_16x16);

+  vp9_copy(cpi->common.fc.hybrid_coef_counts_16x16,

+           cpi->hybrid_coef_counts_16x16);

+  vp9_adapt_coef_probs(&cpi->common);

+  if (cpi->common.frame_type != KEY_FRAME) {

+    vp9_copy(cpi->common.fc.ymode_counts, cpi->ymode_count);

+    vp9_copy(cpi->common.fc.uv_mode_counts, cpi->y_uv_mode_count);

+    vp9_copy(cpi->common.fc.bmode_counts, cpi->bmode_count);

+    vp9_copy(cpi->common.fc.i8x8_mode_counts, cpi->i8x8_mode_count);

+    vp9_copy(cpi->common.fc.sub_mv_ref_counts, cpi->sub_mv_ref_count);

+    vp9_copy(cpi->common.fc.mbsplit_counts, cpi->mbsplit_count);

+    vp9_adapt_mode_probs(&cpi->common);

+    cpi->common.fc.NMVcount = cpi->NMVcount;

+    vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);

+    vp9_update_mode_context(&cpi->common);

+  }

+  /* Move storing frame_type out of the above loop since it is also

+   * needed in motion search besides loopfilter */

+  cm->last_frame_type = cm->frame_type;

+  // Keep a copy of the size estimate used in the loop

+  loop_size_estimate = cpi->projected_frame_size;

+  // Update rate control heuristics

+  cpi->total_byte_count += (*size);

+  cpi->projected_frame_size = (*size) << 3;

+  if (!active_worst_qchanged)

+    vp9_update_rate_correction_factors(cpi, 2);

+  cpi->last_q[cm->frame_type] = cm->base_qindex;

+  // Keep record of last boosted (KF/KF/ARF) Q value.

+  // If the current frame is coded at a lower Q then we also update it.

+  // If all mbs in this group are skipped only update if the Q value is

+  // better than that already stored.

+  // This is used to help set quality in forced key frames to reduce popping

+  if ((cm->base_qindex < cpi->last_boosted_qindex) ||

+      ((cpi->static_mb_pct < 100) &&

+       ((cm->frame_type == KEY_FRAME) ||

+        cm->refresh_alt_ref_frame ||

+        (cm->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) {

+    cpi->last_boosted_qindex = cm->base_qindex;

+  }

+  if (cm->frame_type == KEY_FRAME) {

+    vp9_adjust_key_frame_context(cpi);

+  }

+  // Keep a record of ambient average Q.

+  if (cm->frame_type != KEY_FRAME)

+    cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;

+  // Keep a record from which we can calculate the average Q excluding GF updates and key frames

+  if ((cm->frame_type != KEY_FRAME) && !cm->refresh_golden_frame && !cm->refresh_alt_ref_frame) {

+    cpi->ni_frames++;

+    cpi->tot_q += vp9_convert_qindex_to_q(Q);

+    cpi->avg_q = cpi->tot_q / (double)cpi->ni_frames;

+    // Calculate the average Q for normal inter frames (not key or GFU

+    // frames).

+    cpi->ni_tot_qi += Q;

+    cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames);

+  }

+  // Update the buffer level variable.

+  // Non-viewable frames are a special case and are treated as pure overhead.

+  if (!cm->show_frame)

+    cpi->bits_off_target -= cpi->projected_frame_size;

+  else

+    cpi->bits_off_target += cpi->av_per_frame_bandwidth - cpi->projected_frame_size;

+  // Clip the buffer level at the maximum buffer size

+  if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)

+    cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;

+  // Rolling monitors of whether we are over or underspending used to help regulate min and Max Q in two pass.

+  cpi->rolling_target_bits = ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4;

+  cpi->rolling_actual_bits = ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4;

+  cpi->long_rolling_target_bits = ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32;

+  cpi->long_rolling_actual_bits = ((cpi->long_rolling_actual_bits * 31) + cpi->projected_frame_size + 16) / 32;

+  // Actual bits spent

+  cpi->total_actual_bits    += cpi->projected_frame_size;

+  // Debug stats

+  cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size);

+  cpi->buffer_level = cpi->bits_off_target;

+  // Update bits left to the kf and gf groups to account for overshoot or undershoot on these frames

+  if (cm->frame_type == KEY_FRAME) {

+    cpi->twopass.kf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;

+    if (cpi->twopass.kf_group_bits < 0)

+      cpi->twopass.kf_group_bits = 0;

+  } else if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame) {

+    cpi->twopass.gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;

+    if (cpi->twopass.gf_group_bits < 0)

+      cpi->twopass.gf_group_bits = 0;

+  }

+  // Update the skip mb flag probabilities based on the distribution seen

+  // in this frame.

+  update_base_skip_probs(cpi);

+#if 0 //CONFIG_NEW_MVREF && CONFIG_INTERNAL_STATS

+  {

+    FILE *f = fopen("mv_ref_dist.stt", "a");

+    unsigned int i;

+    for (i = 0; i < MAX_MV_REFS; ++i) {

+      fprintf(f, "%10d", cpi->best_ref_index_counts[0][i]);

+    }

+    fprintf(f, "\n" );

+    fclose(f);

+  }

+#endif

+#if 0// 1 && CONFIG_INTERNAL_STATS

+  {

+    FILE *f = fopen("tmp.stt", "a");

+    int recon_err;

+    vp9_clear_system_state();  // __asm emms;

+    recon_err = vp9_calc_ss_err(cpi->Source,

+                                &cm->yv12_fb[cm->new_fb_idx]);

+    if (cpi->twopass.total_left_stats->coded_error != 0.0)

+      fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"

+              "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"

+              "%6d %5d %5d %5d %8d %8.2f %10d %10.3f"

+              "%10.3f %8d %10d %10d %10d\n",

+              cpi->common.current_video_frame, cpi->this_frame_target,

+              cpi->projected_frame_size, loop_size_estimate,

+              (cpi->projected_frame_size - cpi->this_frame_target),

+              (int)cpi->total_target_vs_actual,

+              (cpi->oxcf.starting_buffer_level - cpi->bits_off_target),

+              (int)cpi->total_actual_bits,

+              vp9_convert_qindex_to_q(cm->base_qindex),

+              (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,

+              vp9_convert_qindex_to_q(cpi->active_best_quality),

+              vp9_convert_qindex_to_q(cpi->active_worst_quality),

+              cpi->avg_q,

+              vp9_convert_qindex_to_q(cpi->ni_av_qi),

+              vp9_convert_qindex_to_q(cpi->cq_target_quality),

+              cpi->zbin_over_quant,

+              // cpi->avg_frame_qindex, cpi->zbin_over_quant,

+              cm->refresh_golden_frame, cm->refresh_alt_ref_frame,

+              cm->frame_type, cpi->gfu_boost,

+              cpi->twopass.est_max_qcorrection_factor,

+              (int)cpi->twopass.bits_left,

+              cpi->twopass.total_left_stats->coded_error,

+              (double)cpi->twopass.bits_left /

+              cpi->twopass.total_left_stats->coded_error,

+              cpi->tot_recode_hits, recon_err, cpi->kf_boost,

+              cpi->kf_zeromotion_pct);

+    else

+      fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"

+              "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"

+              "%6d %5d %5d %5d %8d %8.2f %10d %10.3f"

+              "%8d %10d %10d %10d\n",

+              cpi->common.current_video_frame,

+              cpi->this_frame_target, cpi->projected_frame_size,

+              loop_size_estimate,

+              (cpi->projected_frame_size - cpi->this_frame_target),

+              (int)cpi->total_target_vs_actual,

+              (cpi->oxcf.starting_buffer_level - cpi->bits_off_target),

+              (int)cpi->total_actual_bits,

+              vp9_convert_qindex_to_q(cm->base_qindex),

+              (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,

+              vp9_convert_qindex_to_q(cpi->active_best_quality),

+              vp9_convert_qindex_to_q(cpi->active_worst_quality),

+              cpi->avg_q,

+              vp9_convert_qindex_to_q(cpi->ni_av_qi),

+              vp9_convert_qindex_to_q(cpi->cq_target_quality),

+              cpi->zbin_over_quant,

+              // cpi->avg_frame_qindex, cpi->zbin_over_quant,

+              cm->refresh_golden_frame, cm->refresh_alt_ref_frame,

+              cm->frame_type, cpi->gfu_boost,

+              cpi->twopass.est_max_qcorrection_factor,

+              (int)cpi->twopass.bits_left,

+              cpi->twopass.total_left_stats->coded_error,

+              cpi->tot_recode_hits, recon_err, cpi->kf_boost,

+              cpi->kf_zeromotion_pct);

+    fclose(f);

+    if (0) {

+      FILE *fmodes = fopen("Modes.stt", "a");

+      int i;

+      fprintf(fmodes, "%6d:%1d:%1d:%1d ",

+              cpi->common.current_video_frame,

+              cm->frame_type, cm->refresh_golden_frame,

+              cm->refresh_alt_ref_frame);

+      for (i = 0; i < MAX_MODES; i++)

+        fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);

+      fprintf(fmodes, "\n");

+      fclose(fmodes);

+    }

+  }

+#endif

+#if 0

+  // Debug stats for segment feature experiments.

+  print_seg_map(cpi);

+#endif

+  // If this was a kf or Gf note the Q

+  if ((cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame || cm->refresh_alt_ref_frame)

+    cm->last_kf_gf_q = cm->base_qindex;

+  if (cm->refresh_golden_frame == 1)

+    cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;

+  else

+    cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_GOLDEN;

+  if (cm->refresh_alt_ref_frame == 1)

+    cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF;

+  else

+    cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF;

+  if (cm->refresh_last_frame & cm->refresh_golden_frame) // both refreshed

+    cpi->gold_is_last = 1;

+  else if (cm->refresh_last_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other

+    cpi->gold_is_last = 0;

+  if (cm->refresh_last_frame & cm->refresh_alt_ref_frame) // both refreshed

+    cpi->alt_is_last = 1;

+  else if (cm->refresh_last_frame ^ cm->refresh_alt_ref_frame) // 1 refreshed but not the other

+    cpi->alt_is_last = 0;

+  if (cm->refresh_alt_ref_frame & cm->refresh_golden_frame) // both refreshed

+    cpi->gold_is_alt = 1;

+  else if (cm->refresh_alt_ref_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other

+    cpi->gold_is_alt = 0;

+  cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;

+  if (cpi->gold_is_last)

+    cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;

+  if (cpi->alt_is_last)

+    cpi->ref_frame_flags &= ~VP9_ALT_FLAG;

+  if (cpi->gold_is_alt)

+    cpi->ref_frame_flags &= ~VP9_ALT_FLAG;

+  if (cpi->oxcf.play_alternate && cm->refresh_alt_ref_frame && (cm->frame_type != KEY_FRAME))

+    // Update the alternate reference frame stats as appropriate.

+    update_alt_ref_frame_stats(cpi);

+  else

+    // Update the Golden frame stats as appropriate.

+    update_golden_frame_stats(cpi);

+  if (cm->frame_type == KEY_FRAME) {

+    // Tell the caller that the frame was coded as a key frame

+    *frame_flags = cm->frame_flags | FRAMEFLAGS_KEY;

+    // As this frame is a key frame  the next defaults to an inter frame.

+    cm->frame_type = INTER_FRAME;

+  } else {

+    *frame_flags = cm->frame_flags&~FRAMEFLAGS_KEY;

+  }

+  // Clear the one shot update flags for segmentation map and mode/ref loop filter deltas.

+  xd->update_mb_segmentation_map = 0;

+  xd->update_mb_segmentation_data = 0;

+  xd->mode_ref_lf_delta_update = 0;

+  // Dont increment frame counters if this was an altref buffer update not a real frame

+  if (cm->show_frame) {

+    cm->current_video_frame++;

+    cpi->frames_since_key++;

+  }

+  // reset to normal state now that we are done.

+#if 0

+  {

+    char filename[512];

+    FILE *recon_file;

+    sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);

+    recon_file = fopen(filename, "wb");

+    fwrite(cm->yv12_fb[cm->lst_fb_idx].buffer_alloc,

+           cm->yv12_fb[cm->lst_fb_idx].frame_size, 1, recon_file);

+    fclose(recon_file);

+  }

+#endif

+#ifdef OUTPUT_YUV_REC

+  vp9_write_yuv_rec_frame(cm);

+#endif

+  if (cm->show_frame) {

+    vpx_memcpy(cm->prev_mip, cm->mip,

+               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

+  } else {

+    vpx_memset(cm->prev_mip, 0,

+               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

+  }

+}

+static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,

+                        unsigned char *dest, unsigned int *frame_flags) {

+  if (!cpi->common.refresh_alt_ref_frame)

+    vp9_second_pass(cpi);

+  encode_frame_to_data_rate(cpi, size, dest, frame_flags);

+  cpi->twopass.bits_left -= 8 * *size;

+  if (!cpi->common.refresh_alt_ref_frame) {

+    double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate;

+    double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth

+                                        * cpi->oxcf.two_pass_vbrmin_section / 100);

+    if (two_pass_min_rate < lower_bounds_min_rate)

+      two_pass_min_rate = lower_bounds_min_rate;

+    cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->oxcf.frame_rate);

+  }

+}

+// For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.

+#if HAVE_ARMV7

+extern void vp9_push_neon(int64_t *store);

+extern void vp9_pop_neon(int64_t *store);

+#endif

+int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,

+                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,

+                          int64_t end_time) {

+#if HAVE_ARMV7

+  int64_t store_reg[8];

+#endif

+  VP9_COMP              *cpi = (VP9_COMP *) ptr;

+  VP9_COMMON            *cm = &cpi->common;

+  struct vpx_usec_timer  timer;

+  int                    res = 0;

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+  if (cm->rtcd.flags & HAS_NEON)

+#endif

+  {

+    vp9_push_neon(store_reg);

+  }

+#endif

+  vpx_usec_timer_start(&timer);

+  if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags,

+                         cpi->active_map_enabled ? cpi->active_map : NULL))

+    res = -1;

+  cm->clr_type = sd->clrtype;

+  vpx_usec_timer_mark(&timer);

+  cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+  if (cm->rtcd.flags & HAS_NEON)

+#endif

+  {

+    vp9_pop_neon(store_reg);

+  }

+#endif

+  return res;

+}

+static int frame_is_reference(const VP9_COMP *cpi) {

+  const VP9_COMMON *cm = &cpi->common;

+  const MACROBLOCKD *xd = &cpi->mb.e_mbd;

+  return cm->frame_type == KEY_FRAME || cm->refresh_last_frame

+         || cm->refresh_golden_frame || cm->refresh_alt_ref_frame

+         || cm->copy_buffer_to_gf || cm->copy_buffer_to_arf

+         || cm->refresh_entropy_probs

+         || xd->mode_ref_lf_delta_update

+         || xd->update_mb_segmentation_map || xd->update_mb_segmentation_data;

+}

+int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,

+                            unsigned long *size, unsigned char *dest,

+                            int64_t *time_stamp, int64_t *time_end, int flush) {

+#if HAVE_ARMV7

+  int64_t store_reg[8];

+#endif

+  VP9_COMP *cpi = (VP9_COMP *) ptr;

+  VP9_COMMON *cm = &cpi->common;

+  struct vpx_usec_timer  cmptimer;

+  YV12_BUFFER_CONFIG    *force_src_buffer = NULL;

+  if (!cpi)

+    return -1;

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+  if (cm->rtcd.flags & HAS_NEON)

+#endif

+  {

+    vp9_push_neon(store_reg);

+  }

+#endif

+  vpx_usec_timer_start(&cmptimer);

+  cpi->source = NULL;

+  cpi->mb.e_mbd.allow_high_precision_mv = ALTREF_HIGH_PRECISION_MV;

+  // Should we code an alternate reference frame

+  if (cpi->oxcf.play_alternate &&

+      cpi->source_alt_ref_pending) {

+    if ((cpi->source = vp9_lookahead_peek(cpi->lookahead,

+                                          cpi->frames_till_gf_update_due))) {

+      cpi->alt_ref_source = cpi->source;

+      if (cpi->oxcf.arnr_max_frames > 0) {

+        vp9_temporal_filter_prepare_c(cpi,

+                                      cpi->frames_till_gf_update_due);

+        force_src_buffer = &cpi->alt_ref_buffer;

+      }

+      cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;

+      cm->refresh_alt_ref_frame = 1;

+      cm->refresh_golden_frame = 0;

+      cm->refresh_last_frame = 0;

+      cm->show_frame = 0;

+      cpi->source_alt_ref_pending = FALSE;   // Clear Pending altf Ref flag.

+      cpi->is_src_frame_alt_ref = 0;

+    }

+  }

+  if (!cpi->source) {

+    if ((cpi->source = vp9_lookahead_pop(cpi->lookahead, flush))) {

+      cm->show_frame = 1;

+      cpi->is_src_frame_alt_ref = cpi->alt_ref_source

+                                  && (cpi->source == cpi->alt_ref_source);

+      if (cpi->is_src_frame_alt_ref)

+        cpi->alt_ref_source = NULL;

+    }

+  }

+  if (cpi->source) {

+    cpi->un_scaled_source =

+      cpi->Source = force_src_buffer ? force_src_buffer : &cpi->source->img;

+    *time_stamp = cpi->source->ts_start;

+    *time_end = cpi->source->ts_end;

+    *frame_flags = cpi->source->flags;

+  } else {

+    *size = 0;

+    if (flush && cpi->pass == 1 && !cpi->twopass.first_pass_done) {

+      vp9_end_first_pass(cpi);    /* get last stats packet */

+      cpi->twopass.first_pass_done = 1;

+    }

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+    if (cm->rtcd.flags & HAS_NEON)

+#endif

+    {

+      vp9_pop_neon(store_reg);

+    }

+#endif

+    return -1;

+  }

+  if (cpi->source->ts_start < cpi->first_time_stamp_ever) {

+    cpi->first_time_stamp_ever = cpi->source->ts_start;

+    cpi->last_end_time_stamp_seen = cpi->source->ts_start;

+  }

+  // adjust frame rates based on timestamps given

+  if (!cm->refresh_alt_ref_frame) {

+    int64_t this_duration;

+    int step = 0;

+    if (cpi->source->ts_start == cpi->first_time_stamp_ever) {

+      this_duration = cpi->source->ts_end - cpi->source->ts_start;

+      step = 1;

+    } else {

+      int64_t last_duration;

+      this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;

+      last_duration = cpi->last_end_time_stamp_seen

+                      - cpi->last_time_stamp_seen;

+      // do a step update if the duration changes by 10%

+      if (last_duration)

+        step = ((this_duration - last_duration) * 10 / last_duration);

+    }

+    if (this_duration) {

+      if (step)

+        vp9_new_frame_rate(cpi, 10000000.0 / this_duration);

+      else {

+        double avg_duration, interval;

+        /* Average this frame's rate into the last second's average

+         * frame rate. If we haven't seen 1 second yet, then average

+         * over the whole interval seen.

+         */

+        interval = cpi->source->ts_end - cpi->first_time_stamp_ever;

+        if (interval > 10000000.0)

+          interval = 10000000;

+        avg_duration = 10000000.0 / cpi->oxcf.frame_rate;

+        avg_duration *= (interval - avg_duration + this_duration);

+        avg_duration /= interval;

+        vp9_new_frame_rate(cpi, 10000000.0 / avg_duration);

+      }

+    }

+    cpi->last_time_stamp_seen = cpi->source->ts_start;

+    cpi->last_end_time_stamp_seen = cpi->source->ts_end;

+  }

+  // start with a 0 size frame

+  *size = 0;

+  // Clear down mmx registers

+  vp9_clear_system_state();  // __asm emms;

+  cm->frame_type = INTER_FRAME;

+  cm->frame_flags = *frame_flags;

+#if 0

+  if (cm->refresh_alt_ref_frame) {

+    // cm->refresh_golden_frame = 1;

+    cm->refresh_golden_frame = 0;

+    cm->refresh_last_frame = 0;

+  } else {

+    cm->refresh_golden_frame = 0;

+    cm->refresh_last_frame = 1;

+  }

+#endif

+  /* find a free buffer for the new frame */

+  {

+    int i = 0;

+    for (; i < NUM_YV12_BUFFERS; i++) {

+      if (!cm->yv12_fb[i].flags) {

+        cm->new_fb_idx = i;

+        break;

+      }

+    }

+    assert(i < NUM_YV12_BUFFERS);

+  }

+  if (cpi->pass == 1) {

+    Pass1Encode(cpi, size, dest, frame_flags);

+  } else if (cpi->pass == 2) {

+    Pass2Encode(cpi, size, dest, frame_flags);

+  } else {

+    encode_frame_to_data_rate(cpi, size, dest, frame_flags);

+  }

+  if (cm->refresh_entropy_probs) {

+    if (cm->refresh_alt_ref_frame)

+      vpx_memcpy(&cm->lfc_a, &cm->fc, sizeof(cm->fc));

+    else

+      vpx_memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc));

+  }

+  // if its a dropped frame honor the requests on subsequent frames

+  if (*size > 0) {

+    cpi->droppable = !frame_is_reference(cpi);

+    // return to normal state

+    cm->refresh_entropy_probs = 1;

+    cm->refresh_alt_ref_frame = 0;

+    cm->refresh_golden_frame = 0;

+    cm->refresh_last_frame = 1;

+    cm->frame_type = INTER_FRAME;

+  }

+  vpx_usec_timer_mark(&cmptimer);

+  cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);

+  if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) {

+    generate_psnr_packet(cpi);

+  }

+#if CONFIG_INTERNAL_STATS

+  if (cpi->pass != 1) {

+    cpi->bytes += *size;

+    if (cm->show_frame) {

+      cpi->count++;

+      if (cpi->b_calculate_psnr) {

+        double ye, ue, ve;

+        double frame_psnr;

+        YV12_BUFFER_CONFIG      *orig = cpi->Source;

+        YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;

+        YV12_BUFFER_CONFIG      *pp = &cm->post_proc_buffer;

+        int y_samples = orig->y_height * orig->y_width;

+        int uv_samples = orig->uv_height * orig->uv_width;

+        int t_samples = y_samples + 2 * uv_samples;

+        int64_t sq_error;

+        ye = calc_plane_error(orig->y_buffer, orig->y_stride,

+                              recon->y_buffer, recon->y_stride, orig->y_width,

+                              orig->y_height);

+        ue = calc_plane_error(orig->u_buffer, orig->uv_stride,

+                              recon->u_buffer, recon->uv_stride, orig->uv_width,

+                              orig->uv_height);

+        ve = calc_plane_error(orig->v_buffer, orig->uv_stride,

+                              recon->v_buffer, recon->uv_stride, orig->uv_width,

+                              orig->uv_height);

+        sq_error = ye + ue + ve;

+        frame_psnr = vp9_mse2psnr(t_samples, 255.0, sq_error);

+        cpi->total_y += vp9_mse2psnr(y_samples, 255.0, ye);

+        cpi->total_u += vp9_mse2psnr(uv_samples, 255.0, ue);

+        cpi->total_v += vp9_mse2psnr(uv_samples, 255.0, ve);

+        cpi->total_sq_error += sq_error;

+        cpi->total  += frame_psnr;

+        {

+          double frame_psnr2, frame_ssim2 = 0;

+          double weight = 0;

+#if CONFIG_POSTPROC

+          vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0, IF_RTCD(&cm->rtcd.postproc));

+#endif

+          vp9_clear_system_state();

+          ye = calc_plane_error(orig->y_buffer, orig->y_stride,

+                                pp->y_buffer, pp->y_stride, orig->y_width,

+                                orig->y_height);

+          ue = calc_plane_error(orig->u_buffer, orig->uv_stride,

+                                pp->u_buffer, pp->uv_stride, orig->uv_width,

+                                orig->uv_height);

+          ve = calc_plane_error(orig->v_buffer, orig->uv_stride,

+                                pp->v_buffer, pp->uv_stride, orig->uv_width,

+                                orig->uv_height);

+          sq_error = ye + ue + ve;

+          frame_psnr2 = vp9_mse2psnr(t_samples, 255.0, sq_error);

+          cpi->totalp_y += vp9_mse2psnr(y_samples, 255.0, ye);

+          cpi->totalp_u += vp9_mse2psnr(uv_samples, 255.0, ue);

+          cpi->totalp_v += vp9_mse2psnr(uv_samples, 255.0, ve);

+          cpi->total_sq_error2 += sq_error;

+          cpi->totalp  += frame_psnr2;

+          frame_ssim2 = vp9_calc_ssim(cpi->Source,

+                                      &cm->post_proc_buffer, 1, &weight);

+          cpi->summed_quality += frame_ssim2 * weight;

+          cpi->summed_weights += weight;

+#if 0

+          {

+            FILE *f = fopen("q_used.stt", "a");

+            fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",

+                    cpi->common.current_video_frame, y2, u2, v2,

+                    frame_psnr2, frame_ssim2);

+            fclose(f);

+          }

+#endif

+        }

+      }

+      if (cpi->b_calculate_ssimg) {

+        double y, u, v, frame_all;

+        frame_all =  vp9_calc_ssimg(cpi->Source, cm->frame_to_show,

+                                    &y, &u, &v);

+        cpi->total_ssimg_y += y;

+        cpi->total_ssimg_u += u;

+        cpi->total_ssimg_v += v;

+        cpi->total_ssimg_all += frame_all;

+      }

+    }

+  }

+#endif

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+  if (cm->rtcd.flags & HAS_NEON)

+#endif

+  {

+    vp9_pop_neon(store_reg);

+  }

+#endif

+  return 0;

+}

+int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,

+                              vp9_ppflags_t *flags) {

+  VP9_COMP *cpi = (VP9_COMP *) comp;

+  if (cpi->common.refresh_alt_ref_frame)

+    return -1;

+  else {

+    int ret;

+#if CONFIG_POSTPROC

+    ret = vp9_post_proc_frame(&cpi->common, dest, flags);

+#else

+    if (cpi->common.frame_to_show) {

+      *dest = *cpi->common.frame_to_show;

+      dest->y_width = cpi->common.Width;

+      dest->y_height = cpi->common.Height;

+      dest->uv_height = cpi->common.Height / 2;

+      ret = 0;

+    } else {

+      ret = -1;

+    }

+#endif // !CONFIG_POSTPROC

+    vp9_clear_system_state();

+    return ret;

+  }

+}

+int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,

+                   unsigned int cols, int delta_q[4], int delta_lf[4],

+                   unsigned int threshold[4]) {

+  VP9_COMP *cpi = (VP9_COMP *) comp;

+  signed char feature_data[SEG_LVL_MAX][MAX_MB_SEGMENTS];

+  MACROBLOCKD *xd = &cpi->mb.e_mbd;

+  int i;

+  if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)

+    return -1;

+  if (!map) {

+    vp9_disable_segmentation((VP9_PTR)cpi);

+    return 0;

+  }

+  // Set the segmentation Map

+  vp9_set_segmentation_map((VP9_PTR)cpi, map);

+  // Activate segmentation.

+  vp9_enable_segmentation((VP9_PTR)cpi);

+  // Set up the quant segment data

+  feature_data[SEG_LVL_ALT_Q][0] = delta_q[0];

+  feature_data[SEG_LVL_ALT_Q][1] = delta_q[1];

+  feature_data[SEG_LVL_ALT_Q][2] = delta_q[2];

+  feature_data[SEG_LVL_ALT_Q][3] = delta_q[3];

+  // Set up the loop segment data s

+  feature_data[SEG_LVL_ALT_LF][0] = delta_lf[0];

+  feature_data[SEG_LVL_ALT_LF][1] = delta_lf[1];

+  feature_data[SEG_LVL_ALT_LF][2] = delta_lf[2];

+  feature_data[SEG_LVL_ALT_LF][3] = delta_lf[3];

+  cpi->segment_encode_breakout[0] = threshold[0];

+  cpi->segment_encode_breakout[1] = threshold[1];

+  cpi->segment_encode_breakout[2] = threshold[2];

+  cpi->segment_encode_breakout[3] = threshold[3];

+  // Enable the loop and quant changes in the feature mask

+  for (i = 0; i < 4; i++) {

+    if (delta_q[i])

+      vp9_enable_segfeature(xd, i, SEG_LVL_ALT_Q);

+    else

+      vp9_disable_segfeature(xd, i, SEG_LVL_ALT_Q);

+    if (delta_lf[i])

+      vp9_enable_segfeature(xd, i, SEG_LVL_ALT_LF);

+    else

+      vp9_disable_segfeature(xd, i, SEG_LVL_ALT_LF);

+  }

+  // Initialise the feature data structure

+  // SEGMENT_DELTADATA    0, SEGMENT_ABSDATA      1

+  vp9_set_segment_data((VP9_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA);

+  return 0;

+}

+int vp9_set_active_map(VP9_PTR comp, unsigned char *map,

+                       unsigned int rows, unsigned int cols) {

+  VP9_COMP *cpi = (VP9_COMP *) comp;

+  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {

+    if (map) {

+      vpx_memcpy(cpi->active_map, map, rows * cols);

+      cpi->active_map_enabled = 1;

+    } else

+      cpi->active_map_enabled = 0;

+    return 0;

+  } else {

+    // cpi->active_map_enabled = 0;

+    return -1;

+  }

+}

+int vp9_set_internal_size(VP9_PTR comp,

+                          VPX_SCALING horiz_mode, VPX_SCALING vert_mode) {

+  VP9_COMP *cpi = (VP9_COMP *) comp;

+  if (horiz_mode <= ONETWO)

+    cpi->common.horiz_scale = horiz_mode;

+  else

+    return -1;

+  if (vert_mode <= ONETWO)

+    cpi->common.vert_scale  = vert_mode;

+  else

+    return -1;

+  return 0;

+}

+int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest) {

+  int i, j;

+  int Total = 0;

+  unsigned char *src = source->y_buffer;

+  unsigned char *dst = dest->y_buffer;

+  // Loop through the Y plane raw and reconstruction data summing (square differences)

+  for (i = 0; i < source->y_height; i += 16) {

+    for (j = 0; j < source->y_width; j += 16) {

+      unsigned int sse;

+      Total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride,

+                            &sse);

+    }

+    src += 16 * source->y_stride;

+    dst += 16 * dest->y_stride;

+  }

+  return Total;

+}

+int vp9_get_quantizer(VP9_PTR c) {

+  VP9_COMP   *cpi = (VP9_COMP *) c;

+  return cpi->common.base_qindex;

+}

--- /dev/null

+++ b/vp9/encoder/onyx_int.h

@@ -1,0 +1,788 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_ONYX_INT_H

+#define __INC_ONYX_INT_H

+#include <stdio.h>

+#include "vpx_ports/config.h"

+#include "vp9/common/onyx.h"

+#include "treewriter.h"

+#include "tokenize.h"

+#include "vp9/common/onyxc_int.h"

+#include "variance.h"

+#include "encodemb.h"

+#include "quantize.h"

+#include "vp9/common/entropy.h"

+#include "vp9/common/entropymode.h"

+#include "vpx_ports/mem.h"

+#include "vpx/internal/vpx_codec_internal.h"

+#include "mcomp.h"

+#include "temporal_filter.h"

+#include "vp9/common/findnearmv.h"

+#include "lookahead.h"

+// #define SPEEDSTATS 1

+#define MIN_GF_INTERVAL             4

+#define DEFAULT_GF_INTERVAL         7

+#define KEY_FRAME_CONTEXT 5

+#define MAX_LAG_BUFFERS 25

+#define AF_THRESH   25

+#define AF_THRESH2  100

+#define ARF_DECAY_THRESH 12

+#if CONFIG_PRED_FILTER

+#define MAX_MODES 54

+#else  // CONFIG_PRED_FILTER

+#define MAX_MODES 42

+#endif  // CONFIG_PRED_FILTER

+#define MIN_THRESHMULT  32

+#define MAX_THRESHMULT  512

+#define GF_ZEROMV_ZBIN_BOOST 12

+#define LF_ZEROMV_ZBIN_BOOST 6

+#define MV_ZBIN_BOOST        4

+#define ZBIN_OQ_MAX 192

+#define VP9_TEMPORAL_ALT_REF 1

+typedef struct {

+  nmv_context nmvc;

+  int nmvjointcost[MV_JOINTS];

+  int nmvcosts[2][MV_VALS];

+  int nmvcosts_hp[2][MV_VALS];

+#ifdef MODE_STATS

+  // Stats

+  int y_modes[VP9_YMODES];

+  int uv_modes[VP9_UV_MODES];

+  int i8x8_modes[VP9_I8X8_MODES];

+  int b_modes[B_MODE_COUNT];

+  int inter_y_modes[MB_MODE_COUNT];

+  int inter_uv_modes[VP9_UV_MODES];

+  int inter_b_modes[B_MODE_COUNT];

+#endif

+  vp9_prob segment_pred_probs[PREDICTION_PROBS];

+  unsigned char ref_pred_probs_update[PREDICTION_PROBS];

+  vp9_prob ref_pred_probs[PREDICTION_PROBS];

+  vp9_prob prob_comppred[COMP_PRED_CONTEXTS];

+  unsigned char *last_frame_seg_map_copy;

+  // 0 = Intra, Last, GF, ARF

+  signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];

+  // 0 = BPRED, ZERO_MV, MV, SPLIT

+  signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];

+  vp9_prob coef_probs[BLOCK_TYPES]

+      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

+  vp9_prob hybrid_coef_probs[BLOCK_TYPES]

+      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

+  vp9_prob coef_probs_8x8[BLOCK_TYPES_8X8]

+      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

+  vp9_prob hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]

+      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

+  vp9_prob coef_probs_16x16[BLOCK_TYPES_16X16]

+      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

+  vp9_prob hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]

+      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

+  vp9_prob ymode_prob [VP9_YMODES - 1]; /* interframe intra mode probs */

+  vp9_prob uv_mode_prob [VP9_YMODES][VP9_UV_MODES - 1];

+  vp9_prob bmode_prob [VP9_BINTRAMODES - 1];

+  vp9_prob i8x8_mode_prob [VP9_I8X8_MODES - 1];

+  vp9_prob sub_mv_ref_prob [SUBMVREF_COUNT][VP9_SUBMVREFS - 1];

+  vp9_prob mbsplit_prob [VP9_NUMMBSPLITS - 1];

+  vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]

+                                 [VP9_SWITCHABLE_FILTERS - 1];

+  int mv_ref_ct[6][4][2];

+  int mode_context[6][4];

+  int mv_ref_ct_a[6][4][2];

+  int mode_context_a[6][4];

+} CODING_CONTEXT;

+typedef struct {

+  double frame;

+  double intra_error;

+  double coded_error;

+  double sr_coded_error;

+  double ssim_weighted_pred_err;

+  double pcnt_inter;

+  double pcnt_motion;

+  double pcnt_second_ref;

+  double pcnt_neutral;

+  double MVr;

+  double mvr_abs;

+  double MVc;

+  double mvc_abs;

+  double MVrv;

+  double MVcv;

+  double mv_in_out_count;

+  double new_mv_count;

+  double duration;

+  double count;

+}

+FIRSTPASS_STATS;

+typedef struct {

+  int frames_so_far;

+  double frame_intra_error;

+  double frame_coded_error;

+  double frame_pcnt_inter;

+  double frame_pcnt_motion;

+  double frame_mvr;

+  double frame_mvr_abs;

+  double frame_mvc;

+  double frame_mvc_abs;

+} ONEPASS_FRAMESTATS;

+typedef struct {

+  struct {

+    int err;

+    union {

+      int_mv mv;

+      MB_PREDICTION_MODE mode;

+    } m;

+  } ref[MAX_REF_FRAMES];

+} MBGRAPH_MB_STATS;

+typedef struct {

+  MBGRAPH_MB_STATS *mb_stats;

+} MBGRAPH_FRAME_STATS;

+#if CONFIG_PRED_FILTER

+typedef enum {

+  THR_ZEROMV,

+  THR_ZEROMV_FILT,

+  THR_DC,

+  THR_NEARESTMV,

+  THR_NEARESTMV_FILT,

+  THR_NEARMV,

+  THR_NEARMV_FILT,

+  THR_ZEROG,

+  THR_ZEROG_FILT,

+  THR_NEARESTG,

+  THR_NEARESTG_FILT,

+  THR_ZEROA,

+  THR_ZEROA_FILT,

+  THR_NEARESTA,

+  THR_NEARESTA_FILT,

+  THR_NEARG,

+  THR_NEARG_FILT,

+  THR_NEARA,

+  THR_NEARA_FILT,

+  THR_V_PRED,

+  THR_H_PRED,

+  THR_D45_PRED,

+  THR_D135_PRED,

+  THR_D117_PRED,

+  THR_D153_PRED,

+  THR_D27_PRED,

+  THR_D63_PRED,

+  THR_TM,

+  THR_NEWMV,

+  THR_NEWMV_FILT,

+  THR_NEWG,

+  THR_NEWG_FILT,

+  THR_NEWA,

+  THR_NEWA_FILT,

+  THR_SPLITMV,

+  THR_SPLITG,

+  THR_SPLITA,

+  THR_B_PRED,

+  THR_I8X8_PRED,

+  THR_COMP_ZEROLG,

+  THR_COMP_NEARESTLG,

+  THR_COMP_NEARLG,

+  THR_COMP_ZEROLA,

+  THR_COMP_NEARESTLA,

+  THR_COMP_NEARLA,

+  THR_COMP_ZEROGA,

+  THR_COMP_NEARESTGA,

+  THR_COMP_NEARGA,

+  THR_COMP_NEWLG,

+  THR_COMP_NEWLA,

+  THR_COMP_NEWGA,

+  THR_COMP_SPLITLG,

+  THR_COMP_SPLITLA,

+  THR_COMP_SPLITGA,

+}

+THR_MODES;

+#else

+typedef enum {

+  THR_ZEROMV,

+  THR_DC,

+  THR_NEARESTMV,

+  THR_NEARMV,

+  THR_ZEROG,

+  THR_NEARESTG,

+  THR_ZEROA,

+  THR_NEARESTA,

+  THR_NEARG,

+  THR_NEARA,

+  THR_V_PRED,

+  THR_H_PRED,

+  THR_D45_PRED,

+  THR_D135_PRED,

+  THR_D117_PRED,

+  THR_D153_PRED,

+  THR_D27_PRED,

+  THR_D63_PRED,

+  THR_TM,

+  THR_NEWMV,

+  THR_NEWG,

+  THR_NEWA,

+  THR_SPLITMV,

+  THR_SPLITG,

+  THR_SPLITA,

+  THR_B_PRED,

+  THR_I8X8_PRED,

+  THR_COMP_ZEROLG,

+  THR_COMP_NEARESTLG,

+  THR_COMP_NEARLG,

+  THR_COMP_ZEROLA,

+  THR_COMP_NEARESTLA,

+  THR_COMP_NEARLA,

+  THR_COMP_ZEROGA,

+  THR_COMP_NEARESTGA,

+  THR_COMP_NEARGA,

+  THR_COMP_NEWLG,

+  THR_COMP_NEWLA,

+  THR_COMP_NEWGA,

+  THR_COMP_SPLITLG,

+  THR_COMP_SPLITLA,

+  THR_COMP_SPLITGA

+}

+THR_MODES;

+#endif

+typedef enum {

+  DIAMOND = 0,

+  NSTEP = 1,

+  HEX = 2

+} SEARCH_METHODS;

+typedef struct {

+  int RD;

+  SEARCH_METHODS search_method;

+  int improved_dct;

+  int auto_filter;

+  int recode_loop;

+  int iterative_sub_pixel;

+  int half_pixel_search;

+  int quarter_pixel_search;

+  int thresh_mult[MAX_MODES];

+  int max_step_search_steps;

+  int first_step;

+  int optimize_coefficients;

+  int no_skip_block4x4_search;

+  int improved_mv_pred;

+  int search_best_filter;

+} SPEED_FEATURES;

+typedef struct {

+  MACROBLOCK  mb;

+  int totalrate;

+} MB_ROW_COMP;

+typedef struct {

+  TOKENEXTRA *start;

+  TOKENEXTRA *stop;

+} TOKENLIST;

+typedef struct {

+  int ithread;

+  void *ptr1;

+  void *ptr2;

+} ENCODETHREAD_DATA;

+typedef struct {

+  int ithread;

+  void *ptr1;

+} LPFTHREAD_DATA;

+typedef struct VP9_ENCODER_RTCD {

+  VP9_COMMON_RTCD            *common;

+  vp9_search_rtcd_vtable_t    search;

+  vp9_temporal_rtcd_vtable_t  temporal;

+} VP9_ENCODER_RTCD;

+enum BlockSize {

+  BLOCK_16X8 = PARTITIONING_16X8,

+  BLOCK_8X16 = PARTITIONING_8X16,

+  BLOCK_8X8 = PARTITIONING_8X8,

+  BLOCK_4X4 = PARTITIONING_4X4,

+  BLOCK_16X16,

+  BLOCK_MAX_SEGMENTS,

+  BLOCK_32X32 = BLOCK_MAX_SEGMENTS,

+  BLOCK_MAX_SB_SEGMENTS,

+};

+typedef struct VP9_COMP {

+  DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);

+  DECLARE_ALIGNED(64, short, Y1zbin_8x8[QINDEX_RANGE][64]);

+  DECLARE_ALIGNED(64, short, Y2zbin_8x8[QINDEX_RANGE][64]);

+  DECLARE_ALIGNED(64, short, UVzbin_8x8[QINDEX_RANGE][64]);

+  DECLARE_ALIGNED(64, short, zrun_zbin_boost_y1_8x8[QINDEX_RANGE][64]);

+  DECLARE_ALIGNED(64, short, zrun_zbin_boost_y2_8x8[QINDEX_RANGE][64]);

+  DECLARE_ALIGNED(64, short, zrun_zbin_boost_uv_8x8[QINDEX_RANGE][64]);

+  DECLARE_ALIGNED(16, short, Y1zbin_16x16[QINDEX_RANGE][256]);

+  DECLARE_ALIGNED(16, short, Y2zbin_16x16[QINDEX_RANGE][256]);

+  DECLARE_ALIGNED(16, short, UVzbin_16x16[QINDEX_RANGE][256]);

+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1_16x16[QINDEX_RANGE][256]);

+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_16x16[QINDEX_RANGE][256]);

+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_16x16[QINDEX_RANGE][256]);

+  MACROBLOCK mb;

+  VP9_COMMON common;

+  VP9_CONFIG oxcf;

+  struct lookahead_ctx    *lookahead;

+  struct lookahead_entry  *source;

+  struct lookahead_entry  *alt_ref_source;

+  YV12_BUFFER_CONFIG *Source;

+  YV12_BUFFER_CONFIG *un_scaled_source;

+  YV12_BUFFER_CONFIG scaled_source;

+  int source_alt_ref_pending; // frame in src_buffers has been identified to be encoded as an alt ref

+  int source_alt_ref_active;  // an alt ref frame has been encoded and is usable

+  int is_src_frame_alt_ref;   // source of frame to encode is an exact copy of an alt ref frame

+  int gold_is_last; // golden frame same as last frame ( short circuit gold searches)

+  int alt_is_last;  // Alt reference frame same as last ( short circuit altref search)

+  int gold_is_alt;  // don't do both alt and gold search ( just do gold).

+  // int refresh_alt_ref_frame;

+  YV12_BUFFER_CONFIG last_frame_uf;

+  TOKENEXTRA *tok;

+  unsigned int tok_count;

+  unsigned int frames_since_key;

+  unsigned int key_frame_frequency;

+  unsigned int this_key_frame_forced;

+  unsigned int next_key_frame_forced;

+  // Ambient reconstruction err target for force key frames

+  int ambient_err;

+  unsigned int mode_check_freq[MAX_MODES];

+  unsigned int mode_test_hit_counts[MAX_MODES];

+  unsigned int mode_chosen_counts[MAX_MODES];

+  int rd_thresh_mult[MAX_MODES];

+  int rd_baseline_thresh[MAX_MODES];

+  int rd_threshes[MAX_MODES];

+  int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES];

+  int rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];

+  int comp_pred_count[COMP_PRED_CONTEXTS];

+  int single_pred_count[COMP_PRED_CONTEXTS];

+  // FIXME contextualize

+  int txfm_count[TX_SIZE_MAX];

+  int txfm_count_8x8p[TX_SIZE_MAX - 1];

+  int64_t rd_tx_select_diff[NB_TXFM_MODES];

+  int rd_tx_select_threshes[4][NB_TXFM_MODES];

+  int RDMULT;

+  int RDDIV;

+  CODING_CONTEXT coding_context;

+  // Rate targetting variables

+  int64_t prediction_error;

+  int64_t last_prediction_error;

+  int64_t intra_error;

+  int64_t last_intra_error;

+  int this_frame_target;

+  int projected_frame_size;

+  int last_q[2];                   // Separate values for Intra/Inter

+  int last_boosted_qindex;         // Last boosted GF/KF/ARF q

+  double rate_correction_factor;

+  double key_frame_rate_correction_factor;

+  double gf_rate_correction_factor;

+  int frames_till_gf_update_due;      // Count down till next GF

+  int current_gf_interval;          // GF interval chosen when we coded the last GF

+  int gf_overspend_bits;            // Total bits overspent becasue of GF boost (cumulative)

+  int non_gf_bitrate_adjustment;     // Used in the few frames following a GF to recover the extra bits spent in that GF

+  int kf_overspend_bits;            // Extra bits spent on key frames that need to be recovered on inter frames

+  int kf_bitrate_adjustment;        // Current number of bit s to try and recover on each inter frame.

+  int max_gf_interval;

+  int baseline_gf_interval;

+  int active_arnr_frames;           // <= cpi->oxcf.arnr_max_frames

+  int64_t key_frame_count;

+  int prior_key_frame_distance[KEY_FRAME_CONTEXT];

+  int per_frame_bandwidth;          // Current section per frame bandwidth target

+  int av_per_frame_bandwidth;        // Average frame size target for clip

+  int min_frame_bandwidth;          // Minimum allocation that should be used for any frame

+  int inter_frame_target;

+  double output_frame_rate;

+  int64_t last_time_stamp_seen;

+  int64_t last_end_time_stamp_seen;

+  int64_t first_time_stamp_ever;

+  int ni_av_qi;

+  int ni_tot_qi;

+  int ni_frames;

+  int avg_frame_qindex;

+  double tot_q;

+  double avg_q;

+  int zbin_over_quant;

+  int zbin_mode_boost;

+  int zbin_mode_boost_enabled;

+  int64_t total_byte_count;

+  int buffered_mode;

+  int buffer_level;

+  int bits_off_target;

+  int rolling_target_bits;

+  int rolling_actual_bits;

+  int long_rolling_target_bits;

+  int long_rolling_actual_bits;

+  int64_t total_actual_bits;

+  int total_target_vs_actual;        // debug stats

+  int worst_quality;

+  int active_worst_quality;

+  int best_quality;

+  int active_best_quality;

+  int cq_target_quality;

+#if CONFIG_SUPERBLOCKS

+  int sb_count;

+  int sb_ymode_count [VP9_I32X32_MODES];

+#endif

+  int ymode_count [VP9_YMODES];        /* intra MB type cts this frame */

+  int bmode_count [VP9_BINTRAMODES];

+  int i8x8_mode_count [VP9_I8X8_MODES];

+  int sub_mv_ref_count [SUBMVREF_COUNT][VP9_SUBMVREFS];

+  int mbsplit_count [VP9_NUMMBSPLITS];

+  // int uv_mode_count[VP9_UV_MODES];       /* intra MB type cts this frame */

+  int y_uv_mode_count[VP9_YMODES][VP9_UV_MODES];

+  nmv_context_counts NMVcount;

+  unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */

+  vp9_prob frame_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+  unsigned int frame_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];

+  unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */

+  vp9_prob frame_hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+  unsigned int frame_hybrid_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];

+  unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */

+  vp9_prob frame_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+  unsigned int frame_branch_ct_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];

+  unsigned int hybrid_coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */

+  vp9_prob frame_hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+  unsigned int frame_hybrid_branch_ct_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];

+  unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */

+  vp9_prob frame_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+  unsigned int frame_branch_ct_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];

+  unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */

+  vp9_prob frame_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+  unsigned int frame_hybrid_branch_ct_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];

+  int gfu_boost;

+  int last_boost;

+  int kf_boost;

+  int kf_zeromotion_pct;

+  int target_bandwidth;

+  struct vpx_codec_pkt_list  *output_pkt_list;

+#if 0

+  // Experimental code for lagged and one pass

+  ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS];

+  int one_pass_frame_index;

+#endif

+  MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];

+  int mbgraph_n_frames;             // number of frames filled in the above

+  int static_mb_pct;                // % forced skip mbs by segmentation

+  int seg0_progress, seg0_idx, seg0_cnt;

+  int ref_pred_count[3][2];

+  int decimation_factor;

+  int decimation_count;

+  // for real time encoding

+  int avg_encode_time;              // microsecond

+  int avg_pick_mode_time;            // microsecond

+  int Speed;

+  unsigned int cpu_freq;           // Mhz

+  int compressor_speed;

+  int interquantizer;

+  int goldfreq;

+  int auto_worst_q;

+  int cpu_used;

+  int horiz_scale;

+  int vert_scale;

+  int pass;

+  vp9_prob last_skip_false_probs[3][MBSKIP_CONTEXTS];

+  int last_skip_probs_q[3];

+  int recent_ref_frame_usage[MAX_REF_FRAMES];

+  int count_mb_ref_frame_usage[MAX_REF_FRAMES];

+  int ref_frame_flags;

+  unsigned char ref_pred_probs_update[PREDICTION_PROBS];

+  SPEED_FEATURES sf;

+  int error_bins[1024];

+  // Data used for real time conferencing mode to help determine if it would be good to update the gf

+  int inter_zz_count;

+  int gf_bad_count;

+  int gf_update_recommended;

+  int skip_true_count[3];

+  int skip_false_count[3];

+  unsigned char *segmentation_map;

+  // segment threashold for encode breakout

+  int  segment_encode_breakout[MAX_MB_SEGMENTS];

+  unsigned char *active_map;

+  unsigned int active_map_enabled;

+  TOKENLIST *tplist;

+  fractional_mv_step_fp *find_fractional_mv_step;

+  vp9_full_search_fn_t full_search_sad;

+  vp9_refining_search_fn_t refining_search_sad;

+  vp9_diamond_search_fn_t diamond_search_sad;

+  vp9_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SB_SEGMENTS];

+  uint64_t time_receive_data;

+  uint64_t time_compress_data;

+  uint64_t time_pick_lpf;

+  uint64_t time_encode_mb_row;

+  int base_skip_false_prob[QINDEX_RANGE][3];

+  struct twopass_rc {

+    unsigned int section_intra_rating;

+    unsigned int next_iiratio;

+    unsigned int this_iiratio;

+    FIRSTPASS_STATS *total_stats;

+    FIRSTPASS_STATS *this_frame_stats;

+    FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;

+    FIRSTPASS_STATS *total_left_stats;

+    int first_pass_done;

+    int64_t bits_left;

+    int64_t clip_bits_total;

+    double avg_iiratio;

+    double modified_error_total;

+    double modified_error_used;

+    double modified_error_left;

+    double kf_intra_err_min;

+    double gf_intra_err_min;

+    int frames_to_key;

+    int maxq_max_limit;

+    int maxq_min_limit;

+    int static_scene_max_gf_interval;

+    int kf_bits;

+    int gf_group_error_left;           // Remaining error from uncoded frames in a gf group. Two pass use only

+    // Projected total bits available for a key frame group of frames

+    int64_t kf_group_bits;

+    // Error score of frames still to be coded in kf group

+    int64_t kf_group_error_left;

+    int gf_group_bits;                // Projected Bits available for a group of frames including 1 GF or ARF

+    int gf_bits;                     // Bits for the golden frame or ARF - 2 pass only

+    int alt_extra_bits;

+    int sr_update_lag;

+    double est_max_qcorrection_factor;

+  } twopass;

+#if CONFIG_RUNTIME_CPU_DETECT

+  VP9_ENCODER_RTCD            rtcd;

+#endif

+#if VP9_TEMPORAL_ALT_REF

+  YV12_BUFFER_CONFIG alt_ref_buffer;

+  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];

+  int fixed_divide[512];

+#endif

+#if CONFIG_INTERNAL_STATS

+  int    count;

+  double total_y;

+  double total_u;

+  double total_v;

+  double total;

+  double total_sq_error;

+  double totalp_y;

+  double totalp_u;

+  double totalp_v;

+  double totalp;

+  double total_sq_error2;

+  int    bytes;

+  double summed_quality;

+  double summed_weights;

+  unsigned int tot_recode_hits;

+  double total_ssimg_y;

+  double total_ssimg_u;

+  double total_ssimg_v;

+  double total_ssimg_all;

+  int b_calculate_ssimg;

+#endif

+  int b_calculate_psnr;

+  // Per MB activity measurement

+  unsigned int activity_avg;

+  unsigned int *mb_activity_map;

+  int *mb_norm_activity_map;

+  // Record of which MBs still refer to last golden frame either

+  // directly or through 0,0

+  unsigned char *gf_active_flags;

+  int gf_active_count;

+  int output_partition;

+  // Store last frame's MV info for next frame MV prediction

+  int_mv *lfmv;

+  int *lf_ref_frame_sign_bias;

+  int *lf_ref_frame;

+  /* force next frame to intra when kf_auto says so */

+  int force_next_frame_intra;

+  int droppable;

+  // TODO Do we still need this??

+  int update_context;

+  int dummy_packing;    /* flag to indicate if packing is dummy */

+#if CONFIG_PRED_FILTER

+  int pred_filter_on_count;

+  int pred_filter_off_count;

+#endif

+  unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1]

+                                      [VP9_SWITCHABLE_FILTERS];

+#if CONFIG_NEW_MVREF

+  unsigned int best_ref_index_counts[MAX_REF_FRAMES][MAX_MV_REFS];

+#endif

+} VP9_COMP;

+void vp9_encode_frame(VP9_COMP *cpi);

+void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,

+                        unsigned long *size);

+void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x);

+void vp9_tokenize_mb(VP9_COMP *, MACROBLOCKD *, TOKENEXTRA **, int dry_run);

+void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run);

+void vp9_set_speed_features(VP9_COMP *cpi);

+#if CONFIG_DEBUG

+#define CHECK_MEM_ERROR(lval,expr) do {\

+    lval = (expr); \

+    if(!lval) \

+      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\

+                         "Failed to allocate "#lval" at %s:%d", \

+                         __FILE__,__LINE__);\

+  } while(0)

+#else

+#define CHECK_MEM_ERROR(lval,expr) do {\

+    lval = (expr); \

+    if(!lval) \

+      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\

+                         "Failed to allocate "#lval);\

+  } while(0)

+#endif

+#endif  // __INC_ONYX_INT_H

--- /dev/null

+++ b/vp9/encoder/picklpf.c

@@ -1,0 +1,420 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vp9/common/onyxc_int.h"

+#include "onyx_int.h"

+#include "quantize.h"

+#include "vpx_mem/vpx_mem.h"

+#include "vpx_scale/yv12extend.h"

+#include "vpx_scale/vpxscale.h"

+#include "vp9/common/alloccommon.h"

+#include "vp9/common/loopfilter.h"

+#if ARCH_ARM

+#include "vpx_ports/arm.h"

+#endif

+extern int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source,

+                           YV12_BUFFER_CONFIG *dest);

+#if HAVE_ARMV7

+extern void vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);

+#endif

+#if CONFIG_RUNTIME_CPU_DETECT

+#define IF_RTCD(x) (x)

+#else

+#define IF_RTCD(x) NULL

+#endif

+extern void(*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc,

+                                              YV12_BUFFER_CONFIG *dst_ybc,

+                                              int fraction);

+void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,

+                                 YV12_BUFFER_CONFIG *dst_ybc, int Fraction) {

+  unsigned char *src_y, *dst_y;

+  int yheight;

+  int ystride;

+  int border;

+  int yoffset;

+  int linestocopy;

+  border   = src_ybc->border;

+  yheight  = src_ybc->y_height;

+  ystride  = src_ybc->y_stride;

+  linestocopy = (yheight >> (Fraction + 4));

+  if (linestocopy < 1)

+    linestocopy = 1;

+  linestocopy <<= 4;

+  yoffset  = ystride * ((yheight >> 5) * 16 - 8);

+  src_y = src_ybc->y_buffer + yoffset;

+  dst_y = dst_ybc->y_buffer + yoffset;

+  vpx_memcpy(dst_y, src_y, ystride * (linestocopy + 16));

+}

+static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,

+                                YV12_BUFFER_CONFIG *dest, int Fraction) {

+  int i, j;

+  int Total = 0;

+  int srcoffset, dstoffset;

+  unsigned char *src = source->y_buffer;

+  unsigned char *dst = dest->y_buffer;

+  int linestocopy = (source->y_height >> (Fraction + 4));

+  if (linestocopy < 1)

+    linestocopy = 1;

+  linestocopy <<= 4;

+  srcoffset = source->y_stride   * (dest->y_height >> 5) * 16;

+  dstoffset = dest->y_stride     * (dest->y_height >> 5) * 16;

+  src += srcoffset;

+  dst += dstoffset;

+  // Loop through the Y plane raw and reconstruction data summing (square differences)

+  for (i = 0; i < linestocopy; i += 16) {

+    for (j = 0; j < source->y_width; j += 16) {

+      unsigned int sse;

+      Total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride,

+                            &sse);

+    }

+    src += 16 * source->y_stride;

+    dst += 16 * dest->y_stride;

+  }

+  return Total;

+}

+// Enforce a minimum filter level based upon baseline Q

+static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) {

+  int min_filter_level;

+  /*int q = (int) vp9_convert_qindex_to_q(base_qindex);

+  if (cpi->source_alt_ref_active && cpi->common.refresh_golden_frame && !cpi->common.refresh_alt_ref_frame)

+      min_filter_level = 0;

+  else

+  {

+      if (q <= 10)

+          min_filter_level = 0;

+      else if (q <= 64)

+          min_filter_level = 1;

+      else

+          min_filter_level = (q >> 6);

+  }

+  */

+  min_filter_level = 0;

+  return min_filter_level;

+}

+// Enforce a maximum filter level based upon baseline Q

+static int get_max_filter_level(VP9_COMP *cpi, int base_qindex) {

+  // PGW August 2006: Highest filter values almost always a bad idea

+  // jbb chg: 20100118 - not so any more with this overquant stuff allow high values

+  // with lots of intra coming in.

+  int max_filter_level = MAX_LOOP_FILTER;// * 3 / 4;

+  (void)base_qindex;

+  if (cpi->twopass.section_intra_rating > 8)

+    max_filter_level = MAX_LOOP_FILTER * 3 / 4;

+  return max_filter_level;

+}

+void vp9_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {

+  VP9_COMMON *cm = &cpi->common;

+  int best_err = 0;

+  int filt_err = 0;

+  int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);

+  int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);

+  int filt_val;

+  int best_filt_val = cm->filter_level;

+  //  Make a copy of the unfiltered / processed recon buffer

+  vp9_yv12_copy_partial_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf, 3);

+  if (cm->frame_type == KEY_FRAME)

+    cm->sharpness_level = 0;

+  else

+    cm->sharpness_level = cpi->oxcf.Sharpness;

+  if (cm->sharpness_level != cm->last_sharpness_level) {

+    vp9_loop_filter_update_sharpness(&cm->lf_info, cm->sharpness_level);

+    cm->last_sharpness_level = cm->sharpness_level;

+  }

+  // Start the search at the previous frame filter level unless it is now out of range.

+  if (cm->filter_level < min_filter_level)

+    cm->filter_level = min_filter_level;

+  else if (cm->filter_level > max_filter_level)

+    cm->filter_level = max_filter_level;

+  filt_val = cm->filter_level;

+  best_filt_val = filt_val;

+  // Get the err using the previous frame's filter value.

+  vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);

+  best_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);

+  //  Re-instate the unfiltered frame

+  vp9_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);

+  filt_val -= (1 + ((filt_val > 10) ? 1 : 0));

+  // Search lower filter levels

+  while (filt_val >= min_filter_level) {

+    // Apply the loop filter

+    vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);

+    // Get the err for filtered frame

+    filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);

+    //  Re-instate the unfiltered frame

+    vp9_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);

+    // Update the best case record or exit loop.

+    if (filt_err < best_err) {

+      best_err = filt_err;

+      best_filt_val = filt_val;

+    } else

+      break;

+    // Adjust filter level

+    filt_val -= (1 + ((filt_val > 10) ? 1 : 0));

+  }

+  // Search up (note that we have already done filt_val = cm->filter_level)

+  filt_val = cm->filter_level + (1 + ((filt_val > 10) ? 1 : 0));

+  if (best_filt_val == cm->filter_level) {

+    // Resist raising filter level for very small gains

+    best_err -= (best_err >> 10);

+    while (filt_val < max_filter_level) {

+      // Apply the loop filter

+      vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);

+      // Get the err for filtered frame

+      filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);

+      //  Re-instate the unfiltered frame

+      vp9_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf,

+                                      cm->frame_to_show, 3);

+      // Update the best case record or exit loop.

+      if (filt_err < best_err) {

+        // Do not raise filter level if improvement is < 1 part in 4096

+        best_err = filt_err - (filt_err >> 10);

+        best_filt_val = filt_val;

+      } else

+        break;

+      // Adjust filter level

+      filt_val += (1 + ((filt_val > 10) ? 1 : 0));

+    }

+  }

+  cm->filter_level = best_filt_val;

+  if (cm->filter_level < min_filter_level)

+    cm->filter_level = min_filter_level;

+  if (cm->filter_level > max_filter_level)

+    cm->filter_level = max_filter_level;

+}

+// Stub function for now Alt LF not used

+void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val) {

+}

+void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {

+  VP9_COMMON *cm = &cpi->common;

+  int best_err = 0;

+  int filt_err = 0;

+  int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);

+  int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);

+  int filter_step;

+  int filt_high = 0;

+  int filt_mid = cm->filter_level;      // Start search at previous frame filter level

+  int filt_low = 0;

+  int filt_best;

+  int filt_direction = 0;

+  int Bias = 0;                       // Bias against raising loop filter and in favour of lowering it

+  //  Make a copy of the unfiltered / processed recon buffer

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+  if (cm->rtcd.flags & HAS_NEON)

+#endif

+  {

+    vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(cm->frame_to_show, &cpi->last_frame_uf);

+  }

+#if CONFIG_RUNTIME_CPU_DETECT

+  else

+#endif

+#endif

+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT

+  {

+    vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf);

+  }

+#endif

+  if (cm->frame_type == KEY_FRAME)

+    cm->sharpness_level = 0;

+  else

+    cm->sharpness_level = cpi->oxcf.Sharpness;

+  // Start the search at the previous frame filter level unless it is now out of range.

+  filt_mid = cm->filter_level;

+  if (filt_mid < min_filter_level)

+    filt_mid = min_filter_level;

+  else if (filt_mid > max_filter_level)

+    filt_mid = max_filter_level;

+  // Define the initial step size

+  filter_step = (filt_mid < 16) ? 4 : filt_mid / 4;

+  // Get baseline error score

+  vp9_set_alt_lf_level(cpi, filt_mid);

+  vp9_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid);

+  best_err = vp9_calc_ss_err(sd, cm->frame_to_show);

+  filt_best = filt_mid;

+  //  Re-instate the unfiltered frame

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+  if (cm->rtcd.flags & HAS_NEON)

+#endif

+  {

+    vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);

+  }

+#if CONFIG_RUNTIME_CPU_DETECT

+  else

+#endif

+#endif

+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT

+  {

+    vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);

+  }

+#endif

+  while (filter_step > 0) {

+    Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; // PGW change 12/12/06 for small images

+    // jbb chg: 20100118 - in sections with lots of new material coming in don't bias as much to a low filter value

+    if (cpi->twopass.section_intra_rating < 20)

+      Bias = Bias * cpi->twopass.section_intra_rating / 20;

+    // yx, bias less for large block size

+    if (cpi->common.txfm_mode != ONLY_4X4)

+      Bias >>= 1;

+    filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step);

+    filt_low = ((filt_mid - filter_step) < min_filter_level) ? min_filter_level : (filt_mid - filter_step);

+    if ((filt_direction <= 0) && (filt_low != filt_mid)) {

+      // Get Low filter error score

+      vp9_set_alt_lf_level(cpi, filt_low);

+      vp9_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);

+      filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);

+      //  Re-instate the unfiltered frame

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+      if (cm->rtcd.flags & HAS_NEON)

+#endif

+      {

+        vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);

+      }

+#if CONFIG_RUNTIME_CPU_DETECT

+      else

+#endif

+#endif

+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT

+      {

+        vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);

+      }

+#endif

+      // If value is close to the best so far then bias towards a lower loop filter value.

+      if ((filt_err - Bias) < best_err) {

+        // Was it actually better than the previous best?

+        if (filt_err < best_err)

+          best_err = filt_err;

+        filt_best = filt_low;

+      }

+    }

+    // Now look at filt_high

+    if ((filt_direction >= 0) && (filt_high != filt_mid)) {

+      vp9_set_alt_lf_level(cpi, filt_high);

+      vp9_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);

+      filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);

+      //  Re-instate the unfiltered frame

+#if HAVE_ARMV7

+#if CONFIG_RUNTIME_CPU_DETECT

+      if (cm->rtcd.flags & HAS_NEON)

+#endif

+      {

+        vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);

+      }

+#if CONFIG_RUNTIME_CPU_DETECT

+      else

+#endif

+#endif

+#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT

+      {

+        vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);

+      }

+#endif

+      // Was it better than the previous best?

+      if (filt_err < (best_err - Bias)) {

+        best_err = filt_err;

+        filt_best = filt_high;

+      }

+    }

+    // Half the step distance if the best filter value was the same as last time

+    if (filt_best == filt_mid) {

+      filter_step = filter_step / 2;

+      filt_direction = 0;

+    } else {

+      filt_direction = (filt_best < filt_mid) ? -1 : 1;

+      filt_mid = filt_best;

+    }

+  }

+  cm->filter_level = filt_best;

+}

--- /dev/null

+++ b/vp9/encoder/ppc/csystemdependent.c

@@ -1,0 +1,155 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vp9/encoder/variance.h"

+#include "vp9/encoder/onyx_int.h"

+SADFunction *vp9_sad16x16;

+SADFunction *vp9_sad16x8;

+SADFunction *vp9_sad8x16;

+SADFunction *vp9_sad8x8;

+SADFunction *vp9_sad4x4;

+variance_function *vp9_variance4x4;

+variance_function *vp9_variance8x8;

+variance_function *vp9_variance8x16;

+variance_function *vp9_variance16x8;

+variance_function *vp9_variance16x16;

+variance_function *vp9_mse16x16;

+sub_pixel_variance_function *vp9_sub_pixel_variance4x4;

+sub_pixel_variance_function *vp9_sub_pixel_variance8x8;

+sub_pixel_variance_function *vp9_sub_pixel_variance8x16;

+sub_pixel_variance_function *vp9_sub_pixel_variance16x8;

+sub_pixel_variance_function *vp9_sub_pixel_variance16x16;

+int (*vp9_block_error)(short *coeff, short *dqcoeff);

+int (*vp9_mbblock_error)(MACROBLOCK *mb, int dc);

+int (*vp9_mbuverror)(MACROBLOCK *mb);

+unsigned int (*vp9_get_mb_ss)(short *);

+void (*vp9_short_fdct4x4)(short *input, short *output, int pitch);

+void (*vp9_short_fdct8x4)(short *input, short *output, int pitch);

+void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);

+void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);

+void (*short_walsh4x4)(short *input, short *output, int pitch);

+void (*vp9_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);

+void (*vp9_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);

+void (*vp9_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);

+void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);

+// c imports

+extern int block_error_c(short *coeff, short *dqcoeff);

+extern int vp9_mbblock_error_c(MACROBLOCK *mb, int dc);

+extern int vp9_mbuverror_c(MACROBLOCK *mb);

+extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);

+extern void short_fdct4x4_c(short *input, short *output, int pitch);

+extern void short_fdct8x4_c(short *input, short *output, int pitch);

+extern void vp9_short_walsh4x4_c(short *input, short *output, int pitch);

+extern void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);

+extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);

+extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);

+extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);

+extern SADFunction sad16x16_c;

+extern SADFunction sad16x8_c;

+extern SADFunction sad8x16_c;

+extern SADFunction sad8x8_c;

+extern SADFunction sad4x4_c;

+extern variance_function variance16x16_c;

+extern variance_function variance8x16_c;

+extern variance_function variance16x8_c;

+extern variance_function variance8x8_c;

+extern variance_function variance4x4_c;

+extern variance_function mse16x16_c;

+extern sub_pixel_variance_function sub_pixel_variance4x4_c;

+extern sub_pixel_variance_function sub_pixel_variance8x8_c;

+extern sub_pixel_variance_function sub_pixel_variance8x16_c;

+extern sub_pixel_variance_function sub_pixel_variance16x8_c;

+extern sub_pixel_variance_function sub_pixel_variance16x16_c;

+extern unsigned int vp9_get_mb_ss_c(short *);

+// ppc

+extern int vp9_block_error_ppc(short *coeff, short *dqcoeff);

+extern void vp9_short_fdct4x4_ppc(short *input, short *output, int pitch);

+extern void vp9_short_fdct8x4_ppc(short *input, short *output, int pitch);

+extern void vp9_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride);

+extern void vp9_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);

+extern SADFunction vp9_sad16x16_ppc;

+extern SADFunction vp9_sad16x8_ppc;

+extern SADFunction vp9_sad8x16_ppc;

+extern SADFunction vp9_sad8x8_ppc;

+extern SADFunction vp9_sad4x4_ppc;

+extern variance_function vp9_variance16x16_ppc;

+extern variance_function vp9_variance8x16_ppc;

+extern variance_function vp9_variance16x8_ppc;

+extern variance_function vp9_variance8x8_ppc;

+extern variance_function vp9_variance4x4_ppc;

+extern variance_function vp9_mse16x16_ppc;

+extern sub_pixel_variance_function vp9_sub_pixel_variance4x4_ppc;

+extern sub_pixel_variance_function vp9_sub_pixel_variance8x8_ppc;

+extern sub_pixel_variance_function vp9_sub_pixel_variance8x16_ppc;

+extern sub_pixel_variance_function vp9_sub_pixel_variance16x8_ppc;

+extern sub_pixel_variance_function vp9_sub_pixel_variance16x16_ppc;

+extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);

+extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);

+void vp9_cmachine_specific_config(void) {

+  // Pure C:

+  vp9_mbuverror               = vp9_mbuverror_c;

+  vp8_fast_quantize_b           = vp8_fast_quantize_b_c;

+  vp9_short_fdct4x4            = vp9_short_fdct4x4_ppc;

+  vp9_short_fdct8x4            = vp9_short_fdct8x4_ppc;

+  vp8_fast_fdct4x4             = vp9_short_fdct4x4_ppc;

+  vp8_fast_fdct8x4             = vp9_short_fdct8x4_ppc;

+  short_walsh4x4               = vp9_short_walsh4x4_c;

+  vp9_variance4x4             = vp9_variance4x4_ppc;

+  vp9_variance8x8             = vp9_variance8x8_ppc;

+  vp9_variance8x16            = vp9_variance8x16_ppc;

+  vp9_variance16x8            = vp9_variance16x8_ppc;

+  vp9_variance16x16           = vp9_variance16x16_ppc;

+  vp9_mse16x16                = vp9_mse16x16_ppc;

+  vp9_sub_pixel_variance4x4     = vp9_sub_pixel_variance4x4_ppc;

+  vp9_sub_pixel_variance8x8     = vp9_sub_pixel_variance8x8_ppc;

+  vp9_sub_pixel_variance8x16    = vp9_sub_pixel_variance8x16_ppc;

+  vp9_sub_pixel_variance16x8    = vp9_sub_pixel_variance16x8_ppc;

+  vp9_sub_pixel_variance16x16   = vp9_sub_pixel_variance16x16_ppc;

+  vp9_get_mb_ss                 = vp9_get_mb_ss_c;

+  vp9_sad16x16                = vp9_sad16x16_ppc;

+  vp9_sad16x8                 = vp9_sad16x8_ppc;

+  vp9_sad8x16                 = vp9_sad8x16_ppc;

+  vp9_sad8x8                  = vp9_sad8x8_ppc;

+  vp9_sad4x4                  = vp9_sad4x4_ppc;

+  vp9_block_error              = vp9_block_error_ppc;

+  vp9_mbblock_error            = vp9_mbblock_error_c;

+  vp9_subtract_b               = vp9_subtract_b_c;

+  vp9_subtract_mby             = vp9_subtract_mby_ppc;

+  vp9_subtract_mbuv            = vp9_subtract_mbuv_ppc;

+}

--- /dev/null

+++ b/vp9/encoder/ppc/encodemb_altivec.asm

@@ -1,0 +1,153 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    .globl vp8_subtract_mbuv_ppc

+    .globl vp8_subtract_mby_ppc

+;# r3 short *diff

+;# r4 unsigned char *usrc

+;# r5 unsigned char *vsrc

+;# r6 unsigned char *pred

+;# r7 int stride

+vp8_subtract_mbuv_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xf000

+    mtspr   256, r12            ;# set VRSAVE

+    li      r9, 256

+    add     r3, r3, r9

+    add     r3, r3, r9

+    add     r6, r6, r9

+    li      r10, 16

+    li      r9,  4

+    mtctr   r9

+    vspltisw v0, 0

+mbu_loop:

+    lvsl    v5, 0, r4           ;# permutate value for alignment

+    lvx     v1, 0, r4           ;# src

+    lvx     v2, 0, r6           ;# pred

+    add     r4, r4, r7

+    addi    r6, r6, 16

+    vperm   v1, v1, v0, v5

+    vmrghb  v3, v0, v1          ;# unpack high src  to short

+    vmrghb  v4, v0, v2          ;# unpack high pred to short

+    lvsl    v5, 0, r4           ;# permutate value for alignment

+    lvx     v1, 0, r4           ;# src

+    add     r4, r4, r7

+    vsubshs v3, v3, v4

+    stvx    v3, 0, r3           ;# store out diff

+    vperm   v1, v1, v0, v5

+    vmrghb  v3, v0, v1          ;# unpack high src  to short

+    vmrglb  v4, v0, v2          ;# unpack high pred to short

+    vsubshs v3, v3, v4

+    stvx    v3, r10, r3         ;# store out diff

+    addi    r3, r3, 32

+    bdnz    mbu_loop

+    mtctr   r9

+mbv_loop:

+    lvsl    v5, 0, r5           ;# permutate value for alignment

+    lvx     v1, 0, r5           ;# src

+    lvx     v2, 0, r6           ;# pred

+    add     r5, r5, r7

+    addi    r6, r6, 16

+    vperm   v1, v1, v0, v5

+    vmrghb  v3, v0, v1          ;# unpack high src  to short

+    vmrghb  v4, v0, v2          ;# unpack high pred to short

+    lvsl    v5, 0, r5           ;# permutate value for alignment

+    lvx     v1, 0, r5           ;# src

+    add     r5, r5, r7

+    vsubshs v3, v3, v4

+    stvx    v3, 0, r3           ;# store out diff

+    vperm   v1, v1, v0, v5

+    vmrghb  v3, v0, v1          ;# unpack high src  to short

+    vmrglb  v4, v0, v2          ;# unpack high pred to short

+    vsubshs v3, v3, v4

+    stvx    v3, r10, r3         ;# store out diff

+    addi    r3, r3, 32

+    bdnz    mbv_loop

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+;# r3 short *diff

+;# r4 unsigned char *src

+;# r5 unsigned char *pred

+;# r6 int stride

+vp8_subtract_mby_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xf800

+    mtspr   256, r12            ;# set VRSAVE

+    li      r10, 16

+    mtctr   r10

+    vspltisw v0, 0

+mby_loop:

+    lvx     v1, 0, r4           ;# src

+    lvx     v2, 0, r5           ;# pred

+    add     r4, r4, r6

+    addi    r5, r5, 16

+    vmrghb  v3, v0, v1          ;# unpack high src  to short

+    vmrghb  v4, v0, v2          ;# unpack high pred to short

+    vsubshs v3, v3, v4

+    stvx    v3, 0, r3           ;# store out diff

+    vmrglb  v3, v0, v1          ;# unpack low src  to short

+    vmrglb  v4, v0, v2          ;# unpack low pred to short

+    vsubshs v3, v3, v4

+    stvx    v3, r10, r3         ;# store out diff

+    addi    r3, r3, 32

+    bdnz    mby_loop

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

--- /dev/null

+++ b/vp9/encoder/ppc/fdct_altivec.asm

@@ -1,0 +1,205 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    .globl vp8_short_fdct4x4_ppc

+    .globl vp8_short_fdct8x4_ppc

+.macro load_c V, LABEL, OFF, R0, R1

+    lis     \R0, \LABEL@ha

+    la      \R1, \LABEL@l(\R0)

+    lvx     \V, \OFF, \R1

+.endm

+;# Forward and inverse DCTs are nearly identical; only differences are

+;#   in normalization (fwd is twice unitary, inv is half unitary)

+;#   and that they are of course transposes of each other.

+;#

+;#   The following three accomplish most of implementation and

+;#   are used only by ppc_idct.c and ppc_fdct.c.

+.macro prologue

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xfffc

+    mtspr   256, r12            ;# set VRSAVE

+    stwu    r1,-32(r1)          ;# create space on the stack

+    li      r6, 16

+    load_c v0, dct_tab, 0, r9, r10

+    lvx     v1,   r6, r10

+    addi    r10, r10, 32

+    lvx     v2,    0, r10

+    lvx     v3,   r6, r10

+    load_c v4, ppc_dctperm_tab,  0, r9, r10

+    load_c v5, ppc_dctperm_tab, r6, r9, r10

+    load_c v6, round_tab, 0, r10, r9

+.endm

+.macro epilogue

+    addi    r1, r1, 32          ;# recover stack

+    mtspr   256, r11            ;# reset old VRSAVE

+.endm

+;# Do horiz xf on two rows of coeffs  v8 = a0 a1 a2 a3  b0 b1 b2 b3.

+;#   a/A are the even rows 0,2   b/B are the odd rows 1,3

+;#   For fwd transform, indices are horizontal positions, then frequencies.

+;#   For inverse transform, frequencies then positions.

+;#   The two resulting  A0..A3  B0..B3  are later combined

+;#   and vertically transformed.

+.macro two_rows_horiz Dst

+    vperm   v9, v8, v8, v4      ;# v9 = a2 a3 a0 a1  b2 b3 b0 b1

+    vmsumshm v10, v0, v8, v6

+    vmsumshm v10, v1, v9, v10

+    vsraw   v10, v10, v7        ;# v10 = A0 A1  B0 B1

+    vmsumshm v11, v2, v8, v6

+    vmsumshm v11, v3, v9, v11

+    vsraw   v11, v11, v7        ;# v11 = A2 A3  B2 B3

+    vpkuwum v10, v10, v11       ;# v10  = A0 A1  B0 B1  A2 A3  B2 B3

+    vperm   \Dst, v10, v10, v5  ;# Dest = A0 B0  A1 B1  A2 B2  A3 B3

+.endm

+;# Vertical xf on two rows. DCT values in comments are for inverse transform;

+;#   forward transform uses transpose.

+.macro two_rows_vert Ceven, Codd

+    vspltw  v8, \Ceven, 0       ;# v8 = c00 c10  or  c02 c12 four times

+    vspltw  v9, \Codd,  0       ;# v9 = c20 c30  or  c22 c32 ""

+    vmsumshm v8, v8, v12, v6

+    vmsumshm v8, v9, v13, v8

+    vsraw   v10, v8, v7

+    vspltw  v8, \Codd,  1       ;# v8 = c01 c11  or  c03 c13

+    vspltw  v9, \Ceven, 1       ;# v9 = c21 c31  or  c23 c33

+    vmsumshm v8, v8, v12, v6

+    vmsumshm v8, v9, v13, v8

+    vsraw   v8, v8, v7

+    vpkuwum v8, v10, v8         ;# v8 = rows 0,1  or 2,3

+.endm

+.macro two_rows_h Dest

+    stw     r0,  0(r8)

+    lwz     r0,  4(r3)

+    stw     r0,  4(r8)

+    lwzux   r0, r3,r5

+    stw     r0,  8(r8)

+    lwz     r0,  4(r3)

+    stw     r0, 12(r8)

+    lvx     v8,  0,r8

+    two_rows_horiz \Dest

+.endm

+    .align 2

+;# r3 short *input

+;# r4 short *output

+;# r5 int pitch

+vp8_short_fdct4x4_ppc:

+    prologue

+    vspltisw v7, 14             ;# == 14, fits in 5 signed bits

+    addi    r8, r1, 0

+    lwz     r0, 0(r3)

+    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13

+    lwzux   r0, r3, r5

+    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33

+    lvx     v6, r6, r9          ;# v6 = Vround

+    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter

+    two_rows_vert v0, v1

+    stvx    v8, 0, r4

+    two_rows_vert v2, v3

+    stvx    v8, r6, r4

+    epilogue

+    blr

+    .align 2

+;# r3 short *input

+;# r4 short *output

+;# r5 int pitch

+vp8_short_fdct8x4_ppc:

+    prologue

+    vspltisw v7, 14             ;# == 14, fits in 5 signed bits

+    addi    r8,  r1, 0

+    addi    r10, r3, 0

+    lwz     r0, 0(r3)

+    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13

+    lwzux   r0, r3, r5

+    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33

+    lvx     v6, r6, r9          ;# v6 = Vround

+    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter

+    two_rows_vert v0, v1

+    stvx    v8, 0, r4

+    two_rows_vert v2, v3

+    stvx    v8, r6, r4

+    ;# Next block

+    addi    r3, r10, 8

+    addi    r4, r4, 32

+    lvx     v6, 0, r9           ;# v6 = Hround

+    vspltisw v7, 14             ;# == 14, fits in 5 signed bits

+    addi    r8, r1, 0

+    lwz     r0, 0(r3)

+    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13

+    lwzux   r0, r3, r5

+    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33

+    lvx     v6, r6, r9          ;# v6 = Vround

+    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter

+    two_rows_vert v0, v1

+    stvx    v8, 0, r4

+    two_rows_vert v2, v3

+    stvx    v8, r6, r4

+    epilogue

+    blr

+    .data

+    .align 4

+ppc_dctperm_tab:

+    .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11

+    .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15

+    .align 4

+dct_tab:

+    .short  23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274

+    .short  23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540

+    .short  23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540

+    .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274

+    .align 4

+round_tab:

+    .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))

+    .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))

--- /dev/null

+++ b/vp9/encoder/ppc/rdopt_altivec.asm

@@ -1,0 +1,51 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    .globl vp8_block_error_ppc

+    .align 2

+;# r3 short *Coeff

+;# r4 short *dqcoeff

+vp8_block_error_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xf800

+    mtspr   256, r12            ;# set VRSAVE

+    stwu    r1,-32(r1)          ;# create space on the stack

+    stw     r5, 12(r1)          ;# tranfer dc to vector register

+    lvx     v0, 0, r3           ;# Coeff

+    lvx     v1, 0, r4           ;# dqcoeff

+    li      r10, 16

+    vspltisw v3, 0

+    vsubshs v0, v0, v1

+    vmsumshm v2, v0, v0, v3     ;# multiply differences

+    lvx     v0, r10, r3         ;# Coeff

+    lvx     v1, r10, r4         ;# dqcoeff

+    vsubshs v0, v0, v1

+    vmsumshm v1, v0, v0, v2     ;# multiply differences

+    vsumsws v1, v1, v3          ;# sum up

+    stvx    v1, 0, r1

+    lwz     r3, 12(r1)          ;# return value

+    addi    r1, r1, 32          ;# recover stack

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

--- /dev/null

+++ b/vp9/encoder/ppc/sad_altivec.asm

@@ -1,0 +1,277 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    .globl vp8_sad16x16_ppc

+    .globl vp8_sad16x8_ppc

+    .globl vp8_sad8x16_ppc

+    .globl vp8_sad8x8_ppc

+    .globl vp8_sad4x4_ppc

+.macro load_aligned_16 V R O

+    lvsl    v3,  0, \R          ;# permutate value for alignment

+    lvx     v1,  0, \R

+    lvx     v2, \O, \R

+    vperm   \V, v1, v2, v3

+.endm

+.macro prologue

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xffc0

+    mtspr   256, r12            ;# set VRSAVE

+    stwu    r1, -32(r1)         ;# create space on the stack

+    li      r10, 16             ;# load offset and loop counter

+    vspltisw v8, 0              ;# zero out total to start

+.endm

+.macro epilogue

+    addi    r1, r1, 32          ;# recover stack

+    mtspr   256, r11            ;# reset old VRSAVE

+.endm

+.macro SAD_16

+    ;# v6 = abs (v4 - v5)

+    vsububs v6, v4, v5

+    vsububs v7, v5, v4

+    vor     v6, v6, v7

+    ;# v8 += abs (v4 - v5)

+    vsum4ubs v8, v6, v8

+.endm

+.macro sad_16_loop loop_label

+    lvsl    v3,  0, r5          ;# only needs to be done once per block

+    ;# preload a line of data before getting into the loop

+    lvx     v4, 0, r3

+    lvx     v1,  0, r5

+    lvx     v2, r10, r5

+    add     r5, r5, r6

+    add     r3, r3, r4

+    vperm   v5, v1, v2, v3

+    .align 4

+\loop_label:

+    ;# compute difference on first row

+    vsububs v6, v4, v5

+    vsububs v7, v5, v4

+    ;# load up next set of data

+    lvx     v9, 0, r3

+    lvx     v1,  0, r5

+    lvx     v2, r10, r5

+    ;# perform abs() of difference

+    vor     v6, v6, v7

+    add     r3, r3, r4

+    ;# add to the running tally

+    vsum4ubs v8, v6, v8

+    ;# now onto the next line

+    vperm   v5, v1, v2, v3

+    add     r5, r5, r6

+    lvx     v4, 0, r3

+    ;# compute difference on second row

+    vsububs v6, v9, v5

+    lvx     v1,  0, r5

+    vsububs v7, v5, v9

+    lvx     v2, r10, r5

+    vor     v6, v6, v7

+    add     r3, r3, r4

+    vsum4ubs v8, v6, v8

+    vperm   v5, v1, v2, v3

+    add     r5, r5, r6

+    bdnz    \loop_label

+    vspltisw v7, 0

+    vsumsws v8, v8, v7

+    stvx    v8, 0, r1

+    lwz     r3, 12(r1)

+.endm

+.macro sad_8_loop loop_label

+    .align 4

+\loop_label:

+    ;# only one of the inputs should need to be aligned.

+    load_aligned_16 v4, r3, r10

+    load_aligned_16 v5, r5, r10

+    ;# move onto the next line

+    add     r3, r3, r4

+    add     r5, r5, r6

+    ;# only one of the inputs should need to be aligned.

+    load_aligned_16 v6, r3, r10

+    load_aligned_16 v7, r5, r10

+    ;# move onto the next line

+    add     r3, r3, r4

+    add     r5, r5, r6

+    vmrghb  v4, v4, v6

+    vmrghb  v5, v5, v7

+    SAD_16

+    bdnz    \loop_label

+    vspltisw v7, 0

+    vsumsws v8, v8, v7

+    stvx    v8, 0, r1

+    lwz     r3, 12(r1)

+.endm

+    .align 2

+;# r3 unsigned char *src_ptr

+;# r4 int  src_stride

+;# r5 unsigned char *ref_ptr

+;# r6 int  ref_stride

+;#

+;# r3 return value

+vp8_sad16x16_ppc:

+    prologue

+    li      r9, 8

+    mtctr   r9

+    sad_16_loop sad16x16_loop

+    epilogue

+    blr

+    .align 2

+;# r3 unsigned char *src_ptr

+;# r4 int  src_stride

+;# r5 unsigned char *ref_ptr

+;# r6 int  ref_stride

+;#

+;# r3 return value

+vp8_sad16x8_ppc:

+    prologue

+    li      r9, 4

+    mtctr   r9

+    sad_16_loop sad16x8_loop

+    epilogue

+    blr

+    .align 2

+;# r3 unsigned char *src_ptr

+;# r4 int  src_stride

+;# r5 unsigned char *ref_ptr

+;# r6 int  ref_stride

+;#

+;# r3 return value

+vp8_sad8x16_ppc:

+    prologue

+    li      r9, 8

+    mtctr   r9

+    sad_8_loop sad8x16_loop

+    epilogue

+    blr

+    .align 2

+;# r3 unsigned char *src_ptr

+;# r4 int  src_stride

+;# r5 unsigned char *ref_ptr

+;# r6 int  ref_stride

+;#

+;# r3 return value

+vp8_sad8x8_ppc:

+    prologue

+    li      r9, 4

+    mtctr   r9

+    sad_8_loop sad8x8_loop

+    epilogue

+    blr

+.macro transfer_4x4 I P

+    lwz     r0, 0(\I)

+    add     \I, \I, \P

+    lwz     r7, 0(\I)

+    add     \I, \I, \P

+    lwz     r8, 0(\I)

+    add     \I, \I, \P

+    lwz     r9, 0(\I)

+    stw     r0,  0(r1)

+    stw     r7,  4(r1)

+    stw     r8,  8(r1)

+    stw     r9, 12(r1)

+.endm

+    .align 2

+;# r3 unsigned char *src_ptr

+;# r4 int  src_stride

+;# r5 unsigned char *ref_ptr

+;# r6 int  ref_stride

+;#

+;# r3 return value

+vp8_sad4x4_ppc:

+    prologue

+    transfer_4x4 r3, r4

+    lvx     v4, 0, r1

+    transfer_4x4 r5, r6

+    lvx     v5, 0, r1

+    vspltisw v8, 0              ;# zero out total to start

+    ;# v6 = abs (v4 - v5)

+    vsububs v6, v4, v5

+    vsububs v7, v5, v4

+    vor     v6, v6, v7

+    ;# v8 += abs (v4 - v5)

+    vsum4ubs v7, v6, v8

+    vsumsws v7, v7, v8

+    stvx    v7, 0, r1

+    lwz     r3, 12(r1)

+    epilogue

+    blr

--- /dev/null

+++ b/vp9/encoder/ppc/variance_altivec.asm

@@ -1,0 +1,375 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    .globl vp8_get8x8var_ppc

+    .globl vp8_get16x16var_ppc

+    .globl vp8_mse16x16_ppc

+    .globl vp9_variance16x16_ppc

+    .globl vp9_variance16x8_ppc

+    .globl vp9_variance8x16_ppc

+    .globl vp9_variance8x8_ppc

+    .globl vp9_variance4x4_ppc

+.macro load_aligned_16 V R O

+    lvsl    v3,  0, \R          ;# permutate value for alignment

+    lvx     v1,  0, \R

+    lvx     v2, \O, \R

+    vperm   \V, v1, v2, v3

+.endm

+.macro prologue

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xffc0

+    mtspr   256, r12            ;# set VRSAVE

+    stwu    r1, -32(r1)         ;# create space on the stack

+    li      r10, 16             ;# load offset and loop counter

+    vspltisw v7, 0              ;# zero for merging

+    vspltisw v8, 0              ;# zero out total to start

+    vspltisw v9, 0              ;# zero out total for dif^2

+.endm

+.macro epilogue

+    addi    r1, r1, 32          ;# recover stack

+    mtspr   256, r11            ;# reset old VRSAVE

+.endm

+.macro compute_sum_sse

+    ;# Compute sum first.  Unpack to so signed subract

+    ;#  can be used.  Only have a half word signed

+    ;#  subract.  Do high, then low.

+    vmrghb  v2, v7, v4

+    vmrghb  v3, v7, v5

+    vsubshs v2, v2, v3

+    vsum4shs v8, v2, v8

+    vmrglb  v2, v7, v4

+    vmrglb  v3, v7, v5

+    vsubshs v2, v2, v3

+    vsum4shs v8, v2, v8

+    ;# Now compute sse.

+    vsububs v2, v4, v5

+    vsububs v3, v5, v4

+    vor     v2, v2, v3

+    vmsumubm v9, v2, v2, v9

+.endm

+.macro variance_16 DS loop_label store_sum

+\loop_label:

+    ;# only one of the inputs should need to be aligned.

+    load_aligned_16 v4, r3, r10

+    load_aligned_16 v5, r5, r10

+    ;# move onto the next line

+    add     r3, r3, r4

+    add     r5, r5, r6

+    compute_sum_sse

+    bdnz    \loop_label

+    vsumsws v8, v8, v7

+    vsumsws v9, v9, v7

+    stvx    v8, 0, r1

+    lwz     r3, 12(r1)

+    stvx    v9, 0, r1

+    lwz     r4, 12(r1)

+.if \store_sum

+    stw     r3, 0(r8)           ;# sum

+.endif

+    stw     r4, 0(r7)           ;# sse

+    mullw   r3, r3, r3          ;# sum*sum

+    srawi   r3, r3, \DS         ;# (sum*sum) >> DS

+    subf    r3, r3, r4          ;# sse - ((sum*sum) >> DS)

+.endm

+.macro variance_8 DS loop_label store_sum

+\loop_label:

+    ;# only one of the inputs should need to be aligned.

+    load_aligned_16 v4, r3, r10

+    load_aligned_16 v5, r5, r10

+    ;# move onto the next line

+    add     r3, r3, r4

+    add     r5, r5, r6

+    ;# only one of the inputs should need to be aligned.

+    load_aligned_16 v6, r3, r10

+    load_aligned_16 v0, r5, r10

+    ;# move onto the next line

+    add     r3, r3, r4

+    add     r5, r5, r6

+    vmrghb  v4, v4, v6

+    vmrghb  v5, v5, v0

+    compute_sum_sse

+    bdnz    \loop_label

+    vsumsws v8, v8, v7

+    vsumsws v9, v9, v7

+    stvx    v8, 0, r1

+    lwz     r3, 12(r1)

+    stvx    v9, 0, r1

+    lwz     r4, 12(r1)

+.if \store_sum

+    stw     r3, 0(r8)           ;# sum

+.endif

+    stw     r4, 0(r7)           ;# sse

+    mullw   r3, r3, r3          ;# sum*sum

+    srawi   r3, r3, \DS         ;# (sum*sum) >> 8

+    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)

+.endm

+    .align 2

+;# r3 unsigned char *src_ptr

+;# r4 int  source_stride

+;# r5 unsigned char *ref_ptr

+;# r6 int  recon_stride

+;# r7 unsigned int *SSE

+;# r8 int *Sum

+;#

+;# r3 return value

+vp8_get8x8var_ppc:

+    prologue

+    li      r9, 4

+    mtctr   r9

+    variance_8 6, get8x8var_loop, 1

+    epilogue

+    blr

+    .align 2

+;# r3 unsigned char *src_ptr

+;# r4 int  source_stride

+;# r5 unsigned char *ref_ptr

+;# r6 int  recon_stride

+;# r7 unsigned int *SSE

+;# r8 int *Sum

+;#

+;# r3 return value

+vp8_get16x16var_ppc:

+    prologue

+    mtctr   r10

+    variance_16 8, get16x16var_loop, 1

+    epilogue

+    blr

+    .align 2

+;# r3 unsigned char *src_ptr

+;# r4 int  source_stride

+;# r5 unsigned char *ref_ptr

+;# r6 int  recon_stride

+;# r7 unsigned int *sse

+;#

+;# r 3 return value

+vp8_mse16x16_ppc:

+    prologue

+    mtctr   r10

+mse16x16_loop:

+    ;# only one of the inputs should need to be aligned.

+    load_aligned_16 v4, r3, r10

+    load_aligned_16 v5, r5, r10

+    ;# move onto the next line

+    add     r3, r3, r4

+    add     r5, r5, r6

+    ;# Now compute sse.

+    vsububs v2, v4, v5

+    vsububs v3, v5, v4

+    vor     v2, v2, v3

+    vmsumubm v9, v2, v2, v9

+    bdnz    mse16x16_loop

+    vsumsws v9, v9, v7

+    stvx    v9, 0, r1

+    lwz     r3, 12(r1)

+    stvx    v9, 0, r1

+    lwz     r3, 12(r1)

+    stw     r3, 0(r7)           ;# sse

+    epilogue

+    blr

+    .align 2

+;# r3 unsigned char *src_ptr

+;# r4 int  source_stride

+;# r5 unsigned char *ref_ptr

+;# r6 int  recon_stride

+;# r7 unsigned int *sse

+;#

+;# r3 return value

+vp9_variance16x16_ppc:

+    prologue

+    mtctr   r10

+    variance_16 8, variance16x16_loop, 0

+    epilogue

+    blr

+    .align 2

+;# r3 unsigned char *src_ptr

+;# r4 int  source_stride

+;# r5 unsigned char *ref_ptr

+;# r6 int  recon_stride

+;# r7 unsigned int *sse

+;#

+;# r3 return value

+vp9_variance16x8_ppc:

+    prologue

+    li      r9, 8

+    mtctr   r9

+    variance_16 7, variance16x8_loop, 0

+    epilogue

+    blr

+    .align 2

+;# r3 unsigned char *src_ptr

+;# r4 int  source_stride

+;# r5 unsigned char *ref_ptr

+;# r6 int  recon_stride

+;# r7 unsigned int *sse

+;#

+;# r3 return value

+vp9_variance8x16_ppc:

+    prologue

+    li      r9, 8

+    mtctr   r9

+    variance_8 7, variance8x16_loop, 0

+    epilogue

+    blr

+    .align 2

+;# r3 unsigned char *src_ptr

+;# r4 int  source_stride

+;# r5 unsigned char *ref_ptr

+;# r6 int  recon_stride

+;# r7 unsigned int *sse

+;#

+;# r3 return value

+vp9_variance8x8_ppc:

+    prologue

+    li      r9, 4

+    mtctr   r9

+    variance_8 6, variance8x8_loop, 0

+    epilogue

+    blr

+.macro transfer_4x4 I P

+    lwz     r0, 0(\I)

+    add     \I, \I, \P

+    lwz     r10,0(\I)

+    add     \I, \I, \P

+    lwz     r8, 0(\I)

+    add     \I, \I, \P

+    lwz     r9, 0(\I)

+    stw     r0,  0(r1)

+    stw     r10, 4(r1)

+    stw     r8,  8(r1)

+    stw     r9, 12(r1)

+.endm

+    .align 2

+;# r3 unsigned char *src_ptr

+;# r4 int  source_stride

+;# r5 unsigned char *ref_ptr

+;# r6 int  recon_stride

+;# r7 unsigned int *sse

+;#

+;# r3 return value

+vp9_variance4x4_ppc:

+    prologue

+    transfer_4x4 r3, r4

+    lvx     v4, 0, r1

+    transfer_4x4 r5, r6

+    lvx     v5, 0, r1

+    compute_sum_sse

+    vsumsws v8, v8, v7

+    vsumsws v9, v9, v7

+    stvx    v8, 0, r1

+    lwz     r3, 12(r1)

+    stvx    v9, 0, r1

+    lwz     r4, 12(r1)

+    stw     r4, 0(r7)           ;# sse

+    mullw   r3, r3, r3          ;# sum*sum

+    srawi   r3, r3, 4           ;# (sum*sum) >> 4

+    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 4)

+    epilogue

+    blr

--- /dev/null

+++ b/vp9/encoder/ppc/variance_subpixel_altivec.asm

@@ -1,0 +1,865 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    .globl vp9_sub_pixel_variance4x4_ppc

+    .globl vp9_sub_pixel_variance8x8_ppc

+    .globl vp9_sub_pixel_variance8x16_ppc

+    .globl vp9_sub_pixel_variance16x8_ppc

+    .globl vp9_sub_pixel_variance16x16_ppc

+.macro load_c V, LABEL, OFF, R0, R1

+    lis     \R0, \LABEL@ha

+    la      \R1, \LABEL@l(\R0)

+    lvx     \V, \OFF, \R1

+.endm

+.macro load_vfilter V0, V1

+    load_c \V0, vfilter_b, r6, r12, r10

+    addi    r6,  r6, 16

+    lvx     \V1, r6, r10

+.endm

+.macro HProlog jump_label

+    ;# load up horizontal filter

+    slwi.   r5, r5, 4           ;# index into horizontal filter array

+    ;# index to the next set of vectors in the row.

+    li      r10, 16

+    ;# downshift by 7 ( divide by 128 ) at the end

+    vspltish v19, 7

+    ;# If there isn't any filtering to be done for the horizontal, then

+    ;#  just skip to the second pass.

+    beq     \jump_label

+    load_c v20, hfilter_b, r5, r12, r0

+    ;# setup constants

+    ;# v14 permutation value for alignment

+    load_c v28, b_hperm_b, 0, r12, r0

+    ;# index to the next set of vectors in the row.

+    li      r12, 32

+    ;# rounding added in on the multiply

+    vspltisw v21, 8

+    vspltisw v18, 3

+    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040

+    slwi.   r6, r6, 5           ;# index into vertical filter array

+.endm

+;# Filters a horizontal line

+;# expects:

+;#  r3  src_ptr

+;#  r4  pitch

+;#  r10 16

+;#  r12 32

+;#  v17 perm intput

+;#  v18 rounding

+;#  v19 shift

+;#  v20 filter taps

+;#  v21 tmp

+;#  v22 tmp

+;#  v23 tmp

+;#  v24 tmp

+;#  v25 tmp

+;#  v26 tmp

+;#  v27 tmp

+;#  v28 perm output

+;#

+.macro hfilter_8 V, hp, lp, increment_counter

+    lvsl    v17,  0, r3         ;# permutate value for alignment

+    ;# input to filter is 9 bytes wide, output is 8 bytes.

+    lvx     v21,   0, r3

+    lvx     v22, r10, r3

+.if \increment_counter

+    add     r3, r3, r4

+.endif

+    vperm   v21, v21, v22, v17

+    vperm   v24, v21, v21, \hp  ;# v20 = 0123 1234 2345 3456

+    vperm   v25, v21, v21, \lp  ;# v21 = 4567 5678 6789 789A

+    vmsummbm v24, v20, v24, v18

+    vmsummbm v25, v20, v25, v18

+    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)

+    vsrh    v24, v24, v19       ;# divide v0, v1 by 128

+    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result

+.endm

+.macro vfilter_16 P0 P1

+    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps

+    vadduhm v22, v18, v22

+    vmuloub v23, \P0, v20

+    vadduhm v23, v18, v23

+    vmuleub v24, \P1, v21

+    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary

+    vmuloub v25, \P1, v21

+    vadduhm v23, v23, v25       ;# Ro = odds

+    vsrh    v22, v22, v19       ;# divide by 128

+    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds

+    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order

+    vmrglh  v23, v22, v23

+    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result

+.endm

+.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0

+    ;# Compute sum first.  Unpack to so signed subract

+    ;#  can be used.  Only have a half word signed

+    ;#  subract.  Do high, then low.

+    vmrghb  \t1, \z0, \src

+    vmrghb  \t2, \z0, \ref

+    vsubshs \t1, \t1, \t2

+    vsum4shs \sum, \t1, \sum

+    vmrglb  \t1, \z0, \src

+    vmrglb  \t2, \z0, \ref

+    vsubshs \t1, \t1, \t2

+    vsum4shs \sum, \t1, \sum

+    ;# Now compute sse.

+    vsububs \t1, \src, \ref

+    vsububs \t2, \ref, \src

+    vor     \t1, \t1, \t2

+    vmsumubm \sse, \t1, \t1, \sse

+.endm

+.macro variance_final sum, sse, z0, DS

+    vsumsws \sum, \sum, \z0

+    vsumsws \sse, \sse, \z0

+    stvx    \sum, 0, r1

+    lwz     r3, 12(r1)

+    stvx    \sse, 0, r1

+    lwz     r4, 12(r1)

+    stw     r4, 0(r9)           ;# sse

+    mullw   r3, r3, r3          ;# sum*sum

+    srawi   r3, r3, \DS         ;# (sum*sum) >> 8

+    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)

+.endm

+.macro compute_sum_sse_16 V, increment_counter

+    load_and_align_16  v16, r7, r8, \increment_counter

+    compute_sum_sse \V, v16, v18, v19, v20, v21, v23

+.endm

+.macro load_and_align_16 V, R, P, increment_counter

+    lvsl    v17,  0, \R         ;# permutate value for alignment

+    ;# input to filter is 21 bytes wide, output is 16 bytes.

+    ;#  input will can span three vectors if not aligned correctly.

+    lvx     v21,   0, \R

+    lvx     v22, r10, \R

+.if \increment_counter

+    add     \R, \R, \P

+.endif

+    vperm   \V, v21, v22, v17

+.endm

+    .align 2

+;# r3 unsigned char  *src_ptr

+;# r4 int  src_pixels_per_line

+;# r5 int  xoffset

+;# r6 int  yoffset

+;# r7 unsigned char *dst_ptr

+;# r8 int dst_pixels_per_line

+;# r9 unsigned int *sse

+;#

+;# r3 return value

+vp9_sub_pixel_variance4x4_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xf830

+    ori     r12, r12, 0xfff8

+    mtspr   256, r12            ;# set VRSAVE

+    stwu    r1,-32(r1)          ;# create space on the stack

+    HProlog second_pass_4x4_pre_copy_b

+    ;# Load up permutation constants

+    load_c v10, b_0123_b, 0, r12, r0

+    load_c v11, b_4567_b, 0, r12, r0

+    hfilter_8 v0, v10, v11, 1

+    hfilter_8 v1, v10, v11, 1

+    hfilter_8 v2, v10, v11, 1

+    hfilter_8 v3, v10, v11, 1

+    ;# Finished filtering main horizontal block.  If there is no

+    ;#  vertical filtering, jump to storing the data.  Otherwise

+    ;#  load up and filter the additional line that is needed

+    ;#  for the vertical filter.

+    beq     compute_sum_sse_4x4_b

+    hfilter_8 v4, v10, v11, 0

+    b   second_pass_4x4_b

+second_pass_4x4_pre_copy_b:

+    slwi    r6, r6, 5           ;# index into vertical filter array

+    load_and_align_16 v0, r3, r4, 1

+    load_and_align_16 v1, r3, r4, 1

+    load_and_align_16 v2, r3, r4, 1

+    load_and_align_16 v3, r3, r4, 1

+    load_and_align_16 v4, r3, r4, 0

+second_pass_4x4_b:

+    vspltish v20, 8

+    vspltish v18, 3

+    vslh    v18, v20, v18       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

+    load_vfilter v20, v21

+    vfilter_16 v0,  v1

+    vfilter_16 v1,  v2

+    vfilter_16 v2,  v3

+    vfilter_16 v3,  v4

+compute_sum_sse_4x4_b:

+    vspltish v18, 0             ;# sum

+    vspltish v19, 0             ;# sse

+    vspltish v23, 0             ;# unpack

+    li      r10, 16

+    load_and_align_16 v4, r7, r8, 1

+    load_and_align_16 v5, r7, r8, 1

+    load_and_align_16 v6, r7, r8, 1

+    load_and_align_16 v7, r7, r8, 1

+    vmrghb  v0, v0, v1

+    vmrghb  v1, v2, v3

+    vmrghb  v2, v4, v5

+    vmrghb  v3, v6, v7

+    load_c v10, b_hilo_b, 0, r12, r0

+    vperm   v0, v0, v1, v10

+    vperm   v1, v2, v3, v10

+    compute_sum_sse v0, v1, v18, v19, v20, v21, v23

+    variance_final v18, v19, v23, 4

+    addi    r1, r1, 32          ;# recover stack

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+    .align 2

+;# r3 unsigned char  *src_ptr

+;# r4 int  src_pixels_per_line

+;# r5 int  xoffset

+;# r6 int  yoffset

+;# r7 unsigned char *dst_ptr

+;# r8 int dst_pixels_per_line

+;# r9 unsigned int *sse

+;#

+;# r3 return value

+vp9_sub_pixel_variance8x8_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xfff0

+    ori     r12, r12, 0xffff

+    mtspr   256, r12            ;# set VRSAVE

+    stwu    r1,-32(r1)          ;# create space on the stack

+    HProlog second_pass_8x8_pre_copy_b

+    ;# Load up permutation constants

+    load_c v10, b_0123_b, 0, r12, r0

+    load_c v11, b_4567_b, 0, r12, r0

+    hfilter_8 v0, v10, v11, 1

+    hfilter_8 v1, v10, v11, 1

+    hfilter_8 v2, v10, v11, 1

+    hfilter_8 v3, v10, v11, 1

+    hfilter_8 v4, v10, v11, 1

+    hfilter_8 v5, v10, v11, 1

+    hfilter_8 v6, v10, v11, 1

+    hfilter_8 v7, v10, v11, 1

+    ;# Finished filtering main horizontal block.  If there is no

+    ;#  vertical filtering, jump to storing the data.  Otherwise

+    ;#  load up and filter the additional line that is needed

+    ;#  for the vertical filter.

+    beq     compute_sum_sse_8x8_b

+    hfilter_8 v8, v10, v11, 0

+    b   second_pass_8x8_b

+second_pass_8x8_pre_copy_b:

+    slwi.   r6, r6, 5           ;# index into vertical filter array

+    load_and_align_16 v0, r3, r4, 1

+    load_and_align_16 v1, r3, r4, 1

+    load_and_align_16 v2, r3, r4, 1

+    load_and_align_16 v3, r3, r4, 1

+    load_and_align_16 v4, r3, r4, 1

+    load_and_align_16 v5, r3, r4, 1

+    load_and_align_16 v6, r3, r4, 1

+    load_and_align_16 v7, r3, r4, 1

+    load_and_align_16 v8, r3, r4, 0

+    beq     compute_sum_sse_8x8_b

+second_pass_8x8_b:

+    vspltish v20, 8

+    vspltish v18, 3

+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

+    load_vfilter v20, v21

+    vfilter_16 v0, v1

+    vfilter_16 v1, v2

+    vfilter_16 v2, v3

+    vfilter_16 v3, v4

+    vfilter_16 v4, v5

+    vfilter_16 v5, v6

+    vfilter_16 v6, v7

+    vfilter_16 v7, v8

+compute_sum_sse_8x8_b:

+    vspltish v18, 0             ;# sum

+    vspltish v19, 0             ;# sse

+    vspltish v23, 0             ;# unpack

+    li      r10, 16

+    vmrghb  v0, v0, v1

+    vmrghb  v1, v2, v3

+    vmrghb  v2, v4, v5

+    vmrghb  v3, v6, v7

+    load_and_align_16 v4,  r7, r8, 1

+    load_and_align_16 v5,  r7, r8, 1

+    load_and_align_16 v6,  r7, r8, 1

+    load_and_align_16 v7,  r7, r8, 1

+    load_and_align_16 v8,  r7, r8, 1

+    load_and_align_16 v9,  r7, r8, 1

+    load_and_align_16 v10, r7, r8, 1

+    load_and_align_16 v11, r7, r8, 0

+    vmrghb  v4, v4,  v5

+    vmrghb  v5, v6,  v7

+    vmrghb  v6, v8,  v9

+    vmrghb  v7, v10, v11

+    compute_sum_sse v0, v4, v18, v19, v20, v21, v23

+    compute_sum_sse v1, v5, v18, v19, v20, v21, v23

+    compute_sum_sse v2, v6, v18, v19, v20, v21, v23

+    compute_sum_sse v3, v7, v18, v19, v20, v21, v23

+    variance_final v18, v19, v23, 6

+    addi    r1, r1, 32          ;# recover stack

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+    .align 2

+;# r3 unsigned char  *src_ptr

+;# r4 int  src_pixels_per_line

+;# r5 int  xoffset

+;# r6 int  yoffset

+;# r7 unsigned char *dst_ptr

+;# r8 int dst_pixels_per_line

+;# r9 unsigned int *sse

+;#

+;# r3 return value

+vp9_sub_pixel_variance8x16_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xffff

+    ori     r12, r12, 0xfffc

+    mtspr   256, r12            ;# set VRSAVE

+    stwu    r1,-32(r1)          ;# create space on the stack

+    HProlog second_pass_8x16_pre_copy_b

+    ;# Load up permutation constants

+    load_c v29, b_0123_b, 0, r12, r0

+    load_c v30, b_4567_b, 0, r12, r0

+    hfilter_8 v0,  v29, v30, 1

+    hfilter_8 v1,  v29, v30, 1

+    hfilter_8 v2,  v29, v30, 1

+    hfilter_8 v3,  v29, v30, 1

+    hfilter_8 v4,  v29, v30, 1

+    hfilter_8 v5,  v29, v30, 1

+    hfilter_8 v6,  v29, v30, 1

+    hfilter_8 v7,  v29, v30, 1

+    hfilter_8 v8,  v29, v30, 1

+    hfilter_8 v9,  v29, v30, 1

+    hfilter_8 v10, v29, v30, 1

+    hfilter_8 v11, v29, v30, 1

+    hfilter_8 v12, v29, v30, 1

+    hfilter_8 v13, v29, v30, 1

+    hfilter_8 v14, v29, v30, 1

+    hfilter_8 v15, v29, v30, 1

+    ;# Finished filtering main horizontal block.  If there is no

+    ;#  vertical filtering, jump to storing the data.  Otherwise

+    ;#  load up and filter the additional line that is needed

+    ;#  for the vertical filter.

+    beq     compute_sum_sse_8x16_b

+    hfilter_8 v16, v29, v30, 0

+    b   second_pass_8x16_b

+second_pass_8x16_pre_copy_b:

+    slwi.   r6, r6, 5           ;# index into vertical filter array

+    load_and_align_16 v0,  r3, r4, 1

+    load_and_align_16 v1,  r3, r4, 1

+    load_and_align_16 v2,  r3, r4, 1

+    load_and_align_16 v3,  r3, r4, 1

+    load_and_align_16 v4,  r3, r4, 1

+    load_and_align_16 v5,  r3, r4, 1

+    load_and_align_16 v6,  r3, r4, 1

+    load_and_align_16 v7,  r3, r4, 1

+    load_and_align_16 v8,  r3, r4, 1

+    load_and_align_16 v9,  r3, r4, 1

+    load_and_align_16 v10, r3, r4, 1

+    load_and_align_16 v11, r3, r4, 1

+    load_and_align_16 v12, r3, r4, 1

+    load_and_align_16 v13, r3, r4, 1

+    load_and_align_16 v14, r3, r4, 1

+    load_and_align_16 v15, r3, r4, 1

+    load_and_align_16 v16, r3, r4, 0

+    beq     compute_sum_sse_8x16_b

+second_pass_8x16_b:

+    vspltish v20, 8

+    vspltish v18, 3

+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

+    load_vfilter v20, v21

+    vfilter_16 v0,  v1

+    vfilter_16 v1,  v2

+    vfilter_16 v2,  v3

+    vfilter_16 v3,  v4

+    vfilter_16 v4,  v5

+    vfilter_16 v5,  v6

+    vfilter_16 v6,  v7

+    vfilter_16 v7,  v8

+    vfilter_16 v8,  v9

+    vfilter_16 v9,  v10

+    vfilter_16 v10, v11

+    vfilter_16 v11, v12

+    vfilter_16 v12, v13

+    vfilter_16 v13, v14

+    vfilter_16 v14, v15

+    vfilter_16 v15, v16

+compute_sum_sse_8x16_b:

+    vspltish v18, 0             ;# sum

+    vspltish v19, 0             ;# sse

+    vspltish v23, 0             ;# unpack

+    li      r10, 16

+    vmrghb  v0, v0,  v1

+    vmrghb  v1, v2,  v3

+    vmrghb  v2, v4,  v5

+    vmrghb  v3, v6,  v7

+    vmrghb  v4, v8,  v9

+    vmrghb  v5, v10, v11

+    vmrghb  v6, v12, v13

+    vmrghb  v7, v14, v15

+    load_and_align_16 v8,  r7, r8, 1

+    load_and_align_16 v9,  r7, r8, 1

+    load_and_align_16 v10, r7, r8, 1

+    load_and_align_16 v11, r7, r8, 1

+    load_and_align_16 v12, r7, r8, 1

+    load_and_align_16 v13, r7, r8, 1

+    load_and_align_16 v14, r7, r8, 1

+    load_and_align_16 v15, r7, r8, 1

+    vmrghb  v8,  v8,  v9

+    vmrghb  v9,  v10, v11

+    vmrghb  v10, v12, v13

+    vmrghb  v11, v14, v15

+    compute_sum_sse v0, v8,  v18, v19, v20, v21, v23

+    compute_sum_sse v1, v9,  v18, v19, v20, v21, v23

+    compute_sum_sse v2, v10, v18, v19, v20, v21, v23

+    compute_sum_sse v3, v11, v18, v19, v20, v21, v23

+    load_and_align_16 v8,  r7, r8, 1

+    load_and_align_16 v9,  r7, r8, 1

+    load_and_align_16 v10, r7, r8, 1

+    load_and_align_16 v11, r7, r8, 1

+    load_and_align_16 v12, r7, r8, 1

+    load_and_align_16 v13, r7, r8, 1

+    load_and_align_16 v14, r7, r8, 1

+    load_and_align_16 v15, r7, r8, 0

+    vmrghb  v8,  v8,  v9

+    vmrghb  v9,  v10, v11

+    vmrghb  v10, v12, v13

+    vmrghb  v11, v14, v15

+    compute_sum_sse v4, v8,  v18, v19, v20, v21, v23

+    compute_sum_sse v5, v9,  v18, v19, v20, v21, v23

+    compute_sum_sse v6, v10, v18, v19, v20, v21, v23

+    compute_sum_sse v7, v11, v18, v19, v20, v21, v23

+    variance_final v18, v19, v23, 7

+    addi    r1, r1, 32          ;# recover stack

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+;# Filters a horizontal line

+;# expects:

+;#  r3  src_ptr

+;#  r4  pitch

+;#  r10 16

+;#  r12 32

+;#  v17 perm intput

+;#  v18 rounding

+;#  v19 shift

+;#  v20 filter taps

+;#  v21 tmp

+;#  v22 tmp

+;#  v23 tmp

+;#  v24 tmp

+;#  v25 tmp

+;#  v26 tmp

+;#  v27 tmp

+;#  v28 perm output

+;#

+.macro hfilter_16 V, increment_counter

+    lvsl    v17,  0, r3         ;# permutate value for alignment

+    ;# input to filter is 21 bytes wide, output is 16 bytes.

+    ;#  input will can span three vectors if not aligned correctly.

+    lvx     v21,   0, r3

+    lvx     v22, r10, r3

+    lvx     v23, r12, r3

+.if \increment_counter

+    add     r3, r3, r4

+.endif

+    vperm   v21, v21, v22, v17

+    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified

+    ;# set 0

+    vmsummbm v24, v20, v21, v18 ;# taps times elements

+    ;# set 1

+    vsldoi  v23, v21, v22, 1

+    vmsummbm v25, v20, v23, v18

+    ;# set 2

+    vsldoi  v23, v21, v22, 2

+    vmsummbm v26, v20, v23, v18

+    ;# set 3

+    vsldoi  v23, v21, v22, 3

+    vmsummbm v27, v20, v23, v18

+    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)

+    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F

+    vsrh    v24, v24, v19       ;# divide v0, v1 by 128

+    vsrh    v25, v25, v19

+    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result

+    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result

+.endm

+    .align 2

+;# r3 unsigned char  *src_ptr

+;# r4 int  src_pixels_per_line

+;# r5 int  xoffset

+;# r6 int  yoffset

+;# r7 unsigned char *dst_ptr

+;# r8 int dst_pixels_per_line

+;# r9 unsigned int *sse

+;#

+;# r3 return value

+vp9_sub_pixel_variance16x8_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xffff

+    ori     r12, r12, 0xfff8

+    mtspr   256, r12            ;# set VRSAVE

+    stwu    r1, -32(r1)         ;# create space on the stack

+    HProlog second_pass_16x8_pre_copy_b

+    hfilter_16 v0, 1

+    hfilter_16 v1, 1

+    hfilter_16 v2, 1

+    hfilter_16 v3, 1

+    hfilter_16 v4, 1

+    hfilter_16 v5, 1

+    hfilter_16 v6, 1

+    hfilter_16 v7, 1

+    ;# Finished filtering main horizontal block.  If there is no

+    ;#  vertical filtering, jump to storing the data.  Otherwise

+    ;#  load up and filter the additional line that is needed

+    ;#  for the vertical filter.

+    beq     compute_sum_sse_16x8_b

+    hfilter_16 v8, 0

+    b   second_pass_16x8_b

+second_pass_16x8_pre_copy_b:

+    slwi.   r6, r6, 5           ;# index into vertical filter array

+    load_and_align_16  v0,  r3, r4, 1

+    load_and_align_16  v1,  r3, r4, 1

+    load_and_align_16  v2,  r3, r4, 1

+    load_and_align_16  v3,  r3, r4, 1

+    load_and_align_16  v4,  r3, r4, 1

+    load_and_align_16  v5,  r3, r4, 1

+    load_and_align_16  v6,  r3, r4, 1

+    load_and_align_16  v7,  r3, r4, 1

+    load_and_align_16  v8,  r3, r4, 1

+    beq     compute_sum_sse_16x8_b

+second_pass_16x8_b:

+    vspltish v20, 8

+    vspltish v18, 3

+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

+    load_vfilter v20, v21

+    vfilter_16 v0,  v1

+    vfilter_16 v1,  v2

+    vfilter_16 v2,  v3

+    vfilter_16 v3,  v4

+    vfilter_16 v4,  v5

+    vfilter_16 v5,  v6

+    vfilter_16 v6,  v7

+    vfilter_16 v7,  v8

+compute_sum_sse_16x8_b:

+    vspltish v18, 0             ;# sum

+    vspltish v19, 0             ;# sse

+    vspltish v23, 0             ;# unpack

+    li      r10, 16

+    compute_sum_sse_16 v0, 1

+    compute_sum_sse_16 v1, 1

+    compute_sum_sse_16 v2, 1

+    compute_sum_sse_16 v3, 1

+    compute_sum_sse_16 v4, 1

+    compute_sum_sse_16 v5, 1

+    compute_sum_sse_16 v6, 1

+    compute_sum_sse_16 v7, 0

+    variance_final v18, v19, v23, 7

+    addi    r1, r1, 32          ;# recover stack

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+    .align 2

+;# r3 unsigned char  *src_ptr

+;# r4 int  src_pixels_per_line

+;# r5 int  xoffset

+;# r6 int  yoffset

+;# r7 unsigned char *dst_ptr

+;# r8 int dst_pixels_per_line

+;# r9 unsigned int *sse

+;#

+;# r3 return value

+vp9_sub_pixel_variance16x16_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xffff

+    ori     r12, r12, 0xfff8

+    mtspr   256, r12            ;# set VRSAVE

+    stwu    r1, -32(r1)         ;# create space on the stack

+    HProlog second_pass_16x16_pre_copy_b

+    hfilter_16 v0,  1

+    hfilter_16 v1,  1

+    hfilter_16 v2,  1

+    hfilter_16 v3,  1

+    hfilter_16 v4,  1

+    hfilter_16 v5,  1

+    hfilter_16 v6,  1

+    hfilter_16 v7,  1

+    hfilter_16 v8,  1

+    hfilter_16 v9,  1

+    hfilter_16 v10, 1

+    hfilter_16 v11, 1

+    hfilter_16 v12, 1

+    hfilter_16 v13, 1

+    hfilter_16 v14, 1

+    hfilter_16 v15, 1

+    ;# Finished filtering main horizontal block.  If there is no

+    ;#  vertical filtering, jump to storing the data.  Otherwise

+    ;#  load up and filter the additional line that is needed

+    ;#  for the vertical filter.

+    beq     compute_sum_sse_16x16_b

+    hfilter_16 v16, 0

+    b   second_pass_16x16_b

+second_pass_16x16_pre_copy_b:

+    slwi.   r6, r6, 5           ;# index into vertical filter array

+    load_and_align_16  v0,  r3, r4, 1

+    load_and_align_16  v1,  r3, r4, 1

+    load_and_align_16  v2,  r3, r4, 1

+    load_and_align_16  v3,  r3, r4, 1

+    load_and_align_16  v4,  r3, r4, 1

+    load_and_align_16  v5,  r3, r4, 1

+    load_and_align_16  v6,  r3, r4, 1

+    load_and_align_16  v7,  r3, r4, 1

+    load_and_align_16  v8,  r3, r4, 1

+    load_and_align_16  v9,  r3, r4, 1

+    load_and_align_16  v10, r3, r4, 1

+    load_and_align_16  v11, r3, r4, 1

+    load_and_align_16  v12, r3, r4, 1

+    load_and_align_16  v13, r3, r4, 1

+    load_and_align_16  v14, r3, r4, 1

+    load_and_align_16  v15, r3, r4, 1

+    load_and_align_16  v16, r3, r4, 0

+    beq     compute_sum_sse_16x16_b

+second_pass_16x16_b:

+    vspltish v20, 8

+    vspltish v18, 3

+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040

+    load_vfilter v20, v21

+    vfilter_16 v0,  v1

+    vfilter_16 v1,  v2

+    vfilter_16 v2,  v3

+    vfilter_16 v3,  v4

+    vfilter_16 v4,  v5

+    vfilter_16 v5,  v6

+    vfilter_16 v6,  v7

+    vfilter_16 v7,  v8

+    vfilter_16 v8,  v9

+    vfilter_16 v9,  v10

+    vfilter_16 v10, v11

+    vfilter_16 v11, v12

+    vfilter_16 v12, v13

+    vfilter_16 v13, v14

+    vfilter_16 v14, v15

+    vfilter_16 v15, v16

+compute_sum_sse_16x16_b:

+    vspltish v18, 0             ;# sum

+    vspltish v19, 0             ;# sse

+    vspltish v23, 0             ;# unpack

+    li      r10, 16

+    compute_sum_sse_16 v0,  1

+    compute_sum_sse_16 v1,  1

+    compute_sum_sse_16 v2,  1

+    compute_sum_sse_16 v3,  1

+    compute_sum_sse_16 v4,  1

+    compute_sum_sse_16 v5,  1

+    compute_sum_sse_16 v6,  1

+    compute_sum_sse_16 v7,  1

+    compute_sum_sse_16 v8,  1

+    compute_sum_sse_16 v9,  1

+    compute_sum_sse_16 v10, 1

+    compute_sum_sse_16 v11, 1

+    compute_sum_sse_16 v12, 1

+    compute_sum_sse_16 v13, 1

+    compute_sum_sse_16 v14, 1

+    compute_sum_sse_16 v15, 0

+    variance_final v18, v19, v23, 8

+    addi    r1, r1, 32          ;# recover stack

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+    .data

+    .align 4

+hfilter_b:

+    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0

+    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0

+    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0

+    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0

+    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0

+    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0

+    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0

+    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0

+    .align 4

+vfilter_b:

+    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128

+    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

+    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112

+    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

+    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96

+    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32

+    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80

+    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48

+    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64

+    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64

+    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48

+    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80

+    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32

+    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96

+    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

+    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112

+    .align 4

+b_hperm_b:

+    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15

+    .align 4

+b_0123_b:

+    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6

+    .align 4

+b_4567_b:

+    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10

+b_hilo_b:

+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

--- /dev/null

+++ b/vp9/encoder/psnr.c

@@ -1,0 +1,30 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_scale/yv12config.h"

+#include "math.h"

+#include "vp9/common/systemdependent.h" /* for vp9_clear_system_state() */

+#define MAX_PSNR 100

+double vp9_mse2psnr(double Samples, double Peak, double Mse) {

+  double psnr;

+  if ((double)Mse > 0.0)

+    psnr = 10.0 * log10(Peak * Peak * Samples / Mse);

+  else

+    psnr = MAX_PSNR;      // Limit to prevent / 0

+  if (psnr > MAX_PSNR)

+    psnr = MAX_PSNR;

+  return psnr;

+}

--- /dev/null

+++ b/vp9/encoder/psnr.h

@@ -1,0 +1,17 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_PSNR_H

+#define __INC_PSNR_H

+extern double vp9_mse2psnr(double Samples, double Peak, double Mse);

+#endif

--- /dev/null

+++ b/vp9/encoder/quantize.c

@@ -1,0 +1,716 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <math.h>

+#include "vpx_mem/vpx_mem.h"

+#include "onyx_int.h"

+#include "quantize.h"

+#include "vp9/common/quant_common.h"

+#include "vp9/common/seg_common.h"

+#ifdef ENC_DEBUG

+extern int enc_debug;

+#endif

+void vp9_ht_quantize_b_4x4(BLOCK *b, BLOCKD *d, TX_TYPE tx_type) {

+  int i, rc, eob;

+  int zbin;

+  int x, y, z, sz;

+  short *zbin_boost_ptr  = b->zrun_zbin_boost;

+  short *coeff_ptr       = b->coeff;

+  short *zbin_ptr        = b->zbin;

+  short *round_ptr       = b->round;

+  short *quant_ptr       = b->quant;

+  unsigned char *quant_shift_ptr = b->quant_shift;

+  short *qcoeff_ptr      = d->qcoeff;

+  short *dqcoeff_ptr     = d->dqcoeff;

+  short *dequant_ptr     = d->dequant;

+  short zbin_oq_value    = b->zbin_extra;

+  int const *pt_scan ;

+  switch (tx_type) {

+    case ADST_DCT :

+      pt_scan = vp9_row_scan;

+      break;

+    case DCT_ADST :

+      pt_scan = vp9_col_scan;

+      break;

+    default :

+      pt_scan = vp9_default_zig_zag1d;

+      break;

+  }

+  vpx_memset(qcoeff_ptr, 0, 32);

+  vpx_memset(dqcoeff_ptr, 0, 32);

+  eob = -1;

+  for (i = 0; i < b->eob_max_offset; i++) {

+    rc   = pt_scan[i];

+    z    = coeff_ptr[rc];

+    zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;

+    zbin_boost_ptr ++;

+    sz = (z >> 31);                                 // sign of z

+    x  = (z ^ sz) - sz;                             // x = abs(z)

+    if (x >= zbin) {

+      x += round_ptr[rc];

+      y  = (((x * quant_ptr[rc]) >> 16) + x)

+           >> quant_shift_ptr[rc];                // quantize (x)

+      x  = (y ^ sz) - sz;                         // get the sign back

+      qcoeff_ptr[rc]  = x;                        // write to destination

+      dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value

+      if (y) {

+        eob = i;                                // last nonzero coeffs

+        zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength

+      }

+    }

+  }

+  d->eob = eob + 1;

+}

+void vp9_regular_quantize_b_4x4(BLOCK *b, BLOCKD *d) {

+  int i, rc, eob;

+  int zbin;

+  int x, y, z, sz;

+  short *zbin_boost_ptr  = b->zrun_zbin_boost;

+  short *coeff_ptr       = b->coeff;

+  short *zbin_ptr        = b->zbin;

+  short *round_ptr       = b->round;

+  short *quant_ptr       = b->quant;

+  unsigned char *quant_shift_ptr = b->quant_shift;

+  short *qcoeff_ptr      = d->qcoeff;

+  short *dqcoeff_ptr     = d->dqcoeff;

+  short *dequant_ptr     = d->dequant;

+  short zbin_oq_value    = b->zbin_extra;

+  vpx_memset(qcoeff_ptr, 0, 32);

+  vpx_memset(dqcoeff_ptr, 0, 32);

+  eob = -1;

+  for (i = 0; i < b->eob_max_offset; i++) {

+    rc   = vp9_default_zig_zag1d[i];

+    z    = coeff_ptr[rc];

+    zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;

+    zbin_boost_ptr ++;

+    sz = (z >> 31);                                 // sign of z

+    x  = (z ^ sz) - sz;                             // x = abs(z)

+    if (x >= zbin) {

+      x += round_ptr[rc];

+      y  = (((x * quant_ptr[rc]) >> 16) + x)

+           >> quant_shift_ptr[rc];                // quantize (x)

+      x  = (y ^ sz) - sz;                         // get the sign back

+      qcoeff_ptr[rc]  = x;                        // write to destination

+      dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value

+      if (y) {

+        eob = i;                                // last nonzero coeffs

+        zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength

+      }

+    }

+  }

+  d->eob = eob + 1;

+}

+void vp9_quantize_mby_4x4_c(MACROBLOCK *x) {

+  int i;

+  int has_2nd_order = x->e_mbd.mode_info_context->mbmi.mode != SPLITMV;

+  for (i = 0; i < 16; i++)

+    x->quantize_b_4x4(&x->block[i], &x->e_mbd.block[i]);

+  if (has_2nd_order)

+    x->quantize_b_4x4(&x->block[24], &x->e_mbd.block[24]);

+}

+void vp9_quantize_mbuv_4x4_c(MACROBLOCK *x) {

+  int i;

+  for (i = 16; i < 24; i++)

+    x->quantize_b_4x4(&x->block[i], &x->e_mbd.block[i]);

+}

+void vp9_quantize_mb_4x4_c(MACROBLOCK *x) {

+  vp9_quantize_mby_4x4_c(x);

+  vp9_quantize_mbuv_4x4_c(x);

+}

+void vp9_regular_quantize_b_2x2(BLOCK *b, BLOCKD *d) {

+  int i, rc, eob;

+  int zbin;

+  int x, y, z, sz;

+  short *zbin_boost_ptr = b->zrun_zbin_boost;

+  int zbin_zrun_index = 0;

+  short *coeff_ptr  = b->coeff;

+  short *zbin_ptr   = b->zbin;

+  short *round_ptr  = b->round;

+  short *quant_ptr  = b->quant;

+  unsigned char *quant_shift_ptr = b->quant_shift;

+  short *qcoeff_ptr = d->qcoeff;

+  short *dqcoeff_ptr = d->dqcoeff;

+  short *dequant_ptr = d->dequant;

+  short zbin_oq_value = b->zbin_extra;

+  // double q2nd = 4;

+  vpx_memset(qcoeff_ptr, 0, 32);

+  vpx_memset(dqcoeff_ptr, 0, 32);

+  eob = -1;

+  for (i = 0; i < b->eob_max_offset_8x8; i++) {

+    rc   = vp9_default_zig_zag1d[i];

+    z    = coeff_ptr[rc];

+    zbin_boost_ptr = &b->zrun_zbin_boost[zbin_zrun_index];

+    zbin_zrun_index += 4;

+    zbin = (zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value);

+    sz = (z >> 31);                               // sign of z

+    x  = (z ^ sz) - sz;                           // x = abs(z)

+    if (x >= zbin) {

+      x += (round_ptr[rc]);

+      y  = ((int)((int)(x * quant_ptr[rc]) >> 16) + x)

+           >> quant_shift_ptr[rc];                // quantize (x)

+      x  = (y ^ sz) - sz;                         // get the sign back

+      qcoeff_ptr[rc]  = x;                        // write to destination

+      dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value

+      if (y) {

+        eob = i;                                  // last nonzero coeffs

+        zbin_zrun_index = 0;

+      }

+    }

+  }

+  d->eob = eob + 1;

+}

+void vp9_regular_quantize_b_8x8(BLOCK *b, BLOCKD *d) {

+  int i, rc, eob;

+  int zbin;

+  int x, y, z, sz;

+  short *zbin_boost_ptr = b->zrun_zbin_boost_8x8;

+  short *coeff_ptr  = b->coeff;

+  short *zbin_ptr   = b->zbin_8x8;

+  short *round_ptr  = b->round;

+  short *quant_ptr  = b->quant;

+  unsigned char *quant_shift_ptr = b->quant_shift;

+  short *qcoeff_ptr = d->qcoeff;

+  short *dqcoeff_ptr = d->dqcoeff;

+  short *dequant_ptr = d->dequant;

+  short zbin_oq_value = b->zbin_extra;

+  vpx_memset(qcoeff_ptr, 0, 64 * sizeof(short));

+  vpx_memset(dqcoeff_ptr, 0, 64 * sizeof(short));

+  eob = -1;

+  for (i = 0; i < b->eob_max_offset_8x8; i++) {

+    rc   = vp9_default_zig_zag1d_8x8[i];

+    z    = coeff_ptr[rc];

+    zbin = (zbin_ptr[rc != 0] + *zbin_boost_ptr + zbin_oq_value);

+    zbin_boost_ptr++;

+    sz = (z >> 31);                               // sign of z

+    x  = (z ^ sz) - sz;                           // x = abs(z)

+    if (x >= zbin) {

+      x += (round_ptr[rc != 0]);

+      y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))

+           >> quant_shift_ptr[rc != 0];            // quantize (x)

+      x  = (y ^ sz) - sz;                         // get the sign back

+      qcoeff_ptr[rc]  = x;                        // write to destination

+      dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0]; // dequantized value

+      if (y) {

+        eob = i;                                  // last nonzero coeffs

+        zbin_boost_ptr = b->zrun_zbin_boost_8x8;

+      }

+    }

+  }

+  d->eob = eob + 1;

+}

+void vp9_quantize_mby_8x8(MACROBLOCK *x) {

+  int i;

+  int has_2nd_order = x->e_mbd.mode_info_context->mbmi.mode != SPLITMV;

+  for (i = 0; i < 16; i ++) {

+    x->e_mbd.block[i].eob = 0;

+  }

+  x->e_mbd.block[24].eob = 0;

+  for (i = 0; i < 16; i += 4)

+    x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);

+  if (has_2nd_order)

+    x->quantize_b_2x2(&x->block[24], &x->e_mbd.block[24]);

+}

+void vp9_quantize_mbuv_8x8(MACROBLOCK *x) {

+  int i;

+  for (i = 16; i < 24; i ++)

+    x->e_mbd.block[i].eob = 0;

+  for (i = 16; i < 24; i += 4)

+    x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);

+}

+void vp9_quantize_mb_8x8(MACROBLOCK *x) {

+  vp9_quantize_mby_8x8(x);

+  vp9_quantize_mbuv_8x8(x);

+}

+void vp9_quantize_mby_16x16(MACROBLOCK *x) {

+  int i;

+  for (i = 0; i < 16; i++)

+    x->e_mbd.block[i].eob = 0;

+  x->e_mbd.block[24].eob = 0;

+  x->quantize_b_16x16(&x->block[0], &x->e_mbd.block[0]);

+}

+void vp9_quantize_mb_16x16(MACROBLOCK *x) {

+  vp9_quantize_mby_16x16(x);

+  vp9_quantize_mbuv_8x8(x);

+}

+void vp9_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) {

+  int i, rc, eob;

+  int zbin;

+  int x, y, z, sz;

+  short *zbin_boost_ptr = b->zrun_zbin_boost_16x16;

+  short *coeff_ptr  = b->coeff;

+  short *zbin_ptr   = b->zbin_16x16;

+  short *round_ptr  = b->round;

+  short *quant_ptr  = b->quant;

+  unsigned char *quant_shift_ptr = b->quant_shift;

+  short *qcoeff_ptr = d->qcoeff;

+  short *dqcoeff_ptr = d->dqcoeff;

+  short *dequant_ptr = d->dequant;

+  short zbin_oq_value = b->zbin_extra;

+  vpx_memset(qcoeff_ptr, 0, 256*sizeof(short));

+  vpx_memset(dqcoeff_ptr, 0, 256*sizeof(short));

+  eob = -1;

+  for (i = 0; i < b->eob_max_offset_16x16; i++) {

+    rc   = vp9_default_zig_zag1d_16x16[i];

+    z    = coeff_ptr[rc];

+    zbin = (zbin_ptr[rc!=0] + *zbin_boost_ptr + zbin_oq_value);

+    zbin_boost_ptr ++;

+    sz = (z >> 31);                               // sign of z

+    x  = (z ^ sz) - sz;                           // x = abs(z)

+    if (x >= zbin) {

+      x += (round_ptr[rc!=0]);

+      y  = ((int)(((int)(x * quant_ptr[rc!=0]) >> 16) + x))

+          >> quant_shift_ptr[rc!=0];              // quantize (x)

+      x  = (y ^ sz) - sz;                         // get the sign back

+      qcoeff_ptr[rc]  = x;                        // write to destination

+      dqcoeff_ptr[rc] = x * dequant_ptr[rc!=0];   // dequantized value

+      if (y) {

+        eob = i;                                  // last nonzero coeffs

+        zbin_boost_ptr = b->zrun_zbin_boost_16x16;

+      }

+    }

+  }

+  d->eob = eob + 1;

+}

+/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of

+ * these two C functions if corresponding optimized routine is not available.

+ * NEON optimized version implements currently the fast quantization for pair

+ * of blocks. */

+void vp9_regular_quantize_b_4x4_pair(BLOCK *b1, BLOCK *b2,

+                                     BLOCKD *d1, BLOCKD *d2) {

+  vp9_regular_quantize_b_4x4(b1, d1);

+  vp9_regular_quantize_b_4x4(b2, d2);

+}

+static void invert_quant(short *quant,

+                         unsigned char *shift, short d) {

+  unsigned t;

+  int l;

+  t = d;

+  for (l = 0; t > 1; l++)

+    t >>= 1;

+  t = 1 + (1 << (16 + l)) / d;

+  *quant = (short)(t - (1 << 16));

+  *shift = l;

+}

+void vp9_init_quantizer(VP9_COMP *cpi) {

+  int i;

+  int quant_val;

+  int Q;

+  static const int zbin_boost[16] = {  0,  0,  8, 10, 12, 14, 16, 20,

+                                      24, 28, 32, 36, 40, 44, 44, 44

+                                    };

+  static const int zbin_boost_8x8[64] = {  0,  0,  0,  8,  8,  8, 10, 12,

+                                          14, 16, 18, 20, 22, 24, 26, 28,

+                                          30, 32, 34, 36, 38, 40, 42, 44,

+                                          46, 48, 48, 48, 48, 48, 48, 48,

+                                          48, 48, 48, 48, 48, 48, 48, 48,

+                                          48, 48, 48, 48, 48, 48, 48, 48,

+                                          48, 48, 48, 48, 48, 48, 48, 48,

+                                          48, 48, 48, 48, 48, 48, 48, 48

+                                        };

+  static const int zbin_boost_16x16[256] = {

+     0,  0,  0,  8,  8,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28,

+    30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+  };

+  int qrounding_factor = 48;

+  for (Q = 0; Q < QINDEX_RANGE; Q++) {

+    int qzbin_factor = (vp9_dc_quant(Q, 0) < 148) ? 84 : 80;

+#if CONFIG_LOSSLESS

+    if (cpi->oxcf.lossless) {

+      if (Q == 0) {

+        qzbin_factor = 64;

+        qrounding_factor = 64;

+      }

+    }

+#endif

+    // dc values

+    quant_val = vp9_dc_quant(Q, cpi->common.y1dc_delta_q);

+    invert_quant(cpi->Y1quant[Q] + 0,

+                 cpi->Y1quant_shift[Q] + 0, quant_val);

+    cpi->Y1zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

+    cpi->Y1zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

+    cpi->Y1zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

+    cpi->Y1round[Q][0] = (qrounding_factor * quant_val) >> 7;

+    cpi->common.Y1dequant[Q][0] = quant_val;

+    cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;

+    cpi->zrun_zbin_boost_y1_8x8[Q][0] =

+      ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;

+    cpi->zrun_zbin_boost_y1_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;

+    quant_val = vp9_dc2quant(Q, cpi->common.y2dc_delta_q);

+    invert_quant(cpi->Y2quant[Q] + 0,

+                 cpi->Y2quant_shift[Q] + 0, quant_val);

+    cpi->Y2zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

+    cpi->Y2zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

+    cpi->Y2zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

+    cpi->Y2round[Q][0] = (qrounding_factor * quant_val) >> 7;

+    cpi->common.Y2dequant[Q][0] = quant_val;

+    cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;

+    cpi->zrun_zbin_boost_y2_8x8[Q][0] =

+      ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;

+    cpi->zrun_zbin_boost_y2_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;

+    quant_val = vp9_dc_uv_quant(Q, cpi->common.uvdc_delta_q);

+    invert_quant(cpi->UVquant[Q] + 0,

+                 cpi->UVquant_shift[Q] + 0, quant_val);

+    cpi->UVzbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

+    cpi->UVzbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

+    cpi->UVzbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

+    cpi->UVround[Q][0] = (qrounding_factor * quant_val) >> 7;

+    cpi->common.UVdequant[Q][0] = quant_val;

+    cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;

+    cpi->zrun_zbin_boost_uv_8x8[Q][0] =

+      ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;

+    cpi->zrun_zbin_boost_uv_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;

+    // all the 4x4 ac values =;

+    for (i = 1; i < 16; i++) {

+      int rc = vp9_default_zig_zag1d[i];

+      quant_val = vp9_ac_yquant(Q);

+      invert_quant(cpi->Y1quant[Q] + rc,

+                   cpi->Y1quant_shift[Q] + rc, quant_val);

+      cpi->Y1zbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

+      cpi->Y1round[Q][rc] = (qrounding_factor * quant_val) >> 7;

+      cpi->common.Y1dequant[Q][rc] = quant_val;

+      cpi->zrun_zbin_boost_y1[Q][i] =

+        ((quant_val * zbin_boost[i]) + 64) >> 7;

+      quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);

+      invert_quant(cpi->Y2quant[Q] + rc,

+                   cpi->Y2quant_shift[Q] + rc, quant_val);

+      cpi->Y2zbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

+      cpi->Y2round[Q][rc] = (qrounding_factor * quant_val) >> 7;

+      cpi->common.Y2dequant[Q][rc] = quant_val;

+      cpi->zrun_zbin_boost_y2[Q][i] =

+        ((quant_val * zbin_boost[i]) + 64) >> 7;

+      quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);

+      invert_quant(cpi->UVquant[Q] + rc,

+                   cpi->UVquant_shift[Q] + rc, quant_val);

+      cpi->UVzbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

+      cpi->UVround[Q][rc] = (qrounding_factor * quant_val) >> 7;

+      cpi->common.UVdequant[Q][rc] = quant_val;

+      cpi->zrun_zbin_boost_uv[Q][i] =

+        ((quant_val * zbin_boost[i]) + 64) >> 7;

+    }

+    // 8x8 structures... only zbin seperated out for now

+    // This needs cleaning up for 8x8 especially if we are to add

+    // support for non flat Q matices

+    for (i = 1; i < 64; i++) {

+      int rc = vp9_default_zig_zag1d_8x8[i];

+      quant_val = vp9_ac_yquant(Q);

+      cpi->Y1zbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

+      cpi->zrun_zbin_boost_y1_8x8[Q][i] =

+        ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;

+      quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);

+      cpi->Y2zbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

+      cpi->zrun_zbin_boost_y2_8x8[Q][i] =

+        ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;

+      quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);

+      cpi->UVzbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

+      cpi->zrun_zbin_boost_uv_8x8[Q][i] =

+        ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;

+    }

+    // 16x16 structures. Same comment above applies.

+    for (i = 1; i < 256; i++) {

+      int rc = vp9_default_zig_zag1d_16x16[i];

+      quant_val = vp9_ac_yquant(Q);

+      cpi->Y1zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

+      cpi->zrun_zbin_boost_y1_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;

+      quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);

+      cpi->Y2zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

+      cpi->zrun_zbin_boost_y2_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;

+      quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);

+      cpi->UVzbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

+      cpi->zrun_zbin_boost_uv_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;

+    }

+  }

+}

+void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {

+  int i;

+  int QIndex;

+  MACROBLOCKD *xd = &x->e_mbd;

+  int zbin_extra;

+  int segment_id = xd->mode_info_context->mbmi.segment_id;

+  // Select the baseline MB Q index allowing for any segment level change.

+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {

+    // Abs Value

+    if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA)

+      QIndex = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);

+    // Delta Value

+    else {

+      QIndex = cpi->common.base_qindex +

+               vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);

+      // Clamp to valid range

+      QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;

+    }

+  } else

+    QIndex = cpi->common.base_qindex;

+  // Y

+  zbin_extra = (cpi->common.Y1dequant[QIndex][1] *

+                (cpi->zbin_over_quant +

+                 cpi->zbin_mode_boost +

+                 x->act_zbin_adj)) >> 7;

+  for (i = 0; i < 16; i++) {

+    x->block[i].quant = cpi->Y1quant[QIndex];

+    x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];

+    x->block[i].zbin = cpi->Y1zbin[QIndex];

+    x->block[i].zbin_8x8 = cpi->Y1zbin_8x8[QIndex];

+    x->block[i].zbin_16x16 = cpi->Y1zbin_16x16[QIndex];

+    x->block[i].round = cpi->Y1round[QIndex];

+    x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex];

+    x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];

+    x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y1_8x8[QIndex];

+    x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y1_16x16[QIndex];

+    x->block[i].zbin_extra = (short)zbin_extra;

+    // Segment max eob offset feature.

+    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {

+      x->block[i].eob_max_offset =

+        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

+      x->block[i].eob_max_offset_8x8 =

+        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

+      x->block[i].eob_max_offset_16x16 =

+        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

+    } else {

+      x->block[i].eob_max_offset = 16;

+      x->block[i].eob_max_offset_8x8 = 64;

+      x->block[i].eob_max_offset_16x16 = 256;

+    }

+  }

+  // UV

+  zbin_extra = (cpi->common.UVdequant[QIndex][1] *

+                (cpi->zbin_over_quant +

+                 cpi->zbin_mode_boost +

+                 x->act_zbin_adj)) >> 7;

+  for (i = 16; i < 24; i++) {

+    x->block[i].quant = cpi->UVquant[QIndex];

+    x->block[i].quant_shift = cpi->UVquant_shift[QIndex];

+    x->block[i].zbin = cpi->UVzbin[QIndex];

+    x->block[i].zbin_8x8 = cpi->UVzbin_8x8[QIndex];

+    x->block[i].zbin_16x16 = cpi->UVzbin_16x16[QIndex];

+    x->block[i].round = cpi->UVround[QIndex];

+    x->e_mbd.block[i].dequant = cpi->common.UVdequant[QIndex];

+    x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex];

+    x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_uv_8x8[QIndex];

+    x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_uv_16x16[QIndex];

+    x->block[i].zbin_extra = (short)zbin_extra;

+    // Segment max eob offset feature.

+    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {

+      x->block[i].eob_max_offset =

+        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

+      x->block[i].eob_max_offset_8x8 =

+        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

+    } else {

+      x->block[i].eob_max_offset = 16;

+      x->block[i].eob_max_offset_8x8 = 64;

+    }

+  }

+  // Y2

+  zbin_extra = (cpi->common.Y2dequant[QIndex][1] *

+                ((cpi->zbin_over_quant / 2) +

+                 cpi->zbin_mode_boost +

+                 x->act_zbin_adj)) >> 7;

+  x->block[24].quant = cpi->Y2quant[QIndex];

+  x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];

+  x->block[24].zbin = cpi->Y2zbin[QIndex];

+  x->block[24].zbin_8x8 = cpi->Y2zbin_8x8[QIndex];

+  x->block[24].zbin_16x16 = cpi->Y2zbin_16x16[QIndex];

+  x->block[24].round = cpi->Y2round[QIndex];

+  x->e_mbd.block[24].dequant = cpi->common.Y2dequant[QIndex];

+  x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex];

+  x->block[24].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y2_8x8[QIndex];

+  x->block[24].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y2_16x16[QIndex];

+  x->block[24].zbin_extra = (short)zbin_extra;

+  // TBD perhaps not use for Y2

+  // Segment max eob offset feature.

+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {

+    x->block[24].eob_max_offset =

+      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

+    x->block[24].eob_max_offset_8x8 =

+      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

+  } else {

+    x->block[24].eob_max_offset = 16;

+    x->block[24].eob_max_offset_8x8 = 4;

+  }

+  /* save this macroblock QIndex for vp9_update_zbin_extra() */

+  x->e_mbd.q_index = QIndex;

+}

+void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {

+  int i;

+  int QIndex = x->e_mbd.q_index;

+  int zbin_extra;

+  // Y

+  zbin_extra = (cpi->common.Y1dequant[QIndex][1] *

+                (cpi->zbin_over_quant +

+                 cpi->zbin_mode_boost +

+                 x->act_zbin_adj)) >> 7;

+  for (i = 0; i < 16; i++) {

+    x->block[i].zbin_extra = (short)zbin_extra;

+  }

+  // UV

+  zbin_extra = (cpi->common.UVdequant[QIndex][1] *

+                (cpi->zbin_over_quant +

+                 cpi->zbin_mode_boost +

+                 x->act_zbin_adj)) >> 7;

+  for (i = 16; i < 24; i++) {

+    x->block[i].zbin_extra = (short)zbin_extra;

+  }

+  // Y2

+  zbin_extra = (cpi->common.Y2dequant[QIndex][1] *

+                ((cpi->zbin_over_quant / 2) +

+                 cpi->zbin_mode_boost +

+                 x->act_zbin_adj)) >> 7;

+  x->block[24].zbin_extra = (short)zbin_extra;

+}

+void vp9_frame_init_quantizer(VP9_COMP *cpi) {

+  // Clear Zbin mode boost for default case

+  cpi->zbin_mode_boost = 0;

+  // MB level quantizer setup

+  vp9_mb_init_quantizer(cpi, &cpi->mb);

+}

+void vp9_set_quantizer(struct VP9_COMP *cpi, int Q) {

+  VP9_COMMON *cm = &cpi->common;

+  cm->base_qindex = Q;

+  // if any of the delta_q values are changing update flag will

+  // have to be set.

+  cm->y1dc_delta_q = 0;

+  cm->y2ac_delta_q = 0;

+  cm->uvdc_delta_q = 0;

+  cm->uvac_delta_q = 0;

+  cm->y2dc_delta_q = 0;

+  // quantizer has to be reinitialized if any delta_q changes.

+  // As there are not any here for now this is inactive code.

+  // if(update)

+  //    vp9_init_quantizer(cpi);

+}

--- /dev/null

+++ b/vp9/encoder/quantize.h

@@ -1,0 +1,97 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_QUANTIZE_H

+#define __INC_QUANTIZE_H

+#include "block.h"

+#define prototype_quantize_block(sym) \

+  void (sym)(BLOCK *b,BLOCKD *d)

+#define prototype_quantize_block_pair(sym) \

+  void (sym)(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)

+#define prototype_quantize_mb(sym) \

+  void (sym)(MACROBLOCK *x)

+#if ARCH_X86 || ARCH_X86_64

+#include "x86/quantize_x86.h"

+#endif

+#if ARCH_ARM

+#include "arm/quantize_arm.h"

+#endif

+#define prototype_quantize_block_type(sym) \

+  void (sym)(BLOCK *b, BLOCKD *d, TX_TYPE type)

+extern prototype_quantize_block_type(vp9_ht_quantize_b_4x4);

+#ifndef vp9_quantize_quantb_4x4

+#define vp9_quantize_quantb_4x4 vp9_regular_quantize_b_4x4

+#endif

+extern prototype_quantize_block(vp9_quantize_quantb_4x4);

+#ifndef vp9_quantize_quantb_4x4_pair

+#define vp9_quantize_quantb_4x4_pair vp9_regular_quantize_b_4x4_pair

+#endif

+extern prototype_quantize_block_pair(vp9_quantize_quantb_4x4_pair);

+#ifndef vp9_quantize_quantb_8x8

+#define vp9_quantize_quantb_8x8 vp9_regular_quantize_b_8x8

+#endif

+extern prototype_quantize_block(vp9_quantize_quantb_8x8);

+#ifndef vp9_quantize_quantb_16x16

+#define vp9_quantize_quantb_16x16 vp9_regular_quantize_b_16x16

+#endif

+extern prototype_quantize_block(vp9_quantize_quantb_16x16);

+#ifndef vp9_quantize_quantb_2x2

+#define vp9_quantize_quantb_2x2 vp9_regular_quantize_b_2x2

+#endif

+extern prototype_quantize_block(vp9_quantize_quantb_2x2);

+#ifndef vp9_quantize_mb_4x4

+#define vp9_quantize_mb_4x4 vp9_quantize_mb_4x4_c

+#endif

+extern prototype_quantize_mb(vp9_quantize_mb_4x4);

+void vp9_quantize_mb_8x8(MACROBLOCK *x);

+#ifndef vp9_quantize_mbuv_4x4

+#define vp9_quantize_mbuv_4x4 vp9_quantize_mbuv_4x4_c

+#endif

+extern prototype_quantize_mb(vp9_quantize_mbuv_4x4);

+#ifndef vp9_quantize_mby_4x4

+#define vp9_quantize_mby_4x4 vp9_quantize_mby_4x4_c

+#endif

+extern prototype_quantize_mb(vp9_quantize_mby_4x4);

+extern prototype_quantize_mb(vp9_quantize_mby_8x8);

+extern prototype_quantize_mb(vp9_quantize_mbuv_8x8);

+void vp9_quantize_mb_16x16(MACROBLOCK *x);

+extern prototype_quantize_block(vp9_quantize_quantb_16x16);

+extern prototype_quantize_mb(vp9_quantize_mby_16x16);

+struct VP9_COMP;

+extern void vp9_set_quantizer(struct VP9_COMP *cpi, int Q);

+extern void vp9_frame_init_quantizer(struct VP9_COMP *cpi);

+extern void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x);

+extern void vp9_mb_init_quantizer(struct VP9_COMP *cpi, MACROBLOCK *x);

+extern void vp9_init_quantizer(struct VP9_COMP *cpi);

+#endif

--- /dev/null

+++ b/vp9/encoder/ratectrl.c

@@ -1,0 +1,698 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <stdlib.h>

+#include <stdio.h>

+#include <string.h>

+#include <limits.h>

+#include <assert.h>

+#include "math.h"

+#include "vp9/common/alloccommon.h"

+#include "vp9/common/common.h"

+#include "ratectrl.h"

+#include "vp9/common/entropymode.h"

+#include "vpx_mem/vpx_mem.h"

+#include "vp9/common/systemdependent.h"

+#include "encodemv.h"

+#include "vp9/common/quant_common.h"

+#define MIN_BPB_FACTOR          0.005

+#define MAX_BPB_FACTOR          50

+#ifdef MODE_STATS

+extern unsigned int y_modes[VP9_YMODES];

+extern unsigned int uv_modes[VP9_UV_MODES];

+extern unsigned int b_modes[B_MODE_COUNT];

+extern unsigned int inter_y_modes[MB_MODE_COUNT];

+extern unsigned int inter_uv_modes[VP9_UV_MODES];

+extern unsigned int inter_b_modes[B_MODE_COUNT];

+#endif

+// Bits Per MB at different Q (Multiplied by 512)

+#define BPER_MB_NORMBITS    9

+// % adjustment to target kf size based on seperation from previous frame

+static const int kf_boost_seperation_adjustment[16] = {

+  30,   40,   50,   55,   60,   65,   70,   75,

+  80,   85,   90,   95,  100,  100,  100,  100,

+};

+static const int gf_adjust_table[101] = {

+  100,

+  115, 130, 145, 160, 175, 190, 200, 210, 220, 230,

+  240, 260, 270, 280, 290, 300, 310, 320, 330, 340,

+  350, 360, 370, 380, 390, 400, 400, 400, 400, 400,

+  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,

+  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,

+  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,

+  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,

+  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,

+  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,

+  400, 400, 400, 400, 400, 400, 400, 400, 400, 400,

+};

+static const int gf_intra_usage_adjustment[20] = {

+  125, 120, 115, 110, 105, 100,  95,  85,  80,  75,

+  70,  65,  60,  55,  50,  50,  50,  50,  50,  50,

+};

+static const int gf_interval_table[101] = {

+  7,

+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8,

+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8,

+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9,

+  9, 9, 9, 9, 9, 9, 9, 9, 9, 9,

+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10,

+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10,

+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,

+};

+static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] = { 1, 2, 3, 4, 5 };

+// These functions use formulaic calculations to make playing with the

+// quantizer tables easier. If necessary they can be replaced by lookup

+// tables if and when things settle down in the experimental bitstream

+double vp9_convert_qindex_to_q(int qindex) {

+  // Convert the index to a real Q value (scaled down to match old Q values)

+  return (double)vp9_ac_yquant(qindex) / 4.0;

+}

+int vp9_gfboost_qadjust(int qindex) {

+  int retval;

+  double q;

+  q = vp9_convert_qindex_to_q(qindex);

+  retval = (int)((0.00000828 * q * q * q) +

+                 (-0.0055 * q * q) +

+                 (1.32 * q) + 79.3);

+  return retval;

+}

+static int kfboost_qadjust(int qindex) {

+  int retval;

+  double q;

+  q = vp9_convert_qindex_to_q(qindex);

+  retval = (int)((0.00000973 * q * q * q) +

+                 (-0.00613 * q * q) +

+                 (1.316 * q) + 121.2);

+  return retval;

+}

+int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex) {

+  if (frame_type == KEY_FRAME)

+    return (int)(4500000 / vp9_convert_qindex_to_q(qindex));

+  else

+    return (int)(2850000 / vp9_convert_qindex_to_q(qindex));

+}

+void vp9_save_coding_context(VP9_COMP *cpi) {

+  CODING_CONTEXT *const cc = &cpi->coding_context;

+  VP9_COMMON *cm = &cpi->common;

+  MACROBLOCKD *xd = &cpi->mb.e_mbd;

+  // Stores a snapshot of key state variables which can subsequently be

+  // restored with a call to vp9_restore_coding_context. These functions are

+  // intended for use in a re-code loop in vp9_compress_frame where the

+  // quantizer value is adjusted between loop iterations.

+  cc->nmvc = cm->fc.nmvc;

+  vp9_copy(cc->nmvjointcost,  cpi->mb.nmvjointcost);

+  vp9_copy(cc->nmvcosts,  cpi->mb.nmvcosts);

+  vp9_copy(cc->nmvcosts_hp,  cpi->mb.nmvcosts_hp);

+  vp9_copy(cc->mv_ref_ct, cm->fc.mv_ref_ct);

+  vp9_copy(cc->mode_context, cm->fc.mode_context);

+  vp9_copy(cc->mv_ref_ct_a, cm->fc.mv_ref_ct_a);

+  vp9_copy(cc->mode_context_a, cm->fc.mode_context_a);

+  vp9_copy(cc->ymode_prob, cm->fc.ymode_prob);

+  vp9_copy(cc->bmode_prob, cm->fc.bmode_prob);

+  vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);

+  vp9_copy(cc->i8x8_mode_prob, cm->fc.i8x8_mode_prob);

+  vp9_copy(cc->sub_mv_ref_prob, cm->fc.sub_mv_ref_prob);

+  vp9_copy(cc->mbsplit_prob, cm->fc.mbsplit_prob);

+  // Stats

+#ifdef MODE_STATS

+  vp9_copy(cc->y_modes,       y_modes);

+  vp9_copy(cc->uv_modes,      uv_modes);

+  vp9_copy(cc->b_modes,       b_modes);

+  vp9_copy(cc->inter_y_modes,  inter_y_modes);

+  vp9_copy(cc->inter_uv_modes, inter_uv_modes);

+  vp9_copy(cc->inter_b_modes,  inter_b_modes);

+#endif

+  vp9_copy(cc->segment_pred_probs, cm->segment_pred_probs);

+  vp9_copy(cc->ref_pred_probs_update, cpi->ref_pred_probs_update);

+  vp9_copy(cc->ref_pred_probs, cm->ref_pred_probs);

+  vp9_copy(cc->prob_comppred, cm->prob_comppred);

+  vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,

+             cm->last_frame_seg_map, (cm->mb_rows * cm->mb_cols));

+  vp9_copy(cc->last_ref_lf_deltas, xd->last_ref_lf_deltas);

+  vp9_copy(cc->last_mode_lf_deltas, xd->last_mode_lf_deltas);

+  vp9_copy(cc->coef_probs, cm->fc.coef_probs);

+  vp9_copy(cc->hybrid_coef_probs, cm->fc.hybrid_coef_probs);

+  vp9_copy(cc->coef_probs_8x8, cm->fc.coef_probs_8x8);

+  vp9_copy(cc->hybrid_coef_probs_8x8, cm->fc.hybrid_coef_probs_8x8);

+  vp9_copy(cc->coef_probs_16x16, cm->fc.coef_probs_16x16);

+  vp9_copy(cc->hybrid_coef_probs_16x16, cm->fc.hybrid_coef_probs_16x16);

+  vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);

+}

+void vp9_restore_coding_context(VP9_COMP *cpi) {

+  CODING_CONTEXT *const cc = &cpi->coding_context;

+  VP9_COMMON *cm = &cpi->common;

+  MACROBLOCKD *xd = &cpi->mb.e_mbd;

+  // Restore key state variables to the snapshot state stored in the

+  // previous call to vp9_save_coding_context.

+  cm->fc.nmvc = cc->nmvc;

+  vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);

+  vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts);

+  vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp);

+  vp9_copy(cm->fc.mv_ref_ct, cc->mv_ref_ct);

+  vp9_copy(cm->fc.mode_context, cc->mode_context);

+  vp9_copy(cm->fc.mv_ref_ct_a, cc->mv_ref_ct_a);

+  vp9_copy(cm->fc.mode_context_a, cc->mode_context_a);

+  vp9_copy(cm->fc.ymode_prob, cc->ymode_prob);

+  vp9_copy(cm->fc.bmode_prob, cc->bmode_prob);

+  vp9_copy(cm->fc.i8x8_mode_prob, cc->i8x8_mode_prob);

+  vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);

+  vp9_copy(cm->fc.sub_mv_ref_prob, cc->sub_mv_ref_prob);

+  vp9_copy(cm->fc.mbsplit_prob, cc->mbsplit_prob);

+  // Stats

+#ifdef MODE_STATS

+  vp9_copy(y_modes, cc->y_modes);

+  vp9_copy(uv_modes, cc->uv_modes);

+  vp9_copy(b_modes, cc->b_modes);

+  vp9_copy(inter_y_modes, cc->inter_y_modes);

+  vp9_copy(inter_uv_modes, cc->inter_uv_modes);

+  vp9_copy(inter_b_modes, cc->inter_b_modes);

+#endif

+  vp9_copy(cm->segment_pred_probs, cc->segment_pred_probs);

+  vp9_copy(cpi->ref_pred_probs_update, cc->ref_pred_probs_update);

+  vp9_copy(cm->ref_pred_probs, cc->ref_pred_probs);

+  vp9_copy(cm->prob_comppred, cc->prob_comppred);

+  vpx_memcpy(cm->last_frame_seg_map,

+             cpi->coding_context.last_frame_seg_map_copy,

+             (cm->mb_rows * cm->mb_cols));

+  vp9_copy(xd->last_ref_lf_deltas, cc->last_ref_lf_deltas);

+  vp9_copy(xd->last_mode_lf_deltas, cc->last_mode_lf_deltas);

+  vp9_copy(cm->fc.coef_probs, cc->coef_probs);

+  vp9_copy(cm->fc.hybrid_coef_probs, cc->hybrid_coef_probs);

+  vp9_copy(cm->fc.coef_probs_8x8, cc->coef_probs_8x8);

+  vp9_copy(cm->fc.hybrid_coef_probs_8x8, cc->hybrid_coef_probs_8x8);

+  vp9_copy(cm->fc.coef_probs_16x16, cc->coef_probs_16x16);

+  vp9_copy(cm->fc.hybrid_coef_probs_16x16, cc->hybrid_coef_probs_16x16);

+  vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);

+}

+void vp9_setup_key_frame(VP9_COMP *cpi) {

+  VP9_COMMON *cm = &cpi->common;

+  // Setup for Key frame:

+  vp9_default_coef_probs(& cpi->common);

+  vp9_kf_default_bmode_probs(cpi->common.kf_bmode_prob);

+  vp9_init_mbmode_probs(& cpi->common);

+  vp9_default_bmode_probs(cm->fc.bmode_prob);

+  vp9_init_mv_probs(& cpi->common);

+  // cpi->common.filter_level = 0;      // Reset every key frame.

+  cpi->common.filter_level = cpi->common.base_qindex * 3 / 8;

+  // interval before next GF

+  cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;

+  cpi->common.refresh_golden_frame = TRUE;

+  cpi->common.refresh_alt_ref_frame = TRUE;

+  vp9_init_mode_contexts(&cpi->common);

+  vpx_memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));

+  vpx_memcpy(&cpi->common.lfc_a, &cpi->common.fc, sizeof(cpi->common.fc));

+  vpx_memset(cm->prev_mip, 0,

+    (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

+  vpx_memset(cm->mip, 0,

+    (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

+  vp9_update_mode_info_border(cm, cm->mip);

+  vp9_update_mode_info_in_image(cm, cm->mi);

+}

+void vp9_setup_inter_frame(VP9_COMP *cpi) {

+  if (cpi->common.refresh_alt_ref_frame) {

+    vpx_memcpy(&cpi->common.fc,

+               &cpi->common.lfc_a,

+               sizeof(cpi->common.fc));

+    vpx_memcpy(cpi->common.fc.vp8_mode_contexts,

+               cpi->common.fc.mode_context_a,

+               sizeof(cpi->common.fc.vp8_mode_contexts));

+  } else {

+    vpx_memcpy(&cpi->common.fc,

+               &cpi->common.lfc,

+               sizeof(cpi->common.fc));

+    vpx_memcpy(cpi->common.fc.vp8_mode_contexts,

+               cpi->common.fc.mode_context,

+               sizeof(cpi->common.fc.vp8_mode_contexts));

+  }

+}

+static int estimate_bits_at_q(int frame_kind, int Q, int MBs,

+                              double correction_factor) {

+  int Bpm = (int)(.5 + correction_factor * vp9_bits_per_mb(frame_kind, Q));

+  /* Attempt to retain reasonable accuracy without overflow. The cutoff is

+   * chosen such that the maximum product of Bpm and MBs fits 31 bits. The

+   * largest Bpm takes 20 bits.

+   */

+  if (MBs > (1 << 11))

+    return (Bpm >> BPER_MB_NORMBITS) * MBs;

+  else

+    return (Bpm * MBs) >> BPER_MB_NORMBITS;

+}

+static void calc_iframe_target_size(VP9_COMP *cpi) {

+  // boost defaults to half second

+  int target;

+  // Clear down mmx registers to allow floating point in what follows

+  vp9_clear_system_state();  // __asm emms;

+  // New Two pass RC

+  target = cpi->per_frame_bandwidth;

+  if (cpi->oxcf.rc_max_intra_bitrate_pct) {

+    unsigned int max_rate = cpi->per_frame_bandwidth

+                            * cpi->oxcf.rc_max_intra_bitrate_pct / 100;

+    if (target > max_rate)

+      target = max_rate;

+  }

+  cpi->this_frame_target = target;

+}

+//  Do the best we can to define the parameteres for the next GF based

+//  on what information we have available.

+//

+//  In this experimental code only two pass is supported

+//  so we just use the interval determined in the two pass code.

+static void calc_gf_params(VP9_COMP *cpi) {

+  // Set the gf interval

+  cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;

+}

+static void calc_pframe_target_size(VP9_COMP *cpi) {

+  int min_frame_target;

+  min_frame_target = 0;

+  min_frame_target = cpi->min_frame_bandwidth;

+  if (min_frame_target < (cpi->av_per_frame_bandwidth >> 5))

+    min_frame_target = cpi->av_per_frame_bandwidth >> 5;

+  // Special alt reference frame case

+  if (cpi->common.refresh_alt_ref_frame) {

+    // Per frame bit target for the alt ref frame

+    cpi->per_frame_bandwidth = cpi->twopass.gf_bits;

+    cpi->this_frame_target = cpi->per_frame_bandwidth;

+  }

+  // Normal frames (gf,and inter)

+  else {

+    cpi->this_frame_target = cpi->per_frame_bandwidth;

+  }

+  // Sanity check that the total sum of adjustments is not above the maximum allowed

+  // That is that having allowed for KF and GF penalties we have not pushed the

+  // current interframe target to low. If the adjustment we apply here is not capable of recovering

+  // all the extra bits we have spent in the KF or GF then the remainder will have to be recovered over

+  // a longer time span via other buffer / rate control mechanisms.

+  if (cpi->this_frame_target < min_frame_target)

+    cpi->this_frame_target = min_frame_target;

+  if (!cpi->common.refresh_alt_ref_frame)

+    // Note the baseline target data rate for this inter frame.

+    cpi->inter_frame_target = cpi->this_frame_target;

+  // Adjust target frame size for Golden Frames:

+  if (cpi->frames_till_gf_update_due == 0) {

+    // int Boost = 0;

+    int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;

+    cpi->common.refresh_golden_frame = TRUE;

+    calc_gf_params(cpi);

+    // If we are using alternate ref instead of gf then do not apply the boost

+    // It will instead be applied to the altref update

+    // Jims modified boost

+    if (!cpi->source_alt_ref_active) {

+      if (cpi->oxcf.fixed_q < 0) {

+        // The spend on the GF is defined in the two pass code

+        // for two pass encodes

+        cpi->this_frame_target = cpi->per_frame_bandwidth;

+      } else

+        cpi->this_frame_target =

+          (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0)

+           * cpi->last_boost) / 100;

+    }

+    // If there is an active ARF at this location use the minimum

+    // bits on this frame even if it is a contructed arf.

+    // The active maximum quantizer insures that an appropriate

+    // number of bits will be spent if needed for contstructed ARFs.

+    else {

+      cpi->this_frame_target = 0;

+    }

+    cpi->current_gf_interval = cpi->frames_till_gf_update_due;

+  }

+}

+void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {

+  int    Q = cpi->common.base_qindex;

+  int    correction_factor = 100;

+  double rate_correction_factor;

+  double adjustment_limit;

+  int    projected_size_based_on_q = 0;

+  // Clear down mmx registers to allow floating point in what follows

+  vp9_clear_system_state();  // __asm emms;

+  if (cpi->common.frame_type == KEY_FRAME) {

+    rate_correction_factor = cpi->key_frame_rate_correction_factor;

+  } else {

+    if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)

+      rate_correction_factor = cpi->gf_rate_correction_factor;

+    else

+      rate_correction_factor = cpi->rate_correction_factor;

+  }

+  // Work out how big we would have expected the frame to be at this Q given the current correction factor.

+  // Stay in double to avoid int overflow when values are large

+  projected_size_based_on_q =

+    (int)(((.5 + rate_correction_factor *

+            vp9_bits_per_mb(cpi->common.frame_type, Q)) *

+           cpi->common.MBs) / (1 << BPER_MB_NORMBITS));

+  // Make some allowance for cpi->zbin_over_quant

+  if (cpi->zbin_over_quant > 0) {

+    int Z = cpi->zbin_over_quant;

+    double Factor = 0.99;

+    double factor_adjustment = 0.01 / 256.0; // (double)ZBIN_OQ_MAX;

+    while (Z > 0) {

+      Z--;

+      projected_size_based_on_q =

+        (int)(Factor * projected_size_based_on_q);

+      Factor += factor_adjustment;

+      if (Factor  >= 0.999)

+        Factor = 0.999;

+    }

+  }

+  // Work out a size correction factor.

+  // if ( cpi->this_frame_target > 0 )

+  //  correction_factor = (100 * cpi->projected_frame_size) / cpi->this_frame_target;

+  if (projected_size_based_on_q > 0)

+    correction_factor = (100 * cpi->projected_frame_size) / projected_size_based_on_q;

+  // More heavily damped adjustment used if we have been oscillating either side of target

+  switch (damp_var) {

+    case 0:

+      adjustment_limit = 0.75;

+      break;

+    case 1:

+      adjustment_limit = 0.375;

+      break;

+    case 2:

+    default:

+      adjustment_limit = 0.25;

+      break;

+  }

+  // if ( (correction_factor > 102) && (Q < cpi->active_worst_quality) )

+  if (correction_factor > 102) {

+    // We are not already at the worst allowable quality

+    correction_factor = (int)(100.5 + ((correction_factor - 100) * adjustment_limit));

+    rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);

+    // Keep rate_correction_factor within limits

+    if (rate_correction_factor > MAX_BPB_FACTOR)

+      rate_correction_factor = MAX_BPB_FACTOR;

+  }

+  // else if ( (correction_factor < 99) && (Q > cpi->active_best_quality) )

+  else if (correction_factor < 99) {

+    // We are not already at the best allowable quality

+    correction_factor = (int)(100.5 - ((100 - correction_factor) * adjustment_limit));

+    rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);

+    // Keep rate_correction_factor within limits

+    if (rate_correction_factor < MIN_BPB_FACTOR)

+      rate_correction_factor = MIN_BPB_FACTOR;

+  }

+  if (cpi->common.frame_type == KEY_FRAME)

+    cpi->key_frame_rate_correction_factor = rate_correction_factor;

+  else {

+    if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)

+      cpi->gf_rate_correction_factor = rate_correction_factor;

+    else

+      cpi->rate_correction_factor = rate_correction_factor;

+  }

+}

+int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {

+  int Q = cpi->active_worst_quality;

+  int i;

+  int last_error = INT_MAX;

+  int target_bits_per_mb;

+  int bits_per_mb_at_this_q;

+  double correction_factor;

+  // Reset Zbin OQ value

+  cpi->zbin_over_quant = 0;

+  // Select the appropriate correction factor based upon type of frame.

+  if (cpi->common.frame_type == KEY_FRAME)

+    correction_factor = cpi->key_frame_rate_correction_factor;

+  else {

+    if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)

+      correction_factor = cpi->gf_rate_correction_factor;

+    else

+      correction_factor = cpi->rate_correction_factor;

+  }

+  // Calculate required scaling factor based on target frame size and size of frame produced using previous Q

+  if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS))

+    target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) << BPER_MB_NORMBITS;       // Case where we would overflow int

+  else

+    target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;

+  i = cpi->active_best_quality;

+  do {

+    bits_per_mb_at_this_q =

+      (int)(.5 + correction_factor *

+            vp9_bits_per_mb(cpi->common.frame_type, i));

+    if (bits_per_mb_at_this_q <= target_bits_per_mb) {

+      if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)

+        Q = i;

+      else

+        Q = i - 1;

+      break;

+    } else

+      last_error = bits_per_mb_at_this_q - target_bits_per_mb;

+  } while (++i <= cpi->active_worst_quality);

+  // If we are at MAXQ then enable Q over-run which seeks to claw back additional bits through things like

+  // the RD multiplier and zero bin size.

+  if (Q >= MAXQ) {

+    int zbin_oqmax;

+    double Factor = 0.99;

+    double factor_adjustment = 0.01 / 256.0; // (double)ZBIN_OQ_MAX;

+    if (cpi->common.frame_type == KEY_FRAME)

+      zbin_oqmax = 0; // ZBIN_OQ_MAX/16

+    else if (cpi->common.refresh_alt_ref_frame || (cpi->common.refresh_golden_frame && !cpi->source_alt_ref_active))

+      zbin_oqmax = 16;

+    else

+      zbin_oqmax = ZBIN_OQ_MAX;

+    // Each incrment in the zbin is assumed to have a fixed effect on bitrate. This is not of course true.

+    // The effect will be highly clip dependent and may well have sudden steps.

+    // The idea here is to acheive higher effective quantizers than the normal maximum by expanding the zero

+    // bin and hence decreasing the number of low magnitude non zero coefficients.

+    while (cpi->zbin_over_quant < zbin_oqmax) {

+      cpi->zbin_over_quant++;

+      if (cpi->zbin_over_quant > zbin_oqmax)

+        cpi->zbin_over_quant = zbin_oqmax;

+      // Adjust bits_per_mb_at_this_q estimate

+      bits_per_mb_at_this_q = (int)(Factor * bits_per_mb_at_this_q);

+      Factor += factor_adjustment;

+      if (Factor  >= 0.999)

+        Factor = 0.999;

+      if (bits_per_mb_at_this_q <= target_bits_per_mb)    // Break out if we get down to the target rate

+        break;

+    }

+  }

+  return Q;

+}

+static int estimate_keyframe_frequency(VP9_COMP *cpi) {

+  int i;

+  // Average key frame frequency

+  int av_key_frame_frequency = 0;

+  /* First key frame at start of sequence is a special case. We have no

+   * frequency data.

+   */

+  if (cpi->key_frame_count == 1) {

+    /* Assume a default of 1 kf every 2 seconds, or the max kf interval,

+     * whichever is smaller.

+     */

+    int key_freq = cpi->oxcf.key_freq > 0 ? cpi->oxcf.key_freq : 1;

+    av_key_frame_frequency = (int)cpi->output_frame_rate * 2;

+    if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)

+      av_key_frame_frequency = cpi->oxcf.key_freq;

+    cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1]

+      = av_key_frame_frequency;

+  } else {

+    unsigned int total_weight = 0;

+    int last_kf_interval =

+      (cpi->frames_since_key > 0) ? cpi->frames_since_key : 1;

+    /* reset keyframe context and calculate weighted average of last

+     * KEY_FRAME_CONTEXT keyframes

+     */

+    for (i = 0; i < KEY_FRAME_CONTEXT; i++) {

+      if (i < KEY_FRAME_CONTEXT - 1)

+        cpi->prior_key_frame_distance[i]

+          = cpi->prior_key_frame_distance[i + 1];

+      else

+        cpi->prior_key_frame_distance[i] = last_kf_interval;

+      av_key_frame_frequency += prior_key_frame_weight[i]

+                                * cpi->prior_key_frame_distance[i];

+      total_weight += prior_key_frame_weight[i];

+    }

+    av_key_frame_frequency  /= total_weight;

+  }

+  return av_key_frame_frequency;

+}

+void vp9_adjust_key_frame_context(VP9_COMP *cpi) {

+  // Clear down mmx registers to allow floating point in what follows

+  vp9_clear_system_state();

+  cpi->frames_since_key = 0;

+  cpi->key_frame_count++;

+}

+void vp9_compute_frame_size_bounds(VP9_COMP *cpi, int *frame_under_shoot_limit,

+                                   int *frame_over_shoot_limit) {

+  // Set-up bounds on acceptable frame size:

+  if (cpi->oxcf.fixed_q >= 0) {

+    // Fixed Q scenario: frame size never outranges target (there is no target!)

+    *frame_under_shoot_limit = 0;

+    *frame_over_shoot_limit  = INT_MAX;

+  } else {

+    if (cpi->common.frame_type == KEY_FRAME) {

+      *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;

+      *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;

+    } else {

+      if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) {

+        *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;

+        *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;

+      } else {

+        // Stron overshoot limit for constrained quality

+        if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {

+          *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;

+          *frame_under_shoot_limit = cpi->this_frame_target * 2 / 8;

+        } else {

+          *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;

+          *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;

+        }

+      }

+    }

+    // For very small rate targets where the fractional adjustment

+    // (eg * 7/8) may be tiny make sure there is at least a minimum

+    // range.

+    *frame_over_shoot_limit += 200;

+    *frame_under_shoot_limit -= 200;

+    if (*frame_under_shoot_limit < 0)

+      *frame_under_shoot_limit = 0;

+  }

+}

+// return of 0 means drop frame

+int vp9_pick_frame_size(VP9_COMP *cpi) {

+  VP9_COMMON *cm = &cpi->common;

+  if (cm->frame_type == KEY_FRAME)

+    calc_iframe_target_size(cpi);

+  else

+    calc_pframe_target_size(cpi);

+  return 1;

+}

--- /dev/null

+++ b/vp9/encoder/ratectrl.h

@@ -1,0 +1,37 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#if !defined __INC_RATECTRL_H

+#include "onyx_int.h"

+#define FRAME_OVERHEAD_BITS 200

+extern void vp9_save_coding_context(VP9_COMP *cpi);

+extern void vp9_restore_coding_context(VP9_COMP *cpi);

+extern void vp9_setup_key_frame(VP9_COMP *cpi);

+extern void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var);

+extern int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame);

+extern void vp9_adjust_key_frame_context(VP9_COMP *cpi);

+extern void vp9_compute_frame_size_bounds(VP9_COMP *cpi,

+                                          int *frame_under_shoot_limit,

+                                          int *frame_over_shoot_limit);

+// return of 0 means drop frame

+extern int vp9_pick_frame_size(VP9_COMP *cpi);

+extern double vp9_convert_qindex_to_q(int qindex);

+extern int vp9_gfboost_qadjust(int qindex);

+extern int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex);

+void vp9_setup_inter_frame(VP9_COMP *cpi);

+#endif

--- /dev/null

+++ b/vp9/encoder/rdopt.c

@@ -1,0 +1,4854 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <stdio.h>

+#include <math.h>

+#include <limits.h>

+#include <assert.h>

+#include "vp9/common/pragmas.h"

+#include "tokenize.h"

+#include "treewriter.h"

+#include "onyx_int.h"

+#include "modecosts.h"

+#include "encodeintra.h"

+#include "vp9/common/entropymode.h"

+#include "vp9/common/reconinter.h"

+#include "vp9/common/reconintra.h"

+#include "vp9/common/reconintra4x4.h"

+#include "vp9/common/findnearmv.h"

+#include "vp9/common/quant_common.h"

+#include "encodemb.h"

+#include "quantize.h"

+#include "vp9/common/idct.h"

+#include "variance.h"

+#include "mcomp.h"

+#include "rdopt.h"

+#include "ratectrl.h"

+#include "vpx_mem/vpx_mem.h"

+#include "vp9/common/systemdependent.h"

+#include "vp9/encoder/encodemv.h"

+#include "vp9/common/seg_common.h"

+#include "vp9/common/pred_common.h"

+#include "vp9/common/entropy.h"

+#include "vpx_rtcd.h"

+#if CONFIG_NEWBESTREFMV

+#include "vp9/common/mvref_common.h"

+#endif

+#if CONFIG_RUNTIME_CPU_DETECT

+#define IF_RTCD(x)  (x)

+#else

+#define IF_RTCD(x)  NULL

+#endif

+extern void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x);

+extern void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x);

+#define MAXF(a,b)            (((a) > (b)) ? (a) : (b))

+#define INVALID_MV 0x80008000

+/* Factor to weigh the rate for switchable interp filters */

+#define SWITCHABLE_INTERP_RATE_FACTOR 1

+static const int auto_speed_thresh[17] = {

+  1000,

+  200,

+  150,

+  130,

+  150,

+  125,

+  120,

+  115,

+  115,

+  115,

+  115,

+  115,

+  115,

+  115,

+  115,

+  115,

+  105

+};

+#if CONFIG_PRED_FILTER

+const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {

+  {ZEROMV,    LAST_FRAME,   0,  0},

+  {ZEROMV,    LAST_FRAME,   0,  1},

+  {DC_PRED,   INTRA_FRAME,  0,  0},

+  {NEARESTMV, LAST_FRAME,   0,  0},

+  {NEARESTMV, LAST_FRAME,   0,  1},

+  {NEARMV,    LAST_FRAME,   0,  0},

+  {NEARMV,    LAST_FRAME,   0,  1},

+  {ZEROMV,    GOLDEN_FRAME, 0,  0},

+  {ZEROMV,    GOLDEN_FRAME, 0,  1},

+  {NEARESTMV, GOLDEN_FRAME, 0,  0},

+  {NEARESTMV, GOLDEN_FRAME, 0,  1},

+  {ZEROMV,    ALTREF_FRAME, 0,  0},

+  {ZEROMV,    ALTREF_FRAME, 0,  1},

+  {NEARESTMV, ALTREF_FRAME, 0,  0},

+  {NEARESTMV, ALTREF_FRAME, 0,  1},

+  {NEARMV,    GOLDEN_FRAME, 0,  0},

+  {NEARMV,    GOLDEN_FRAME, 0,  1},

+  {NEARMV,    ALTREF_FRAME, 0,  0},

+  {NEARMV,    ALTREF_FRAME, 0,  1},

+  {V_PRED,    INTRA_FRAME,  0,  0},

+  {H_PRED,    INTRA_FRAME,  0,  0},

+  {D45_PRED,  INTRA_FRAME,  0,  0},

+  {D135_PRED, INTRA_FRAME,  0,  0},

+  {D117_PRED, INTRA_FRAME,  0,  0},

+  {D153_PRED, INTRA_FRAME,  0,  0},

+  {D27_PRED,  INTRA_FRAME,  0,  0},

+  {D63_PRED,  INTRA_FRAME,  0,  0},

+  {TM_PRED,   INTRA_FRAME,  0,  0},

+  {NEWMV,     LAST_FRAME,   0,  0},

+  {NEWMV,     LAST_FRAME,   0,  1},

+  {NEWMV,     GOLDEN_FRAME, 0,  0},

+  {NEWMV,     GOLDEN_FRAME, 0,  1},

+  {NEWMV,     ALTREF_FRAME, 0,  0},

+  {NEWMV,     ALTREF_FRAME, 0,  1},

+  {SPLITMV,   LAST_FRAME,   0,  0},

+  {SPLITMV,   GOLDEN_FRAME, 0,  0},

+  {SPLITMV,   ALTREF_FRAME, 0,  0},

+  {B_PRED,    INTRA_FRAME,  0,  0},

+  {I8X8_PRED, INTRA_FRAME,  0,  0},

+  /* compound prediction modes */

+  {ZEROMV,    LAST_FRAME,   GOLDEN_FRAME, 0},

+  {NEARESTMV, LAST_FRAME,   GOLDEN_FRAME, 0},

+  {NEARMV,    LAST_FRAME,   GOLDEN_FRAME, 0},

+  {ZEROMV,    ALTREF_FRAME, LAST_FRAME,   0},

+  {NEARESTMV, ALTREF_FRAME, LAST_FRAME,   0},

+  {NEARMV,    ALTREF_FRAME, LAST_FRAME,   0},

+  {ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME, 0},

+  {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME, 0},

+  {NEARMV,    GOLDEN_FRAME, ALTREF_FRAME, 0},

+  {NEWMV,     LAST_FRAME,   GOLDEN_FRAME, 0},

+  {NEWMV,     ALTREF_FRAME, LAST_FRAME,   0},

+  {NEWMV,     GOLDEN_FRAME, ALTREF_FRAME, 0},

+  {SPLITMV,   LAST_FRAME,   GOLDEN_FRAME, 0},

+  {SPLITMV,   ALTREF_FRAME, LAST_FRAME,   0},

+  {SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME, 0}

+};

+#else

+const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {

+  {ZEROMV,    LAST_FRAME,   0},

+  {DC_PRED,   INTRA_FRAME,  0},

+  {NEARESTMV, LAST_FRAME,   0},

+  {NEARMV,    LAST_FRAME,   0},

+  {ZEROMV,    GOLDEN_FRAME, 0},

+  {NEARESTMV, GOLDEN_FRAME, 0},

+  {ZEROMV,    ALTREF_FRAME, 0},

+  {NEARESTMV, ALTREF_FRAME, 0},

+  {NEARMV,    GOLDEN_FRAME, 0},

+  {NEARMV,    ALTREF_FRAME, 0},

+  {V_PRED,    INTRA_FRAME,  0},

+  {H_PRED,    INTRA_FRAME,  0},

+  {D45_PRED,  INTRA_FRAME,  0},

+  {D135_PRED, INTRA_FRAME,  0},

+  {D117_PRED, INTRA_FRAME,  0},

+  {D153_PRED, INTRA_FRAME,  0},

+  {D27_PRED,  INTRA_FRAME,  0},

+  {D63_PRED,  INTRA_FRAME,  0},

+  {TM_PRED,   INTRA_FRAME,  0},

+  {NEWMV,     LAST_FRAME,   0},

+  {NEWMV,     GOLDEN_FRAME, 0},

+  {NEWMV,     ALTREF_FRAME, 0},

+  {SPLITMV,   LAST_FRAME,   0},

+  {SPLITMV,   GOLDEN_FRAME, 0},

+  {SPLITMV,   ALTREF_FRAME, 0},

+  {B_PRED,    INTRA_FRAME,  0},

+  {I8X8_PRED, INTRA_FRAME,  0},

+  /* compound prediction modes */

+  {ZEROMV,    LAST_FRAME,   GOLDEN_FRAME},

+  {NEARESTMV, LAST_FRAME,   GOLDEN_FRAME},

+  {NEARMV,    LAST_FRAME,   GOLDEN_FRAME},

+  {ZEROMV,    ALTREF_FRAME, LAST_FRAME},

+  {NEARESTMV, ALTREF_FRAME, LAST_FRAME},

+  {NEARMV,    ALTREF_FRAME, LAST_FRAME},

+  {ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME},

+  {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},

+  {NEARMV,    GOLDEN_FRAME, ALTREF_FRAME},

+  {NEWMV,     LAST_FRAME,   GOLDEN_FRAME},

+  {NEWMV,     ALTREF_FRAME, LAST_FRAME  },

+  {NEWMV,     GOLDEN_FRAME, ALTREF_FRAME},

+  {SPLITMV,   LAST_FRAME,   GOLDEN_FRAME},

+  {SPLITMV,   ALTREF_FRAME, LAST_FRAME  },

+  {SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME}

+};

+#endif

+static void fill_token_costs(

+  unsigned int (*c)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS],

+  const vp9_prob(*p)[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES],

+  int block_type_counts) {

+  int i, j, k;

+  for (i = 0; i < block_type_counts; i++)

+    for (j = 0; j < COEF_BANDS; j++)

+      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

+        if (k == 0 && ((j > 0 && i > 0) || (j > 1 && i == 0)))

+          vp9_cost_tokens_skip((int *)(c[i][j][k]),

+                               p[i][j][k],

+                               vp9_coef_tree);

+        else

+          vp9_cost_tokens((int *)(c[i][j][k]),

+                          p[i][j][k],

+                          vp9_coef_tree);

+      }

+}

+static int rd_iifactor[32] =  { 4, 4, 3, 2, 1, 0, 0, 0,

+                                0, 0, 0, 0, 0, 0, 0, 0,

+                                0, 0, 0, 0, 0, 0, 0, 0,

+                                0, 0, 0, 0, 0, 0, 0, 0, };

+// 3* dc_qlookup[Q]*dc_qlookup[Q];

+/* values are now correlated to quantizer */

+static int sad_per_bit16lut[QINDEX_RANGE];

+static int sad_per_bit4lut[QINDEX_RANGE];

+void vp9_init_me_luts() {

+  int i;

+  // Initialize the sad lut tables using a formulaic calculation for now

+  // This is to make it easier to resolve the impact of experimental changes

+  // to the quantizer tables.

+  for (i = 0; i < QINDEX_RANGE; i++) {

+    sad_per_bit16lut[i] =

+      (int)((0.0418 * vp9_convert_qindex_to_q(i)) + 2.4107);

+    sad_per_bit4lut[i] = (int)((0.063 * vp9_convert_qindex_to_q(i)) + 2.742);

+  }

+}

+static int compute_rd_mult(int qindex) {

+  int q;

+  q = vp9_dc_quant(qindex, 0);

+  return (11 * q * q) >> 6;

+}

+void vp9_initialize_me_consts(VP9_COMP *cpi, int QIndex) {

+  cpi->mb.sadperbit16 =  sad_per_bit16lut[QIndex];

+  cpi->mb.sadperbit4  =  sad_per_bit4lut[QIndex];

+}

+void vp9_initialize_rd_consts(VP9_COMP *cpi, int QIndex) {

+  int q, i;

+  vp9_clear_system_state();  // __asm emms;

+  // Further tests required to see if optimum is different

+  // for key frames, golden frames and arf frames.

+  // if (cpi->common.refresh_golden_frame ||

+  //     cpi->common.refresh_alt_ref_frame)

+  QIndex = (QIndex < 0) ? 0 : ((QIndex > MAXQ) ? MAXQ : QIndex);

+  cpi->RDMULT = compute_rd_mult(QIndex);

+  // Extend rate multiplier along side quantizer zbin increases

+  if (cpi->zbin_over_quant  > 0) {

+    double oq_factor;

+    // Experimental code using the same basic equation as used for Q above

+    // The units of cpi->zbin_over_quant are 1/128 of Q bin size

+    oq_factor = 1.0 + ((double)0.0015625 * cpi->zbin_over_quant);

+    cpi->RDMULT = (int)((double)cpi->RDMULT * oq_factor * oq_factor);

+  }

+  if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {

+    if (cpi->twopass.next_iiratio > 31)

+      cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4;

+    else

+      cpi->RDMULT +=

+        (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;

+  }

+  if (cpi->RDMULT < 7)

+    cpi->RDMULT = 7;

+  cpi->mb.errorperbit = (cpi->RDMULT / 110);

+  cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);

+  vp9_set_speed_features(cpi);

+  q = (int)pow(vp9_dc_quant(QIndex, 0) >> 2, 1.25);

+  q = q << 2;

+  cpi->RDMULT = cpi->RDMULT << 4;

+  if (q < 8)

+    q = 8;

+  if (cpi->RDMULT > 1000) {

+    cpi->RDDIV = 1;

+    cpi->RDMULT /= 100;

+    for (i = 0; i < MAX_MODES; i++) {

+      if (cpi->sf.thresh_mult[i] < INT_MAX) {

+        cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q / 100;

+      } else {

+        cpi->rd_threshes[i] = INT_MAX;

+      }

+      cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];

+    }

+  } else {

+    cpi->RDDIV = 100;

+    for (i = 0; i < MAX_MODES; i++) {

+      if (cpi->sf.thresh_mult[i] < (INT_MAX / q)) {

+        cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q;

+      } else {

+        cpi->rd_threshes[i] = INT_MAX;

+      }

+      cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];

+    }

+  }

+  fill_token_costs(

+    cpi->mb.token_costs[TX_4X4],

+    (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs,

+    BLOCK_TYPES);

+  fill_token_costs(

+    cpi->mb.hybrid_token_costs[TX_4X4],

+    (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11])

+    cpi->common.fc.hybrid_coef_probs,

+    BLOCK_TYPES);

+  fill_token_costs(

+    cpi->mb.token_costs[TX_8X8],

+    (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_8x8,

+    BLOCK_TYPES_8X8);

+  fill_token_costs(

+    cpi->mb.hybrid_token_costs[TX_8X8],

+    (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11])

+    cpi->common.fc.hybrid_coef_probs_8x8,

+    BLOCK_TYPES_8X8);

+  fill_token_costs(

+    cpi->mb.token_costs[TX_16X16],

+    (const vp9_prob(*)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_16x16,

+    BLOCK_TYPES_16X16);

+  fill_token_costs(

+    cpi->mb.hybrid_token_costs[TX_16X16],

+    (const vp9_prob(*)[8][PREV_COEF_CONTEXTS][11])

+    cpi->common.fc.hybrid_coef_probs_16x16,

+    BLOCK_TYPES_16X16);

+  /*rough estimate for costing*/

+  cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4;

+  vp9_init_mode_costs(cpi);

+  if (cpi->common.frame_type != KEY_FRAME)

+  {

+    vp9_build_nmv_cost_table(

+        cpi->mb.nmvjointcost,

+        cpi->mb.e_mbd.allow_high_precision_mv ?

+        cpi->mb.nmvcost_hp : cpi->mb.nmvcost,

+        &cpi->common.fc.nmvc,

+        cpi->mb.e_mbd.allow_high_precision_mv, 1, 1);

+  }

+}

+void vp9_auto_select_speed(VP9_COMP *cpi) {

+  int milliseconds_for_compress = (int)(1000000 / cpi->oxcf.frame_rate);

+  milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16;

+  /*

+  // this is done during parameter valid check

+  if( cpi->oxcf.cpu_used > 16)

+      cpi->oxcf.cpu_used = 16;

+  if( cpi->oxcf.cpu_used < -16)

+      cpi->oxcf.cpu_used = -16;

+  */

+  if (cpi->avg_pick_mode_time < milliseconds_for_compress &&

+      (cpi->avg_encode_time - cpi->avg_pick_mode_time) <

+      milliseconds_for_compress) {

+    if (cpi->avg_pick_mode_time == 0) {

+      cpi->Speed = 4;

+    } else {

+      if (milliseconds_for_compress * 100 < cpi->avg_encode_time * 95) {

+        cpi->Speed          += 2;

+        cpi->avg_pick_mode_time = 0;

+        cpi->avg_encode_time = 0;

+        if (cpi->Speed > 16) {

+          cpi->Speed = 16;

+        }

+      }

+      if (milliseconds_for_compress * 100 >

+          cpi->avg_encode_time * auto_speed_thresh[cpi->Speed]) {

+        cpi->Speed          -= 1;

+        cpi->avg_pick_mode_time = 0;

+        cpi->avg_encode_time = 0;

+        // In real-time mode, cpi->speed is in [4, 16].

+        if (cpi->Speed < 4) {      // if ( cpi->Speed < 0 )

+          cpi->Speed = 4;        // cpi->Speed = 0;

+        }

+      }

+    }

+  } else {

+    cpi->Speed += 4;

+    if (cpi->Speed > 16)

+      cpi->Speed = 16;

+    cpi->avg_pick_mode_time = 0;

+    cpi->avg_encode_time = 0;

+  }

+}

+int vp9_block_error_c(short *coeff, short *dqcoeff, int block_size) {

+  int i, error = 0;

+  for (i = 0; i < block_size; i++) {

+    int this_diff = coeff[i] - dqcoeff[i];

+    error += this_diff * this_diff;

+  }

+  return error;

+}

+int vp9_mbblock_error_c(MACROBLOCK *mb, int dc) {

+  BLOCK  *be;

+  BLOCKD *bd;

+  int i, j;

+  int berror, error = 0;

+  for (i = 0; i < 16; i++) {

+    be = &mb->block[i];

+    bd = &mb->e_mbd.block[i];

+    berror = 0;

+    for (j = dc; j < 16; j++) {

+      int this_diff = be->coeff[j] - bd->dqcoeff[j];

+      berror += this_diff * this_diff;

+    }

+    error += berror;

+  }

+  return error;

+}

+int vp9_mbuverror_c(MACROBLOCK *mb) {

+  BLOCK  *be;

+  BLOCKD *bd;

+  int i, error = 0;

+  for (i = 16; i < 24; i++) {

+    be = &mb->block[i];

+    bd = &mb->e_mbd.block[i];

+    error += vp9_block_error_c(be->coeff, bd->dqcoeff, 16);

+  }

+  return error;

+}

+int vp9_uvsse(MACROBLOCK *x) {

+  unsigned char *uptr, *vptr;

+  unsigned char *upred_ptr = (*(x->block[16].base_src) + x->block[16].src);

+  unsigned char *vpred_ptr = (*(x->block[20].base_src) + x->block[20].src);

+  int uv_stride = x->block[16].src_stride;

+  unsigned int sse1 = 0;

+  unsigned int sse2 = 0;

+  int mv_row = x->e_mbd.mode_info_context->mbmi.mv[0].as_mv.row;

+  int mv_col = x->e_mbd.mode_info_context->mbmi.mv[0].as_mv.col;

+  int offset;

+  int pre_stride = x->e_mbd.block[16].pre_stride;

+  if (mv_row < 0)

+    mv_row -= 1;

+  else

+    mv_row += 1;

+  if (mv_col < 0)

+    mv_col -= 1;

+  else

+    mv_col += 1;

+  mv_row /= 2;

+  mv_col /= 2;

+  offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);

+  uptr = x->e_mbd.pre.u_buffer + offset;

+  vptr = x->e_mbd.pre.v_buffer + offset;

+  if ((mv_row | mv_col) & 7) {

+    vp9_sub_pixel_variance8x8(uptr, pre_stride, (mv_col & 7) << 1,

+                              (mv_row & 7) << 1, upred_ptr, uv_stride, &sse2);

+    vp9_sub_pixel_variance8x8(vptr, pre_stride, (mv_col & 7) << 1,

+                              (mv_row & 7) << 1, vpred_ptr, uv_stride, &sse1);

+    sse2 += sse1;

+  } else {

+    vp9_variance8x8(uptr, pre_stride, upred_ptr, uv_stride, &sse2);

+    vp9_variance8x8(vptr, pre_stride, vpred_ptr, uv_stride, &sse1);

+    sse2 += sse1;

+  }

+  return sse2;

+}

+static int cost_coeffs_2x2(MACROBLOCK *mb,

+                           BLOCKD *b, PLANE_TYPE type,

+                           ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {

+  int c = (type == PLANE_TYPE_Y_NO_DC); /* start at coef 0, unless Y with Y2 */

+  int eob = b->eob;

+  int pt;    /* surrounding block/prev coef predictor */

+  int cost = 0;

+  short *qcoeff_ptr = b->qcoeff;

+  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

+  assert(eob <= 4);

+  for (; c < eob; c++) {

+    int v = qcoeff_ptr[vp9_default_zig_zag1d[c]];

+    int t = vp9_dct_value_tokens_ptr[v].Token;

+    cost += mb->token_costs[TX_8X8][type][vp9_coef_bands[c]][pt][t];

+    cost += vp9_dct_value_cost_ptr[v];

+    pt = vp9_prev_token_class[t];

+  }

+  if (c < 4)

+    cost += mb->token_costs[TX_8X8][type][vp9_coef_bands[c]]

+            [pt] [DCT_EOB_TOKEN];

+  pt = (c != !type); // is eob first coefficient;

+  *a = *l = pt;

+  return cost;

+}

+static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, PLANE_TYPE type,

+                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

+                       int tx_size) {

+  const int eob = b->eob;

+  int c = (type == PLANE_TYPE_Y_NO_DC); /* start at coef 0, unless Y with Y2 */

+  int cost = 0, default_eob, seg_eob;

+  int pt;                     /* surrounding block/prev coef predictor */

+  int const *scan, *band;

+  short *qcoeff_ptr = b->qcoeff;

+  MACROBLOCKD *xd = &mb->e_mbd;

+  MB_MODE_INFO *mbmi = &mb->e_mbd.mode_info_context->mbmi;

+  TX_TYPE tx_type = DCT_DCT;

+  int segment_id = mbmi->segment_id;

+  switch (tx_size) {

+    case TX_4X4:

+      scan = vp9_default_zig_zag1d;

+      band = vp9_coef_bands;

+      default_eob = 16;

+      if (type == PLANE_TYPE_Y_WITH_DC) {

+        tx_type = get_tx_type_4x4(xd, b);

+        if (tx_type != DCT_DCT) {

+          switch (tx_type) {

+            case ADST_DCT:

+              scan = vp9_row_scan;

+              break;

+            case DCT_ADST:

+              scan = vp9_col_scan;

+              break;

+            default:

+              scan = vp9_default_zig_zag1d;

+              break;

+          }

+        }

+      }

+      break;

+    case TX_8X8:

+      scan = vp9_default_zig_zag1d_8x8;

+      band = vp9_coef_bands_8x8;

+      default_eob = 64;

+      if (type == PLANE_TYPE_Y_WITH_DC) {

+        BLOCKD *bb;

+        int ib = (b - xd->block);

+        if (ib < 16) {

+          ib = (ib & 8) + ((ib & 4) >> 1);

+          bb = xd->block + ib;

+          tx_type = get_tx_type_8x8(xd, bb);

+        }

+      }

+      break;

+    case TX_16X16:

+      scan = vp9_default_zig_zag1d_16x16;

+      band = vp9_coef_bands_16x16;

+      default_eob = 256;

+      if (type == PLANE_TYPE_Y_WITH_DC) {

+        tx_type = get_tx_type_16x16(xd, b);

+      }

+      break;

+    default:

+      break;

+  }

+  if (vp9_segfeature_active(&mb->e_mbd, segment_id, SEG_LVL_EOB))

+    seg_eob = vp9_get_segdata(&mb->e_mbd, segment_id, SEG_LVL_EOB);

+  else

+    seg_eob = default_eob;

+  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

+  if (tx_type != DCT_DCT) {

+    for (; c < eob; c++) {

+      int v = qcoeff_ptr[scan[c]];

+      int t = vp9_dct_value_tokens_ptr[v].Token;

+      cost += mb->hybrid_token_costs[tx_size][type][band[c]][pt][t];

+      cost += vp9_dct_value_cost_ptr[v];

+      pt = vp9_prev_token_class[t];

+    }

+    if (c < seg_eob)

+      cost += mb->hybrid_token_costs[tx_size][type][band[c]]

+          [pt][DCT_EOB_TOKEN];

+  } else {

+    for (; c < eob; c++) {

+      int v = qcoeff_ptr[scan[c]];

+      int t = vp9_dct_value_tokens_ptr[v].Token;

+      cost += mb->token_costs[tx_size][type][band[c]][pt][t];

+      cost += vp9_dct_value_cost_ptr[v];

+      pt = vp9_prev_token_class[t];

+    }

+    if (c < seg_eob)

+      cost += mb->token_costs[tx_size][type][band[c]]

+          [pt][DCT_EOB_TOKEN];

+  }

+  pt = (c != !type); // is eob first coefficient;

+  *a = *l = pt;

+  return cost;

+}

+static int rdcost_mby_4x4(MACROBLOCK *mb) {

+  int cost = 0;

+  int b;

+  MACROBLOCKD *xd = &mb->e_mbd;

+  ENTROPY_CONTEXT_PLANES t_above, t_left;

+  ENTROPY_CONTEXT *ta;

+  ENTROPY_CONTEXT *tl;

+  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  ta = (ENTROPY_CONTEXT *)&t_above;

+  tl = (ENTROPY_CONTEXT *)&t_left;

+  for (b = 0; b < 16; b++)

+    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_NO_DC,

+                        ta + vp9_block2above[b], tl + vp9_block2left[b],

+                        TX_4X4);

+  cost += cost_coeffs(mb, xd->block + 24, PLANE_TYPE_Y2,

+                      ta + vp9_block2above[24], tl + vp9_block2left[24],

+                      TX_4X4);

+  return cost;

+}

+static void macro_block_yrd_4x4(MACROBLOCK *mb,

+                                int *Rate,

+                                int *Distortion,

+                                const VP9_ENCODER_RTCD *rtcd,

+                                int *skippable) {

+  int b;

+  MACROBLOCKD *const xd = &mb->e_mbd;

+  BLOCK   *const mb_y2 = mb->block + 24;

+  BLOCKD *const x_y2  = xd->block + 24;

+  short *Y2DCPtr = mb_y2->src_diff;

+  BLOCK *beptr;

+  int d;

+  vp9_subtract_mby(mb->src_diff, *(mb->block[0].base_src), xd->predictor,

+                   mb->block[0].src_stride);

+  // Fdct and building the 2nd order block

+  for (beptr = mb->block; beptr < mb->block + 16; beptr += 2) {

+    mb->vp9_short_fdct8x4(beptr->src_diff, beptr->coeff, 32);

+    *Y2DCPtr++ = beptr->coeff[0];

+    *Y2DCPtr++ = beptr->coeff[16];

+  }

+  // 2nd order fdct

+  mb->short_walsh4x4(mb_y2->src_diff, mb_y2->coeff, 8);

+  // Quantization

+  for (b = 0; b < 16; b++) {

+    mb->quantize_b_4x4(&mb->block[b], &xd->block[b]);

+  }

+  // DC predication and Quantization of 2nd Order block

+  mb->quantize_b_4x4(mb_y2, x_y2);

+  // Distortion

+  d = vp9_mbblock_error(mb, 1);

+  d += vp9_block_error(mb_y2->coeff, x_y2->dqcoeff, 16);

+  *Distortion = (d >> 2);

+  // rate

+  *Rate = rdcost_mby_4x4(mb);

+  *skippable = vp9_mby_is_skippable_4x4(&mb->e_mbd, 1);

+}

+static int rdcost_mby_8x8(MACROBLOCK *mb, int backup) {

+  int cost = 0;

+  int b;

+  MACROBLOCKD *xd = &mb->e_mbd;

+  ENTROPY_CONTEXT_PLANES t_above, t_left;

+  ENTROPY_CONTEXT *ta;

+  ENTROPY_CONTEXT *tl;

+  if (backup) {

+    vpx_memcpy(&t_above,xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));

+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));

+    ta = (ENTROPY_CONTEXT *)&t_above;

+    tl = (ENTROPY_CONTEXT *)&t_left;

+  } else {

+    ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;

+    tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;

+  }

+  for (b = 0; b < 16; b += 4)

+    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_NO_DC,

+                        ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],

+                        TX_8X8);

+  cost += cost_coeffs_2x2(mb, xd->block + 24, PLANE_TYPE_Y2,

+                          ta + vp9_block2above[24], tl + vp9_block2left[24]);

+  return cost;

+}

+static void macro_block_yrd_8x8(MACROBLOCK *mb,

+                                int *Rate,

+                                int *Distortion,

+                                const VP9_ENCODER_RTCD *rtcd,

+                                int *skippable) {

+  MACROBLOCKD *const xd = &mb->e_mbd;

+  BLOCK   *const mb_y2 = mb->block + 24;

+  BLOCKD *const x_y2  = xd->block + 24;

+  int d;

+  vp9_subtract_mby(mb->src_diff, *(mb->block[0].base_src), xd->predictor,

+                   mb->block[0].src_stride);

+  vp9_transform_mby_8x8(mb);

+  vp9_quantize_mby_8x8(mb);

+  /* remove 1st order dc to properly combine 1st/2nd order distortion */

+  mb->coeff[0] = 0;

+  mb->coeff[64] = 0;

+  mb->coeff[128] = 0;

+  mb->coeff[192] = 0;

+  xd->dqcoeff[0] = 0;

+  xd->dqcoeff[64] = 0;

+  xd->dqcoeff[128] = 0;

+  xd->dqcoeff[192] = 0;

+  d = vp9_mbblock_error(mb, 0);

+  d += vp9_block_error(mb_y2->coeff, x_y2->dqcoeff, 16);

+  *Distortion = (d >> 2);

+  // rate

+  *Rate = rdcost_mby_8x8(mb, 1);

+  *skippable = vp9_mby_is_skippable_8x8(&mb->e_mbd, 1);

+}

+static int rdcost_mby_16x16(MACROBLOCK *mb) {

+  int cost;

+  MACROBLOCKD *xd = &mb->e_mbd;

+  ENTROPY_CONTEXT_PLANES t_above, t_left;

+  ENTROPY_CONTEXT *ta, *tl;

+  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  ta = (ENTROPY_CONTEXT *)&t_above;

+  tl = (ENTROPY_CONTEXT *)&t_left;

+  cost = cost_coeffs(mb, xd->block, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16);

+  return cost;

+}

+static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,

+                                  const VP9_ENCODER_RTCD *rtcd, int *skippable) {

+  int d;

+  MACROBLOCKD *xd = &mb->e_mbd;

+  BLOCKD *b  = &mb->e_mbd.block[0];

+  BLOCK  *be = &mb->block[0];

+  TX_TYPE tx_type;

+  vp9_subtract_mby(mb->src_diff, *(mb->block[0].base_src), mb->e_mbd.predictor,

+                   mb->block[0].src_stride);

+  tx_type = get_tx_type_16x16(xd, b);

+  if (tx_type != DCT_DCT) {

+    vp9_fht(be->src_diff, 32, be->coeff, tx_type, 16);

+  } else

+    vp9_transform_mby_16x16(mb);

+  vp9_quantize_mby_16x16(mb);

+  // TODO(jingning) is it possible to quickly determine whether to force

+  //                trailing coefficients to be zero, instead of running trellis

+  //                optimization in the rate-distortion optimization loop?

+  if (mb->e_mbd.mode_info_context->mbmi.mode < I8X8_PRED)

+    vp9_optimize_mby_16x16(mb, rtcd);

+  d = vp9_mbblock_error(mb, 0);

+  *Distortion = (d >> 2);

+  // rate

+  *Rate = rdcost_mby_16x16(mb);

+  *skippable = vp9_mby_is_skippable_16x16(&mb->e_mbd);

+}

+static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,

+                            int *distortion, int *skippable,

+                            int64_t txfm_cache[NB_TXFM_MODES]) {

+  VP9_COMMON *cm = &cpi->common;

+  MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;

+  MACROBLOCKD *xd = &x->e_mbd;

+  int can_skip = cm->mb_no_coeff_skip;

+  vp9_prob skip_prob = can_skip ? vp9_get_pred_prob(cm, xd, PRED_MBSKIP) : 128;

+  int s0, s1;

+  int r4x4, r4x4s, r8x8, r8x8s, d4x4, d8x8, s4x4, s8x8;

+  int64_t rd4x4, rd8x8, rd4x4s, rd8x8s;

+  int d16x16, r16x16, r16x16s, s16x16;

+  int64_t rd16x16, rd16x16s;

+  // FIXME don't do sub x3

+  if (skip_prob == 0)

+    skip_prob = 1;

+  s0 = vp9_cost_bit(skip_prob, 0);

+  s1 = vp9_cost_bit(skip_prob, 1);

+  macro_block_yrd_16x16(x, &r16x16, &d16x16, IF_RTCD(&cpi->rtcd), &s16x16);

+  if (can_skip) {

+    if (s16x16) {

+      rd16x16 = RDCOST(x->rdmult, x->rddiv, s1, d16x16);

+    } else {

+      rd16x16 = RDCOST(x->rdmult, x->rddiv, r16x16 + s0, d16x16);

+    }

+  } else {

+    rd16x16 = RDCOST(x->rdmult, x->rddiv, r16x16, d16x16);

+  }

+  r16x16s = r16x16 + vp9_cost_one(cm->prob_tx[0]) + vp9_cost_one(cm->prob_tx[1]);

+  if (can_skip) {

+    if (s16x16) {

+      rd16x16s = RDCOST(x->rdmult, x->rddiv, s1, d16x16);

+    } else {

+      rd16x16s = RDCOST(x->rdmult, x->rddiv, r16x16s + s0, d16x16);

+    }

+  } else {

+    rd16x16s = RDCOST(x->rdmult, x->rddiv, r16x16s, d16x16);

+  }

+  macro_block_yrd_8x8(x, &r8x8, &d8x8, IF_RTCD(&cpi->rtcd), &s8x8);

+  if (can_skip) {

+    if (s8x8) {

+      rd8x8 = RDCOST(x->rdmult, x->rddiv, s1, d8x8);

+    } else {

+      rd8x8 = RDCOST(x->rdmult, x->rddiv, r8x8 + s0, d8x8);

+    }

+  } else {

+    rd8x8 = RDCOST(x->rdmult, x->rddiv, r8x8, d8x8);

+  }

+  r8x8s = r8x8 + vp9_cost_one(cm->prob_tx[0]);

+  r8x8s += vp9_cost_zero(cm->prob_tx[1]);

+  if (can_skip) {

+    if (s8x8) {

+      rd8x8s = RDCOST(x->rdmult, x->rddiv, s1, d8x8);

+    } else {

+      rd8x8s = RDCOST(x->rdmult, x->rddiv, r8x8s + s0, d8x8);

+    }

+  } else {

+    rd8x8s = RDCOST(x->rdmult, x->rddiv, r8x8s, d8x8);

+  }

+  macro_block_yrd_4x4(x, &r4x4, &d4x4, IF_RTCD(&cpi->rtcd), &s4x4);

+  if (can_skip) {

+    if (s4x4) {

+      rd4x4 = RDCOST(x->rdmult, x->rddiv, s1, d4x4);

+    } else {

+      rd4x4 = RDCOST(x->rdmult, x->rddiv, r4x4 + s0, d4x4);

+    }

+  } else {

+    rd4x4 = RDCOST(x->rdmult, x->rddiv, r4x4, d4x4);

+  }

+  r4x4s = r4x4 + vp9_cost_zero(cm->prob_tx[0]);

+  if (can_skip) {

+    if (s4x4) {

+      rd4x4s = RDCOST(x->rdmult, x->rddiv, s1, d4x4);

+    } else {

+      rd4x4s = RDCOST(x->rdmult, x->rddiv, r4x4s + s0, d4x4);

+    }

+  } else {

+    rd4x4s = RDCOST(x->rdmult, x->rddiv, r4x4s, d4x4);

+  }

+  if ( cpi->common.txfm_mode == ALLOW_16X16 ||

+      (cpi->common.txfm_mode == TX_MODE_SELECT &&

+       rd16x16s < rd8x8s && rd16x16s < rd4x4s)) {

+    mbmi->txfm_size = TX_16X16;

+    *skippable = s16x16;

+    *distortion = d16x16;

+    *rate = (cpi->common.txfm_mode == ALLOW_16X16) ? r16x16 : r16x16s;

+  } else

+  if ( cpi->common.txfm_mode == ALLOW_8X8 ||

+      (cpi->common.txfm_mode == TX_MODE_SELECT && rd8x8s < rd4x4s)) {

+    mbmi->txfm_size = TX_8X8;

+    *skippable = s8x8;

+    *distortion = d8x8;

+    *rate = (cpi->common.txfm_mode == ALLOW_8X8) ? r8x8 : r8x8s;

+  } else {

+    assert(cpi->common.txfm_mode == ONLY_4X4 ||

+           (cpi->common.txfm_mode == TX_MODE_SELECT && rd4x4s <= rd8x8s));

+    mbmi->txfm_size = TX_4X4;

+    *skippable = s4x4;

+    *distortion = d4x4;

+    *rate = (cpi->common.txfm_mode == ONLY_4X4) ? r4x4 : r4x4s;

+  }

+  txfm_cache[ONLY_4X4] = rd4x4;

+  txfm_cache[ALLOW_8X8] = rd8x8;

+  txfm_cache[ALLOW_16X16] = rd16x16;

+  if (rd16x16s < rd8x8s && rd16x16s < rd4x4s)

+    txfm_cache[TX_MODE_SELECT] = rd16x16s;

+  else

+    txfm_cache[TX_MODE_SELECT] = rd4x4s < rd8x8s ? rd4x4s : rd8x8s;

+}

+static void copy_predictor(unsigned char *dst, const unsigned char *predictor) {

+  const unsigned int *p = (const unsigned int *)predictor;

+  unsigned int *d = (unsigned int *)dst;

+  d[0] = p[0];

+  d[4] = p[4];

+  d[8] = p[8];

+  d[12] = p[12];

+}

+#if CONFIG_SUPERBLOCKS

+static void super_block_yrd_8x8(MACROBLOCK *x,

+                                int *rate,

+                                int *distortion,

+                                const VP9_ENCODER_RTCD *rtcd, int *skip)

+{

+  MACROBLOCKD *const xd = &x->e_mbd;

+  BLOCK *const by2 = x->block + 24;

+  BLOCKD *const bdy2  = xd->block + 24;

+  int d = 0, r = 0, n;

+  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;

+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;

+  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;

+  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;

+  ENTROPY_CONTEXT_PLANES t_above[2];

+  ENTROPY_CONTEXT_PLANES t_left[2];

+  int skippable = 1;

+  vpx_memcpy(t_above, xd->above_context, sizeof(t_above));

+  vpx_memcpy(t_left, xd->left_context, sizeof(t_left));

+  for (n = 0; n < 4; n++) {

+    int x_idx = n & 1, y_idx = n >> 1;

+    vp9_subtract_mby_s_c(x->src_diff,

+                         src + x_idx * 16 + y_idx * 16 * src_y_stride,

+                         src_y_stride,

+                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,

+                         dst_y_stride);

+    vp9_transform_mby_8x8(x);

+    vp9_quantize_mby_8x8(x);

+    /* remove 1st order dc to properly combine 1st/2nd order distortion */

+    x->coeff[  0] = 0;

+    x->coeff[ 64] = 0;

+    x->coeff[128] = 0;

+    x->coeff[192] = 0;

+    xd->dqcoeff[  0] = 0;

+    xd->dqcoeff[ 64] = 0;

+    xd->dqcoeff[128] = 0;

+    xd->dqcoeff[192] = 0;

+    d += vp9_mbblock_error(x, 0);

+    d += vp9_block_error(by2->coeff, bdy2->dqcoeff, 16);

+    xd->above_context = ta + x_idx;

+    xd->left_context = tl + y_idx;

+    r += rdcost_mby_8x8(x, 0);

+    skippable = skippable && vp9_mby_is_skippable_8x8(xd, 1);

+  }

+  *distortion = (d >> 2);

+  *rate       = r;

+  if (skip) *skip = skippable;

+  xd->above_context = ta;

+  xd->left_context = tl;

+  vpx_memcpy(xd->above_context, &t_above, sizeof(t_above));

+  vpx_memcpy(xd->left_context, &t_left, sizeof(t_left));

+}

+#endif

+static void copy_predictor_8x8(unsigned char *dst, const unsigned char *predictor) {

+  const unsigned int *p = (const unsigned int *)predictor;

+  unsigned int *d = (unsigned int *)dst;

+  d[0] = p[0];

+  d[1] = p[1];

+  d[4] = p[4];

+  d[5] = p[5];

+  d[8] = p[8];

+  d[9] = p[9];

+  d[12] = p[12];

+  d[13] = p[13];

+  d[16] = p[16];

+  d[17] = p[17];

+  d[20] = p[20];

+  d[21] = p[21];

+  d[24] = p[24];

+  d[25] = p[25];

+  d[28] = p[28];

+  d[29] = p[29];

+}

+static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,

+                                     BLOCKD *b, B_PREDICTION_MODE *best_mode,

+#if CONFIG_COMP_INTRA_PRED

+                                     B_PREDICTION_MODE *best_second_mode,

+                                     int allow_comp,

+#endif

+                                     int *bmode_costs,

+                                     ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

+                                     int *bestrate, int *bestratey,

+                                     int *bestdistortion) {

+  B_PREDICTION_MODE mode;

+  MACROBLOCKD *xd = &x->e_mbd;

+#if CONFIG_COMP_INTRA_PRED

+  B_PREDICTION_MODE mode2;

+#endif

+  int64_t best_rd = INT64_MAX;

+  int rate = 0;

+  int distortion;

+  ENTROPY_CONTEXT ta = *a, tempa = *a;

+  ENTROPY_CONTEXT tl = *l, templ = *l;

+  TX_TYPE tx_type = DCT_DCT;

+  TX_TYPE best_tx_type = DCT_DCT;

+  /*

+   * The predictor buffer is a 2d buffer with a stride of 16.  Create

+   * a temp buffer that meets the stride requirements, but we are only

+   * interested in the left 4x4 block

+   * */

+  DECLARE_ALIGNED_ARRAY(16, unsigned char,  best_predictor, 16 * 4);

+  DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16);

+  for (mode = B_DC_PRED; mode <= B_HU_PRED; mode++) {

+#if CONFIG_COMP_INTRA_PRED

+    for (mode2 = (allow_comp ? 0 : (B_DC_PRED - 1));

+                   mode2 != (allow_comp ? (mode + 1) : 0); mode2++) {

+#endif

+      int64_t this_rd;

+      int ratey;

+      b->bmi.as_mode.first = mode;

+      rate = bmode_costs[mode];

+#if CONFIG_COMP_INTRA_PRED

+      if (mode2 == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {

+#endif

+        vp9_intra4x4_predict(b, mode, b->predictor);

+#if CONFIG_COMP_INTRA_PRED

+      } else {

+        vp9_comp_intra4x4_predict(b, mode, mode2, b->predictor);

+        rate += bmode_costs[mode2];

+      }

+#endif

+      vp9_subtract_b(be, b, 16);

+      b->bmi.as_mode.first = mode;

+      tx_type = get_tx_type_4x4(xd, b);

+      if (tx_type != DCT_DCT) {

+        vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);

+        vp9_ht_quantize_b_4x4(be, b, tx_type);

+      } else {

+        x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);

+        x->quantize_b_4x4(be, b);

+      }

+      tempa = ta;

+      templ = tl;

+      ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4);

+      rate += ratey;

+      distortion = vp9_block_error(be->coeff, b->dqcoeff, 16) >> 2;

+      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

+      if (this_rd < best_rd) {

+        *bestrate = rate;

+        *bestratey = ratey;

+        *bestdistortion = distortion;

+        best_rd = this_rd;

+        *best_mode = mode;

+        best_tx_type = tx_type;

+#if CONFIG_COMP_INTRA_PRED

+        *best_second_mode = mode2;

+#endif

+        *a = tempa;

+        *l = templ;

+        copy_predictor(best_predictor, b->predictor);

+        vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);

+      }

+#if CONFIG_COMP_INTRA_PRED

+    }

+#endif

+  }

+  b->bmi.as_mode.first = (B_PREDICTION_MODE)(*best_mode);

+#if CONFIG_COMP_INTRA_PRED

+  b->bmi.as_mode.second = (B_PREDICTION_MODE)(*best_second_mode);

+#endif

+  // inverse transform

+  if (best_tx_type != DCT_DCT)

+    vp9_ihtllm_c(best_dqcoeff, b->diff, 32, best_tx_type, 4);

+  else

+    IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(

+        best_dqcoeff, b->diff, 32);

+  vp9_recon_b(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);

+  return best_rd;

+}

+static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, int *Rate,

+                                     int *rate_y, int *Distortion, int64_t best_rd,

+#if CONFIG_COMP_INTRA_PRED

+                                     int allow_comp,

+#endif

+                                     int update_contexts) {

+  int i;

+  MACROBLOCKD *const xd = &mb->e_mbd;

+  int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];

+  int distortion = 0;

+  int tot_rate_y = 0;

+  int64_t total_rd = 0;

+  ENTROPY_CONTEXT_PLANES t_above, t_left;

+  ENTROPY_CONTEXT *ta, *tl;

+  int *bmode_costs;

+  if (update_contexts) {

+    ta = (ENTROPY_CONTEXT *)xd->above_context;

+    tl = (ENTROPY_CONTEXT *)xd->left_context;

+  } else {

+    vpx_memcpy(&t_above, xd->above_context,

+               sizeof(ENTROPY_CONTEXT_PLANES));

+    vpx_memcpy(&t_left, xd->left_context,

+               sizeof(ENTROPY_CONTEXT_PLANES));

+    ta = (ENTROPY_CONTEXT *)&t_above;

+    tl = (ENTROPY_CONTEXT *)&t_left;

+  }

+  xd->mode_info_context->mbmi.mode = B_PRED;

+  bmode_costs = mb->inter_bmode_costs;

+  for (i = 0; i < 16; i++) {

+    MODE_INFO *const mic = xd->mode_info_context;

+    const int mis = xd->mode_info_stride;

+    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);

+#if CONFIG_COMP_INTRA_PRED

+    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_second_mode);

+#endif

+    int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);

+    if (xd->frame_type == KEY_FRAME) {

+      const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);

+      const B_PREDICTION_MODE L = left_block_mode(mic, i);

+      bmode_costs  = mb->bmode_costs[A][L];

+    }

+    total_rd += rd_pick_intra4x4block(

+                  cpi, mb, mb->block + i, xd->block + i, &best_mode,

+#if CONFIG_COMP_INTRA_PRED

+                  & best_second_mode, allow_comp,

+#endif

+                  bmode_costs, ta + vp9_block2above[i],

+                  tl + vp9_block2left[i], &r, &ry, &d);

+    cost += r;

+    distortion += d;

+    tot_rate_y += ry;

+    mic->bmi[i].as_mode.first = best_mode;

+#if CONFIG_COMP_INTRA_PRED

+    mic->bmi[i].as_mode.second = best_second_mode;

+#endif

+    if (total_rd >= best_rd)

+      break;

+  }

+  if (total_rd >= best_rd)

+    return INT64_MAX;

+#if CONFIG_COMP_INTRA_PRED

+  cost += vp9_cost_bit(128, allow_comp);

+#endif

+  *Rate = cost;

+  *rate_y += tot_rate_y;

+  *Distortion = distortion;

+  return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);

+}

+#if CONFIG_SUPERBLOCKS

+static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi,

+                                      MACROBLOCK *x,

+                                      int *rate,

+                                      int *rate_tokenonly,

+                                      int *distortion,

+                                      int *skippable) {

+  MB_PREDICTION_MODE mode;

+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);

+  int this_rate, this_rate_tokenonly;

+  int this_distortion, s;

+  int64_t best_rd = INT64_MAX, this_rd;

+  /* Y Search for 32x32 intra prediction mode */

+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {

+    x->e_mbd.mode_info_context->mbmi.mode = mode;

+    vp9_build_intra_predictors_sby_s(&x->e_mbd);

+    super_block_yrd_8x8(x, &this_rate_tokenonly,

+                        &this_distortion, IF_RTCD(&cpi->rtcd), &s);

+    this_rate = this_rate_tokenonly +

+                x->mbmode_cost[x->e_mbd.frame_type]

+                              [x->e_mbd.mode_info_context->mbmi.mode];

+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);

+    if (this_rd < best_rd) {

+      mode_selected   = mode;

+      best_rd         = this_rd;

+      *rate           = this_rate;

+      *rate_tokenonly = this_rate_tokenonly;

+      *distortion     = this_distortion;

+      *skippable      = s;

+    }

+  }

+  x->e_mbd.mode_info_context->mbmi.mode = mode_selected;

+  return best_rd;

+}

+#endif

+static int64_t rd_pick_intra16x16mby_mode(VP9_COMP *cpi,

+                                          MACROBLOCK *x,

+                                          int *Rate,

+                                          int *rate_y,

+                                          int *Distortion,

+                                          int *skippable,

+                                          int64_t txfm_cache[NB_TXFM_MODES]) {

+  MB_PREDICTION_MODE mode;

+  TX_SIZE txfm_size;

+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);

+#if CONFIG_COMP_INTRA_PRED

+  MB_PREDICTION_MODE mode2;

+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode2_selected);

+#endif

+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

+  int rate, ratey;

+  int distortion, skip;

+  int64_t best_rd = INT64_MAX;

+  int64_t this_rd;

+  MACROBLOCKD *xd = &x->e_mbd;

+  int i;

+  for (i = 0; i < NB_TXFM_MODES; i++)

+    txfm_cache[i] = INT64_MAX;

+  // Y Search for 16x16 intra prediction mode

+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {

+    int64_t local_txfm_cache[NB_TXFM_MODES];

+    mbmi->mode = mode;

+#if CONFIG_COMP_INTRA_PRED

+    for (mode2 = DC_PRED - 1; mode2 != TM_PRED + 1; mode2++) {

+      mbmi->second_mode = mode2;

+      if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {

+#endif

+        vp9_build_intra_predictors_mby(&x->e_mbd);

+#if CONFIG_COMP_INTRA_PRED

+      } else {

+        continue; // i.e. disable for now

+        vp9_build_comp_intra_predictors_mby(&x->e_mbd);

+      }

+#endif

+      macro_block_yrd(cpi, x, &ratey, &distortion, &skip, local_txfm_cache);

+      // FIXME add compoundmode cost

+      // FIXME add rate for mode2

+      rate = ratey + x->mbmode_cost[x->e_mbd.frame_type][mbmi->mode];

+      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

+      if (this_rd < best_rd) {

+        mode_selected = mode;

+        txfm_size = mbmi->txfm_size;

+#if CONFIG_COMP_INTRA_PRED

+        mode2_selected = mode2;

+#endif

+        best_rd = this_rd;

+        *Rate = rate;

+        *rate_y = ratey;

+        *Distortion = distortion;

+        *skippable = skip;

+      }

+      for (i = 0; i < NB_TXFM_MODES; i++) {

+        int64_t adj_rd = this_rd + local_txfm_cache[i] -

+                          local_txfm_cache[cpi->common.txfm_mode];

+        if (adj_rd < txfm_cache[i]) {

+          txfm_cache[i] = adj_rd;

+        }

+      }

+#if CONFIG_COMP_INTRA_PRED

+    }

+#endif

+  }

+  mbmi->txfm_size = txfm_size;

+  mbmi->mode = mode_selected;

+#if CONFIG_COMP_INTRA_PRED

+  mbmi->second_mode = mode2_selected;

+#endif

+  return best_rd;

+}

+static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,

+                                     B_PREDICTION_MODE *best_mode,

+#if CONFIG_COMP_INTRA_PRED

+                                     B_PREDICTION_MODE *best_second_mode,

+#endif

+                                     int *mode_costs,

+                                     ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

+                                     int *bestrate, int *bestratey,

+                                     int *bestdistortion) {

+  MB_PREDICTION_MODE mode;

+#if CONFIG_COMP_INTRA_PRED

+  MB_PREDICTION_MODE mode2;

+#endif

+  MACROBLOCKD *xd = &x->e_mbd;

+  int64_t best_rd = INT64_MAX;

+  int distortion, rate = 0;

+  BLOCK  *be = x->block + ib;

+  BLOCKD *b = xd->block + ib;

+  ENTROPY_CONTEXT ta0, ta1, besta0 = 0, besta1 = 0;

+  ENTROPY_CONTEXT tl0, tl1, bestl0 = 0, bestl1 = 0;

+  /*

+   * The predictor buffer is a 2d buffer with a stride of 16.  Create

+   * a temp buffer that meets the stride requirements, but we are only

+   * interested in the left 8x8 block

+   * */

+  DECLARE_ALIGNED_ARRAY(16, unsigned char,  best_predictor, 16 * 8);

+  DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16 * 4);

+  // perform transformation of dimension 8x8

+  // note the input and output index mapping

+  int idx = (ib & 0x02) ? (ib + 2) : ib;

+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {

+#if CONFIG_COMP_INTRA_PRED

+    for (mode2 = DC_PRED - 1; mode2 != TM_PRED + 1; mode2++) {

+#endif

+      int64_t this_rd;

+      int rate_t;

+      // FIXME rate for compound mode and second intrapred mode

+      rate = mode_costs[mode];

+      b->bmi.as_mode.first = mode;

+#if CONFIG_COMP_INTRA_PRED

+      if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {

+#endif

+        vp9_intra8x8_predict(b, mode, b->predictor);

+#if CONFIG_COMP_INTRA_PRED

+      } else {

+        continue; // i.e. disable for now

+        vp9_comp_intra8x8_predict(b, mode, mode2, b->predictor);

+      }

+#endif

+      vp9_subtract_4b_c(be, b, 16);

+      if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {

+        TX_TYPE tx_type = get_tx_type_8x8(xd, b);

+        if (tx_type != DCT_DCT)

+          vp9_fht(be->src_diff, 32, (x->block + idx)->coeff, tx_type, 8);

+        else

+          x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);

+        x->quantize_b_8x8(x->block + idx, xd->block + idx);

+        // compute quantization mse of 8x8 block

+        distortion = vp9_block_error_c((x->block + idx)->coeff,

+                                       (xd->block + idx)->dqcoeff, 64);

+        ta0 = a[vp9_block2above_8x8[idx]];

+        tl0 = l[vp9_block2left_8x8[idx]];

+        rate_t = cost_coeffs(x, xd->block + idx, PLANE_TYPE_Y_WITH_DC,

+                             &ta0, &tl0, TX_8X8);

+        rate += rate_t;

+        ta1 = ta0;

+        tl1 = tl0;

+      } else {

+        x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);

+        x->vp9_short_fdct8x4((be + 4)->src_diff, (be + 4)->coeff, 32);

+        x->quantize_b_4x4_pair(x->block + ib, x->block + ib + 1,

+                               xd->block + ib, xd->block + ib + 1);

+        x->quantize_b_4x4_pair(x->block + ib + 4, x->block + ib + 5,

+                               xd->block + ib + 4, xd->block + ib + 5);

+        distortion = vp9_block_error_c((x->block + ib)->coeff,

+                                       (xd->block + ib)->dqcoeff, 16);

+        distortion += vp9_block_error_c((x->block + ib + 1)->coeff,

+                                        (xd->block + ib + 1)->dqcoeff, 16);

+        distortion += vp9_block_error_c((x->block + ib + 4)->coeff,

+                                        (xd->block + ib + 4)->dqcoeff, 16);

+        distortion += vp9_block_error_c((x->block + ib + 5)->coeff,

+                                        (xd->block + ib + 5)->dqcoeff, 16);

+        ta0 = a[vp9_block2above[ib]];

+        ta1 = a[vp9_block2above[ib + 1]];

+        tl0 = l[vp9_block2left[ib]];

+        tl1 = l[vp9_block2left[ib + 4]];

+        rate_t = cost_coeffs(x, xd->block + ib, PLANE_TYPE_Y_WITH_DC,

+                             &ta0, &tl0, TX_4X4);

+        rate_t += cost_coeffs(x, xd->block + ib + 1, PLANE_TYPE_Y_WITH_DC,

+                              &ta1, &tl0, TX_4X4);

+        rate_t += cost_coeffs(x, xd->block + ib + 4, PLANE_TYPE_Y_WITH_DC,

+                              &ta0, &tl1, TX_4X4);

+        rate_t += cost_coeffs(x, xd->block + ib + 5, PLANE_TYPE_Y_WITH_DC,

+                              &ta1, &tl1, TX_4X4);

+        rate += rate_t;

+      }

+      distortion >>= 2;

+      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

+      if (this_rd < best_rd) {

+        *bestrate = rate;

+        *bestratey = rate_t;

+        *bestdistortion = distortion;

+        besta0 = ta0;

+        besta1 = ta1;

+        bestl0 = tl0;

+        bestl1 = tl1;

+        best_rd = this_rd;

+        *best_mode = mode;

+#if CONFIG_COMP_INTRA_PRED

+        *best_second_mode = mode2;

+#endif

+        copy_predictor_8x8(best_predictor, b->predictor);

+        vpx_memcpy(best_dqcoeff, b->dqcoeff, 64);

+        vpx_memcpy(best_dqcoeff + 32, b->dqcoeff + 64, 64);

+#if CONFIG_COMP_INTRA_PRED

+      }

+#endif

+    }

+  }

+  b->bmi.as_mode.first = (*best_mode);

+#if CONFIG_COMP_INTRA_PRED

+  b->bmi.as_mode.second = (*best_second_mode);

+#endif

+  vp9_encode_intra8x8(IF_RTCD(&cpi->rtcd), x, ib);

+  if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {

+    a[vp9_block2above_8x8[idx]]     = besta0;

+    a[vp9_block2above_8x8[idx] + 1] = besta1;

+    l[vp9_block2left_8x8[idx]]      = bestl0;

+    l[vp9_block2left_8x8[idx] + 1]  = bestl1;

+  } else {

+    a[vp9_block2above[ib]]     = besta0;

+    a[vp9_block2above[ib + 1]] = besta1;

+    l[vp9_block2left[ib]]      = bestl0;

+    l[vp9_block2left[ib + 4]]  = bestl1;

+  }

+  return best_rd;

+}

+static int64_t rd_pick_intra8x8mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,

+                                         int *Rate, int *rate_y,

+                                         int *Distortion, int64_t best_rd) {

+  MACROBLOCKD *const xd = &mb->e_mbd;

+  int i, ib;

+  int cost = mb->mbmode_cost [xd->frame_type] [I8X8_PRED];

+  int distortion = 0;

+  int tot_rate_y = 0;

+  long long total_rd = 0;

+  ENTROPY_CONTEXT_PLANES t_above, t_left;

+  ENTROPY_CONTEXT *ta, *tl;

+  int *i8x8mode_costs;

+  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  ta = (ENTROPY_CONTEXT *)&t_above;

+  tl = (ENTROPY_CONTEXT *)&t_left;

+  xd->mode_info_context->mbmi.mode = I8X8_PRED;

+  i8x8mode_costs  = mb->i8x8_mode_costs;

+  for (i = 0; i < 4; i++) {

+    MODE_INFO *const mic = xd->mode_info_context;

+    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);

+#if CONFIG_COMP_INTRA_PRED

+    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_second_mode);

+#endif

+    int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);

+    ib = vp9_i8x8_block[i];

+    total_rd += rd_pick_intra8x8block(

+                  cpi, mb, ib, &best_mode,

+#if CONFIG_COMP_INTRA_PRED

+                  & best_second_mode,

+#endif

+                  i8x8mode_costs, ta, tl, &r, &ry, &d);

+    cost += r;

+    distortion += d;

+    tot_rate_y += ry;

+    mic->bmi[ib].as_mode.first = best_mode;

+#if CONFIG_COMP_INTRA_PRED

+    mic->bmi[ib].as_mode.second = best_second_mode;

+#endif

+  }

+  *Rate = cost;

+  *rate_y += tot_rate_y;

+  *Distortion = distortion;

+  return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);

+}

+static int rd_cost_mbuv(MACROBLOCK *mb) {

+  int b;

+  int cost = 0;

+  MACROBLOCKD *xd = &mb->e_mbd;

+  ENTROPY_CONTEXT_PLANES t_above, t_left;

+  ENTROPY_CONTEXT *ta, *tl;

+  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  ta = (ENTROPY_CONTEXT *)&t_above;

+  tl = (ENTROPY_CONTEXT *)&t_left;

+  for (b = 16; b < 24; b++)

+    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,

+                        ta + vp9_block2above[b], tl + vp9_block2left[b],

+                        TX_4X4);

+  return cost;

+}

+static int64_t rd_inter16x16_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,

+                                int *distortion, int fullpixel, int *skip) {

+  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

+                    x->e_mbd.predictor, x->src.uv_stride);

+  vp9_transform_mbuv_4x4(x);

+  vp9_quantize_mbuv_4x4(x);

+  *rate       = rd_cost_mbuv(x);

+  *distortion = vp9_mbuverror(x) / 4;

+  *skip       = vp9_mbuv_is_skippable_4x4(&x->e_mbd);

+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);

+}

+static int rd_cost_mbuv_8x8(MACROBLOCK *mb, int backup) {

+  int b;

+  int cost = 0;

+  MACROBLOCKD *xd = &mb->e_mbd;

+  ENTROPY_CONTEXT_PLANES t_above, t_left;

+  ENTROPY_CONTEXT *ta, *tl;

+  if (backup) {

+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));

+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));

+    ta = (ENTROPY_CONTEXT *)&t_above;

+    tl = (ENTROPY_CONTEXT *)&t_left;

+  } else {

+    ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;

+    tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;

+  }

+  for (b = 16; b < 24; b += 4)

+    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,

+                        ta + vp9_block2above_8x8[b],

+                        tl + vp9_block2left_8x8[b], TX_8X8);

+  return cost;

+}

+#if CONFIG_SUPERBLOCKS

+static int64_t rd_inter32x32_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,

+                                int *distortion, int fullpixel, int *skip) {

+  MACROBLOCKD *xd = &x->e_mbd;

+  int n, r = 0, d = 0;

+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;

+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;

+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;

+  int skippable = 1;

+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];

+  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;

+  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;

+  memcpy(t_above, xd->above_context, sizeof(t_above));

+  memcpy(t_left, xd->left_context, sizeof(t_left));

+  for (n = 0; n < 4; n++) {

+    int x_idx = n & 1, y_idx = n >> 1;

+    vp9_subtract_mbuv_s_c(x->src_diff,

+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

+                          src_uv_stride,

+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

+                          dst_uv_stride);

+    vp9_transform_mbuv_8x8(x);

+    vp9_quantize_mbuv_8x8(x);

+    xd->above_context = ta + x_idx;

+    xd->left_context = tl + y_idx;

+    r += rd_cost_mbuv_8x8(x, 0);

+    d += vp9_mbuverror(x) / 4;

+    skippable = skippable && vp9_mbuv_is_skippable_8x8(xd);

+  }

+  *rate = r;

+  *distortion = d;

+  if (skip) *skip = skippable;

+  xd->left_context = tl;

+  xd->above_context = ta;

+  memcpy(xd->above_context, t_above, sizeof(t_above));

+  memcpy(xd->left_context, t_left, sizeof(t_left));

+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);

+}

+#endif

+static int64_t rd_inter16x16_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,

+                                    int *distortion, int fullpixel, int *skip) {

+  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

+                    x->e_mbd.predictor, x->src.uv_stride);

+  vp9_transform_mbuv_8x8(x);

+  vp9_quantize_mbuv_8x8(x);

+  *rate       = rd_cost_mbuv_8x8(x, 1);

+  *distortion = vp9_mbuverror(x) / 4;

+  *skip       = vp9_mbuv_is_skippable_8x8(&x->e_mbd);

+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);

+}

+static int64_t rd_inter4x4_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,

+                              int *distortion, int *skippable, int fullpixel) {

+  vp9_build_inter4x4_predictors_mbuv(&x->e_mbd);

+  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

+                    x->e_mbd.predictor, x->src.uv_stride);

+  vp9_transform_mbuv_4x4(x);

+  vp9_quantize_mbuv_4x4(x);

+  *rate       = rd_cost_mbuv(x);

+  *distortion = vp9_mbuverror(x) / 4;

+  *skippable  = vp9_mbuv_is_skippable_4x4(&x->e_mbd);

+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);

+}

+static void rd_pick_intra_mbuv_mode(VP9_COMP *cpi,

+                                    MACROBLOCK *x,

+                                    int *rate,

+                                    int *rate_tokenonly,

+                                    int *distortion,

+                                    int *skippable) {

+  MB_PREDICTION_MODE mode;

+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);

+#if CONFIG_COMP_INTRA_PRED

+  MB_PREDICTION_MODE mode2;

+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode2_selected);

+#endif

+  MACROBLOCKD *xd = &x->e_mbd;

+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

+  int64_t best_rd = INT64_MAX;

+  int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);

+  int rate_to, UNINITIALIZED_IS_SAFE(skip);

+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {

+#if CONFIG_COMP_INTRA_PRED

+    for (mode2 = DC_PRED - 1; mode2 != TM_PRED + 1; mode2++) {

+#endif

+      int rate;

+      int distortion;

+      int64_t this_rd;

+      mbmi->uv_mode = mode;

+#if CONFIG_COMP_INTRA_PRED

+      mbmi->second_uv_mode = mode2;

+      if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {

+#endif

+        vp9_build_intra_predictors_mbuv(&x->e_mbd);

+#if CONFIG_COMP_INTRA_PRED

+      } else {

+        continue;

+        vp9_build_comp_intra_predictors_mbuv(&x->e_mbd);

+      }

+#endif

+      vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

+                        x->e_mbd.predictor, x->src.uv_stride);

+      vp9_transform_mbuv_4x4(x);

+      vp9_quantize_mbuv_4x4(x);

+      rate_to = rd_cost_mbuv(x);

+      rate = rate_to

+             + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];

+      distortion = vp9_mbuverror(x) / 4;

+      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

+      if (this_rd < best_rd) {

+        skip = vp9_mbuv_is_skippable_4x4(xd);

+        best_rd = this_rd;

+        d = distortion;

+        r = rate;

+        *rate_tokenonly = rate_to;

+        mode_selected = mode;

+#if CONFIG_COMP_INTRA_PRED

+        mode2_selected = mode2;

+      }

+#endif

+    }

+  }

+  *rate = r;

+  *distortion = d;

+  *skippable = skip;

+  mbmi->uv_mode = mode_selected;

+#if CONFIG_COMP_INTRA_PRED

+  mbmi->second_uv_mode = mode2_selected;

+#endif

+}

+static void rd_pick_intra_mbuv_mode_8x8(VP9_COMP *cpi,

+                                        MACROBLOCK *x,

+                                        int *rate,

+                                        int *rate_tokenonly,

+                                        int *distortion,

+                                        int *skippable) {

+  MACROBLOCKD *xd = &x->e_mbd;

+  MB_PREDICTION_MODE mode;

+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);

+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

+  int64_t best_rd = INT64_MAX;

+  int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);

+  int rate_to, UNINITIALIZED_IS_SAFE(skip);

+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {

+    int rate;

+    int distortion;

+    int64_t this_rd;

+    mbmi->uv_mode = mode;

+    vp9_build_intra_predictors_mbuv(&x->e_mbd);

+    vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

+                      x->e_mbd.predictor, x->src.uv_stride);

+    vp9_transform_mbuv_8x8(x);

+    vp9_quantize_mbuv_8x8(x);

+    rate_to = rd_cost_mbuv_8x8(x, 1);

+    rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];

+    distortion = vp9_mbuverror(x) / 4;

+    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

+    if (this_rd < best_rd) {

+      skip = vp9_mbuv_is_skippable_8x8(xd);

+      best_rd = this_rd;

+      d = distortion;

+      r = rate;

+      *rate_tokenonly = rate_to;

+      mode_selected = mode;

+    }

+  }

+  *rate = r;

+  *distortion = d;

+  *skippable = skip;

+  mbmi->uv_mode = mode_selected;

+}

+#if CONFIG_SUPERBLOCKS

+static void super_block_uvrd_8x8(MACROBLOCK *x,

+                                 int *rate,

+                                 int *distortion,

+                                 const VP9_ENCODER_RTCD *rtcd,

+                                 int *skippable) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  int d = 0, r = 0, n, s = 1;

+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;

+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;

+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;

+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];

+  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;

+  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;

+  memcpy(t_above, xd->above_context, sizeof(t_above));

+  memcpy(t_left,  xd->left_context,  sizeof(t_left));

+  for (n = 0; n < 4; n++) {

+    int x_idx = n & 1, y_idx = n >> 1;

+    vp9_subtract_mbuv_s_c(x->src_diff,

+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

+                          src_uv_stride,

+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

+                          dst_uv_stride);

+    vp9_transform_mbuv_8x8(x);

+    vp9_quantize_mbuv_8x8(x);

+    s &= vp9_mbuv_is_skippable_8x8(xd);

+    d += vp9_mbuverror(x) >> 2;

+    xd->above_context = ta + x_idx;

+    xd->left_context = tl + y_idx;

+    r += rd_cost_mbuv_8x8(x, 0);

+  }

+  xd->above_context = ta;

+  xd->left_context = tl;

+  *distortion = d;

+  *rate       = r;

+  *skippable  = s;

+  xd->left_context = tl;

+  xd->above_context = ta;

+  memcpy(xd->above_context, t_above, sizeof(t_above));

+  memcpy(xd->left_context,  t_left,  sizeof(t_left));

+}

+static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi,

+                                       MACROBLOCK *x,

+                                       int *rate,

+                                       int *rate_tokenonly,

+                                       int *distortion,

+                                       int *skippable) {

+  MB_PREDICTION_MODE mode;

+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);

+  int64_t best_rd = INT64_MAX, this_rd;

+  int this_rate_tokenonly, this_rate;

+  int this_distortion, s;

+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {

+    x->e_mbd.mode_info_context->mbmi.uv_mode = mode;

+    vp9_build_intra_predictors_sbuv_s(&x->e_mbd);

+    super_block_uvrd_8x8(x, &this_rate_tokenonly,

+                         &this_distortion, IF_RTCD(&cpi->rtcd), &s);

+    this_rate = this_rate_tokenonly +

+                x->mbmode_cost[x->e_mbd.frame_type]

+                              [x->e_mbd.mode_info_context->mbmi.mode];

+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);

+    if (this_rd < best_rd) {

+      mode_selected   = mode;

+      best_rd         = this_rd;

+      *rate           = this_rate;

+      *rate_tokenonly = this_rate_tokenonly;

+      *distortion     = this_distortion;

+      *skippable      = s;

+    }

+  }

+  x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected;

+  return best_rd;

+}

+#endif

+int vp9_cost_mv_ref(VP9_COMP *cpi,

+                    MB_PREDICTION_MODE m,

+                    const int near_mv_ref_ct[4]) {

+  MACROBLOCKD *xd = &cpi->mb.e_mbd;

+  int segment_id = xd->mode_info_context->mbmi.segment_id;

+  // If the mode coding is done entirely at the segment level

+  // we should not account for it at the per mb level in rd code.

+  // Note that if the segment level coding is expanded from single mode

+  // to multiple mode masks as per reference frame coding we will need

+  // to do something different here.

+  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {

+    VP9_COMMON *pc = &cpi->common;

+    vp9_prob p [VP9_MVREFS - 1];

+    assert(NEARESTMV <= m  &&  m <= SPLITMV);

+    vp9_mv_ref_probs(pc, p, near_mv_ref_ct);

+    return cost_token(vp9_mv_ref_tree, p,

+                      vp9_mv_ref_encoding_array - NEARESTMV + m);

+  } else

+    return 0;

+}

+void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) {

+  x->e_mbd.mode_info_context->mbmi.mode = mb;

+  x->e_mbd.mode_info_context->mbmi.mv[0].as_int = mv->as_int;

+}

+static int labels2mode(

+  MACROBLOCK *x,

+  int const *labelings, int which_label,

+  B_PREDICTION_MODE this_mode,

+  int_mv *this_mv, int_mv *this_second_mv,

+  int_mv seg_mvs[MAX_REF_FRAMES - 1],

+  int_mv *best_ref_mv,

+  int_mv *second_best_ref_mv,

+  DEC_MVCOSTS) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  MODE_INFO *const mic = xd->mode_info_context;

+  MB_MODE_INFO * mbmi = &mic->mbmi;

+  const int mis = xd->mode_info_stride;

+  int i, cost = 0, thismvcost = 0;

+  /* We have to be careful retrieving previously-encoded motion vectors.

+     Ones from this macroblock have to be pulled from the BLOCKD array

+     as they have not yet made it to the bmi array in our MB_MODE_INFO. */

+  for (i = 0; i < 16; ++i) {

+    BLOCKD *const d = xd->block + i;

+    const int row = i >> 2,  col = i & 3;

+    B_PREDICTION_MODE m;

+    if (labelings[i] != which_label)

+      continue;

+    if (col  &&  labelings[i] == labelings[i - 1])

+      m = LEFT4X4;

+    else if (row  &&  labelings[i] == labelings[i - 4])

+      m = ABOVE4X4;

+    else {

+      // the only time we should do costing for new motion vector or mode

+      // is when we are on a new label  (jbb May 08, 2007)

+      switch (m = this_mode) {

+        case NEW4X4 :

+          if (mbmi->second_ref_frame) {

+            this_mv->as_int = seg_mvs[mbmi->ref_frame - 1].as_int;

+            this_second_mv->as_int =

+              seg_mvs[mbmi->second_ref_frame - 1].as_int;

+          }

+          thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, MVCOSTS,

+                                        102, xd->allow_high_precision_mv);

+          if (mbmi->second_ref_frame) {

+            thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,

+                                          MVCOSTS, 102,

+                                          xd->allow_high_precision_mv);

+          }

+          break;

+        case LEFT4X4:

+          this_mv->as_int = col ? d[-1].bmi.as_mv.first.as_int : left_block_mv(mic, i);

+          if (mbmi->second_ref_frame)

+            this_second_mv->as_int = col ? d[-1].bmi.as_mv.second.as_int : left_block_second_mv(mic, i);

+          break;

+        case ABOVE4X4:

+          this_mv->as_int = row ? d[-4].bmi.as_mv.first.as_int : above_block_mv(mic, i, mis);

+          if (mbmi->second_ref_frame)

+            this_second_mv->as_int = row ? d[-4].bmi.as_mv.second.as_int : above_block_second_mv(mic, i, mis);

+          break;

+        case ZERO4X4:

+          this_mv->as_int = 0;

+          if (mbmi->second_ref_frame)

+            this_second_mv->as_int = 0;

+          break;

+        default:

+          break;

+      }

+      if (m == ABOVE4X4) { // replace above with left if same

+        int_mv left_mv, left_second_mv;

+        left_second_mv.as_int = 0;

+        left_mv.as_int = col ? d[-1].bmi.as_mv.first.as_int :

+                         left_block_mv(mic, i);

+        if (mbmi->second_ref_frame)

+          left_second_mv.as_int = col ? d[-1].bmi.as_mv.second.as_int :

+                                  left_block_second_mv(mic, i);

+        if (left_mv.as_int == this_mv->as_int &&

+            (!mbmi->second_ref_frame ||

+             left_second_mv.as_int == this_second_mv->as_int))

+          m = LEFT4X4;

+      }

+      cost = x->inter_bmode_costs[ m];

+    }

+    d->bmi.as_mv.first.as_int = this_mv->as_int;

+    if (mbmi->second_ref_frame)

+      d->bmi.as_mv.second.as_int = this_second_mv->as_int;

+    x->partition_info->bmi[i].mode = m;

+    x->partition_info->bmi[i].mv.as_int = this_mv->as_int;

+    if (mbmi->second_ref_frame)

+      x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;

+  }

+  cost += thismvcost;

+  return cost;

+}

+static int64_t encode_inter_mb_segment(MACROBLOCK *x,

+                                       int const *labels,

+                                       int which_label,

+                                       int *labelyrate,

+                                       int *distortion,

+                                       ENTROPY_CONTEXT *ta,

+                                       ENTROPY_CONTEXT *tl,

+                                       const VP9_ENCODER_RTCD *rtcd) {

+  int i;

+  MACROBLOCKD *xd = &x->e_mbd;

+  *labelyrate = 0;

+  *distortion = 0;

+  for (i = 0; i < 16; i++) {

+    if (labels[i] == which_label) {

+      BLOCKD *bd = &x->e_mbd.block[i];

+      BLOCK *be = &x->block[i];

+      int thisdistortion;

+      vp9_build_inter_predictors_b(bd, 16, xd->subpixel_predict);

+      if (xd->mode_info_context->mbmi.second_ref_frame)

+        vp9_build_2nd_inter_predictors_b(bd, 16, xd->subpixel_predict_avg);

+      vp9_subtract_b(be, bd, 16);

+      x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);

+      x->quantize_b_4x4(be, bd);

+      thisdistortion = vp9_block_error(be->coeff, bd->dqcoeff, 16);

+      *distortion += thisdistortion;

+      *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,

+                                 ta + vp9_block2above[i],

+                                 tl + vp9_block2left[i], TX_4X4);

+    }

+  }

+  *distortion >>= 2;

+  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);

+}

+static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,

+                                           int const *labels,

+                                           int which_label,

+                                           int *labelyrate,

+                                           int *distortion,

+                                           int64_t *otherrd,

+                                           ENTROPY_CONTEXT *ta,

+                                           ENTROPY_CONTEXT *tl,

+                                           const VP9_ENCODER_RTCD *rtcd) {

+  int i, j;

+  MACROBLOCKD *xd = &x->e_mbd;

+  const int iblock[4] = { 0, 1, 4, 5 };

+  int othercost = 0, otherdist = 0;

+  ENTROPY_CONTEXT_PLANES tac, tlc;

+  ENTROPY_CONTEXT *tacp = (ENTROPY_CONTEXT *) &tac,

+                  *tlcp = (ENTROPY_CONTEXT *) &tlc;

+  if (otherrd) {

+    memcpy(&tac, ta, sizeof(ENTROPY_CONTEXT_PLANES));

+    memcpy(&tlc, tl, sizeof(ENTROPY_CONTEXT_PLANES));

+  }

+  *distortion = 0;

+  *labelyrate = 0;

+  for (i = 0; i < 4; i++) {

+    int ib = vp9_i8x8_block[i];

+    if (labels[ib] == which_label) {

+      int idx = (ib & 8) + ((ib & 2) << 1);

+      BLOCKD *bd = &xd->block[ib], *bd2 = &xd->block[idx];

+      BLOCK *be = &x->block[ib], *be2 = &x->block[idx];

+      int thisdistortion;

+      vp9_build_inter_predictors4b(xd, bd, 16);

+      if (xd->mode_info_context->mbmi.second_ref_frame)

+        vp9_build_2nd_inter_predictors4b(xd, bd, 16);

+      vp9_subtract_4b_c(be, bd, 16);

+      if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) {

+        if (otherrd) {

+          x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32);

+          x->quantize_b_8x8(be2, bd2);

+          thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);

+          otherdist += thisdistortion;

+          othercost += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,

+                                     tacp + vp9_block2above_8x8[idx],

+                                     tlcp + vp9_block2left_8x8[idx], TX_8X8);

+        }

+        for (j = 0; j < 4; j += 2) {

+          bd = &xd->block[ib + iblock[j]];

+          be = &x->block[ib + iblock[j]];

+          x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);

+          x->quantize_b_4x4_pair(be, be + 1, bd, bd + 1);

+          thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);

+          *distortion += thisdistortion;

+          *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,

+                                     ta + vp9_block2above[ib + iblock[j]],

+                                     tl + vp9_block2left[ib + iblock[j]],

+                                     TX_4X4);

+          *labelyrate += cost_coeffs(x, bd + 1, PLANE_TYPE_Y_WITH_DC,

+                                     ta + vp9_block2above[ib + iblock[j] + 1],

+                                     tl + vp9_block2left[ib + iblock[j]],

+                                     TX_4X4);

+        }

+      } else /* 8x8 */ {

+        if (otherrd) {

+          for (j = 0; j < 4; j += 2) {

+            BLOCKD *bd3 = &xd->block[ib + iblock[j]];

+            BLOCK *be3 = &x->block[ib + iblock[j]];

+            x->vp9_short_fdct8x4(be3->src_diff, be3->coeff, 32);

+            x->quantize_b_4x4_pair(be3, be3 + 1, bd3, bd3 + 1);

+            thisdistortion = vp9_block_error_c(be3->coeff, bd3->dqcoeff, 32);

+            otherdist += thisdistortion;

+            othercost += cost_coeffs(x, bd3, PLANE_TYPE_Y_WITH_DC,

+                                     tacp + vp9_block2above[ib + iblock[j]],

+                                     tlcp + vp9_block2left[ib + iblock[j]],

+                                     TX_4X4);

+            othercost += cost_coeffs(x, bd3 + 1, PLANE_TYPE_Y_WITH_DC,

+                                     tacp + vp9_block2above[ib + iblock[j] + 1],

+                                     tlcp + vp9_block2left[ib + iblock[j]],

+                                     TX_4X4);

+          }

+        }

+        x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32);

+        x->quantize_b_8x8(be2, bd2);

+        thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);

+        *distortion += thisdistortion;

+        *labelyrate += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,

+                                   ta + vp9_block2above_8x8[idx],

+                                   tl + vp9_block2left_8x8[idx], TX_8X8);

+      }

+    }

+  }

+  *distortion >>= 2;

+  if (otherrd) {

+    otherdist >>= 2;

+    *otherrd = RDCOST(x->rdmult, x->rddiv, othercost, otherdist);

+  }

+  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);

+}

+static const unsigned int segmentation_to_sseshift[4] = {3, 3, 2, 0};

+typedef struct {

+  int_mv *ref_mv, *second_ref_mv;

+  int_mv mvp;

+  int64_t segment_rd;

+  SPLITMV_PARTITIONING_TYPE segment_num;

+  TX_SIZE txfm_size;

+  int r;

+  int d;

+  int segment_yrate;

+  B_PREDICTION_MODE modes[16];

+  int_mv mvs[16], second_mvs[16];

+  int eobs[16];

+  int mvthresh;

+  int *mdcounts;

+  int_mv sv_mvp[4];     // save 4 mvp from 8x8

+  int sv_istep[2];  // save 2 initial step_param for 16x8/8x16

+} BEST_SEG_INFO;

+static __inline

+int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {

+  int r = 0;

+  r |= (mv->as_mv.row >> 3) < x->mv_row_min;

+  r |= (mv->as_mv.row >> 3) > x->mv_row_max;

+  r |= (mv->as_mv.col >> 3) < x->mv_col_min;

+  r |= (mv->as_mv.col >> 3) > x->mv_col_max;

+  return r;

+}

+static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,

+                                    BEST_SEG_INFO *bsi,

+                                    SPLITMV_PARTITIONING_TYPE segmentation,

+                                    TX_SIZE tx_size, int64_t *otherrds,

+                                    int64_t *rds, int *completed,

+                                    /* 16 = n_blocks */

+                                    int_mv seg_mvs[16 /* n_blocks */]

+                                                  [MAX_REF_FRAMES - 1]) {

+  int i, j;

+  int const *labels;

+  int br = 0, bd = 0;

+  B_PREDICTION_MODE this_mode;

+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

+  int label_count;

+  int64_t this_segment_rd = 0, other_segment_rd;

+  int label_mv_thresh;

+  int rate = 0;

+  int sbr = 0, sbd = 0;

+  int segmentyrate = 0;

+  int best_eobs[16] = { 0 };

+  vp9_variance_fn_ptr_t *v_fn_ptr;

+  ENTROPY_CONTEXT_PLANES t_above, t_left;

+  ENTROPY_CONTEXT *ta, *tl;

+  ENTROPY_CONTEXT_PLANES t_above_b, t_left_b;

+  ENTROPY_CONTEXT *ta_b, *tl_b;

+  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  ta = (ENTROPY_CONTEXT *)&t_above;

+  tl = (ENTROPY_CONTEXT *)&t_left;

+  ta_b = (ENTROPY_CONTEXT *)&t_above_b;

+  tl_b = (ENTROPY_CONTEXT *)&t_left_b;

+  v_fn_ptr = &cpi->fn_ptr[segmentation];

+  labels = vp9_mbsplits[segmentation];

+  label_count = vp9_mbsplit_count[segmentation];

+  // 64 makes this threshold really big effectively

+  // making it so that we very rarely check mvs on

+  // segments.   setting this to 1 would make mv thresh

+  // roughly equal to what it is for macroblocks

+  label_mv_thresh = 1 * bsi->mvthresh / label_count;

+  // Segmentation method overheads

+  rate = cost_token(vp9_mbsplit_tree, vp9_mbsplit_probs,

+                    vp9_mbsplit_encodings + segmentation);

+  rate += vp9_cost_mv_ref(cpi, SPLITMV, bsi->mdcounts);

+  this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);

+  br += rate;

+  other_segment_rd = this_segment_rd;

+  mbmi->txfm_size = tx_size;

+  for (i = 0; i < label_count && this_segment_rd < bsi->segment_rd; i++) {

+    int_mv mode_mv[B_MODE_COUNT], second_mode_mv[B_MODE_COUNT];

+    int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;

+    B_PREDICTION_MODE mode_selected = ZERO4X4;

+    int bestlabelyrate = 0;

+    // search for the best motion vector on this segment

+    for (this_mode = LEFT4X4; this_mode <= NEW4X4; this_mode ++) {

+      int64_t this_rd, other_rd;

+      int distortion;

+      int labelyrate;

+      ENTROPY_CONTEXT_PLANES t_above_s, t_left_s;

+      ENTROPY_CONTEXT *ta_s;

+      ENTROPY_CONTEXT *tl_s;

+      vpx_memcpy(&t_above_s, &t_above, sizeof(ENTROPY_CONTEXT_PLANES));

+      vpx_memcpy(&t_left_s, &t_left, sizeof(ENTROPY_CONTEXT_PLANES));

+      ta_s = (ENTROPY_CONTEXT *)&t_above_s;

+      tl_s = (ENTROPY_CONTEXT *)&t_left_s;

+      // motion search for newmv (single predictor case only)

+      if (!mbmi->second_ref_frame && this_mode == NEW4X4) {

+        int sseshift, n;

+        int step_param = 0;

+        int further_steps;

+        int thissme, bestsme = INT_MAX;

+        BLOCK *c;

+        BLOCKD *e;

+        /* Is the best so far sufficiently good that we cant justify doing

+         * and new motion search. */

+        if (best_label_rd < label_mv_thresh)

+          break;

+        if (cpi->compressor_speed) {

+          if (segmentation == PARTITIONING_8X16 ||

+              segmentation == PARTITIONING_16X8) {

+            bsi->mvp.as_int = bsi->sv_mvp[i].as_int;

+            if (i == 1 && segmentation == PARTITIONING_16X8)

+              bsi->mvp.as_int = bsi->sv_mvp[2].as_int;

+            step_param = bsi->sv_istep[i];

+          }

+          // use previous block's result as next block's MV predictor.

+          if (segmentation == PARTITIONING_4X4 && i > 0) {

+            bsi->mvp.as_int = x->e_mbd.block[i - 1].bmi.as_mv.first.as_int;

+            if (i == 4 || i == 8 || i == 12)

+              bsi->mvp.as_int = x->e_mbd.block[i - 4].bmi.as_mv.first.as_int;

+            step_param = 2;

+          }

+        }

+        further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;

+        {

+          int sadpb = x->sadperbit4;

+          int_mv mvp_full;

+          mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;

+          mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;

+          // find first label

+          n = vp9_mbsplit_offset[segmentation][i];

+          c = &x->block[n];

+          e = &x->e_mbd.block[n];

+          bestsme = vp9_full_pixel_diamond(cpi, x, c, e, &mvp_full, step_param,

+                                           sadpb, further_steps, 0, v_fn_ptr,

+                                           bsi->ref_mv, &mode_mv[NEW4X4]);

+          sseshift = segmentation_to_sseshift[segmentation];

+          // Should we do a full search (best quality only)

+          if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) {

+            /* Check if mvp_full is within the range. */

+            clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,

+                     x->mv_row_min, x->mv_row_max);

+            thissme = cpi->full_search_sad(x, c, e, &mvp_full,

+                                           sadpb, 16, v_fn_ptr,

+                                           XMVCOST, bsi->ref_mv);

+            if (thissme < bestsme) {

+              bestsme = thissme;

+              mode_mv[NEW4X4].as_int = e->bmi.as_mv.first.as_int;

+            } else {

+              /* The full search result is actually worse so re-instate the

+               * previous best vector */

+              e->bmi.as_mv.first.as_int = mode_mv[NEW4X4].as_int;

+            }

+          }

+        }

+        if (bestsme < INT_MAX) {

+          int distortion;

+          unsigned int sse;

+          cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4],

+                                       bsi->ref_mv, x->errorperbit, v_fn_ptr,

+                                       XMVCOST, &distortion, &sse);

+          // safe motion search result for use in compound prediction

+          seg_mvs[i][mbmi->ref_frame - 1].as_int = mode_mv[NEW4X4].as_int;

+        }

+      } /* NEW4X4 */

+      else if (mbmi->second_ref_frame && this_mode == NEW4X4) {

+        /* motion search not completed? Then skip newmv for this block with

+         * comppred */

+        if (seg_mvs[i][mbmi->second_ref_frame - 1].as_int == INVALID_MV ||

+            seg_mvs[i][mbmi->ref_frame        - 1].as_int == INVALID_MV) {

+          continue;

+        }

+      }

+      rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode],

+                         &second_mode_mv[this_mode], seg_mvs[i],

+                         bsi->ref_mv, bsi->second_ref_mv, XMVCOST);

+      // Trap vectors that reach beyond the UMV borders

+      if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||

+          ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||

+          ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||

+          ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {

+        continue;

+      }

+      if (mbmi->second_ref_frame &&

+          mv_check_bounds(x, &second_mode_mv[this_mode]))

+        continue;

+      if (segmentation == PARTITIONING_4X4) {

+        this_rd = encode_inter_mb_segment(x, labels, i, &labelyrate,

+                                          &distortion,

+                                          ta_s, tl_s, IF_RTCD(&cpi->rtcd));

+        other_rd = this_rd;

+      } else {

+        this_rd = encode_inter_mb_segment_8x8(x, labels, i, &labelyrate,

+                                              &distortion, &other_rd,

+                                              ta_s, tl_s, IF_RTCD(&cpi->rtcd));

+      }

+      this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);

+      rate += labelyrate;

+      if (this_rd < best_label_rd) {

+        sbr = rate;

+        sbd = distortion;

+        bestlabelyrate = labelyrate;

+        mode_selected = this_mode;

+        best_label_rd = this_rd;

+        if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_4X4) {

+          for (j = 0; j < 16; j++)

+            if (labels[j] == i)

+              best_eobs[j] = x->e_mbd.block[j].eob;

+        } else {

+          for (j = 0; j < 4; j++) {

+            int ib = vp9_i8x8_block[j], idx = j * 4;

+            if (labels[ib] == i)

+              best_eobs[idx] = x->e_mbd.block[idx].eob;

+          }

+        }

+        if (other_rd < best_other_rd)

+          best_other_rd = other_rd;

+        vpx_memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));

+        vpx_memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));

+      }

+    } /*for each 4x4 mode*/

+    vpx_memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));

+    vpx_memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));

+    labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],

+                &second_mode_mv[mode_selected], seg_mvs[i],

+                bsi->ref_mv, bsi->second_ref_mv, XMVCOST);

+    br += sbr;

+    bd += sbd;

+    segmentyrate += bestlabelyrate;

+    this_segment_rd += best_label_rd;

+    other_segment_rd += best_other_rd;

+    if (rds)

+      rds[i] = this_segment_rd;

+    if (otherrds)

+      otherrds[i] = other_segment_rd;

+  } /* for each label */

+  if (this_segment_rd < bsi->segment_rd) {

+    bsi->r = br;

+    bsi->d = bd;

+    bsi->segment_yrate = segmentyrate;

+    bsi->segment_rd = this_segment_rd;

+    bsi->segment_num = segmentation;

+    bsi->txfm_size = mbmi->txfm_size;

+    // store everything needed to come back to this!!

+    for (i = 0; i < 16; i++) {

+      BLOCKD *bd = &x->e_mbd.block[i];

+      bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;

+      if (mbmi->second_ref_frame)

+        bsi->second_mvs[i].as_mv = x->partition_info->bmi[i].second_mv.as_mv;

+      bsi->modes[i] = x->partition_info->bmi[i].mode;

+      bsi->eobs[i] = best_eobs[i];

+    }

+  }

+  if (completed) {

+    *completed = i;

+  }

+}

+static void rd_check_segment(VP9_COMP *cpi, MACROBLOCK *x,

+                             BEST_SEG_INFO *bsi,

+                             unsigned int segmentation,

+                             /* 16 = n_blocks */

+                             int_mv seg_mvs[16][MAX_REF_FRAMES - 1],

+                             int64_t txfm_cache[NB_TXFM_MODES]) {

+  int i, n, c = vp9_mbsplit_count[segmentation];

+  if (segmentation == PARTITIONING_4X4) {

+    int64_t rd[16];

+    rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, NULL,

+                            rd, &n, seg_mvs);

+    if (n == c) {

+      for (i = 0; i < NB_TXFM_MODES; i++) {

+        if (rd[c - 1] < txfm_cache[i])

+          txfm_cache[i] = rd[c - 1];

+      }

+    }

+  } else {

+    int64_t diff, base_rd;

+    int cost4x4 = vp9_cost_bit(cpi->common.prob_tx[0], 0);

+    int cost8x8 = vp9_cost_bit(cpi->common.prob_tx[0], 1);

+    if (cpi->common.txfm_mode == TX_MODE_SELECT) {

+      int64_t rd4x4[4], rd8x8[4];

+      int n4x4, n8x8, nmin;

+      BEST_SEG_INFO bsi4x4, bsi8x8;

+      /* factor in cost of cost4x4/8x8 in decision */

+      vpx_memcpy(&bsi4x4, bsi, sizeof(*bsi));

+      vpx_memcpy(&bsi8x8, bsi, sizeof(*bsi));

+      rd_check_segment_txsize(cpi, x, &bsi4x4, segmentation,

+                              TX_4X4, NULL, rd4x4, &n4x4, seg_mvs);

+      rd_check_segment_txsize(cpi, x, &bsi8x8, segmentation,

+                              TX_8X8, NULL, rd8x8, &n8x8, seg_mvs);

+      if (bsi4x4.segment_num == segmentation) {

+        bsi4x4.segment_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);

+        if (bsi4x4.segment_rd < bsi->segment_rd)

+          vpx_memcpy(bsi, &bsi4x4, sizeof(*bsi));

+      }

+      if (bsi8x8.segment_num == segmentation) {

+        bsi8x8.segment_rd += RDCOST(x->rdmult, x->rddiv, cost8x8, 0);

+        if (bsi8x8.segment_rd < bsi->segment_rd)

+          vpx_memcpy(bsi, &bsi8x8, sizeof(*bsi));

+      }

+      n = n4x4 > n8x8 ? n4x4 : n8x8;

+      if (n == c) {

+        nmin = n4x4 < n8x8 ? n4x4 : n8x8;

+        diff = rd8x8[nmin - 1] - rd4x4[nmin - 1];

+        if (n == n4x4) {

+          base_rd = rd4x4[c - 1];

+        } else {

+          base_rd = rd8x8[c - 1] - diff;

+        }

+      }

+    } else {

+      int64_t rd[4], otherrd[4];

+      if (cpi->common.txfm_mode == ONLY_4X4) {

+        rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, otherrd,

+                                rd, &n, seg_mvs);

+        if (n == c) {

+          base_rd = rd[c - 1];

+          diff = otherrd[c - 1] - rd[c - 1];

+        }

+      } else /* use 8x8 transform */ {

+        rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_8X8, otherrd,

+                                rd, &n, seg_mvs);

+        if (n == c) {

+          diff = rd[c - 1] - otherrd[c - 1];

+          base_rd = otherrd[c - 1];

+        }

+      }

+    }

+    if (n == c) {

+      if (base_rd < txfm_cache[ONLY_4X4]) {

+        txfm_cache[ONLY_4X4] = base_rd;

+      }

+      if (base_rd + diff < txfm_cache[1]) {

+        txfm_cache[ALLOW_8X8] = txfm_cache[ALLOW_16X16] = base_rd + diff;

+      }

+      if (diff < 0) {

+        base_rd += diff + RDCOST(x->rdmult, x->rddiv, cost8x8, 0);

+      } else {

+        base_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);

+      }

+      if (base_rd < txfm_cache[TX_MODE_SELECT]) {

+        txfm_cache[TX_MODE_SELECT] = base_rd;

+      }

+    }

+  }

+}

+static __inline void cal_step_param(int sr, int *sp) {

+  int step = 0;

+  if (sr > MAX_FIRST_STEP) sr = MAX_FIRST_STEP;

+  else if (sr < 1) sr = 1;

+  while (sr >>= 1)

+    step++;

+  *sp = MAX_MVSEARCH_STEPS - 1 - step;

+}

+static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,

+                                       int_mv *best_ref_mv,

+                                       int_mv *second_best_ref_mv,

+                                       int64_t best_rd,

+                                       int *mdcounts,

+                                       int *returntotrate,

+                                       int *returnyrate,

+                                       int *returndistortion,

+                                       int *skippable, int mvthresh,

+                                       int_mv seg_mvs[NB_PARTITIONINGS]

+                                                     [16 /* n_blocks */]

+                                                     [MAX_REF_FRAMES - 1],

+                                       int64_t txfm_cache[NB_TXFM_MODES]) {

+  int i;

+  BEST_SEG_INFO bsi;

+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

+  vpx_memset(&bsi, 0, sizeof(bsi));

+  for (i = 0; i < NB_TXFM_MODES; i++)

+    txfm_cache[i] = INT64_MAX;

+  bsi.segment_rd = best_rd;

+  bsi.ref_mv = best_ref_mv;

+  bsi.second_ref_mv = second_best_ref_mv;

+  bsi.mvp.as_int = best_ref_mv->as_int;

+  bsi.mvthresh = mvthresh;

+  bsi.mdcounts = mdcounts;

+  bsi.txfm_size = TX_4X4;

+  for (i = 0; i < 16; i++)

+    bsi.modes[i] = ZERO4X4;

+  if (cpi->compressor_speed == 0) {

+    /* for now, we will keep the original segmentation order

+       when in best quality mode */

+    rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,

+                     seg_mvs[PARTITIONING_16X8], txfm_cache);

+    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,

+                     seg_mvs[PARTITIONING_8X16], txfm_cache);

+    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,

+                     seg_mvs[PARTITIONING_8X8], txfm_cache);

+    rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,

+                     seg_mvs[PARTITIONING_4X4], txfm_cache);

+  } else {

+    int sr;

+    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,

+                     seg_mvs[PARTITIONING_8X8], txfm_cache);

+    if (bsi.segment_rd < best_rd) {

+      int tmp_col_min = x->mv_col_min;

+      int tmp_col_max = x->mv_col_max;

+      int tmp_row_min = x->mv_row_min;

+      int tmp_row_max = x->mv_row_max;

+      vp9_clamp_mv_min_max(x, best_ref_mv);

+      /* Get 8x8 result */

+      bsi.sv_mvp[0].as_int = bsi.mvs[0].as_int;

+      bsi.sv_mvp[1].as_int = bsi.mvs[2].as_int;

+      bsi.sv_mvp[2].as_int = bsi.mvs[8].as_int;

+      bsi.sv_mvp[3].as_int = bsi.mvs[10].as_int;

+      /* Use 8x8 result as 16x8/8x16's predictor MV. Adjust search range

+       * according to the closeness of 2 MV. */

+      /* block 8X16 */

+      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[2].as_mv.row)) >> 3,

+                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[2].as_mv.col)) >> 3);

+      cal_step_param(sr, &bsi.sv_istep[0]);

+      sr = MAXF((abs(bsi.sv_mvp[1].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,

+                (abs(bsi.sv_mvp[1].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);

+      cal_step_param(sr, &bsi.sv_istep[1]);

+      rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,

+                       seg_mvs[PARTITIONING_8X16], txfm_cache);

+      /* block 16X8 */

+      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[1].as_mv.row)) >> 3,

+                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[1].as_mv.col)) >> 3);

+      cal_step_param(sr, &bsi.sv_istep[0]);

+      sr = MAXF((abs(bsi.sv_mvp[2].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,

+                (abs(bsi.sv_mvp[2].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);

+      cal_step_param(sr, &bsi.sv_istep[1]);

+      rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,

+                       seg_mvs[PARTITIONING_16X8], txfm_cache);

+      /* If 8x8 is better than 16x8/8x16, then do 4x4 search */

+      /* Not skip 4x4 if speed=0 (good quality) */

+      if (cpi->sf.no_skip_block4x4_search ||

+          bsi.segment_num == PARTITIONING_8X8) {

+        /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */

+        bsi.mvp.as_int = bsi.sv_mvp[0].as_int;

+        rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,

+                         seg_mvs[PARTITIONING_4X4], txfm_cache);

+      }

+      /* restore UMV window */

+      x->mv_col_min = tmp_col_min;

+      x->mv_col_max = tmp_col_max;

+      x->mv_row_min = tmp_row_min;

+      x->mv_row_max = tmp_row_max;

+    }

+  }

+  /* set it to the best */

+  for (i = 0; i < 16; i++) {

+    BLOCKD *bd = &x->e_mbd.block[i];

+    bd->bmi.as_mv.first.as_int = bsi.mvs[i].as_int;

+    if (mbmi->second_ref_frame)

+      bd->bmi.as_mv.second.as_int = bsi.second_mvs[i].as_int;

+    bd->eob = bsi.eobs[i];

+  }

+  *returntotrate = bsi.r;

+  *returndistortion = bsi.d;

+  *returnyrate = bsi.segment_yrate;

+  *skippable = bsi.txfm_size == TX_4X4 ?

+                    vp9_mby_is_skippable_4x4(&x->e_mbd, 0) :

+                    vp9_mby_is_skippable_8x8(&x->e_mbd, 0);

+  /* save partitions */

+  mbmi->txfm_size = bsi.txfm_size;

+  mbmi->partitioning = bsi.segment_num;

+  x->partition_info->count = vp9_mbsplit_count[bsi.segment_num];

+  for (i = 0; i < x->partition_info->count; i++) {

+    int j;

+    j = vp9_mbsplit_offset[bsi.segment_num][i];

+    x->partition_info->bmi[i].mode = bsi.modes[j];

+    x->partition_info->bmi[i].mv.as_mv = bsi.mvs[j].as_mv;

+    if (mbmi->second_ref_frame)

+      x->partition_info->bmi[i].second_mv.as_mv = bsi.second_mvs[j].as_mv;

+  }

+  /*

+   * used to set mbmi->mv.as_int

+   */

+  x->partition_info->bmi[15].mv.as_int = bsi.mvs[15].as_int;

+  if (mbmi->second_ref_frame)

+    x->partition_info->bmi[15].second_mv.as_int = bsi.second_mvs[15].as_int;

+  return bsi.segment_rd;

+}

+/* Order arr in increasing order, original position stored in idx */

+static void insertsortmv(int arr[], int len) {

+  int i, j, k;

+  for (i = 1; i <= len - 1; i++) {

+    for (j = 0; j < i; j++) {

+      if (arr[j] > arr[i]) {

+        int temp;

+        temp = arr[i];

+        for (k = i; k > j; k--)

+          arr[k] = arr[k - 1];

+        arr[j] = temp;

+      }

+    }

+  }

+}

+static void insertsortsad(int arr[], int idx[], int len) {

+  int i, j, k;

+  for (i = 1; i <= len - 1; i++) {

+    for (j = 0; j < i; j++) {

+      if (arr[j] > arr[i]) {

+        int temp, tempi;

+        temp = arr[i];

+        tempi = idx[i];

+        for (k = i; k > j; k--) {

+          arr[k] = arr[k - 1];

+          idx[k] = idx[k - 1];

+        }

+        arr[j] = temp;

+        idx[j] = tempi;

+      }

+    }

+  }

+}

+// The improved MV prediction

+void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCKD *xd, const MODE_INFO *here,

+                 int_mv *mvp, int refframe, int *ref_frame_sign_bias,

+                 int *sr, int near_sadidx[]) {

+  const MODE_INFO *above = here - xd->mode_info_stride;

+  const MODE_INFO *left = here - 1;

+  const MODE_INFO *aboveleft = above - 1;

+  int_mv           near_mvs[8];

+  int              near_ref[8];

+  int_mv           mv;

+  int              vcnt = 0;

+  int              find = 0;

+  int              mb_offset;

+  int              mvx[8];

+  int              mvy[8];

+  int              i;

+  mv.as_int = 0;

+  if (here->mbmi.ref_frame != INTRA_FRAME) {

+    near_mvs[0].as_int = near_mvs[1].as_int = near_mvs[2].as_int = near_mvs[3].as_int = near_mvs[4].as_int = near_mvs[5].as_int = near_mvs[6].as_int = near_mvs[7].as_int = 0;

+    near_ref[0] = near_ref[1] = near_ref[2] = near_ref[3] = near_ref[4] = near_ref[5] = near_ref[6] = near_ref[7] = 0;

+    // read in 3 nearby block's MVs from current frame as prediction candidates.

+    if (above->mbmi.ref_frame != INTRA_FRAME) {

+      near_mvs[vcnt].as_int = above->mbmi.mv[0].as_int;

+      mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);

+      near_ref[vcnt] =  above->mbmi.ref_frame;

+    }

+    vcnt++;

+    if (left->mbmi.ref_frame != INTRA_FRAME) {

+      near_mvs[vcnt].as_int = left->mbmi.mv[0].as_int;

+      mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);

+      near_ref[vcnt] =  left->mbmi.ref_frame;

+    }

+    vcnt++;

+    if (aboveleft->mbmi.ref_frame != INTRA_FRAME) {

+      near_mvs[vcnt].as_int = aboveleft->mbmi.mv[0].as_int;

+      mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);

+      near_ref[vcnt] =  aboveleft->mbmi.ref_frame;

+    }

+    vcnt++;

+    // read in 5 nearby block's MVs from last frame.

+    if (cpi->common.last_frame_type != KEY_FRAME) {

+      mb_offset = (-xd->mb_to_top_edge / 128 + 1) * (xd->mode_info_stride + 1) + (-xd->mb_to_left_edge / 128 + 1);

+      // current in last frame

+      if (cpi->lf_ref_frame[mb_offset] != INTRA_FRAME) {

+        near_mvs[vcnt].as_int = cpi->lfmv[mb_offset].as_int;

+        mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset], refframe, &near_mvs[vcnt], ref_frame_sign_bias);

+        near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset];

+      }

+      vcnt++;

+      // above in last frame

+      if (cpi->lf_ref_frame[mb_offset - xd->mode_info_stride - 1] != INTRA_FRAME) {

+        near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - xd->mode_info_stride - 1].as_int;

+        mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset - xd->mode_info_stride - 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);

+        near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset - xd->mode_info_stride - 1];

+      }

+      vcnt++;

+      // left in last frame

+      if (cpi->lf_ref_frame[mb_offset - 1] != INTRA_FRAME) {

+        near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - 1].as_int;

+        mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset - 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);

+        near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset - 1];

+      }

+      vcnt++;

+      // right in last frame

+      if (cpi->lf_ref_frame[mb_offset + 1] != INTRA_FRAME) {

+        near_mvs[vcnt].as_int = cpi->lfmv[mb_offset + 1].as_int;

+        mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset + 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);

+        near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset + 1];

+      }

+      vcnt++;

+      // below in last frame

+      if (cpi->lf_ref_frame[mb_offset + xd->mode_info_stride + 1] != INTRA_FRAME) {

+        near_mvs[vcnt].as_int = cpi->lfmv[mb_offset + xd->mode_info_stride + 1].as_int;

+        mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset + xd->mode_info_stride + 1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);

+        near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset + xd->mode_info_stride + 1];

+      }

+      vcnt++;

+    }

+    for (i = 0; i < vcnt; i++) {

+      if (near_ref[near_sadidx[i]] != INTRA_FRAME) {

+        if (here->mbmi.ref_frame == near_ref[near_sadidx[i]]) {

+          mv.as_int = near_mvs[near_sadidx[i]].as_int;

+          find = 1;

+          if (i < 3)

+            *sr = 3;

+          else

+            *sr = 2;

+          break;

+        }

+      }

+    }

+    if (!find) {

+      for (i = 0; i < vcnt; i++) {

+        mvx[i] = near_mvs[i].as_mv.row;

+        mvy[i] = near_mvs[i].as_mv.col;

+      }

+      insertsortmv(mvx, vcnt);

+      insertsortmv(mvy, vcnt);

+      mv.as_mv.row = mvx[vcnt / 2];

+      mv.as_mv.col = mvy[vcnt / 2];

+      find = 1;

+      // sr is set to 0 to allow calling function to decide the search range.

+      *sr = 0;

+    }

+  }

+  /* Set up return values */

+  mvp->as_int = mv.as_int;

+  clamp_mv2(mvp, xd);

+}

+static void cal_sad(VP9_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x,

+                    int recon_yoffset, int near_sadidx[],

+                    enum BlockSize block_size) {

+  /* 0-cf above, 1-cf left, 2-cf aboveleft, 3-lf current, 4-lf above,

+   * 5-lf left, 6-lf right, 7-lf below */

+  int near_sad[8] = {0};

+  BLOCK *b = &x->block[0];

+  unsigned char *src_y_ptr = *(b->base_src);

+  const unsigned char *dst_y_ptr = xd->dst.y_buffer;

+  const int bs = (block_size == BLOCK_16X16) ? 16 : 32;

+  const int dst_y_str = xd->dst.y_stride;

+  // calculate sad for current frame 3 nearby MBs.

+  if (xd->mb_to_top_edge == 0 && xd->mb_to_left_edge == 0) {

+    near_sad[0] = near_sad[1] = near_sad[2] = INT_MAX;

+  } else if (xd->mb_to_top_edge == 0) {

+    // only has left MB for sad calculation.

+    near_sad[0] = near_sad[2] = INT_MAX;

+    near_sad[1] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,

+                                              dst_y_ptr - bs,

+                                              dst_y_str, 0x7fffffff);

+  } else if (xd->mb_to_left_edge == 0) {

+    // only has left MB for sad calculation.

+    near_sad[1] = near_sad[2] = INT_MAX;

+    near_sad[0] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,

+                                              dst_y_ptr - dst_y_str * bs,

+                                              dst_y_str, 0x7fffffff);

+  } else {

+    near_sad[0] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,

+                                              dst_y_ptr - dst_y_str * bs,

+                                              dst_y_str, 0x7fffffff);

+    near_sad[1] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,

+                                              dst_y_ptr - bs,

+                                              dst_y_str, 0x7fffffff);

+    near_sad[2] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,

+                                              dst_y_ptr - dst_y_str * bs - bs,

+                                              dst_y_str, 0x7fffffff);

+  }

+  if (cpi->common.last_frame_type != KEY_FRAME) {

+    // calculate sad for last frame 5 nearby MBs.

+    unsigned char *pre_y_buffer = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_buffer + recon_yoffset;

+    const int pre_y_str = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_stride;

+    if (xd->mb_to_top_edge == 0) near_sad[4] = INT_MAX;

+    if (xd->mb_to_left_edge == 0) near_sad[5] = INT_MAX;

+    if (xd->mb_to_right_edge == 0) near_sad[6] = INT_MAX;

+    if (xd->mb_to_bottom_edge == 0) near_sad[7] = INT_MAX;

+    near_sad[3] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,

+                                              pre_y_buffer,

+                                              pre_y_str, 0x7fffffff);

+    if (near_sad[4] != INT_MAX)

+      near_sad[4] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,

+                                                pre_y_buffer - pre_y_str * bs,

+                                                pre_y_str, 0x7fffffff);

+    if (near_sad[5] != INT_MAX)

+      near_sad[5] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,

+                                                pre_y_buffer - bs,

+                                                pre_y_str, 0x7fffffff);

+    if (near_sad[6] != INT_MAX)

+      near_sad[6] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,

+                                                pre_y_buffer + bs,

+                                                pre_y_str, 0x7fffffff);

+    if (near_sad[7] != INT_MAX)

+      near_sad[7] = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,

+                                                pre_y_buffer + pre_y_str * bs,

+                                                pre_y_str, 0x7fffffff);

+  }

+  if (cpi->common.last_frame_type != KEY_FRAME) {

+    insertsortsad(near_sad, near_sadidx, 8);

+  } else {

+    insertsortsad(near_sad, near_sadidx, 3);

+  }

+}

+static void set_i8x8_block_modes(MACROBLOCK *x, int modes[2][4]) {

+  int i;

+  MACROBLOCKD *xd = &x->e_mbd;

+  for (i = 0; i < 4; i++) {

+    int ib = vp9_i8x8_block[i];

+    xd->mode_info_context->bmi[ib + 0].as_mode.first = modes[0][i];

+    xd->mode_info_context->bmi[ib + 1].as_mode.first = modes[0][i];

+    xd->mode_info_context->bmi[ib + 4].as_mode.first = modes[0][i];

+    xd->mode_info_context->bmi[ib + 5].as_mode.first = modes[0][i];

+#if CONFIG_COMP_INTRA_PRED

+    xd->mode_info_context->bmi[ib + 0].as_mode.second = modes[1][i];

+    xd->mode_info_context->bmi[ib + 1].as_mode.second = modes[1][i];

+    xd->mode_info_context->bmi[ib + 4].as_mode.second = modes[1][i];

+    xd->mode_info_context->bmi[ib + 5].as_mode.second = modes[1][i];

+#endif

+    // printf("%d,%d,%d,%d %d,%d,%d,%d\n",

+    //       modes[0][0], modes[0][1], modes[0][2], modes[0][3],

+    //       modes[1][0], modes[1][1], modes[1][2], modes[1][3]);

+  }

+  for (i = 0; i < 16; i++) {

+    xd->block[i].bmi = xd->mode_info_context->bmi[i];

+  }

+}

+extern void vp9_calc_ref_probs(int *count, vp9_prob *probs);

+static void estimate_curframe_refprobs(VP9_COMP *cpi, vp9_prob mod_refprobs[3], int pred_ref) {

+  int norm_cnt[MAX_REF_FRAMES];

+  const int *const rfct = cpi->count_mb_ref_frame_usage;

+  int intra_count = rfct[INTRA_FRAME];

+  int last_count  = rfct[LAST_FRAME];

+  int gf_count    = rfct[GOLDEN_FRAME];

+  int arf_count   = rfct[ALTREF_FRAME];

+  // Work out modified reference frame probabilities to use where prediction

+  // of the reference frame fails

+  if (pred_ref == INTRA_FRAME) {

+    norm_cnt[0] = 0;

+    norm_cnt[1] = last_count;

+    norm_cnt[2] = gf_count;

+    norm_cnt[3] = arf_count;

+    vp9_calc_ref_probs(norm_cnt, mod_refprobs);

+    mod_refprobs[0] = 0;    // This branch implicit

+  } else if (pred_ref == LAST_FRAME) {

+    norm_cnt[0] = intra_count;

+    norm_cnt[1] = 0;

+    norm_cnt[2] = gf_count;

+    norm_cnt[3] = arf_count;

+    vp9_calc_ref_probs(norm_cnt, mod_refprobs);

+    mod_refprobs[1] = 0;    // This branch implicit

+  } else if (pred_ref == GOLDEN_FRAME) {

+    norm_cnt[0] = intra_count;

+    norm_cnt[1] = last_count;

+    norm_cnt[2] = 0;

+    norm_cnt[3] = arf_count;

+    vp9_calc_ref_probs(norm_cnt, mod_refprobs);

+    mod_refprobs[2] = 0;  // This branch implicit

+  } else {

+    norm_cnt[0] = intra_count;

+    norm_cnt[1] = last_count;

+    norm_cnt[2] = gf_count;

+    norm_cnt[3] = 0;

+    vp9_calc_ref_probs(norm_cnt, mod_refprobs);

+    mod_refprobs[2] = 0;  // This branch implicit

+  }

+}

+static __inline unsigned weighted_cost(vp9_prob *tab0, vp9_prob *tab1, int idx, int val, int weight) {

+  unsigned cost0 = tab0[idx] ? vp9_cost_bit(tab0[idx], val) : 0;

+  unsigned cost1 = tab1[idx] ? vp9_cost_bit(tab1[idx], val) : 0;

+  // weight is 16-bit fixed point, so this basically calculates:

+  // 0.5 + weight * cost1 + (1.0 - weight) * cost0

+  return (0x8000 + weight * cost1 + (0x10000 - weight) * cost0) >> 16;

+}

+static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, unsigned int *ref_costs) {

+  VP9_COMMON *cm = &cpi->common;

+  MACROBLOCKD *xd = &cpi->mb.e_mbd;

+  vp9_prob *mod_refprobs;

+  unsigned int cost;

+  int pred_ref;

+  int pred_flag;

+  int pred_ctx;

+  int i;

+  int tot_count;

+  vp9_prob pred_prob, new_pred_prob;

+  int seg_ref_active;

+  int seg_ref_count = 0;

+  seg_ref_active = vp9_segfeature_active(xd,

+                                         segment_id,

+                                         SEG_LVL_REF_FRAME);

+  if (seg_ref_active) {

+    seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME)  +

+                    vp9_check_segref(xd, segment_id, LAST_FRAME)   +

+                    vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +

+                    vp9_check_segref(xd, segment_id, ALTREF_FRAME);

+  }

+  // Get the predicted reference for this mb

+  pred_ref = vp9_get_pred_ref(cm, xd);

+  // Get the context probability for the prediction flag (based on last frame)

+  pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);

+  // Predict probability for current frame based on stats so far

+  pred_ctx = vp9_get_pred_context(cm, xd, PRED_REF);

+  tot_count = cpi->ref_pred_count[pred_ctx][0] + cpi->ref_pred_count[pred_ctx][1];

+  if (tot_count) {

+    new_pred_prob =

+      (cpi->ref_pred_count[pred_ctx][0] * 255 + (tot_count >> 1)) / tot_count;

+    new_pred_prob += !new_pred_prob;

+  } else

+    new_pred_prob = 128;

+  // Get the set of probabilities to use if prediction fails

+  mod_refprobs = cm->mod_refprobs[pred_ref];

+  // For each possible selected reference frame work out a cost.

+  for (i = 0; i < MAX_REF_FRAMES; i++) {

+    if (seg_ref_active && seg_ref_count == 1) {

+      cost = 0;

+    } else {

+      pred_flag = (i == pred_ref);

+      // Get the prediction for the current mb

+      cost = weighted_cost(&pred_prob, &new_pred_prob, 0,

+                           pred_flag, cpi->seg0_progress);

+      if (cost > 1024) cost = 768; // i.e. account for 4 bits max.

+      // for incorrectly predicted cases

+      if (! pred_flag) {

+        vp9_prob curframe_mod_refprobs[3];

+        if (cpi->seg0_progress) {

+          estimate_curframe_refprobs(cpi, curframe_mod_refprobs, pred_ref);

+        } else {

+          vpx_memset(curframe_mod_refprobs, 0, sizeof(curframe_mod_refprobs));

+        }

+        cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 0,

+                              (i != INTRA_FRAME), cpi->seg0_progress);

+        if (i != INTRA_FRAME) {

+          cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 1,

+                                (i != LAST_FRAME), cpi->seg0_progress);

+          if (i != LAST_FRAME) {

+            cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 2,

+                                  (i != GOLDEN_FRAME), cpi->seg0_progress);

+          }

+        }

+      }

+    }

+    ref_costs[i] = cost;

+  }

+}

+static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,

+                                 int mode_index,

+                                 PARTITION_INFO *partition,

+                                 int_mv *ref_mv,

+                                 int_mv *second_ref_mv,

+                                 int single_pred_diff,

+                                 int comp_pred_diff,

+                                 int hybrid_pred_diff,

+                                 int64_t txfm_size_diff[NB_TXFM_MODES]) {

+  MACROBLOCKD *xd = &x->e_mbd;

+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

+  // Take a snapshot of the coding context so it can be

+  // restored if we decide to encode this way

+  ctx->best_mode_index = mode_index;

+  vpx_memcpy(&ctx->mic, xd->mode_info_context,

+             sizeof(MODE_INFO));

+  if (partition)

+    vpx_memcpy(&ctx->partition_info, partition,

+               sizeof(PARTITION_INFO));

+  ctx->best_ref_mv.as_int = ref_mv->as_int;

+  ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;

+  // ctx[mb_index].rddiv = x->rddiv;

+  // ctx[mb_index].rdmult = x->rdmult;

+  ctx->single_pred_diff = single_pred_diff;

+  ctx->comp_pred_diff   = comp_pred_diff;

+  ctx->hybrid_pred_diff = hybrid_pred_diff;

+  if (txfm_size_diff) {

+    memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));

+  } else {

+    memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));

+  }

+}

+static void inter_mode_cost(VP9_COMP *cpi, MACROBLOCK *x, int this_mode,

+                            int *rate2, int *distortion2, int *rate_y,

+                            int *distortion, int* rate_uv, int *distortion_uv,

+                            int *skippable, int64_t txfm_cache[NB_TXFM_MODES]) {

+  int y_skippable, uv_skippable;

+  // Y cost and distortion

+  macro_block_yrd(cpi, x, rate_y, distortion, &y_skippable, txfm_cache);

+  *rate2 += *rate_y;

+  *distortion2 += *distortion;

+  // UV cost and distortion

+  if (x->e_mbd.mode_info_context->mbmi.txfm_size != TX_4X4)

+    rd_inter16x16_uv_8x8(cpi, x, rate_uv, distortion_uv,

+                         cpi->common.full_pixel, &uv_skippable);

+  else

+    rd_inter16x16_uv(cpi, x, rate_uv, distortion_uv, cpi->common.full_pixel,

+                     &uv_skippable);

+  *rate2 += *rate_uv;

+  *distortion2 += *distortion_uv;

+  *skippable = y_skippable && uv_skippable;

+}

+#define MIN(x,y) (((x)<(y))?(x):(y))

+#define MAX(x,y) (((x)>(y))?(x):(y))

+static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,

+                               int idx, int frame_type,

+                               int recon_yoffset, int recon_uvoffset,

+                               int_mv frame_nearest_mv[4],

+                               int_mv frame_near_mv[4],

+                               int_mv frame_best_ref_mv[4],

+                               int frame_mdcounts[4][4],

+                               unsigned char *y_buffer[4],

+                               unsigned char *u_buffer[4],

+                               unsigned char *v_buffer[4]) {

+  YV12_BUFFER_CONFIG *yv12 = &cpi->common.yv12_fb[idx];

+  MACROBLOCKD *xd = &x->e_mbd;

+  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;

+  vp9_find_near_mvs(xd, xd->mode_info_context,

+                    xd->prev_mode_info_context,

+                    &frame_nearest_mv[frame_type], &frame_near_mv[frame_type],

+                    &frame_best_ref_mv[frame_type], frame_mdcounts[frame_type],

+                    frame_type, cpi->common.ref_frame_sign_bias);

+  y_buffer[frame_type] = yv12->y_buffer + recon_yoffset;

+  u_buffer[frame_type] = yv12->u_buffer + recon_uvoffset;

+  v_buffer[frame_type] = yv12->v_buffer + recon_uvoffset;

+#if CONFIG_NEWBESTREFMV

+  vp9_find_mv_refs(xd, xd->mode_info_context,

+                   xd->prev_mode_info_context,

+                   frame_type,

+                   mbmi->ref_mvs[frame_type],

+                   cpi->common.ref_frame_sign_bias);

+  vp9_find_best_ref_mvs(xd, y_buffer[frame_type],

+                        yv12->y_stride,

+                        mbmi->ref_mvs[frame_type],

+                        &frame_best_ref_mv[frame_type],

+                        &frame_nearest_mv[frame_type],

+                        &frame_near_mv[frame_type]);

+#endif

+}

+static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,

+                                 enum BlockSize block_size,

+                                 int *saddone, int near_sadidx[],

+                                 int mdcounts[4], int64_t txfm_cache[],

+                                 int *rate2, int *distortion, int *skippable,

+                                 int *compmode_cost,

+                                 int *rate_y, int *distortion_y,

+                                 int *rate_uv, int *distortion_uv,

+                                 int *mode_excluded, int *disable_skip,

+                                 int recon_yoffset, int mode_index,

+                                 int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],

+                                 int_mv frame_best_ref_mv[4]) {

+  VP9_COMMON *cm = &cpi->common;

+  MACROBLOCKD *xd = &x->e_mbd;

+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

+  BLOCK *b = &x->block[0];

+  BLOCKD *d = &xd->block[0];

+  const int is_comp_pred = (mbmi->second_ref_frame != 0);

+  const int num_refs = is_comp_pred ? 2 : 1;

+  const int this_mode = mbmi->mode;

+  int i;

+  int refs[2] = { mbmi->ref_frame, mbmi->second_ref_frame };

+  int_mv cur_mv[2];

+  int_mv mvp;

+  int64_t this_rd = 0;

+  switch (this_mode) {

+    case NEWMV:

+      if (is_comp_pred) {

+        if (frame_mv[NEWMV][refs[0]].as_int == INVALID_MV ||

+            frame_mv[NEWMV][refs[1]].as_int == INVALID_MV)

+          return INT64_MAX;

+        *rate2 += vp9_mv_bit_cost(&frame_mv[NEWMV][refs[0]],

+                                  &frame_best_ref_mv[refs[0]],

+                                  XMVCOST, 96,

+                                  x->e_mbd.allow_high_precision_mv);

+        *rate2 += vp9_mv_bit_cost(&frame_mv[NEWMV][refs[1]],

+                                  &frame_best_ref_mv[refs[1]],

+                                  XMVCOST, 96,

+                                  x->e_mbd.allow_high_precision_mv);

+      } else {

+        int bestsme = INT_MAX;

+        int further_steps, step_param = cpi->sf.first_step;

+        int sadpb = x->sadperbit16;

+        int_mv mvp_full, tmp_mv;

+        // search range got from mv_pred(). It uses step_param levels. (0-7)

+        int sr = 0;

+        int tmp_col_min = x->mv_col_min;

+        int tmp_col_max = x->mv_col_max;

+        int tmp_row_min = x->mv_row_min;

+        int tmp_row_max = x->mv_row_max;

+        vp9_clamp_mv_min_max(x, &frame_best_ref_mv[refs[0]]);

+        if (!*saddone) {

+          cal_sad(cpi, xd, x, recon_yoffset, &near_sadidx[0], block_size);

+          *saddone = 1;

+        }

+        vp9_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp,

+                    mbmi->ref_frame, cpi->common.ref_frame_sign_bias,

+                    &sr, &near_sadidx[0]);

+        mvp_full.as_mv.col = mvp.as_mv.col >> 3;

+        mvp_full.as_mv.row = mvp.as_mv.row >> 3;

+        // adjust search range according to sr from mv prediction

+        step_param = MAX(step_param, sr);

+        // Further step/diamond searches as necessary

+        further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;

+        bestsme = vp9_full_pixel_diamond(cpi, x, b, d, &mvp_full, step_param,

+                                         sadpb, further_steps, 1,

+                                         &cpi->fn_ptr[block_size],

+                                         &frame_best_ref_mv[refs[0]], &tmp_mv);

+        x->mv_col_min = tmp_col_min;

+        x->mv_col_max = tmp_col_max;

+        x->mv_row_min = tmp_row_min;

+        x->mv_row_max = tmp_row_max;

+        if (bestsme < INT_MAX) {

+          int dis; /* TODO: use dis in distortion calculation later. */

+          unsigned int sse;

+          cpi->find_fractional_mv_step(x, b, d, &tmp_mv,

+                                       &frame_best_ref_mv[refs[0]],

+                                       x->errorperbit,

+                                       &cpi->fn_ptr[block_size],

+                                       XMVCOST, &dis, &sse);

+        }

+        d->bmi.as_mv.first.as_int = tmp_mv.as_int;

+        frame_mv[NEWMV][refs[0]].as_int = d->bmi.as_mv.first.as_int;

+        // Add the new motion vector cost to our rolling cost variable

+        *rate2 += vp9_mv_bit_cost(&tmp_mv, &frame_best_ref_mv[refs[0]],

+                                  XMVCOST, 96, xd->allow_high_precision_mv);

+      }

+      break;

+    case NEARESTMV:

+    case NEARMV:

+      // Do not bother proceeding if the vector (from newmv, nearest or

+      // near) is 0,0 as this should then be coded using the zeromv mode.

+      for (i = 0; i < num_refs; ++i)

+        if (frame_mv[this_mode][refs[i]].as_int == 0)

+          return INT64_MAX;

+    case ZEROMV:

+    default:

+      break;

+  }

+  for (i = 0; i < num_refs; ++i) {

+    cur_mv[i] = frame_mv[this_mode][refs[i]];

+    // Clip "next_nearest" so that it does not extend to far out of image

+    clamp_mv2(&cur_mv[i], xd);

+    if (mv_check_bounds(x, &cur_mv[i]))

+      return INT64_MAX;

+    mbmi->mv[i].as_int = cur_mv[i].as_int;

+  }

+#if CONFIG_PRED_FILTER

+  // Filtered prediction:

+  mbmi->pred_filter_enabled = vp9_mode_order[mode_index].pred_filter_flag;

+  *rate2 += vp9_cost_bit(cpi->common.prob_pred_filter_off,

+                         mbmi->pred_filter_enabled);

+#endif

+  if (cpi->common.mcomp_filter_type == SWITCHABLE) {

+    const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);

+    const int m = vp9_switchable_interp_map[mbmi->interp_filter];

+    *rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];

+  }

+  /* We don't include the cost of the second reference here, because there

+   * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other

+   * words if you present them in that order, the second one is always known

+   * if the first is known */

+  *compmode_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP),

+                                is_comp_pred);

+  *rate2 += vp9_cost_mv_ref(cpi, this_mode, mdcounts);

+  if (block_size == BLOCK_16X16) {

+    vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);

+    if (is_comp_pred)

+      vp9_build_2nd_inter16x16_predictors_mby(xd, xd->predictor, 16);

+  } else {

+#if CONFIG_SUPERBLOCKS

+    vp9_build_inter32x32_predictors_sb(xd,

+                                       xd->dst.y_buffer,

+                                       xd->dst.u_buffer,

+                                       xd->dst.v_buffer,

+                                       xd->dst.y_stride,

+                                       xd->dst.uv_stride);

+#endif

+  }

+  if (cpi->active_map_enabled && x->active_ptr[0] == 0)

+    x->skip = 1;

+  else if (x->encode_breakout) {

+    unsigned int sse, var;

+    int threshold = (xd->block[0].dequant[1]

+                     * xd->block[0].dequant[1] >> 4);

+    if (threshold < x->encode_breakout)

+      threshold = x->encode_breakout;

+    if (block_size == BLOCK_16X16) {

+      var = vp9_variance16x16(*(b->base_src), b->src_stride,

+                              xd->predictor, 16, &sse);

+    } else {

+#if CONFIG_SUPERBLOCKS

+      var = vp9_variance32x32(*(b->base_src), b->src_stride,

+                              xd->dst.y_buffer, xd->dst.y_stride, &sse);

+#endif

+    }

+    if (sse < threshold) {

+      unsigned int q2dc = xd->block[24].dequant[0];

+      /* If there is no codeable 2nd order dc

+       or a very small uniform pixel change change */

+      if ((sse - var < q2dc * q2dc >> 4) ||

+          (sse / 2 > var && sse - var < 64)) {

+        // Check u and v to make sure skip is ok

+        int sse2;

+        if (block_size == BLOCK_16X16) {

+          sse2 = vp9_uvsse(x);

+        } else {

+          unsigned int sse2u, sse2v;

+          var = vp9_variance16x16(x->src.u_buffer, x->src.uv_stride,

+                                  xd->dst.u_buffer, xd->dst.uv_stride, &sse2u);

+          var = vp9_variance16x16(x->src.v_buffer, x->src.uv_stride,

+                                  xd->dst.v_buffer, xd->dst.uv_stride, &sse2v);

+          sse2 = sse2u + sse2v;

+        }

+        if (sse2 * 2 < threshold) {

+          x->skip = 1;

+          *distortion = sse + sse2;

+          *rate2 = 500;

+          /* for best_yrd calculation */

+          *rate_uv = 0;

+          *distortion_uv = sse2;

+          *disable_skip = 1;

+          this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);

+        }

+      }

+    }

+  }

+  if (!x->skip) {

+    if (block_size == BLOCK_16X16) {

+      vp9_build_1st_inter16x16_predictors_mbuv(xd, &xd->predictor[256],

+                                               &xd->predictor[320], 8);

+      if (is_comp_pred)

+        vp9_build_2nd_inter16x16_predictors_mbuv(xd, &xd->predictor[256],

+                                                 &xd->predictor[320], 8);

+      inter_mode_cost(cpi, x, this_mode, rate2, distortion,

+                      rate_y, distortion_y, rate_uv, distortion_uv,

+                      skippable, txfm_cache);

+    } else {

+#if CONFIG_SUPERBLOCKS

+      int skippable_y, skippable_uv;

+      // Y cost and distortion - FIXME support other transform sizes

+      super_block_yrd_8x8(x, rate_y, distortion_y,

+                          IF_RTCD(&cpi->rtcd), &skippable_y);

+      *rate2 += *rate_y;

+      *distortion += *distortion_y;

+      rd_inter32x32_uv_8x8(cpi, x, rate_uv, distortion_uv,

+                           cm->full_pixel, &skippable_uv);

+      *rate2 += *rate_uv;

+      *distortion += *distortion_uv;

+      *skippable = skippable_y && skippable_uv;

+#endif

+    }

+  }

+  if (is_comp_pred) {

+    *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);

+  } else {

+    *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);

+  }

+  return this_rd;  // if 0, this will be re-calculated by caller

+}

+void vp9_rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,

+                            int recon_yoffset, int recon_uvoffset,

+                            int *returnrate, int *returndistortion,

+                            int64_t *returnintra) {

+  VP9_COMMON *cm = &cpi->common;

+  MACROBLOCKD *xd = &x->e_mbd;

+  union b_mode_info best_bmodes[16];

+  MB_MODE_INFO best_mbmode;

+  PARTITION_INFO best_partition;

+  int_mv best_ref_mv, second_best_ref_mv;

+  MB_PREDICTION_MODE this_mode;

+  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;

+  int i, best_mode_index = 0;

+  int mode8x8[2][4];

+  unsigned char segment_id = mbmi->segment_id;

+  int mode_index;

+  int mdcounts[4];

+  int rate, distortion;

+  int rate2, distortion2;

+  int64_t best_txfm_rd[NB_TXFM_MODES];

+  int64_t best_txfm_diff[NB_TXFM_MODES];

+  int64_t best_pred_diff[NB_PREDICTION_TYPES];

+  int64_t best_pred_rd[NB_PREDICTION_TYPES];

+  int64_t best_rd = INT64_MAX, best_intra_rd = INT64_MAX;

+#if CONFIG_PRED_FILTER

+  int64_t best_overall_rd = INT64_MAX;

+#endif

+  int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly;

+  int uv_intra_skippable = 0;

+  int uv_intra_rate_8x8 = 0, uv_intra_distortion_8x8 = 0, uv_intra_rate_tokenonly_8x8 = 0;

+  int uv_intra_skippable_8x8 = 0;

+  int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);

+  int distortion_uv = INT_MAX;

+  int64_t best_yrd = INT64_MAX;

+#if CONFIG_PRED_FILTER

+  int best_filter_state;

+#endif

+  int switchable_filter_index = 0;

+  MB_PREDICTION_MODE uv_intra_mode;

+  MB_PREDICTION_MODE uv_intra_mode_8x8 = 0;

+  int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};

+  int saddone = 0;

+  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];

+  int_mv frame_best_ref_mv[4];

+  int frame_mdcounts[4][4];

+  unsigned char *y_buffer[4], *u_buffer[4], *v_buffer[4];

+  unsigned int ref_costs[MAX_REF_FRAMES];

+  int_mv seg_mvs[NB_PARTITIONINGS][16 /* n_blocks */][MAX_REF_FRAMES - 1];

+  vpx_memset(mode8x8, 0, sizeof(mode8x8));

+  vpx_memset(&frame_mv, 0, sizeof(frame_mv));

+  vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));

+  vpx_memset(&best_bmodes, 0, sizeof(best_bmodes));

+  vpx_memset(&x->mb_context[xd->mb_index], 0, sizeof(PICK_MODE_CONTEXT));

+  for (i = 0; i < MAX_REF_FRAMES; i++)

+    frame_mv[NEWMV][i].as_int = INVALID_MV;

+  for (i = 0; i < NB_PREDICTION_TYPES; ++i)

+    best_pred_rd[i] = INT64_MAX;

+  for (i = 0; i < NB_TXFM_MODES; i++)

+    best_txfm_rd[i] = INT64_MAX;

+  for (i = 0; i < NB_PARTITIONINGS; i++) {

+    int j, k;

+    for (j = 0; j < 16; j++)

+      for (k = 0; k < MAX_REF_FRAMES - 1; k++)

+        seg_mvs[i][j][k].as_int = INVALID_MV;

+  }

+  if (cpi->ref_frame_flags & VP9_LAST_FLAG) {

+    setup_buffer_inter(cpi, x, cpi->common.lst_fb_idx, LAST_FRAME,

+                       recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],

+                       frame_mv[NEARMV], frame_best_ref_mv,

+                       frame_mdcounts, y_buffer, u_buffer, v_buffer);

+  }

+  if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {

+    setup_buffer_inter(cpi, x, cpi->common.gld_fb_idx, GOLDEN_FRAME,

+                       recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],

+                       frame_mv[NEARMV], frame_best_ref_mv,

+                       frame_mdcounts, y_buffer, u_buffer, v_buffer);

+  }

+  if (cpi->ref_frame_flags & VP9_ALT_FLAG) {

+    setup_buffer_inter(cpi, x, cpi->common.alt_fb_idx, ALTREF_FRAME,

+                       recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],

+                       frame_mv[NEARMV], frame_best_ref_mv,

+                       frame_mdcounts, y_buffer, u_buffer, v_buffer);

+  }

+  *returnintra = INT64_MAX;

+  x->skip = 0;

+  mbmi->ref_frame = INTRA_FRAME;

+  /* Initialize zbin mode boost for uv costing */

+  cpi->zbin_mode_boost = 0;

+  vp9_update_zbin_extra(cpi, x);

+  rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate,

+                          &uv_intra_rate_tokenonly, &uv_intra_distortion,

+                          &uv_intra_skippable);

+  uv_intra_mode = mbmi->uv_mode;

+  /* rough estimate for now */

+  if (cpi->common.txfm_mode != ONLY_4X4) {

+    rd_pick_intra_mbuv_mode_8x8(cpi, x, &uv_intra_rate_8x8,

+                                &uv_intra_rate_tokenonly_8x8,

+                                &uv_intra_distortion_8x8,

+                                &uv_intra_skippable_8x8);

+    uv_intra_mode_8x8 = mbmi->uv_mode;

+  }

+  // Get estimates of reference frame costs for each reference frame

+  // that depend on the current prediction etc.

+  estimate_ref_frame_costs(cpi, segment_id, ref_costs);

+  for (mode_index = 0; mode_index < MAX_MODES;

+       mode_index += (!switchable_filter_index)) {

+    int64_t this_rd = INT64_MAX;

+    int disable_skip = 0, skippable = 0;

+    int other_cost = 0;

+    int compmode_cost = 0;

+    int mode_excluded = 0;

+    int64_t txfm_cache[NB_TXFM_MODES] = { 0 };

+    // These variables hold are rolling total cost and distortion for this mode

+    rate2 = 0;

+    distortion2 = 0;

+    rate_y = 0;

+    rate_uv = 0;

+    this_mode = vp9_mode_order[mode_index].mode;

+    mbmi->mode = this_mode;

+    mbmi->uv_mode = DC_PRED;

+    mbmi->ref_frame = vp9_mode_order[mode_index].ref_frame;

+    mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;

+#if CONFIG_PRED_FILTER

+    mbmi->pred_filter_enabled = 0;

+#endif

+    if (cpi->common.mcomp_filter_type == SWITCHABLE &&

+        this_mode >= NEARESTMV && this_mode <= SPLITMV) {

+      mbmi->interp_filter =

+          vp9_switchable_interp[switchable_filter_index++];

+      if (switchable_filter_index == VP9_SWITCHABLE_FILTERS)

+        switchable_filter_index = 0;

+    } else {

+      mbmi->interp_filter = cpi->common.mcomp_filter_type;

+    }

+    vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

+    // Test best rd so far against threshold for trying this mode.

+    if (best_rd <= cpi->rd_threshes[mode_index])

+      continue;

+    // current coding mode under rate-distortion optimization test loop

+#if CONFIG_COMP_INTRA_PRED

+    mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

+    mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

+#endif

+    // If the segment reference frame feature is enabled....

+    // then do nothing if the current ref frame is not allowed..

+    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

+        !vp9_check_segref(xd, segment_id, mbmi->ref_frame)) {

+      continue;

+    // If the segment mode feature is enabled....

+    // then do nothing if the current mode is not allowed..

+    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&

+               (this_mode !=

+                vp9_get_segdata(xd, segment_id, SEG_LVL_MODE))) {

+      continue;

+    // Disable this drop out case if either the mode or ref frame

+    // segment level feature is enabled for this segment. This is to

+    // prevent the possibility that the we end up unable to pick any mode.

+    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

+               !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {

+      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,

+      // unless ARNR filtering is enabled in which case we want

+      // an unfiltered alternative

+      if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {

+        if (this_mode != ZEROMV ||

+            mbmi->ref_frame != ALTREF_FRAME) {

+          continue;

+        }

+      }

+    }

+    /* everything but intra */

+    if (mbmi->ref_frame) {

+      int ref = mbmi->ref_frame;

+      xd->pre.y_buffer = y_buffer[ref];

+      xd->pre.u_buffer = u_buffer[ref];

+      xd->pre.v_buffer = v_buffer[ref];

+      best_ref_mv = frame_best_ref_mv[ref];

+      vpx_memcpy(mdcounts, frame_mdcounts[ref], sizeof(mdcounts));

+    }

+    if (mbmi->second_ref_frame) {

+      int ref = mbmi->second_ref_frame;

+      xd->second_pre.y_buffer = y_buffer[ref];

+      xd->second_pre.u_buffer = u_buffer[ref];

+      xd->second_pre.v_buffer = v_buffer[ref];

+      second_best_ref_mv  = frame_best_ref_mv[ref];

+    }

+    // Experimental code. Special case for gf and arf zeromv modes.

+    // Increase zbin size to suppress noise

+    if (cpi->zbin_mode_boost_enabled) {

+      if (vp9_mode_order[mode_index].ref_frame == INTRA_FRAME)

+        cpi->zbin_mode_boost = 0;

+      else {

+        if (vp9_mode_order[mode_index].mode == ZEROMV) {

+          if (vp9_mode_order[mode_index].ref_frame != LAST_FRAME)

+            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;

+          else

+            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;

+        } else if (vp9_mode_order[mode_index].mode == SPLITMV)

+          cpi->zbin_mode_boost = 0;

+        else

+          cpi->zbin_mode_boost = MV_ZBIN_BOOST;

+      }

+      vp9_update_zbin_extra(cpi, x);

+    }

+    // Intra

+    if (!mbmi->ref_frame) {

+      switch (this_mode) {

+        default:

+        case DC_PRED:

+        case V_PRED:

+        case H_PRED:

+        case TM_PRED:

+        case D45_PRED:

+        case D135_PRED:

+        case D117_PRED:

+        case D153_PRED:

+        case D27_PRED:

+        case D63_PRED:

+          mbmi->ref_frame = INTRA_FRAME;

+          // FIXME compound intra prediction

+          vp9_build_intra_predictors_mby(&x->e_mbd);

+          macro_block_yrd(cpi, x, &rate_y, &distortion, &skippable, txfm_cache);

+          rate2 += rate_y;

+          distortion2 += distortion;

+          rate2 += x->mbmode_cost[xd->frame_type][mbmi->mode];

+          if (mbmi->txfm_size != TX_4X4) {

+            rate2 += uv_intra_rate_8x8;

+            rate_uv = uv_intra_rate_tokenonly_8x8;

+            distortion2 += uv_intra_distortion_8x8;

+            distortion_uv = uv_intra_distortion_8x8;

+            skippable = skippable && uv_intra_skippable_8x8;

+          } else {

+            rate2 += uv_intra_rate;

+            rate_uv = uv_intra_rate_tokenonly;

+            distortion2 += uv_intra_distortion;

+            distortion_uv = uv_intra_distortion;

+            skippable = skippable && uv_intra_skippable;

+          }

+          break;

+        case B_PRED: {

+          int64_t tmp_rd;

+          // Note the rate value returned here includes the cost of coding

+          // the BPRED mode : x->mbmode_cost[xd->frame_type][BPRED];

+          mbmi->txfm_size = TX_4X4;

+          tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion, best_yrd,

+#if CONFIG_COMP_INTRA_PRED

+                                             0,

+#endif

+                                             0);

+          rate2 += rate;

+          distortion2 += distortion;

+          if (tmp_rd < best_yrd) {

+            rate2 += uv_intra_rate;

+            rate_uv = uv_intra_rate_tokenonly;

+            distortion2 += uv_intra_distortion;

+            distortion_uv = uv_intra_distortion;

+          } else {

+            this_rd = INT64_MAX;

+            disable_skip = 1;

+          }

+        }

+        break;

+        case I8X8_PRED: {

+          int cost0 = vp9_cost_bit(cm->prob_tx[0], 0);

+          int cost1 = vp9_cost_bit(cm->prob_tx[0], 1);

+          int64_t tmp_rd_4x4s, tmp_rd_8x8s;

+          int64_t tmp_rd_4x4, tmp_rd_8x8, tmp_rd;

+          int r4x4, tok4x4, d4x4, r8x8, tok8x8, d8x8;

+          mbmi->txfm_size = TX_4X4;

+          tmp_rd_4x4 = rd_pick_intra8x8mby_modes(cpi, x, &r4x4, &tok4x4,

+                                                 &d4x4, best_yrd);

+          mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;

+          mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;

+          mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;

+          mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;

+#if CONFIG_COMP_INTRA_PRED

+          mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;

+          mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;

+          mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;

+          mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;

+#endif

+          mbmi->txfm_size = TX_8X8;

+          tmp_rd_8x8 = rd_pick_intra8x8mby_modes(cpi, x, &r8x8, &tok8x8,

+                                                 &d8x8, best_yrd);

+          txfm_cache[ONLY_4X4]  = tmp_rd_4x4;

+          txfm_cache[ALLOW_8X8] = tmp_rd_8x8;

+          txfm_cache[ALLOW_16X16] = tmp_rd_8x8;

+          tmp_rd_4x4s = tmp_rd_4x4 + RDCOST(x->rdmult, x->rddiv, cost0, 0);

+          tmp_rd_8x8s = tmp_rd_8x8 + RDCOST(x->rdmult, x->rddiv, cost1, 0);

+          txfm_cache[TX_MODE_SELECT] = tmp_rd_4x4s < tmp_rd_8x8s ? tmp_rd_4x4s : tmp_rd_8x8s;

+          if (cm->txfm_mode == TX_MODE_SELECT) {

+            if (tmp_rd_4x4s < tmp_rd_8x8s) {

+              rate = r4x4 + cost0;

+              rate_y = tok4x4 + cost0;

+              distortion = d4x4;

+              mbmi->txfm_size = TX_4X4;

+              tmp_rd = tmp_rd_4x4s;

+            } else {

+              rate = r8x8 + cost1;

+              rate_y = tok8x8 + cost1;

+              distortion = d8x8;

+              mbmi->txfm_size = TX_8X8;

+              tmp_rd = tmp_rd_8x8s;

+              mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;

+              mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;

+              mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;

+              mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;

+#if CONFIG_COMP_INTRA_PRED

+              mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;

+              mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;

+              mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;

+              mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;

+#endif

+            }

+          } else if (cm->txfm_mode == ONLY_4X4) {

+            rate = r4x4;

+            rate_y = tok4x4;

+            distortion = d4x4;

+            mbmi->txfm_size = TX_4X4;

+            tmp_rd = tmp_rd_4x4;

+          } else {

+            rate = r8x8;

+            rate_y = tok8x8;

+            distortion = d8x8;

+            mbmi->txfm_size = TX_8X8;

+            tmp_rd = tmp_rd_8x8;

+            mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;

+            mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;

+            mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;

+            mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;

+#if CONFIG_COMP_INTRA_PRED

+            mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;

+            mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;

+            mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;

+            mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;

+#endif

+          }

+          rate2 += rate;

+          distortion2 += distortion;

+          /* TODO: uv rate maybe over-estimated here since there is UV intra

+                   mode coded in I8X8_PRED prediction */

+          if (tmp_rd < best_yrd) {

+            rate2 += uv_intra_rate;

+            rate_uv = uv_intra_rate_tokenonly;

+            distortion2 += uv_intra_distortion;

+            distortion_uv = uv_intra_distortion;

+          } else {

+            this_rd = INT64_MAX;

+            disable_skip = 1;

+          }

+        }

+        break;

+      }

+    }

+    // Split MV. The code is very different from the other inter modes so

+    // special case it.

+    else if (this_mode == SPLITMV) {

+      const int is_comp_pred = mbmi->second_ref_frame != 0;

+      int64_t tmp_rd, this_rd_thresh;

+      int_mv *second_ref = is_comp_pred ? &second_best_ref_mv : NULL;

+      this_rd_thresh =

+              (mbmi->ref_frame == LAST_FRAME) ?

+          cpi->rd_threshes[THR_NEWMV] : cpi->rd_threshes[THR_NEWA];

+      this_rd_thresh =

+              (mbmi->ref_frame == GOLDEN_FRAME) ?

+          cpi->rd_threshes[THR_NEWG] : this_rd_thresh;

+      tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,

+                                           second_ref, best_yrd, mdcounts,

+                                           &rate, &rate_y, &distortion,

+                                           &skippable,

+                                           this_rd_thresh, seg_mvs,

+                                           txfm_cache);

+      rate2 += rate;

+      distortion2 += distortion;

+      if (cpi->common.mcomp_filter_type == SWITCHABLE)

+        rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs

+            [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]

+                [vp9_switchable_interp_map[mbmi->interp_filter]];

+      // If even the 'Y' rd value of split is higher than best so far

+      // then dont bother looking at UV

+      if (tmp_rd < best_yrd) {

+        int uv_skippable;

+        rd_inter4x4_uv(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,

+                       cpi->common.full_pixel);

+        rate2 += rate_uv;

+        distortion2 += distortion_uv;

+        skippable = skippable && uv_skippable;

+      } else {

+        this_rd = INT64_MAX;

+        disable_skip = 1;

+      }

+      if (is_comp_pred)

+        mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;

+      else

+        mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;

+      compmode_cost =

+        vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);

+      mbmi->mode = this_mode;

+    }

+    else {

+      this_rd = handle_inter_mode(cpi, x, BLOCK_16X16,

+                                  &saddone, near_sadidx, mdcounts, txfm_cache,

+                                  &rate2, &distortion2, &skippable,

+                                  &compmode_cost, &rate_y, &distortion,

+                                  &rate_uv, &distortion_uv,

+                                  &mode_excluded, &disable_skip, recon_yoffset,

+                                  mode_index, frame_mv, frame_best_ref_mv);

+      if (this_rd == INT64_MAX)

+        continue;

+    }

+    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION)

+      rate2 += compmode_cost;

+    // Estimate the reference frame signaling cost and add it

+    // to the rolling cost variable.

+    rate2 += ref_costs[mbmi->ref_frame];

+    if (!disable_skip) {

+      // Test for the condition where skip block will be activated

+      // because there are no non zero coefficients and make any

+      // necessary adjustment for rate. Ignore if skip is coded at

+      // segment level as the cost wont have been added in.

+      if (cpi->common.mb_no_coeff_skip) {

+        int mb_skip_allowed;

+        // Is Mb level skip allowed for this mb.

+        mb_skip_allowed =

+          !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||

+          vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

+        if (skippable) {

+          mbmi->mb_skip_coeff = 1;

+          // Back out the coefficient coding costs

+          rate2 -= (rate_y + rate_uv);

+          // for best_yrd calculation

+          rate_uv = 0;

+          if (mb_skip_allowed) {

+            int prob_skip_cost;

+            // Cost the skip mb case

+            vp9_prob skip_prob =

+              vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP);

+            if (skip_prob) {

+              prob_skip_cost = vp9_cost_bit(skip_prob, 1);

+              rate2 += prob_skip_cost;

+              other_cost += prob_skip_cost;

+            }

+          }

+        }

+        // Add in the cost of the no skip flag.

+        else {

+          mbmi->mb_skip_coeff = 0;

+          if (mb_skip_allowed) {

+            int prob_skip_cost = vp9_cost_bit(

+                   vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP), 0);

+            rate2 += prob_skip_cost;

+            other_cost += prob_skip_cost;

+          }

+        }

+      }

+      // Calculate the final RD estimate for this mode.

+      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);

+    }

+    // Keep record of best intra distortion

+    if ((mbmi->ref_frame == INTRA_FRAME) &&

+        (this_rd < best_intra_rd)) {

+      best_intra_rd = this_rd;

+      *returnintra = distortion2;

+    }

+    if (!disable_skip && mbmi->ref_frame == INTRA_FRAME)

+      for (i = 0; i < NB_PREDICTION_TYPES; ++i)

+        best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);

+#if CONFIG_PRED_FILTER

+    // Keep track of the best mode irrespective of prediction filter state

+    if (this_rd < best_overall_rd) {

+      best_overall_rd = this_rd;

+      best_filter_state = mbmi->pred_filter_enabled;

+    }

+    // Ignore modes where the prediction filter state doesn't

+    // match the state signaled at the frame level

+    if ((cm->pred_filter_mode == 2) ||

+        (cm->pred_filter_mode ==

+         mbmi->pred_filter_enabled)) {

+#endif

+      // Did this mode help.. i.e. is it the new best mode

+      if (this_rd < best_rd || x->skip) {

+        if (!mode_excluded) {

+          // Note index of best mode so far

+          best_mode_index = mode_index;

+          if (this_mode <= B_PRED) {

+            if (mbmi->txfm_size != TX_4X4

+                && this_mode != B_PRED

+                && this_mode != I8X8_PRED)

+              mbmi->uv_mode = uv_intra_mode_8x8;

+            else

+              mbmi->uv_mode = uv_intra_mode;

+            /* required for left and above block mv */

+            mbmi->mv[0].as_int = 0;

+          }

+          other_cost += ref_costs[mbmi->ref_frame];

+          /* Calculate the final y RD estimate for this mode */

+          best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),

+                            (distortion2 - distortion_uv));

+          *returnrate = rate2;

+          *returndistortion = distortion2;

+          best_rd = this_rd;

+          vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));

+          vpx_memcpy(&best_partition, x->partition_info, sizeof(PARTITION_INFO));

+          if ((this_mode == B_PRED)

+              || (this_mode == I8X8_PRED)

+              || (this_mode == SPLITMV))

+            for (i = 0; i < 16; i++) {

+              best_bmodes[i] = xd->block[i].bmi;

+            }

+        }

+        // Testing this mode gave rise to an improvement in best error score.

+        // Lower threshold a bit for next time

+        cpi->rd_thresh_mult[mode_index] =

+            (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?

+            cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;

+        cpi->rd_threshes[mode_index] =

+            (cpi->rd_baseline_thresh[mode_index] >> 7) *

+            cpi->rd_thresh_mult[mode_index];

+      }

+      // If the mode did not help improve the best error case then raise the

+      // threshold for testing that mode next time around.

+      else {

+        cpi->rd_thresh_mult[mode_index] += 4;

+        if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)

+          cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;

+        cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];

+      }

+      /* keep record of best compound/single-only prediction */

+      if (!disable_skip &&

+          mbmi->ref_frame != INTRA_FRAME) {

+        int64_t single_rd, hybrid_rd;

+        int single_rate, hybrid_rate;

+        if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {

+          single_rate = rate2 - compmode_cost;

+          hybrid_rate = rate2;

+        } else {

+          single_rate = rate2;

+          hybrid_rate = rate2 + compmode_cost;

+        }

+        single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);

+        hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);

+        if (mbmi->second_ref_frame == INTRA_FRAME &&

+            single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {

+          best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;

+        } else if (mbmi->second_ref_frame != INTRA_FRAME &&

+                   single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {

+          best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;

+        }

+        if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])

+          best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;

+      }

+      /* keep record of best txfm size */

+      if (!mode_excluded && this_rd != INT64_MAX) {

+        for (i = 0; i < NB_TXFM_MODES; i++) {

+          int64_t adj_rd;

+          if (this_mode != B_PRED) {

+            adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->txfm_mode];

+          } else {

+            adj_rd = this_rd;

+          }

+          if (adj_rd < best_txfm_rd[i])

+            best_txfm_rd[i] = adj_rd;

+        }

+      }

+#if CONFIG_PRED_FILTER

+    }

+#endif

+    if (x->skip && !mode_excluded)

+      break;

+  }

+#if CONFIG_PRED_FILTER

+  // Update counts for prediction filter usage

+  if (best_filter_state != 0)

+    ++cpi->pred_filter_on_count;

+  else

+    ++cpi->pred_filter_off_count;

+#endif

+  if (cpi->common.mcomp_filter_type == SWITCHABLE &&

+      best_mbmode.mode >= NEARESTMV &&

+      best_mbmode.mode <= SPLITMV) {

+    ++cpi->switchable_interp_count

+        [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]

+        [vp9_switchable_interp_map[best_mbmode.interp_filter]];

+  }

+  // Reduce the activation RD thresholds for the best choice mode

+  if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&

+      (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {

+    int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);

+    cpi->rd_thresh_mult[best_mode_index] =

+        (cpi->rd_thresh_mult[best_mode_index] >=

+         (MIN_THRESHMULT + best_adjustment)) ?

+        cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;

+    cpi->rd_threshes[best_mode_index] =

+        (cpi->rd_baseline_thresh[best_mode_index] >> 7) *

+        cpi->rd_thresh_mult[best_mode_index];

+  }

+  // This code force Altref,0,0 and skip for the frame that overlays a

+  // an alrtef unless Altref is filtered. However, this is unsafe if

+  // segment level coding of ref frame or mode is enabled for this

+  // segment.

+  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

+      !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&

+      cpi->is_src_frame_alt_ref &&

+      (cpi->oxcf.arnr_max_frames == 0) &&

+      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {

+    mbmi->mode = ZEROMV;

+    if (cm->txfm_mode != TX_MODE_SELECT)

+      mbmi->txfm_size = cm->txfm_mode;

+    else

+      mbmi->txfm_size = TX_16X16;

+    mbmi->ref_frame = ALTREF_FRAME;

+    mbmi->mv[0].as_int = 0;

+    mbmi->uv_mode = DC_PRED;

+    mbmi->mb_skip_coeff =

+      (cpi->common.mb_no_coeff_skip) ? 1 : 0;

+    mbmi->partitioning = 0;

+    vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));

+    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));

+    goto end;

+  }

+  // macroblock modes

+  vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));

+  if (best_mbmode.mode == B_PRED) {

+    for (i = 0; i < 16; i++) {

+      xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;

+      xd->block[i].bmi.as_mode = xd->mode_info_context->bmi[i].as_mode;

+    }

+  }

+  if (best_mbmode.mode == I8X8_PRED)

+    set_i8x8_block_modes(x, mode8x8);

+  if (best_mbmode.mode == SPLITMV) {

+    for (i = 0; i < 16; i++)

+      xd->mode_info_context->bmi[i].as_mv.first.as_int = best_bmodes[i].as_mv.first.as_int;

+    if (mbmi->second_ref_frame)

+      for (i = 0; i < 16; i++)

+        xd->mode_info_context->bmi[i].as_mv.second.as_int = best_bmodes[i].as_mv.second.as_int;

+    vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));

+    mbmi->mv[0].as_int = x->partition_info->bmi[15].mv.as_int;

+    mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;

+  }

+  for (i = 0; i < NB_PREDICTION_TYPES; ++i) {

+    if (best_pred_rd[i] == INT64_MAX)

+      best_pred_diff[i] = INT_MIN;

+    else

+      best_pred_diff[i] = best_rd - best_pred_rd[i];

+  }

+  if (!x->skip) {

+    for (i = 0; i < NB_TXFM_MODES; i++) {

+      if (best_txfm_rd[i] == INT64_MAX)

+        best_txfm_diff[i] = INT_MIN;

+      else

+        best_txfm_diff[i] = best_rd - best_txfm_rd[i];

+    }

+  } else {

+    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));

+  }

+end:

+  store_coding_context(x, &x->mb_context[xd->mb_index], best_mode_index, &best_partition,

+                       &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],

+                       &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],

+                       best_pred_diff[0], best_pred_diff[1], best_pred_diff[2],

+                       best_txfm_diff);

+}

+#if CONFIG_SUPERBLOCKS

+void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,

+                               int *returnrate,

+                               int *returndist) {

+  VP9_COMMON *cm = &cpi->common;

+  MACROBLOCKD *xd = &x->e_mbd;

+  int rate_y, rate_uv;

+  int rate_y_tokenonly, rate_uv_tokenonly;

+  int error_y, error_uv;

+  int dist_y, dist_uv;

+  int y_skip, uv_skip;

+  xd->mode_info_context->mbmi.txfm_size = TX_8X8;

+  error_uv = rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,

+                                     &dist_uv, &uv_skip);

+  error_y = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,

+                                   &dist_y, &y_skip);

+  if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) {

+    *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +

+                  vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);

+    *returndist = dist_y + (dist_uv >> 2);

+  } else {

+    *returnrate = rate_y + rate_uv;

+    if (cpi->common.mb_no_coeff_skip)

+      *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);

+    *returndist = dist_y + (dist_uv >> 2);

+  }

+}

+#endif

+void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,

+                            int *returnrate, int *returndist) {

+  VP9_COMMON *cm = &cpi->common;

+  MACROBLOCKD *xd = &x->e_mbd;

+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

+  int64_t error4x4, error16x16;

+#if CONFIG_COMP_INTRA_PRED

+  int64_t error4x4d;

+  int rate4x4d, dist4x4d;

+#endif

+  int rate4x4, rate16x16 = 0, rateuv, rateuv8x8;

+  int dist4x4, dist16x16, distuv, distuv8x8;

+  int rate;

+  int rate4x4_tokenonly = 0;

+  int rate16x16_tokenonly = 0;

+  int rateuv_tokenonly = 0, rateuv8x8_tokenonly = 0;

+  int64_t error8x8;

+  int rate8x8_tokenonly=0;

+  int rate8x8, dist8x8;

+  int mode16x16;

+  int mode8x8[2][4];

+  int dist;

+  int modeuv, modeuv8x8, uv_intra_skippable, uv_intra_skippable_8x8;

+  int y_intra16x16_skippable;

+  int64_t txfm_cache[NB_TXFM_MODES];

+  TX_SIZE txfm_size_16x16;

+  int i;

+  mbmi->ref_frame = INTRA_FRAME;

+  rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv,

+                          &uv_intra_skippable);

+  modeuv = mbmi->uv_mode;

+  if (cpi->common.txfm_mode != ONLY_4X4) {

+    rd_pick_intra_mbuv_mode_8x8(cpi, x, &rateuv8x8, &rateuv8x8_tokenonly,

+                                &distuv8x8, &uv_intra_skippable_8x8);

+    modeuv8x8 = mbmi->uv_mode;

+  } else {

+    uv_intra_skippable_8x8 = uv_intra_skippable;

+    rateuv8x8 = rateuv;

+    distuv8x8 = distuv;

+    rateuv8x8_tokenonly = rateuv_tokenonly;

+    modeuv8x8 = modeuv;

+  }

+  // current macroblock under rate-distortion optimization test loop

+  error16x16 = rd_pick_intra16x16mby_mode(cpi, x, &rate16x16,

+                                          &rate16x16_tokenonly, &dist16x16,

+                                          &y_intra16x16_skippable, txfm_cache);

+  mode16x16 = mbmi->mode;

+  txfm_size_16x16 = mbmi->txfm_size;

+  // FIXME(rbultje) support transform-size selection

+  mbmi->txfm_size = (cm->txfm_mode == ONLY_4X4) ? TX_4X4 : TX_8X8;

+  error8x8 = rd_pick_intra8x8mby_modes(cpi, x, &rate8x8, &rate8x8_tokenonly,

+                                       &dist8x8, error16x16);

+  mode8x8[0][0]= xd->mode_info_context->bmi[0].as_mode.first;

+  mode8x8[0][1]= xd->mode_info_context->bmi[2].as_mode.first;

+  mode8x8[0][2]= xd->mode_info_context->bmi[8].as_mode.first;

+  mode8x8[0][3]= xd->mode_info_context->bmi[10].as_mode.first;

+#if CONFIG_COMP_INTRA_PRED

+  mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;

+  mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;

+  mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;

+  mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;

+#endif

+  error4x4 = rd_pick_intra4x4mby_modes(cpi, x,

+                                       &rate4x4, &rate4x4_tokenonly,

+                                       &dist4x4, error16x16,

+#if CONFIG_COMP_INTRA_PRED

+                                       0,

+#endif

+                                       0);

+#if CONFIG_COMP_INTRA_PRED

+  error4x4d = rd_pick_intra4x4mby_modes(cpi, x,

+                                        &rate4x4d, &rate4x4_tokenonly,

+                                        &dist4x4d, error16x16, 1, 0);

+#endif

+  mbmi->mb_skip_coeff = 0;

+  if (cpi->common.mb_no_coeff_skip &&

+      y_intra16x16_skippable && uv_intra_skippable_8x8) {

+    mbmi->mb_skip_coeff = 1;

+    mbmi->mode = mode16x16;

+    mbmi->uv_mode = modeuv;

+    rate = rateuv8x8 + rate16x16 - rateuv8x8_tokenonly - rate16x16_tokenonly +

+           vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);

+    dist = dist16x16 + (distuv8x8 >> 2);

+    mbmi->txfm_size = txfm_size_16x16;

+    memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,

+           sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));

+  } else if (error8x8 > error16x16) {

+    if (error4x4 < error16x16) {

+      rate = rateuv;

+#if CONFIG_COMP_INTRA_PRED

+      rate += (error4x4d < error4x4) ? rate4x4d : rate4x4;

+      if (error4x4d >= error4x4) // FIXME save original modes etc.

+        error4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4,

+                                             &rate4x4_tokenonly,

+                                             &dist4x4, error16x16, 0,

+                                             cpi->update_context);

+#else

+      rate += rate4x4;

+#endif

+      mbmi->mode = B_PRED;

+      mbmi->txfm_size = TX_4X4;

+      dist = dist4x4 + (distuv >> 2);

+      memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,

+             sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));

+    } else {

+      mbmi->txfm_size = txfm_size_16x16;

+      mbmi->mode = mode16x16;

+      rate = rate16x16 + rateuv8x8;

+      dist = dist16x16 + (distuv8x8 >> 2);

+      for (i = 0; i < NB_TXFM_MODES; i++) {

+        x->mb_context[xd->mb_index].txfm_rd_diff[i] = error16x16 - txfm_cache[i];

+      }

+    }

+    if (cpi->common.mb_no_coeff_skip)

+      rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);

+  } else {

+    if (error4x4 < error8x8) {

+      rate = rateuv;

+#if CONFIG_COMP_INTRA_PRED

+      rate += (error4x4d < error4x4) ? rate4x4d : rate4x4;

+      if (error4x4d >= error4x4) // FIXME save original modes etc.

+        error4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4,

+                                             &rate4x4_tokenonly,

+                                             &dist4x4, error16x16, 0,

+                                             cpi->update_context);

+#else

+      rate += rate4x4;

+#endif

+      mbmi->mode = B_PRED;

+      mbmi->txfm_size = TX_4X4;

+      dist = dist4x4 + (distuv >> 2);

+      memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,

+             sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));

+    } else {

+      // FIXME(rbultje) support transform-size selection

+      mbmi->mode = I8X8_PRED;

+      mbmi->txfm_size = (cm->txfm_mode == ONLY_4X4) ? TX_4X4 : TX_8X8;

+      set_i8x8_block_modes(x, mode8x8);

+      rate = rate8x8 + rateuv;

+      dist = dist8x8 + (distuv >> 2);

+      memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,

+             sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));

+    }

+    if (cpi->common.mb_no_coeff_skip)

+      rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);

+  }

+  *returnrate = rate;

+  *returndist = dist;

+}

+#if CONFIG_SUPERBLOCKS

+int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,

+                                  int recon_yoffset, int recon_uvoffset,

+                                  int *returnrate, int *returndistortion) {

+  VP9_COMMON *cm = &cpi->common;

+  MACROBLOCKD *xd = &x->e_mbd;

+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

+  MB_PREDICTION_MODE this_mode;

+  MV_REFERENCE_FRAME ref_frame;

+  unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;

+  int comp_pred;

+  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];

+  int_mv frame_best_ref_mv[4];

+  int frame_mdcounts[4][4];

+  unsigned char *y_buffer[4];

+  unsigned char *u_buffer[4];

+  unsigned char *v_buffer[4];

+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,

+                                    VP9_ALT_FLAG };

+  int idx_list[4] = { 0, cpi->common.lst_fb_idx, cpi->common.gld_fb_idx,

+                      cpi->common.alt_fb_idx };

+  int mdcounts[4];

+  int near_sadidx[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };

+  int saddone = 0;

+  int64_t best_rd = INT64_MAX;

+  int64_t best_comp_rd = INT64_MAX;

+  int64_t best_single_rd = INT64_MAX;

+  int64_t best_hybrid_rd = INT64_MAX;

+  int64_t best_yrd = INT64_MAX;

+  MB_MODE_INFO best_mbmode;

+  int mode_index, best_mode_index;

+  unsigned int ref_costs[MAX_REF_FRAMES];

+  x->skip = 0;

+  xd->mode_info_context->mbmi.segment_id = segment_id;

+  estimate_ref_frame_costs(cpi, segment_id, ref_costs);

+  vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));

+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {

+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {

+      setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame,

+                         recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],

+                         frame_mv[NEARMV], frame_best_ref_mv,

+                         frame_mdcounts, y_buffer, u_buffer, v_buffer);

+    }

+    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;

+    frame_mv[ZEROMV][ref_frame].as_int = 0;

+  }

+  for (mode_index = 0; mode_index < MAX_MODES; mode_index++) {

+    int mode_excluded;

+    int64_t this_rd = INT64_MAX;

+    int disable_skip = 0;

+    int other_cost = 0;

+    int compmode_cost = 0;

+    int rate2 = 0, rate_y = 0, rate_uv = 0;

+    int distortion2 = 0, distortion_y = 0, distortion_uv = 0;

+    int skippable;

+    int64_t txfm_cache[NB_TXFM_MODES];

+    // Test best rd so far against threshold for trying this mode.

+    if (best_rd <= cpi->rd_threshes[mode_index]) {

+      continue;

+    }

+    this_mode = vp9_mode_order[mode_index].mode;

+    ref_frame = vp9_mode_order[mode_index].ref_frame;

+    mbmi->ref_frame = ref_frame;

+    comp_pred = vp9_mode_order[mode_index].second_ref_frame != INTRA_FRAME;

+    mbmi->mode = this_mode;

+    mbmi->uv_mode = DC_PRED;

+#if CONFIG_COMP_INTRA_PRED

+    mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

+    mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

+#endif

+    if (!(cpi->ref_frame_flags & flag_list[ref_frame]))

+      continue;

+    // not yet supported or not superblocky

+    // TODO(rbultje): support intra coding

+    if (ref_frame == INTRA_FRAME || this_mode == SPLITMV)

+      continue;

+    if (comp_pred) {

+      int second_ref;

+      if (ref_frame == ALTREF_FRAME) {

+        second_ref = LAST_FRAME;

+      } else {

+        second_ref = ref_frame + 1;

+      }

+      if (!(cpi->ref_frame_flags & flag_list[second_ref]))

+        continue;

+      mbmi->second_ref_frame = second_ref;

+      xd->second_pre.y_buffer = y_buffer[second_ref];

+      xd->second_pre.u_buffer = u_buffer[second_ref];

+      xd->second_pre.v_buffer = v_buffer[second_ref];

+      mode_excluded = cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;

+    } else {

+      mbmi->second_ref_frame = INTRA_FRAME;

+      mode_excluded = cm->comp_pred_mode == COMP_PREDICTION_ONLY;

+    }

+    xd->pre.y_buffer = y_buffer[ref_frame];

+    xd->pre.u_buffer = u_buffer[ref_frame];

+    xd->pre.v_buffer = v_buffer[ref_frame];

+    vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts));

+    // If the segment reference frame feature is enabled....

+    // then do nothing if the current ref frame is not allowed..

+    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

+        !vp9_check_segref(xd, segment_id, ref_frame)) {

+      continue;

+    // If the segment mode feature is enabled....

+    // then do nothing if the current mode is not allowed..

+    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&

+               (this_mode != vp9_get_segdata(xd, segment_id, SEG_LVL_MODE))) {

+      continue;

+    // Disable this drop out case if either the mode or ref frame

+    // segment level feature is enabled for this segment. This is to

+    // prevent the possibility that we end up unable to pick any mode.

+    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

+               !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {

+      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,

+      // unless ARNR filtering is enabled in which case we want

+      // an unfiltered alternative

+      if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {

+        if (this_mode != ZEROMV || ref_frame != ALTREF_FRAME) {

+          continue;

+        }

+      }

+    }

+    this_rd = handle_inter_mode(cpi, x, BLOCK_32X32,

+                                &saddone, near_sadidx, mdcounts, txfm_cache,

+                                &rate2, &distortion2, &skippable,

+                                &compmode_cost, &rate_y, &distortion_y,

+                                &rate_uv, &distortion_uv,

+                                &mode_excluded, &disable_skip, recon_yoffset,

+                                mode_index, frame_mv, frame_best_ref_mv);

+    if (this_rd == INT64_MAX)

+      continue;

+    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {

+      rate2 += compmode_cost;

+    }

+    // Estimate the reference frame signaling cost and add it

+    // to the rolling cost variable.

+    rate2 += ref_costs[xd->mode_info_context->mbmi.ref_frame];

+    if (!disable_skip) {

+      // Test for the condition where skip block will be activated

+      // because there are no non zero coefficients and make any

+      // necessary adjustment for rate. Ignore if skip is coded at

+      // segment level as the cost wont have been added in.

+      if (cpi->common.mb_no_coeff_skip) {

+        int mb_skip_allowed;

+        // Is Mb level skip allowed for this mb.

+        mb_skip_allowed =

+          !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||

+          vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

+        if (skippable) {

+          // Back out the coefficient coding costs

+          rate2 -= (rate_y + rate_uv);

+          // for best_yrd calculation

+          rate_uv = 0;

+          if (mb_skip_allowed) {

+            int prob_skip_cost;

+            // Cost the skip mb case

+            vp9_prob skip_prob =

+              vp9_get_pred_prob(cm, xd, PRED_MBSKIP);

+            if (skip_prob) {

+              prob_skip_cost = vp9_cost_bit(skip_prob, 1);

+              rate2 += prob_skip_cost;

+              other_cost += prob_skip_cost;

+            }

+          }

+        }

+        // Add in the cost of the no skip flag.

+        else if (mb_skip_allowed) {

+          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,

+                                                          PRED_MBSKIP), 0);

+          rate2 += prob_skip_cost;

+          other_cost += prob_skip_cost;

+        }

+      }

+      // Calculate the final RD estimate for this mode.

+      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);

+    }

+#if 0

+    // Keep record of best intra distortion

+    if ((xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&

+        (this_rd < best_intra_rd)) {

+      best_intra_rd = this_rd;

+      *returnintra = distortion2;

+    }

+#endif

+    if (!disable_skip && xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {

+      if (this_rd < best_comp_rd)

+        best_comp_rd = this_rd;

+      if (this_rd < best_single_rd)

+        best_single_rd = this_rd;

+      if (this_rd < best_hybrid_rd)

+        best_hybrid_rd = this_rd;

+    }

+    // Did this mode help.. i.e. is it the new best mode

+    if (this_rd < best_rd || x->skip) {

+      if (!mode_excluded) {

+        // Note index of best mode so far

+        best_mode_index = mode_index;

+#if 0

+        if (this_mode <= B_PRED) {

+          xd->mode_info_context->mbmi.uv_mode = uv_intra_mode_8x8;

+          /* required for left and above block mv */

+          xd->mode_info_context->mbmi.mv.as_int = 0;

+        }

+#endif

+        other_cost += ref_costs[xd->mode_info_context->mbmi.ref_frame];

+        /* Calculate the final y RD estimate for this mode */

+        best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),

+                          (distortion2 - distortion_uv));

+        *returnrate = rate2;

+        *returndistortion = distortion2;

+        best_rd = this_rd;

+        vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));

+      }

+#if 0

+      // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time

+      cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;

+      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];

+#endif

+    }

+    // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around.

+    else {

+#if 0

+      cpi->rd_thresh_mult[mode_index] += 4;

+      if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)

+        cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;

+      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];

+#endif

+    }

+    /* keep record of best compound/single-only prediction */

+    if (!disable_skip && mbmi->ref_frame != INTRA_FRAME) {

+      int single_rd, hybrid_rd, single_rate, hybrid_rate;

+      if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {

+        single_rate = rate2 - compmode_cost;

+        hybrid_rate = rate2;

+      } else {

+        single_rate = rate2;

+        hybrid_rate = rate2 + compmode_cost;

+      }

+      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);

+      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);

+      if (mbmi->second_ref_frame == INTRA_FRAME && single_rd < best_single_rd) {

+        best_single_rd = single_rd;

+      } else if (mbmi->second_ref_frame != INTRA_FRAME &&

+                 single_rd < best_comp_rd) {

+        best_comp_rd = single_rd;

+      }

+      if (hybrid_rd < best_hybrid_rd) {

+        best_hybrid_rd = hybrid_rd;

+      }

+    }

+    if (x->skip && !mode_excluded)

+      break;

+  }

+  // TODO(rbultje) integrate with RD thresholding

+#if 0

+  // Reduce the activation RD thresholds for the best choice mode

+  if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&

+      (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {

+    int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);

+    cpi->rd_thresh_mult[best_mode_index] =

+      (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ?

+      cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;

+    cpi->rd_threshes[best_mode_index] =

+      (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];

+  }

+#endif

+  // This code forces Altref,0,0 and skip for the frame that overlays a

+  // an alrtef unless Altref is filtered. However, this is unsafe if

+  // segment level coding of ref frame or mode is enabled for this

+  // segment.

+  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

+      !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&

+      cpi->is_src_frame_alt_ref &&

+      (cpi->oxcf.arnr_max_frames == 0) &&

+      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {

+    mbmi->mode = ZEROMV;

+    mbmi->ref_frame = ALTREF_FRAME;

+    mbmi->second_ref_frame = 0;

+    mbmi->mv[0].as_int = 0;

+    mbmi->uv_mode = DC_PRED;

+    mbmi->mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;

+    mbmi->partitioning = 0;

+    mbmi->txfm_size = TX_8X8;

+    if (best_rd != INT64_MAX)

+      store_coding_context(x, &x->sb_context[0], best_mode_index, NULL,

+                           &frame_best_ref_mv[mbmi->ref_frame],

+                           &frame_best_ref_mv[mbmi->second_ref_frame],

+                           0, 0, 0, NULL);

+    return best_rd;

+  }

+  // macroblock modes

+  vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));

+  mbmi->txfm_size = TX_8X8;

+  if (best_rd != INT64_MAX)

+    store_coding_context(x, &x->sb_context[0], best_mode_index, NULL,

+                         &frame_best_ref_mv[mbmi->ref_frame],

+                         &frame_best_ref_mv[mbmi->second_ref_frame],

+                         (best_single_rd == INT64_MAX) ? INT_MIN :

+                                        (best_rd - best_single_rd),

+                         (best_comp_rd   == INT64_MAX) ? INT_MIN :

+                                        (best_rd - best_comp_rd),

+                         (best_hybrid_rd == INT64_MAX) ? INT_MIN :

+                                        (best_rd - best_hybrid_rd),

+                         NULL);

+  return best_rd;

+}

+#endif

+void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,

+                                    int recon_yoffset,

+                                    int recon_uvoffset,

+                                    int *totalrate, int *totaldist) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

+  int rate, distortion;

+  int64_t intra_error = 0;

+  unsigned char *segment_id = &mbmi->segment_id;

+  if (xd->segmentation_enabled)

+    x->encode_breakout = cpi->segment_encode_breakout[*segment_id];

+  else

+    x->encode_breakout = cpi->oxcf.encode_breakout;

+  // if (cpi->sf.RD)

+  // For now this codebase is limited to a single rd encode path

+  {

+    int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;

+    vp9_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,

+                           &distortion, &intra_error);

+    /* restore cpi->zbin_mode_boost_enabled */

+    cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;

+  }

+  // else

+  // The non rd encode path has been deleted from this code base

+  // to simplify development

+  //    vp9_pick_inter_mode

+  // Store metrics so they can be added in to totals if this mode is picked

+  x->mb_context[xd->mb_index].distortion  = distortion;

+  x->mb_context[xd->mb_index].intra_error = intra_error;

+  *totalrate = rate;

+  *totaldist = distortion;

+}

--- /dev/null

+++ b/vp9/encoder/rdopt.h

@@ -1,0 +1,41 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_RDOPT_H

+#define __INC_RDOPT_H

+#define RDCOST(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )

+#define RDCOST_8x8(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )

+extern void vp9_initialize_rd_consts(VP9_COMP *cpi, int Qvalue);

+extern void vp9_rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,

+                                   int recon_yoffset, int recon_uvoffset,

+                                   int *returnrate, int *returndistortion,

+                                   int64_t *returnintra);

+extern void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,

+                                   int *r, int *d);

+extern void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,

+                                      int *r, int *d);

+extern void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCKD *xd,

+                        const MODE_INFO *here, int_mv *mvp,

+                        int refframe, int *ref_frame_sign_bias,

+                        int *sr, int near_sadidx[]);

+extern void vp9_init_me_luts();

+extern void vp9_set_mbmode_and_mvs(MACROBLOCK *x,

+                                   MB_PREDICTION_MODE mb, int_mv *mv);

+#endif

--- /dev/null

+++ b/vp9/encoder/sad_c.c

@@ -1,0 +1,480 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <stdlib.h>

+#include "vp9/common/sadmxn.h"

+#include "vpx_ports/config.h"

+#include "vpx/vpx_integer.h"

+unsigned int vp9_sad32x32_c(const unsigned char *src_ptr,

+                            int  src_stride,

+                            const unsigned char *ref_ptr,

+                            int  ref_stride,

+                            int max_sad) {

+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);

+}

+unsigned int vp9_sad16x16_c(const unsigned char *src_ptr,

+                            int  src_stride,

+                            const unsigned char *ref_ptr,

+                            int  ref_stride,

+                            int max_sad) {

+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16);

+}

+unsigned int vp9_sad8x8_c(const unsigned char *src_ptr,

+                          int  src_stride,

+                          const unsigned char *ref_ptr,

+                          int  ref_stride,

+                          int max_sad) {

+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8);

+}

+unsigned int vp9_sad16x8_c(const unsigned char *src_ptr,

+                           int  src_stride,

+                           const unsigned char *ref_ptr,

+                           int  ref_stride,

+                           int max_sad) {

+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8);

+}

+unsigned int vp9_sad8x16_c(const unsigned char *src_ptr,

+                           int  src_stride,

+                           const unsigned char *ref_ptr,

+                           int  ref_stride,

+                           int max_sad) {

+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);

+}

+unsigned int vp9_sad4x4_c(const unsigned char *src_ptr,

+                          int  src_stride,

+                          const unsigned char *ref_ptr,

+                          int  ref_stride,

+                          int max_sad) {

+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);

+}

+void vp9_sad32x32x3_c(const unsigned char *src_ptr,

+                      int  src_stride,

+                      const unsigned char *ref_ptr,

+                      int  ref_stride,

+                      unsigned int *sad_array

+                      ) {

+  sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride,

+                                ref_ptr, ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride,

+                                ref_ptr + 1, ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad32x32_c(src_ptr, src_stride,

+                                ref_ptr + 2, ref_stride, 0x7fffffff);

+}

+void vp9_sad32x32x8_c(const unsigned char *src_ptr,

+                      int  src_stride,

+                      const unsigned char *ref_ptr,

+                      int  ref_stride,

+                      unsigned short *sad_array

+                      ) {

+  sad_array[0] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,

+                                                ref_ptr, ref_stride,

+                                                0x7fffffff);

+  sad_array[1] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,

+                                                ref_ptr + 1, ref_stride,

+                                                0x7fffffff);

+  sad_array[2] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,

+                                                ref_ptr + 2, ref_stride,

+                                                0x7fffffff);

+  sad_array[3] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,

+                                                ref_ptr + 3, ref_stride,

+                                                0x7fffffff);

+  sad_array[4] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,

+                                                ref_ptr + 4, ref_stride,

+                                                0x7fffffff);

+  sad_array[5] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,

+                                                ref_ptr + 5, ref_stride,

+                                                0x7fffffff);

+  sad_array[6] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,

+                                                ref_ptr + 6, ref_stride,

+                                                0x7fffffff);

+  sad_array[7] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,

+                                                ref_ptr + 7, ref_stride,

+                                                0x7fffffff);

+}

+void vp9_sad16x16x3_c(const unsigned char *src_ptr,

+                      int  src_stride,

+                      const unsigned char *ref_ptr,

+                      int  ref_stride,

+                      unsigned int *sad_array) {

+  sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride,

+                                ref_ptr, ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad16x16_c(src_ptr, src_stride,

+                                ref_ptr + 1, ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad16x16_c(src_ptr, src_stride,

+                                ref_ptr + 2, ref_stride, 0x7fffffff);

+}

+void vp9_sad16x16x8_c(const unsigned char *src_ptr,

+                      int  src_stride,

+                      const unsigned char *ref_ptr,

+                      int  ref_stride,

+                      unsigned short *sad_array) {

+  sad_array[0] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,

+                                                ref_ptr, ref_stride,

+                                                0x7fffffff);

+  sad_array[1] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,

+                                                ref_ptr + 1, ref_stride,

+                                                0x7fffffff);

+  sad_array[2] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,

+                                                ref_ptr + 2, ref_stride,

+                                                0x7fffffff);

+  sad_array[3] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,

+                                                ref_ptr + 3, ref_stride,

+                                                0x7fffffff);

+  sad_array[4] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,

+                                                ref_ptr + 4, ref_stride,

+                                                0x7fffffff);

+  sad_array[5] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,

+                                                ref_ptr + 5, ref_stride,

+                                                0x7fffffff);

+  sad_array[6] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,

+                                                ref_ptr + 6, ref_stride,

+                                                0x7fffffff);

+  sad_array[7] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,

+                                                ref_ptr + 7, ref_stride,

+                                                0x7fffffff);

+}

+void vp9_sad16x8x3_c(const unsigned char *src_ptr,

+                     int  src_stride,

+                     const unsigned char *ref_ptr,

+                     int  ref_stride,

+                     unsigned int *sad_array) {

+  sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride,

+                               ref_ptr, ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad16x8_c(src_ptr, src_stride,

+                               ref_ptr + 1, ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad16x8_c(src_ptr, src_stride,

+                               ref_ptr + 2, ref_stride, 0x7fffffff);

+}

+void vp9_sad16x8x8_c(const unsigned char *src_ptr,

+                     int  src_stride,

+                     const unsigned char *ref_ptr,

+                     int  ref_stride,

+                     unsigned short *sad_array) {

+  sad_array[0] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,

+                                               ref_ptr, ref_stride,

+                                               0x7fffffff);

+  sad_array[1] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,

+                                               ref_ptr + 1, ref_stride,

+                                               0x7fffffff);

+  sad_array[2] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,

+                                               ref_ptr + 2, ref_stride,

+                                               0x7fffffff);

+  sad_array[3] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,

+                                               ref_ptr + 3, ref_stride,

+                                               0x7fffffff);

+  sad_array[4] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,

+                                               ref_ptr + 4, ref_stride,

+                                               0x7fffffff);

+  sad_array[5] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,

+                                               ref_ptr + 5, ref_stride,

+                                               0x7fffffff);

+  sad_array[6] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,

+                                               ref_ptr + 6, ref_stride,

+                                               0x7fffffff);

+  sad_array[7] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,

+                                               ref_ptr + 7, ref_stride,

+                                               0x7fffffff);

+}

+void vp9_sad8x8x3_c(const unsigned char *src_ptr,

+                    int  src_stride,

+                    const unsigned char *ref_ptr,

+                    int  ref_stride,

+                    unsigned int *sad_array) {

+  sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride,

+                              ref_ptr, ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad8x8_c(src_ptr, src_stride,

+                              ref_ptr + 1, ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad8x8_c(src_ptr, src_stride,

+                              ref_ptr + 2, ref_stride, 0x7fffffff);

+}

+void vp9_sad8x8x8_c(const unsigned char *src_ptr,

+                    int  src_stride,

+                    const unsigned char *ref_ptr,

+                    int  ref_stride,

+                    unsigned short *sad_array) {

+  sad_array[0] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,

+                                              ref_ptr, ref_stride,

+                                              0x7fffffff);

+  sad_array[1] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,

+                                              ref_ptr + 1, ref_stride,

+                                              0x7fffffff);

+  sad_array[2] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,

+                                              ref_ptr + 2, ref_stride,

+                                              0x7fffffff);

+  sad_array[3] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,

+                                              ref_ptr + 3, ref_stride,

+                                              0x7fffffff);

+  sad_array[4] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,

+                                              ref_ptr + 4, ref_stride,

+                                              0x7fffffff);

+  sad_array[5] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,

+                                              ref_ptr + 5, ref_stride,

+                                              0x7fffffff);

+  sad_array[6] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,

+                                              ref_ptr + 6, ref_stride,

+                                              0x7fffffff);

+  sad_array[7] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,

+                                              ref_ptr + 7, ref_stride,

+                                              0x7fffffff);

+}

+void vp9_sad8x16x3_c(const unsigned char *src_ptr,

+                     int  src_stride,

+                     const unsigned char *ref_ptr,

+                     int  ref_stride,

+                     unsigned int *sad_array) {

+  sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride,

+                               ref_ptr, ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad8x16_c(src_ptr, src_stride,

+                               ref_ptr + 1, ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad8x16_c(src_ptr, src_stride,

+                               ref_ptr + 2, ref_stride, 0x7fffffff);

+}

+void vp9_sad8x16x8_c(const unsigned char *src_ptr,

+                     int  src_stride,

+                     const unsigned char *ref_ptr,

+                     int  ref_stride,

+                     unsigned short *sad_array) {

+  sad_array[0] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,

+                                               ref_ptr, ref_stride,

+                                               0x7fffffff);

+  sad_array[1] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,

+                                               ref_ptr + 1, ref_stride,

+                                               0x7fffffff);

+  sad_array[2] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,

+                                               ref_ptr + 2, ref_stride,

+                                               0x7fffffff);

+  sad_array[3] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,

+                                               ref_ptr + 3, ref_stride,

+                                               0x7fffffff);

+  sad_array[4] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,

+                                               ref_ptr + 4, ref_stride,

+                                               0x7fffffff);

+  sad_array[5] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,

+                                               ref_ptr + 5, ref_stride,

+                                               0x7fffffff);

+  sad_array[6] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,

+                                               ref_ptr + 6, ref_stride,

+                                               0x7fffffff);

+  sad_array[7] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,

+                                               ref_ptr + 7, ref_stride,

+                                               0x7fffffff);

+}

+void vp9_sad4x4x3_c(const unsigned char *src_ptr,

+                    int  src_stride,

+                    const unsigned char *ref_ptr,

+                    int  ref_stride,

+                    unsigned int *sad_array) {

+  sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride,

+                              ref_ptr, ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad4x4_c(src_ptr, src_stride,

+                              ref_ptr + 1, ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad4x4_c(src_ptr, src_stride,

+                              ref_ptr + 2, ref_stride, 0x7fffffff);

+}

+void vp9_sad4x4x8_c(const unsigned char *src_ptr,

+                    int  src_stride,

+                    const unsigned char *ref_ptr,

+                    int  ref_stride,

+                    unsigned short *sad_array) {

+  sad_array[0] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,

+                                              ref_ptr, ref_stride,

+                                              0x7fffffff);

+  sad_array[1] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,

+                                              ref_ptr + 1, ref_stride,

+                                              0x7fffffff);

+  sad_array[2] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,

+                                              ref_ptr + 2, ref_stride,

+                                              0x7fffffff);

+  sad_array[3] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,

+                                              ref_ptr + 3, ref_stride,

+                                              0x7fffffff);

+  sad_array[4] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,

+                                              ref_ptr + 4, ref_stride,

+                                              0x7fffffff);

+  sad_array[5] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,

+                                              ref_ptr + 5, ref_stride,

+                                              0x7fffffff);

+  sad_array[6] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,

+                                              ref_ptr + 6, ref_stride,

+                                              0x7fffffff);

+  sad_array[7] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,

+                                              ref_ptr + 7, ref_stride,

+                                              0x7fffffff);

+}

+void vp9_sad32x32x4d_c(const unsigned char *src_ptr,

+                       int  src_stride,

+                       unsigned char *ref_ptr[],

+                       int  ref_stride,

+                       unsigned int *sad_array

+                       ) {

+  sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride,

+                                ref_ptr[0], ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride,

+                                ref_ptr[1], ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad32x32_c(src_ptr, src_stride,

+                                ref_ptr[2], ref_stride, 0x7fffffff);

+  sad_array[3] = vp9_sad32x32_c(src_ptr, src_stride,

+                                ref_ptr[3], ref_stride, 0x7fffffff);

+}

+void vp9_sad16x16x4d_c(const unsigned char *src_ptr,

+                       int  src_stride,

+                       unsigned char *ref_ptr[],

+                       int  ref_stride,

+                       unsigned int *sad_array) {

+  sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride,

+                                ref_ptr[0], ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad16x16_c(src_ptr, src_stride,

+                                ref_ptr[1], ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad16x16_c(src_ptr, src_stride,

+                                ref_ptr[2], ref_stride, 0x7fffffff);

+  sad_array[3] = vp9_sad16x16_c(src_ptr, src_stride,

+                                ref_ptr[3], ref_stride, 0x7fffffff);

+}

+void vp9_sad16x8x4d_c(const unsigned char *src_ptr,

+                      int  src_stride,

+                      unsigned char *ref_ptr[],

+                      int  ref_stride,

+                      unsigned int *sad_array) {

+  sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride,

+                               ref_ptr[0], ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad16x8_c(src_ptr, src_stride,

+                               ref_ptr[1], ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad16x8_c(src_ptr, src_stride,

+                               ref_ptr[2], ref_stride, 0x7fffffff);

+  sad_array[3] = vp9_sad16x8_c(src_ptr, src_stride,

+                               ref_ptr[3], ref_stride, 0x7fffffff);

+}

+void vp9_sad8x8x4d_c(const unsigned char *src_ptr,

+                     int  src_stride,

+                     unsigned char *ref_ptr[],

+                     int  ref_stride,

+                     unsigned int *sad_array) {

+  sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride,

+                              ref_ptr[0], ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad8x8_c(src_ptr, src_stride,

+                              ref_ptr[1], ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad8x8_c(src_ptr, src_stride,

+                              ref_ptr[2], ref_stride, 0x7fffffff);

+  sad_array[3] = vp9_sad8x8_c(src_ptr, src_stride,

+                              ref_ptr[3], ref_stride, 0x7fffffff);

+}

+void vp9_sad8x16x4d_c(const unsigned char *src_ptr,

+                      int  src_stride,

+                      unsigned char *ref_ptr[],

+                      int  ref_stride,

+                      unsigned int *sad_array) {

+  sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride,

+                               ref_ptr[0], ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad8x16_c(src_ptr, src_stride,

+                               ref_ptr[1], ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad8x16_c(src_ptr, src_stride,

+                               ref_ptr[2], ref_stride, 0x7fffffff);

+  sad_array[3] = vp9_sad8x16_c(src_ptr, src_stride,

+                               ref_ptr[3], ref_stride, 0x7fffffff);

+}

+void vp9_sad4x4x4d_c(const unsigned char *src_ptr,

+                     int  src_stride,

+                     unsigned char *ref_ptr[],

+                     int  ref_stride,

+                     unsigned int *sad_array) {

+  sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride,

+                              ref_ptr[0], ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad4x4_c(src_ptr, src_stride,

+                              ref_ptr[1], ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad4x4_c(src_ptr, src_stride,

+                              ref_ptr[2], ref_stride, 0x7fffffff);

+  sad_array[3] = vp9_sad4x4_c(src_ptr, src_stride,

+                              ref_ptr[3], ref_stride, 0x7fffffff);

+}

+/* Copy 2 macroblocks to a buffer */

+void vp9_copy32xn_c(unsigned char *src_ptr,

+                    int  src_stride,

+                    unsigned char *dst_ptr,

+                    int  dst_stride,

+                    int height) {

+  int r;

+  for (r = 0; r < height; r++) {

+#if !(CONFIG_FAST_UNALIGNED)

+    dst_ptr[0] = src_ptr[0];

+    dst_ptr[1] = src_ptr[1];

+    dst_ptr[2] = src_ptr[2];

+    dst_ptr[3] = src_ptr[3];

+    dst_ptr[4] = src_ptr[4];

+    dst_ptr[5] = src_ptr[5];

+    dst_ptr[6] = src_ptr[6];

+    dst_ptr[7] = src_ptr[7];

+    dst_ptr[8] = src_ptr[8];

+    dst_ptr[9] = src_ptr[9];

+    dst_ptr[10] = src_ptr[10];

+    dst_ptr[11] = src_ptr[11];

+    dst_ptr[12] = src_ptr[12];

+    dst_ptr[13] = src_ptr[13];

+    dst_ptr[14] = src_ptr[14];

+    dst_ptr[15] = src_ptr[15];

+    dst_ptr[16] = src_ptr[16];

+    dst_ptr[17] = src_ptr[17];

+    dst_ptr[18] = src_ptr[18];

+    dst_ptr[19] = src_ptr[19];

+    dst_ptr[20] = src_ptr[20];

+    dst_ptr[21] = src_ptr[21];

+    dst_ptr[22] = src_ptr[22];

+    dst_ptr[23] = src_ptr[23];

+    dst_ptr[24] = src_ptr[24];

+    dst_ptr[25] = src_ptr[25];

+    dst_ptr[26] = src_ptr[26];

+    dst_ptr[27] = src_ptr[27];

+    dst_ptr[28] = src_ptr[28];

+    dst_ptr[29] = src_ptr[29];

+    dst_ptr[30] = src_ptr[30];

+    dst_ptr[31] = src_ptr[31];

+#else

+    ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0];

+    ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1];

+    ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2];

+    ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3];

+    ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4];

+    ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5];

+    ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6];

+    ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7];

+#endif

+    src_ptr += src_stride;

+    dst_ptr += dst_stride;

+  }

+}

--- /dev/null

+++ b/vp9/encoder/satd_c.c

@@ -1,0 +1,47 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <stdlib.h>

+#include "vpx_ports/mem.h"

+#include "./vpx_rtcd.h"

+unsigned int vp9_satd16x16_c(const unsigned char *src_ptr,

+                             int  src_stride,

+                             const unsigned char *ref_ptr,

+                             int  ref_stride,

+                             unsigned int *psatd) {

+  int r, c, i;

+  unsigned int satd = 0;

+  DECLARE_ALIGNED(16, short, diff_in[256]);

+  DECLARE_ALIGNED(16, short, diff_out[16]);

+  short *in;

+  for (r = 0; r < 16; r++) {

+    for (c = 0; c < 16; c++) {

+      diff_in[r * 16 + c] = src_ptr[c] - ref_ptr[c];

+    }

+    src_ptr += src_stride;

+    ref_ptr += ref_stride;

+  }

+  in = diff_in;

+  for (r = 0; r < 16; r += 4) {

+    for (c = 0; c < 16; c += 4) {

+      vp9_short_walsh4x4_c(in + c, diff_out, 32);

+      for (i = 0; i < 16; i++)

+        satd += abs(diff_out[i]);

+    }

+    in += 64;

+  }

+  if (psatd)

+    *psatd = satd;

+  return satd;

+}

--- /dev/null

+++ b/vp9/encoder/segmentation.c

@@ -1,0 +1,327 @@

+/*

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "limits.h"

+#include "vpx_mem/vpx_mem.h"

+#include "segmentation.h"

+#include "vp9/common/pred_common.h"

+void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x) {

+  int mb_row, mb_col;

+  MODE_INFO *this_mb_mode_info = cm->mi;

+  x->gf_active_ptr = (signed char *)cpi->gf_active_flags;

+  if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame)) {

+    // Reset Gf useage monitors

+    vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));

+    cpi->gf_active_count = cm->mb_rows * cm->mb_cols;

+  } else {

+    // for each macroblock row in image

+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {

+      // for each macroblock col in image

+      for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {

+        // If using golden then set GF active flag if not already set.

+        // If using last frame 0,0 mode then leave flag as it is

+        // else if using non 0,0 motion or intra modes then clear

+        // flag if it is currently set

+        if ((this_mb_mode_info->mbmi.ref_frame == GOLDEN_FRAME) ||

+            (this_mb_mode_info->mbmi.ref_frame == ALTREF_FRAME)) {

+          if (*(x->gf_active_ptr) == 0) {

+            *(x->gf_active_ptr) = 1;

+            cpi->gf_active_count++;

+          }

+        } else if ((this_mb_mode_info->mbmi.mode != ZEROMV) &&

+                   *(x->gf_active_ptr)) {

+          *(x->gf_active_ptr) = 0;

+          cpi->gf_active_count--;

+        }

+        x->gf_active_ptr++;          // Step onto next entry

+        this_mb_mode_info++;         // skip to next mb

+      }

+      // this is to account for the border

+      this_mb_mode_info++;

+    }

+  }

+}

+void vp9_enable_segmentation(VP9_PTR ptr) {

+  VP9_COMP *cpi = (VP9_COMP *)(ptr);

+  // Set the appropriate feature bit

+  cpi->mb.e_mbd.segmentation_enabled = 1;

+  cpi->mb.e_mbd.update_mb_segmentation_map = 1;

+  cpi->mb.e_mbd.update_mb_segmentation_data = 1;

+}

+void vp9_disable_segmentation(VP9_PTR ptr) {

+  VP9_COMP *cpi = (VP9_COMP *)(ptr);

+  // Clear the appropriate feature bit

+  cpi->mb.e_mbd.segmentation_enabled = 0;

+}

+void vp9_set_segmentation_map(VP9_PTR ptr,

+                              unsigned char *segmentation_map) {

+  VP9_COMP *cpi = (VP9_COMP *)(ptr);

+  // Copy in the new segmentation map

+  vpx_memcpy(cpi->segmentation_map, segmentation_map,

+             (cpi->common.mb_rows * cpi->common.mb_cols));

+  // Signal that the map should be updated.

+  cpi->mb.e_mbd.update_mb_segmentation_map = 1;

+  cpi->mb.e_mbd.update_mb_segmentation_data = 1;

+}

+void vp9_set_segment_data(VP9_PTR ptr,

+                          signed char *feature_data,

+                          unsigned char abs_delta) {

+  VP9_COMP *cpi = (VP9_COMP *)(ptr);

+  cpi->mb.e_mbd.mb_segment_abs_delta = abs_delta;

+  vpx_memcpy(cpi->mb.e_mbd.segment_feature_data, feature_data,

+             sizeof(cpi->mb.e_mbd.segment_feature_data));

+  // TBD ?? Set the feature mask

+  // vpx_memcpy(cpi->mb.e_mbd.segment_feature_mask, 0,

+  //            sizeof(cpi->mb.e_mbd.segment_feature_mask));

+}

+// Based on set of segment counts calculate a probability tree

+static void calc_segtree_probs(MACROBLOCKD *xd,

+                               int *segcounts,

+                               vp9_prob *segment_tree_probs) {

+  int count1, count2;

+  int tot_count;

+  int i;

+  // Blank the strtucture to start with

+  vpx_memset(segment_tree_probs, 0,

+             MB_FEATURE_TREE_PROBS * sizeof(*segment_tree_probs));

+  // Total count for all segments

+  count1 = segcounts[0] + segcounts[1];

+  count2 = segcounts[2] + segcounts[3];

+  tot_count = count1 + count2;

+  // Work out probabilities of each segment

+  if (tot_count)

+    segment_tree_probs[0] = (count1 * 255) / tot_count;

+  if (count1 > 0)

+    segment_tree_probs[1] = (segcounts[0] * 255) / count1;

+  if (count2 > 0)

+    segment_tree_probs[2] = (segcounts[2] * 255) / count2;

+  // Clamp probabilities to minimum allowed value

+  for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {

+    if (segment_tree_probs[i] == 0)

+      segment_tree_probs[i] = 1;

+  }

+}

+// Based on set of segment counts and probabilities calculate a cost estimate

+static int cost_segmap(MACROBLOCKD *xd,

+                       int *segcounts,

+                       vp9_prob *probs) {

+  int cost;

+  int count1, count2;

+  // Cost the top node of the tree

+  count1 = segcounts[0] + segcounts[1];

+  count2 = segcounts[2] + segcounts[3];

+  cost = count1 * vp9_cost_zero(probs[0]) +

+         count2 * vp9_cost_one(probs[0]);

+  // Now add the cost of each individual segment branch

+  if (count1 > 0)

+    cost += segcounts[0] * vp9_cost_zero(probs[1]) +

+            segcounts[1] * vp9_cost_one(probs[1]);

+  if (count2 > 0)

+    cost += segcounts[2] * vp9_cost_zero(probs[2]) +

+            segcounts[3] * vp9_cost_one(probs[2]);

+  return cost;

+}

+void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {

+  VP9_COMMON *const cm = &cpi->common;

+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;

+  const int mis = cm->mode_info_stride;

+  int i;

+  int tot_count;

+  int no_pred_cost;

+  int t_pred_cost = INT_MAX;

+  int pred_context;

+  int mb_row, mb_col;

+  int segmap_index = 0;

+  unsigned char segment_id;

+  int temporal_predictor_count[PREDICTION_PROBS][2];

+  int no_pred_segcounts[MAX_MB_SEGMENTS];

+  int t_unpred_seg_counts[MAX_MB_SEGMENTS];

+  vp9_prob no_pred_tree[MB_FEATURE_TREE_PROBS];

+  vp9_prob t_pred_tree[MB_FEATURE_TREE_PROBS];

+  vp9_prob t_nopred_prob[PREDICTION_PROBS];

+  // Set default state for the segment tree probabilities and the

+  // temporal coding probabilities

+  vpx_memset(xd->mb_segment_tree_probs, 255,

+             sizeof(xd->mb_segment_tree_probs));

+  vpx_memset(cm->segment_pred_probs, 255,

+             sizeof(cm->segment_pred_probs));

+  vpx_memset(no_pred_segcounts, 0, sizeof(no_pred_segcounts));

+  vpx_memset(t_unpred_seg_counts, 0, sizeof(t_unpred_seg_counts));

+  vpx_memset(temporal_predictor_count, 0, sizeof(temporal_predictor_count));

+  // First of all generate stats regarding how well the last segment map

+  // predicts this one

+  // Initialize macroblock decoder mode info context for the first mb

+  // in the frame

+  xd->mode_info_context = cm->mi;

+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 2) {

+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 2) {

+      for (i = 0; i < 4; i++) {

+        static const int dx[4] = { +1, -1, +1, +1 };

+        static const int dy[4] = {  0, +1,  0, -1 };

+        int x_idx = i & 1, y_idx = i >> 1;

+        if (mb_col + x_idx >= cm->mb_cols ||

+            mb_row + y_idx >= cm->mb_rows) {

+          goto end;

+        }

+        xd->mb_to_top_edge = -((mb_row * 16) << 3);

+        xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;

+        xd->mb_to_left_edge = -((mb_col * 16) << 3);

+        xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_row) * 16) << 3;

+        segmap_index = (mb_row + y_idx) * cm->mb_cols + mb_col + x_idx;

+        segment_id = xd->mode_info_context->mbmi.segment_id;

+#if CONFIG_SUPERBLOCKS

+        if (xd->mode_info_context->mbmi.encoded_as_sb) {

+          if (mb_col + 1 < cm->mb_cols)

+            segment_id = segment_id &&

+                         xd->mode_info_context[1].mbmi.segment_id;

+          if (mb_row + 1 < cm->mb_rows) {

+            segment_id = segment_id &&

+                         xd->mode_info_context[mis].mbmi.segment_id;

+            if (mb_col + 1 < cm->mb_cols)

+              segment_id = segment_id &&

+                           xd->mode_info_context[mis + 1].mbmi.segment_id;

+          }

+        }

+#endif

+        // Count the number of hits on each segment with no prediction

+        no_pred_segcounts[segment_id]++;

+        // Temporal prediction not allowed on key frames

+        if (cm->frame_type != KEY_FRAME) {

+          // Test to see if the segment id matches the predicted value.

+          int seg_predicted =

+            (segment_id == vp9_get_pred_mb_segid(cm, xd, segmap_index));

+          // Get the segment id prediction context

+          pred_context =

+            vp9_get_pred_context(cm, xd, PRED_SEG_ID);

+          // Store the prediction status for this mb and update counts

+          // as appropriate

+          vp9_set_pred_flag(xd, PRED_SEG_ID, seg_predicted);

+          temporal_predictor_count[pred_context][seg_predicted]++;

+          if (!seg_predicted)

+            // Update the "unpredicted" segment count

+            t_unpred_seg_counts[segment_id]++;

+        }

+#if CONFIG_SUPERBLOCKS

+        if (xd->mode_info_context->mbmi.encoded_as_sb) {

+          assert(!i);

+          xd->mode_info_context += 2;

+          break;

+        }

+#endif

+      end:

+        xd->mode_info_context += dx[i] + dy[i] * cm->mode_info_stride;

+      }

+    }

+    // this is to account for the border in mode_info_context

+    xd->mode_info_context -= mb_col;

+    xd->mode_info_context += cm->mode_info_stride * 2;

+  }

+  // Work out probability tree for coding segments without prediction

+  // and the cost.

+  calc_segtree_probs(xd, no_pred_segcounts, no_pred_tree);

+  no_pred_cost = cost_segmap(xd, no_pred_segcounts, no_pred_tree);

+  // Key frames cannot use temporal prediction

+  if (cm->frame_type != KEY_FRAME) {

+    // Work out probability tree for coding those segments not

+    // predicted using the temporal method and the cost.

+    calc_segtree_probs(xd, t_unpred_seg_counts, t_pred_tree);

+    t_pred_cost = cost_segmap(xd, t_unpred_seg_counts, t_pred_tree);

+    // Add in the cost of the signalling for each prediction context

+    for (i = 0; i < PREDICTION_PROBS; i++) {

+      tot_count = temporal_predictor_count[i][0] +

+                  temporal_predictor_count[i][1];

+      // Work out the context probabilities for the segment

+      // prediction flag

+      if (tot_count) {

+        t_nopred_prob[i] = (temporal_predictor_count[i][0] * 255) /

+                           tot_count;

+        // Clamp to minimum allowed value

+        if (t_nopred_prob[i] < 1)

+          t_nopred_prob[i] = 1;

+      } else

+        t_nopred_prob[i] = 1;

+      // Add in the predictor signaling cost

+      t_pred_cost += (temporal_predictor_count[i][0] *

+                      vp9_cost_zero(t_nopred_prob[i])) +

+                     (temporal_predictor_count[i][1] *

+                      vp9_cost_one(t_nopred_prob[i]));

+    }

+  }

+  // Now choose which coding method to use.

+  if (t_pred_cost < no_pred_cost) {

+    cm->temporal_update = 1;

+    vpx_memcpy(xd->mb_segment_tree_probs,

+               t_pred_tree, sizeof(t_pred_tree));

+    vpx_memcpy(&cm->segment_pred_probs,

+               t_nopred_prob, sizeof(t_nopred_prob));

+  } else {

+    cm->temporal_update = 0;

+    vpx_memcpy(xd->mb_segment_tree_probs,

+               no_pred_tree, sizeof(no_pred_tree));

+  }

+}

--- /dev/null

+++ b/vp9/encoder/segmentation.h

@@ -1,0 +1,46 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "string.h"

+#include "vp9/common/blockd.h"

+#include "onyx_int.h"

+#ifndef __INC_SEGMENTATION_H__

+#define __INC_SEGMENTATION_H__ 1

+extern void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm,

+                                      MACROBLOCK *x);

+extern void vp9_enable_segmentation(VP9_PTR ptr);

+extern void vp9_disable_segmentation(VP9_PTR ptr);

+// Valid values for a segment are 0 to 3

+// Segmentation map is arrange as [Rows][Columns]

+extern void vp9_set_segmentation_map(VP9_PTR ptr,

+                                     unsigned char *segmentation_map);

+// The values given for each segment can be either deltas (from the default

+// value chosen for the frame) or absolute values.

+//

+// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for

+// SEGMENT_ALT_LF)

+// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for

+// SEGMENT_ALT_LF)

+//

+// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use

+// the absolute values given).

+//

+extern void vp9_set_segment_data(VP9_PTR ptr, signed char *feature_data,

+                                 unsigned char abs_delta);

+extern void vp9_choose_segmap_coding_method(VP9_COMP *cpi);

+#endif /* __INC_SEGMENTATION_H__ */

--- /dev/null

+++ b/vp9/encoder/ssim.c

@@ -1,0 +1,147 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "onyx_int.h"

+void vp9_ssim_parms_16x16_c(unsigned char *s, int sp, unsigned char *r,

+                            int rp, unsigned long *sum_s, unsigned long *sum_r,

+                            unsigned long *sum_sq_s, unsigned long *sum_sq_r,

+                            unsigned long *sum_sxr) {

+  int i, j;

+  for (i = 0; i < 16; i++, s += sp, r += rp) {

+    for (j = 0; j < 16; j++) {

+      *sum_s += s[j];

+      *sum_r += r[j];

+      *sum_sq_s += s[j] * s[j];

+      *sum_sq_r += r[j] * r[j];

+      *sum_sxr += s[j] * r[j];

+    }

+  }

+}

+void vp9_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp,

+                          unsigned long *sum_s, unsigned long *sum_r,

+                          unsigned long *sum_sq_s, unsigned long *sum_sq_r,

+                          unsigned long *sum_sxr) {

+  int i, j;

+  for (i = 0; i < 8; i++, s += sp, r += rp) {

+    for (j = 0; j < 8; j++) {

+      *sum_s += s[j];

+      *sum_r += r[j];

+      *sum_sq_s += s[j] * s[j];

+      *sum_sq_r += r[j] * r[j];

+      *sum_sxr += s[j] * r[j];

+    }

+  }

+}

+const static int64_t cc1 =  26634; // (64^2*(.01*255)^2

+const static int64_t cc2 = 239708; // (64^2*(.03*255)^2

+static double similarity(unsigned long sum_s, unsigned long sum_r,

+                         unsigned long sum_sq_s, unsigned long sum_sq_r,

+                         unsigned long sum_sxr, int count) {

+  int64_t ssim_n, ssim_d;

+  int64_t c1, c2;

+  // scale the constants by number of pixels

+  c1 = (cc1 * count * count) >> 12;

+  c2 = (cc2 * count * count) >> 12;

+  ssim_n = (2 * sum_s * sum_r + c1) * ((int64_t) 2 * count * sum_sxr -

+                                       (int64_t) 2 * sum_s * sum_r + c2);

+  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *

+           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +

+            (int64_t)count * sum_sq_r - (int64_t) sum_r * sum_r + c2);

+  return ssim_n * 1.0 / ssim_d;

+}

+static double ssim_16x16(unsigned char *s, int sp, unsigned char *r, int rp) {

+  unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;

+  vp9_ssim_parms_16x16(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,

+                       &sum_sxr);

+  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);

+}

+static double ssim_8x8(unsigned char *s, int sp, unsigned char *r, int rp) {

+  unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;

+  vp9_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,

+                     &sum_sxr);

+  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);

+}

+// We are using a 8x8 moving window with starting location of each 8x8 window

+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap

+// block boundaries to penalize blocking artifacts.

+double vp9_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1,

+                 int stride_img2, int width, int height) {

+  int i, j;

+  int samples = 0;

+  double ssim_total = 0;

+  // sample point start with each 4x4 location

+  for (i = 0; i < height - 8; i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {

+    for (j = 0; j < width - 8; j += 4) {

+      double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);

+      ssim_total += v;

+      samples++;

+    }

+  }

+  ssim_total /= samples;

+  return ssim_total;

+}

+double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,

+                     int lumamask, double *weight) {

+  double a, b, c;

+  double ssimv;

+  a = vp9_ssim2(source->y_buffer, dest->y_buffer,

+                source->y_stride, dest->y_stride, source->y_width,

+                source->y_height);

+  b = vp9_ssim2(source->u_buffer, dest->u_buffer,

+                source->uv_stride, dest->uv_stride, source->uv_width,

+                source->uv_height);

+  c = vp9_ssim2(source->v_buffer, dest->v_buffer,

+                source->uv_stride, dest->uv_stride, source->uv_width,

+                source->uv_height);

+  ssimv = a * .8 + .1 * (b + c);

+  *weight = 1;

+  return ssimv;

+}

+double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,

+                      double *ssim_y, double *ssim_u, double *ssim_v) {

+  double ssim_all = 0;

+  double a, b, c;

+  a = vp9_ssim2(source->y_buffer, dest->y_buffer,

+                source->y_stride, dest->y_stride, source->y_width,

+                source->y_height);

+  b = vp9_ssim2(source->u_buffer, dest->u_buffer,

+                source->uv_stride, dest->uv_stride, source->uv_width,

+                source->uv_height);

+  c = vp9_ssim2(source->v_buffer, dest->v_buffer,

+                source->uv_stride, dest->uv_stride, source->uv_width,

+                source->uv_height);

+  *ssim_y = a;

+  *ssim_u = b;

+  *ssim_v = c;

+  ssim_all = (a * 4 + b + c) / 6;

+  return ssim_all;

+}

--- /dev/null

+++ b/vp9/encoder/temporal_filter.c

@@ -1,0 +1,516 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vp9/common/onyxc_int.h"

+#include "onyx_int.h"

+#include "vp9/common/systemdependent.h"

+#include "quantize.h"

+#include "vp9/common/alloccommon.h"

+#include "mcomp.h"

+#include "firstpass.h"

+#include "psnr.h"

+#include "vpx_scale/vpxscale.h"

+#include "vp9/common/extend.h"

+#include "ratectrl.h"

+#include "vp9/common/quant_common.h"

+#include "segmentation.h"

+#include "vpx_scale/yv12extend.h"

+#include "vpx_mem/vpx_mem.h"

+#include "vp9/common/swapyv12buffer.h"

+#include "vpx_ports/vpx_timer.h"

+#include <math.h>

+#include <limits.h>

+#define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering

+#define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering

+#if VP9_TEMPORAL_ALT_REF

+static void temporal_filter_predictors_mb_c

+(

+  MACROBLOCKD *xd,

+  unsigned char *y_mb_ptr,

+  unsigned char *u_mb_ptr,

+  unsigned char *v_mb_ptr,

+  int stride,

+  int mv_row,

+  int mv_col,

+  unsigned char *pred

+) {

+  int offset;

+  unsigned char *yptr, *uptr, *vptr;

+  int omv_row, omv_col;

+  // Y

+  yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3);

+  if ((mv_row | mv_col) & 7) {

+    xd->subpixel_predict16x16(yptr, stride,

+                             (mv_col & 7) << 1, (mv_row & 7) << 1, &pred[0], 16);

+  } else {

+    vp9_copy_mem16x16(yptr, stride, &pred[0], 16);

+  }

+  // U & V

+  omv_row = mv_row;

+  omv_col = mv_col;

+  mv_row >>= 1;

+  mv_col >>= 1;

+  stride = (stride + 1) >> 1;

+  offset = (mv_row >> 3) * stride + (mv_col >> 3);

+  uptr = u_mb_ptr + offset;

+  vptr = v_mb_ptr + offset;

+  if ((omv_row | omv_col) & 15) {

+    xd->subpixel_predict8x8(uptr, stride,

+                           (omv_col & 15), (omv_row & 15), &pred[256], 8);

+    xd->subpixel_predict8x8(vptr, stride,

+                           (omv_col & 15), (omv_row & 15), &pred[320], 8);

+  }

+  else {

+    vp9_copy_mem8x8(uptr, stride, &pred[256], 8);

+    vp9_copy_mem8x8(vptr, stride, &pred[320], 8);

+  }

+}

+void vp9_temporal_filter_apply_c

+(

+  unsigned char *frame1,

+  unsigned int stride,

+  unsigned char *frame2,

+  unsigned int block_size,

+  int strength,

+  int filter_weight,

+  unsigned int *accumulator,

+  unsigned short *count

+) {

+  unsigned int i, j, k;

+  int modifier;

+  int byte = 0;

+  for (i = 0, k = 0; i < block_size; i++) {

+    for (j = 0; j < block_size; j++, k++) {

+      int src_byte = frame1[byte];

+      int pixel_value = *frame2++;

+      modifier   = src_byte - pixel_value;

+      // This is an integer approximation of:

+      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);

+      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);

+      modifier  *= modifier;

+      modifier  *= 3;

+      modifier  += 1 << (strength - 1);

+      modifier >>= strength;

+      if (modifier > 16)

+        modifier = 16;

+      modifier = 16 - modifier;

+      modifier *= filter_weight;

+      count[k] += modifier;

+      accumulator[k] += modifier * pixel_value;

+      byte++;

+    }

+    byte += stride - block_size;

+  }

+}

+#if ALT_REF_MC_ENABLED

+static int temporal_filter_find_matching_mb_c

+(

+  VP9_COMP *cpi,

+  YV12_BUFFER_CONFIG *arf_frame,

+  YV12_BUFFER_CONFIG *frame_ptr,

+  int mb_offset,

+  int error_thresh

+) {

+  MACROBLOCK *x = &cpi->mb;

+  int step_param;

+  int further_steps;

+  int sadpb = x->sadperbit16;

+  int bestsme = INT_MAX;

+  BLOCK *b = &x->block[0];

+  BLOCKD *d = &x->e_mbd.block[0];

+  int_mv best_ref_mv1;

+  int_mv best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */

+  // Save input state

+  unsigned char **base_src = b->base_src;

+  int src = b->src;

+  int src_stride = b->src_stride;

+  unsigned char **base_pre = d->base_pre;

+  int pre = d->pre;

+  int pre_stride = d->pre_stride;

+  best_ref_mv1.as_int = 0;

+  best_ref_mv1_full.as_mv.col = best_ref_mv1.as_mv.col >> 3;

+  best_ref_mv1_full.as_mv.row = best_ref_mv1.as_mv.row >> 3;

+  // Setup frame pointers

+  b->base_src = &arf_frame->y_buffer;

+  b->src_stride = arf_frame->y_stride;

+  b->src = mb_offset;

+  d->base_pre = &frame_ptr->y_buffer;

+  d->pre_stride = frame_ptr->y_stride;

+  d->pre = mb_offset;

+  // Further step/diamond searches as necessary

+  if (cpi->Speed < 8) {

+    step_param = cpi->sf.first_step +

+                 ((cpi->Speed > 5) ? 1 : 0);

+    further_steps =

+      (cpi->sf.max_step_search_steps - 1) - step_param;

+  } else {

+    step_param = cpi->sf.first_step + 2;

+    further_steps = 0;

+  }

+  /*cpi->sf.search_method == HEX*/

+  // TODO Check that the 16x16 vf & sdf are selected here

+  // Ignore mv costing by sending NULL pointer instead of cost arrays

+  bestsme = vp9_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.as_mv.first,

+                           step_param, sadpb, &cpi->fn_ptr[BLOCK_16X16],

+                           NULLMVCOST, NULLMVCOST,

+                           &best_ref_mv1);

+#if ALT_REF_SUBPEL_ENABLED

+  // Try sub-pixel MC?

+  // if (bestsme > error_thresh && bestsme < INT_MAX)

+  {

+    int distortion;

+    unsigned int sse;

+    // Ignore mv costing by sending NULL pointer instead of cost array

+    bestsme = cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv.first,

+                                           &best_ref_mv1,

+                                           x->errorperbit,

+                                           &cpi->fn_ptr[BLOCK_16X16],

+                                           NULLMVCOST,

+                                           &distortion, &sse);

+  }

+#endif

+  // Save input state

+  b->base_src = base_src;

+  b->src = src;

+  b->src_stride = src_stride;

+  d->base_pre = base_pre;

+  d->pre = pre;

+  d->pre_stride = pre_stride;

+  return bestsme;

+}

+#endif

+static void temporal_filter_iterate_c

+(

+  VP9_COMP *cpi,

+  int frame_count,

+  int alt_ref_index,

+  int strength

+) {

+  int byte;

+  int frame;

+  int mb_col, mb_row;

+  unsigned int filter_weight;

+  int mb_cols = cpi->common.mb_cols;

+  int mb_rows = cpi->common.mb_rows;

+  int mb_y_offset = 0;

+  int mb_uv_offset = 0;

+  DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16 * 16 + 8 * 8 + 8 * 8);

+  DECLARE_ALIGNED_ARRAY(16, unsigned short, count, 16 * 16 + 8 * 8 + 8 * 8);

+  MACROBLOCKD *mbd = &cpi->mb.e_mbd;

+  YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];

+  unsigned char *dst1, *dst2;

+  DECLARE_ALIGNED_ARRAY(16, unsigned char,  predictor, 16 * 16 + 8 * 8 + 8 * 8);

+  // Save input state

+  unsigned char *y_buffer = mbd->pre.y_buffer;

+  unsigned char *u_buffer = mbd->pre.u_buffer;

+  unsigned char *v_buffer = mbd->pre.v_buffer;

+  for (mb_row = 0; mb_row < mb_rows; mb_row++) {

+#if ALT_REF_MC_ENABLED

+    // Source frames are extended to 16 pixels.  This is different than

+    //  L/A/G reference frames that have a border of 32 (VP8BORDERINPIXELS)

+    // A 6/8 tap filter is used for motion search.  This requires 2 pixels

+    //  before and 3 pixels after.  So the largest Y mv on a border would

+    //  then be 16 - INTERP_EXTEND. The UV blocks are half the size of the Y and

+    //  therefore only extended by 8.  The largest mv that a UV block

+    //  can support is 8 - INTERP_EXTEND.  A UV mv is half of a Y mv.

+    //  (16 - INTERP_EXTEND) >> 1 which is greater than 8 - INTERP_EXTEND.

+    // To keep the mv in play for both Y and UV planes the max that it

+    //  can be on a border is therefore 16 - (2*INTERP_EXTEND+1).

+    cpi->mb.mv_row_min = -((mb_row * 16) + (17 - 2 * INTERP_EXTEND));

+    cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16)

+                         + (17 - 2 * INTERP_EXTEND);

+#endif

+    for (mb_col = 0; mb_col < mb_cols; mb_col++) {

+      int i, j, k;

+      int stride;

+      vpx_memset(accumulator, 0, 384 * sizeof(unsigned int));

+      vpx_memset(count, 0, 384 * sizeof(unsigned short));

+#if ALT_REF_MC_ENABLED

+      cpi->mb.mv_col_min = -((mb_col * 16) + (17 - 2 * INTERP_EXTEND));

+      cpi->mb.mv_col_max = ((cpi->common.mb_cols - 1 - mb_col) * 16)

+                           + (17 - 2 * INTERP_EXTEND);

+#endif

+      for (frame = 0; frame < frame_count; frame++) {

+        if (cpi->frames[frame] == NULL)

+          continue;

+        mbd->block[0].bmi.as_mv.first.as_mv.row = 0;

+        mbd->block[0].bmi.as_mv.first.as_mv.col = 0;

+        if (frame == alt_ref_index) {

+          filter_weight = 2;

+        } else {

+          int err = 0;

+#if ALT_REF_MC_ENABLED

+#define THRESH_LOW   10000

+#define THRESH_HIGH  20000

+          // Find best match in this frame by MC

+          err = temporal_filter_find_matching_mb_c

+                (cpi,

+                 cpi->frames[alt_ref_index],

+                 cpi->frames[frame],

+                 mb_y_offset,

+                 THRESH_LOW);

+#endif

+          // Assign higher weight to matching MB if it's error

+          // score is lower. If not applying MC default behavior

+          // is to weight all MBs equal.

+          filter_weight = err < THRESH_LOW

+                          ? 2 : err < THRESH_HIGH ? 1 : 0;

+        }

+        if (filter_weight != 0) {

+          // Construct the predictors

+          temporal_filter_predictors_mb_c

+          (mbd,

+           cpi->frames[frame]->y_buffer + mb_y_offset,

+           cpi->frames[frame]->u_buffer + mb_uv_offset,

+           cpi->frames[frame]->v_buffer + mb_uv_offset,

+           cpi->frames[frame]->y_stride,

+           mbd->block[0].bmi.as_mv.first.as_mv.row,

+           mbd->block[0].bmi.as_mv.first.as_mv.col,

+           predictor);

+          // Apply the filter (YUV)

+          TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)

+          (f->y_buffer + mb_y_offset,

+           f->y_stride,

+           predictor,

+           16,

+           strength,

+           filter_weight,

+           accumulator,

+           count);

+          TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)

+          (f->u_buffer + mb_uv_offset,

+           f->uv_stride,

+           predictor + 256,

+           8,

+           strength,

+           filter_weight,

+           accumulator + 256,

+           count + 256);

+          TEMPORAL_INVOKE(&cpi->rtcd.temporal, apply)

+          (f->v_buffer + mb_uv_offset,

+           f->uv_stride,

+           predictor + 320,

+           8,

+           strength,

+           filter_weight,

+           accumulator + 320,

+           count + 320);

+        }

+      }

+      // Normalize filter output to produce AltRef frame

+      dst1 = cpi->alt_ref_buffer.y_buffer;

+      stride = cpi->alt_ref_buffer.y_stride;

+      byte = mb_y_offset;

+      for (i = 0, k = 0; i < 16; i++) {

+        for (j = 0; j < 16; j++, k++) {

+          unsigned int pval = accumulator[k] + (count[k] >> 1);

+          pval *= cpi->fixed_divide[count[k]];

+          pval >>= 19;

+          dst1[byte] = (unsigned char)pval;

+          // move to next pixel

+          byte++;

+        }

+        byte += stride - 16;

+      }

+      dst1 = cpi->alt_ref_buffer.u_buffer;

+      dst2 = cpi->alt_ref_buffer.v_buffer;

+      stride = cpi->alt_ref_buffer.uv_stride;

+      byte = mb_uv_offset;

+      for (i = 0, k = 256; i < 8; i++) {

+        for (j = 0; j < 8; j++, k++) {

+          int m = k + 64;

+          // U

+          unsigned int pval = accumulator[k] + (count[k] >> 1);

+          pval *= cpi->fixed_divide[count[k]];

+          pval >>= 19;

+          dst1[byte] = (unsigned char)pval;

+          // V

+          pval = accumulator[m] + (count[m] >> 1);

+          pval *= cpi->fixed_divide[count[m]];

+          pval >>= 19;

+          dst2[byte] = (unsigned char)pval;

+          // move to next pixel

+          byte++;

+        }

+        byte += stride - 8;

+      }

+      mb_y_offset += 16;

+      mb_uv_offset += 8;

+    }

+    mb_y_offset += 16 * (f->y_stride - mb_cols);

+    mb_uv_offset += 8 * (f->uv_stride - mb_cols);

+  }

+  // Restore input state

+  mbd->pre.y_buffer = y_buffer;

+  mbd->pre.u_buffer = u_buffer;

+  mbd->pre.v_buffer = v_buffer;

+}

+void vp9_temporal_filter_prepare_c

+(

+  VP9_COMP *cpi,

+  int distance

+) {

+  int frame = 0;

+  int num_frames_backward = 0;

+  int num_frames_forward = 0;

+  int frames_to_blur_backward = 0;

+  int frames_to_blur_forward = 0;

+  int frames_to_blur = 0;

+  int start_frame = 0;

+  int strength = cpi->oxcf.arnr_strength;

+  int blur_type = cpi->oxcf.arnr_type;

+  int max_frames = cpi->active_arnr_frames;

+  num_frames_backward = distance;

+  num_frames_forward = vp9_lookahead_depth(cpi->lookahead)

+                       - (num_frames_backward + 1);

+  switch (blur_type) {

+    case 1:

+      /////////////////////////////////////////

+      // Backward Blur

+      frames_to_blur_backward = num_frames_backward;

+      if (frames_to_blur_backward >= max_frames)

+        frames_to_blur_backward = max_frames - 1;

+      frames_to_blur = frames_to_blur_backward + 1;

+      break;

+    case 2:

+      /////////////////////////////////////////

+      // Forward Blur

+      frames_to_blur_forward = num_frames_forward;

+      if (frames_to_blur_forward >= max_frames)

+        frames_to_blur_forward = max_frames - 1;

+      frames_to_blur = frames_to_blur_forward + 1;

+      break;

+    case 3:

+    default:

+      /////////////////////////////////////////

+      // Center Blur

+      frames_to_blur_forward = num_frames_forward;

+      frames_to_blur_backward = num_frames_backward;

+      if (frames_to_blur_forward > frames_to_blur_backward)

+        frames_to_blur_forward = frames_to_blur_backward;

+      if (frames_to_blur_backward > frames_to_blur_forward)

+        frames_to_blur_backward = frames_to_blur_forward;

+      // When max_frames is even we have 1 more frame backward than forward

+      if (frames_to_blur_forward > (max_frames - 1) / 2)

+        frames_to_blur_forward = ((max_frames - 1) / 2);

+      if (frames_to_blur_backward > (max_frames / 2))

+        frames_to_blur_backward = (max_frames / 2);

+      frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1;

+      break;

+  }

+  start_frame = distance + frames_to_blur_forward;

+#ifdef DEBUGFWG

+  // DEBUG FWG

+  printf("max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d start:%d"

+, max_frames

+, num_frames_backward

+, num_frames_forward

+, frames_to_blur

+, frames_to_blur_backward

+, frames_to_blur_forward

+, cpi->source_encode_index

+, cpi->last_alt_ref_sei

+, start_frame);

+#endif

+  // Setup frame pointers, NULL indicates frame not included in filter

+  vpx_memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *));

+  for (frame = 0; frame < frames_to_blur; frame++) {

+    int which_buffer =  start_frame - frame;

+    struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead,

+                                                     which_buffer);

+    cpi->frames[frames_to_blur - 1 - frame] = &buf->img;

+  }

+  temporal_filter_iterate_c(

+    cpi,

+    frames_to_blur,

+    frames_to_blur_backward,

+    strength);

+}

+#endif

--- /dev/null

+++ b/vp9/encoder/temporal_filter.h

@@ -1,0 +1,47 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_TEMPORAL_FILTER_H

+#define __INC_TEMPORAL_FILTER_H

+#define prototype_apply(sym)\

+  void (sym) \

+  ( \

+    unsigned char *frame1, \

+    unsigned int stride, \

+    unsigned char *frame2, \

+    unsigned int block_size, \

+    int strength, \

+    int filter_weight, \

+    unsigned int *accumulator, \

+    unsigned short *count \

+  )

+#if ARCH_X86 || ARCH_X86_64

+#include "x86/temporal_filter_x86.h"

+#endif

+#ifndef vp9_temporal_filter_apply

+#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c

+#endif

+extern prototype_apply(vp9_temporal_filter_apply);

+typedef struct {

+  prototype_apply(*apply);

+} vp9_temporal_rtcd_vtable_t;

+#if CONFIG_RUNTIME_CPU_DETECT

+#define TEMPORAL_INVOKE(ctx,fn) (ctx)->fn

+#else

+#define TEMPORAL_INVOKE(ctx,fn) vp9_temporal_filter_##fn

+#endif

+#endif // __INC_TEMPORAL_FILTER_H

--- /dev/null

+++ b/vp9/encoder/tokenize.c

@@ -1,0 +1,868 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <math.h>

+#include <stdio.h>

+#include <string.h>

+#include <assert.h>

+#include "onyx_int.h"

+#include "tokenize.h"

+#include "vpx_mem/vpx_mem.h"

+#include "vp9/common/pred_common.h"

+#include "vp9/common/seg_common.h"

+#include "vp9/common/entropy.h"

+/* Global event counters used for accumulating statistics across several

+   compressions, then generating context.c = initial stats. */

+#ifdef ENTROPY_STATS

+INT64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

+INT64 hybrid_context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

+INT64 context_counters_8x8[BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

+INT64 hybrid_context_counters_8x8[BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

+INT64 context_counters_16x16[BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

+INT64 hybrid_context_counters_16x16[BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

+extern unsigned int tree_update_hist[BLOCK_TYPES][COEF_BANDS]

+                    [PREV_COEF_CONTEXTS][ENTROPY_NODES][2];

+extern unsigned int hybrid_tree_update_hist[BLOCK_TYPES][COEF_BANDS]

+                    [PREV_COEF_CONTEXTS][ENTROPY_NODES][2];

+extern unsigned int tree_update_hist_8x8[BLOCK_TYPES_8X8][COEF_BANDS]

+                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];

+extern unsigned int hybrid_tree_update_hist_8x8[BLOCK_TYPES_8X8][COEF_BANDS]

+                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];

+extern unsigned int tree_update_hist_16x16[BLOCK_TYPES_16X16][COEF_BANDS]

+                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];

+extern unsigned int hybrid_tree_update_hist_16x16[BLOCK_TYPES_16X16][COEF_BANDS]

+                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];

+#endif  /* ENTROPY_STATS */

+void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run);

+void vp9_fix_contexts(MACROBLOCKD *xd);

+static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];

+const TOKENVALUE *vp9_dct_value_tokens_ptr;

+static int dct_value_cost[DCT_MAX_VALUE * 2];

+const int *vp9_dct_value_cost_ptr;

+static void fill_value_tokens() {

+  TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE;

+  vp9_extra_bit_struct *const e = vp9_extra_bits;

+  int i = -DCT_MAX_VALUE;

+  int sign = 1;

+  do {

+    if (!i)

+      sign = 0;

+    {

+      const int a = sign ? -i : i;

+      int eb = sign;

+      if (a > 4) {

+        int j = 4;

+        while (++j < 11  &&  e[j].base_val <= a) {}

+        t[i].Token = --j;

+        eb |= (a - e[j].base_val) << 1;

+      } else

+        t[i].Token = a;

+      t[i].Extra = eb;

+    }

+    // initialize the cost for extra bits for all possible coefficient value.

+    {

+      int cost = 0;

+      vp9_extra_bit_struct *p = vp9_extra_bits + t[i].Token;

+      if (p->base_val) {

+        const int extra = t[i].Extra;

+        const int Length = p->Len;

+        if (Length)

+          cost += treed_cost(p->tree, p->prob, extra >> 1, Length);

+        cost += vp9_cost_bit(vp9_prob_half, extra & 1); /* sign */

+        dct_value_cost[i + DCT_MAX_VALUE] = cost;

+      }

+    }

+  } while (++i < DCT_MAX_VALUE);

+  vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;

+  vp9_dct_value_cost_ptr   = dct_value_cost + DCT_MAX_VALUE;

+}

+static void tokenize_b(VP9_COMP *cpi,

+                       MACROBLOCKD *xd,

+                       const BLOCKD * const b,

+                       TOKENEXTRA **tp,

+                       PLANE_TYPE type,

+                       ENTROPY_CONTEXT *a,

+                       ENTROPY_CONTEXT *l,

+                       TX_SIZE tx_size,

+                       int dry_run) {

+  int pt; /* near block/prev token context index */

+  int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0;

+  const int eob = b->eob;     /* one beyond last nonzero coeff */

+  TOKENEXTRA *t = *tp;        /* store tokens starting here */

+  const short *qcoeff_ptr = b->qcoeff;

+  int seg_eob;

+  int segment_id = xd->mode_info_context->mbmi.segment_id;

+  const int *bands, *scan;

+  unsigned int (*counts)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];

+  vp9_prob (*probs)[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

+  const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

+                          get_tx_type(xd, b) : DCT_DCT;

+  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

+  switch (tx_size) {

+    default:

+    case TX_4X4:

+      seg_eob = 16;

+      bands = vp9_coef_bands;

+      scan = vp9_default_zig_zag1d;

+      if (tx_type != DCT_DCT) {

+        counts = cpi->hybrid_coef_counts;

+        probs = cpi->common.fc.hybrid_coef_probs;

+        if (tx_type == ADST_DCT) {

+          scan = vp9_row_scan;

+        } else if (tx_type == DCT_ADST) {

+          scan = vp9_col_scan;

+        }

+      } else {

+        counts = cpi->coef_counts;

+        probs = cpi->common.fc.coef_probs;

+      }

+      break;

+    case TX_8X8:

+      if (type == PLANE_TYPE_Y2) {

+        seg_eob = 4;

+        bands = vp9_coef_bands;

+        scan = vp9_default_zig_zag1d;

+      } else {

+        seg_eob = 64;

+        bands = vp9_coef_bands_8x8;

+        scan = vp9_default_zig_zag1d_8x8;

+      }

+      if (tx_type != DCT_DCT) {

+        counts = cpi->hybrid_coef_counts_8x8;

+        probs = cpi->common.fc.hybrid_coef_probs_8x8;

+      } else {

+        counts = cpi->coef_counts_8x8;

+        probs = cpi->common.fc.coef_probs_8x8;

+      }

+      break;

+    case TX_16X16:

+      seg_eob = 256;

+      bands = vp9_coef_bands_16x16;

+      scan = vp9_default_zig_zag1d_16x16;

+      if (tx_type != DCT_DCT) {

+        counts = cpi->hybrid_coef_counts_16x16;

+        probs = cpi->common.fc.hybrid_coef_probs_16x16;

+      } else {

+        counts = cpi->coef_counts_16x16;

+        probs = cpi->common.fc.coef_probs_16x16;

+      }

+      break;

+  }

+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB))

+    seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

+  do {

+    const int band = bands[c];

+    int token;

+    if (c < eob) {

+      const int rc = scan[c];

+      const int v = qcoeff_ptr[rc];

+      assert(-DCT_MAX_VALUE <= v  &&  v < DCT_MAX_VALUE);

+      t->Extra = vp9_dct_value_tokens_ptr[v].Extra;

+      token    = vp9_dct_value_tokens_ptr[v].Token;

+    } else {

+      token = DCT_EOB_TOKEN;

+    }

+    t->Token = token;

+    t->context_tree = probs[type][band][pt];

+    t->skip_eob_node = (pt == 0) && ((band > 0 && type != PLANE_TYPE_Y_NO_DC) ||

+                                     (band > 1 && type == PLANE_TYPE_Y_NO_DC));

+    assert(vp9_coef_encodings[t->Token].Len - t->skip_eob_node > 0);

+    if (!dry_run) {

+      ++counts[type][band][pt][token];

+    }

+    pt = vp9_prev_token_class[token];

+    ++t;

+  } while (c < eob && ++c < seg_eob);

+  *tp = t;

+  *a = *l = (c != !type); /* 0 <-> all coeff data is zero */

+}

+int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block) {

+  int skip = 1;

+  int i = 0;

+  if (has_y2_block) {

+    for (i = 0; i < 16; i++)

+      skip &= (xd->block[i].eob < 2);

+    skip &= (!xd->block[24].eob);

+  } else {

+    for (i = 0; i < 16; i++)

+      skip &= (!xd->block[i].eob);

+  }

+  return skip;

+}

+int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd) {

+  int skip = 1;

+  int i;

+  for (i = 16; i < 24; i++)

+    skip &= (!xd->block[i].eob);

+  return skip;

+}

+static int mb_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block) {

+  return (vp9_mby_is_skippable_4x4(xd, has_y2_block) &

+          vp9_mbuv_is_skippable_4x4(xd));

+}

+int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block) {

+  int skip = 1;

+  int i = 0;

+  if (has_y2_block) {

+    for (i = 0; i < 16; i += 4)

+      skip &= (xd->block[i].eob < 2);

+    skip &= (!xd->block[24].eob);

+  } else {

+    for (i = 0; i < 16; i += 4)

+      skip &= (!xd->block[i].eob);

+  }

+  return skip;

+}

+int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd) {

+  return (!xd->block[16].eob) & (!xd->block[20].eob);

+}

+static int mb_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block) {

+  return (vp9_mby_is_skippable_8x8(xd, has_y2_block) &

+          vp9_mbuv_is_skippable_8x8(xd));

+}

+static int mb_is_skippable_8x8_4x4uv(MACROBLOCKD *xd, int has_y2_block) {

+  return (vp9_mby_is_skippable_8x8(xd, has_y2_block) &

+          vp9_mbuv_is_skippable_4x4(xd));

+}

+int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd) {

+  int skip = 1;

+  skip &= !xd->block[0].eob;

+  return skip;

+}

+static int mb_is_skippable_16x16(MACROBLOCKD *xd) {

+  return (vp9_mby_is_skippable_16x16(xd) & vp9_mbuv_is_skippable_8x8(xd));

+}

+void vp9_tokenize_mb(VP9_COMP *cpi,

+                     MACROBLOCKD *xd,

+                     TOKENEXTRA **t,

+                     int dry_run) {

+  PLANE_TYPE plane_type;

+  int has_y2_block;

+  int b;

+  int tx_size = xd->mode_info_context->mbmi.txfm_size;

+  int mb_skip_context = vp9_get_pred_context(&cpi->common, xd, PRED_MBSKIP);

+  TOKENEXTRA *t_backup = *t;

+  ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *) xd->above_context;

+  ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *) xd->left_context;

+  // If the MB is going to be skipped because of a segment level flag

+  // exclude this from the skip count stats used to calculate the

+  // transmitted skip probability;

+  int skip_inc;

+  int segment_id = xd->mode_info_context->mbmi.segment_id;

+  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||

+      (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0)) {

+    skip_inc = 1;

+  } else

+    skip_inc = 0;

+  has_y2_block = (tx_size != TX_16X16

+                  && xd->mode_info_context->mbmi.mode != B_PRED

+                  && xd->mode_info_context->mbmi.mode != I8X8_PRED

+                  && xd->mode_info_context->mbmi.mode != SPLITMV);

+  switch (tx_size) {

+    case TX_16X16:

+      xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_16x16(xd);

+      break;

+    case TX_8X8:

+      if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||

+          xd->mode_info_context->mbmi.mode == SPLITMV)

+        xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_8x8_4x4uv(xd, 0);

+      else

+        xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_8x8(xd, has_y2_block);

+      break;

+    default:

+      xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_4x4(xd, has_y2_block);

+      break;

+  }

+  if (xd->mode_info_context->mbmi.mb_skip_coeff) {

+    if (!dry_run)

+      cpi->skip_true_count[mb_skip_context] += skip_inc;

+    if (!cpi->common.mb_no_coeff_skip) {

+      vp9_stuff_mb(cpi, xd, t, dry_run);

+    } else {

+      vp9_fix_contexts(xd);

+    }

+    if (dry_run)

+      *t = t_backup;

+    return;

+  }

+  if (!dry_run)

+    cpi->skip_false_count[mb_skip_context] += skip_inc;

+  if (has_y2_block) {

+    if (tx_size == TX_8X8) {

+      tokenize_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,

+                 A + vp9_block2above_8x8[24], L + vp9_block2left_8x8[24],

+                 TX_8X8, dry_run);

+    } else {

+      tokenize_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,

+                 A + vp9_block2above[24], L + vp9_block2left[24],

+                 TX_4X4, dry_run);

+    }

+    plane_type = PLANE_TYPE_Y_NO_DC;

+  } else

+    plane_type = PLANE_TYPE_Y_WITH_DC;

+  if (tx_size == TX_16X16) {

+    tokenize_b(cpi, xd, xd->block, t, PLANE_TYPE_Y_WITH_DC,

+               A, L, TX_16X16, dry_run);

+    A[1] = A[2] = A[3] = A[0];

+    L[1] = L[2] = L[3] = L[0];

+    for (b = 16; b < 24; b += 4) {

+      tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,

+                 A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],

+                 TX_8X8, dry_run);

+      A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];

+      L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];

+    }

+    vpx_memset(&A[8], 0, sizeof(A[8]));

+    vpx_memset(&L[8], 0, sizeof(L[8]));

+  } else if (tx_size == TX_8X8) {

+    for (b = 0; b < 16; b += 4) {

+      tokenize_b(cpi, xd, xd->block + b, t, plane_type,

+                 A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],

+                 TX_8X8, dry_run);

+      A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];

+      L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];

+    }

+    if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||

+        xd->mode_info_context->mbmi.mode == SPLITMV) {

+      for (b = 16; b < 24; b++) {

+        tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,

+                   A + vp9_block2above[b], L + vp9_block2left[b],

+                   TX_4X4, dry_run);

+      }

+    } else {

+      for (b = 16; b < 24; b += 4) {

+        tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,

+                   A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],

+                   TX_8X8, dry_run);

+        A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];

+        L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];

+      }

+    }

+  } else {

+    for (b = 0; b < 16; b++) {

+      tokenize_b(cpi, xd, xd->block + b, t, plane_type,

+                 A + vp9_block2above[b], L + vp9_block2left[b],

+                 TX_4X4, dry_run);

+    }

+    for (b = 16; b < 24; b++) {

+      tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,

+                 A + vp9_block2above[b], L + vp9_block2left[b],

+                 TX_4X4, dry_run);

+    }

+  }

+  if (dry_run)

+    *t = t_backup;

+}

+#ifdef ENTROPY_STATS

+void init_context_counters(void) {

+  FILE *f = fopen("context.bin", "rb");

+  if (!f) {

+    vpx_memset(context_counters, 0, sizeof(context_counters));

+    vpx_memset(context_counters_8x8, 0, sizeof(context_counters_8x8));

+    vpx_memset(context_counters_16x16, 0, sizeof(context_counters_16x16));

+  } else {

+    fread(context_counters, sizeof(context_counters), 1, f);

+    fread(context_counters_8x8, sizeof(context_counters_8x8), 1, f);

+    fread(context_counters_16x16, sizeof(context_counters_16x16), 1, f);

+    fclose(f);

+  }

+  f = fopen("treeupdate.bin", "rb");

+  if (!f) {

+    vpx_memset(tree_update_hist, 0, sizeof(tree_update_hist));

+    vpx_memset(tree_update_hist_8x8, 0, sizeof(tree_update_hist_8x8));

+    vpx_memset(tree_update_hist_16x16, 0, sizeof(tree_update_hist_16x16));

+  } else {

+    fread(tree_update_hist, sizeof(tree_update_hist), 1, f);

+    fread(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);

+    fread(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);

+    fclose(f);

+  }

+}

+void print_context_counters() {

+  int type, band, pt, t;

+  FILE *f = fopen("context.c", "w");

+  fprintf(f, "#include \"entropy.h\"\n");

+  fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");

+  fprintf(f, "static const unsigned int\n"

+          "vp9_default_coef_counts[BLOCK_TYPES]\n"

+          "                      [COEF_BANDS]\n"

+          "                      [PREV_COEF_CONTEXTS]\n"

+          "                      [MAX_ENTROPY_TOKENS]={\n");

+# define Comma( X) (X? ",":"")

+  type = 0;

+  do {

+    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);

+    band = 0;

+    do {

+      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);

+      pt = 0;

+      do {

+        fprintf(f, "%s\n      {", Comma(pt));

+        t = 0;

+        do {

+          const INT64 x = context_counters [type] [band] [pt] [t];

+          const int y = (int) x;

+          assert(x == (INT64) y);  /* no overflow handling yet */

+          fprintf(f, "%s %d", Comma(t), y);

+        } while (++t < MAX_ENTROPY_TOKENS);

+        fprintf(f, "}");

+      } while (++pt < PREV_COEF_CONTEXTS);

+      fprintf(f, "\n    }");

+    } while (++band < COEF_BANDS);

+    fprintf(f, "\n  }");

+  } while (++type < BLOCK_TYPES);

+  fprintf(f, "\n};\n");

+  fprintf(f, "static const unsigned int\nvp9_default_coef_counts_8x8"

+          "[BLOCK_TYPES_8X8] [COEF_BANDS]"

+          "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");

+  type = 0;

+  do {

+    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);

+    band = 0;

+    do {

+      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);

+      pt = 0;

+      do {

+        fprintf(f, "%s\n      {", Comma(pt));

+        t = 0;

+        do {

+          const INT64 x = context_counters_8x8 [type] [band] [pt] [t];

+          const int y = (int) x;

+          assert(x == (INT64) y);  /* no overflow handling yet */

+          fprintf(f, "%s %d", Comma(t), y);

+        } while (++t < MAX_ENTROPY_TOKENS);

+        fprintf(f, "}");

+      } while (++pt < PREV_COEF_CONTEXTS);

+      fprintf(f, "\n    }");

+    } while (++band < COEF_BANDS);

+    fprintf(f, "\n  }");

+  } while (++type < BLOCK_TYPES_8X8);

+  fprintf(f, "\n};\n");

+  fprintf(f, "static const unsigned int\nvp9_default_coef_counts_16x16"

+          "[BLOCK_TYPES_16X16] [COEF_BANDS]"

+          "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");

+  type = 0;

+  do {

+    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);

+    band = 0;

+    do {

+      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);

+      pt = 0;

+      do {

+        fprintf(f, "%s\n      {", Comma(pt));

+        t = 0;

+        do {

+          const INT64 x = context_counters_16x16 [type] [band] [pt] [t];

+          const int y = (int) x;

+          assert(x == (INT64) y);  /* no overflow handling yet */

+          fprintf(f, "%s %d", Comma(t), y);

+        } while (++t < MAX_ENTROPY_TOKENS);

+        fprintf(f, "}");

+      } while (++pt < PREV_COEF_CONTEXTS);

+      fprintf(f, "\n    }");

+    } while (++band < COEF_BANDS);

+    fprintf(f, "\n  }");

+  } while (++type < BLOCK_TYPES_16X16);

+  fprintf(f, "\n};\n");

+  fprintf(f, "static const vp9_prob\n"

+          "vp9_default_coef_probs[BLOCK_TYPES] [COEF_BANDS] \n"

+          "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");

+  type = 0;

+  do {

+    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);

+    band = 0;

+    do {

+      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);

+      pt = 0;

+      do {

+        unsigned int branch_ct [ENTROPY_NODES] [2];

+        unsigned int coef_counts[MAX_ENTROPY_TOKENS];

+        vp9_prob coef_probs[ENTROPY_NODES];

+        for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

+          coef_counts[t] = context_counters [type] [band] [pt] [t];

+        vp9_tree_probs_from_distribution(

+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

+          coef_probs, branch_ct, coef_counts, 256, 1);

+        fprintf(f, "%s\n      {", Comma(pt));

+        t = 0;

+        do {

+          fprintf(f, "%s %d", Comma(t), coef_probs[t]);

+        } while (++t < ENTROPY_NODES);

+        fprintf(f, "}");

+      } while (++pt < PREV_COEF_CONTEXTS);

+      fprintf(f, "\n    }");

+    } while (++band < COEF_BANDS);

+    fprintf(f, "\n  }");

+  } while (++type < BLOCK_TYPES);

+  fprintf(f, "\n};\n");

+  fprintf(f, "static const vp9_prob\n"

+          "vp9_default_coef_probs_8x8[BLOCK_TYPES_8X8] [COEF_BANDS]\n"

+          "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");

+  type = 0;

+  do {

+    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);

+    band = 0;

+    do {

+      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);

+      pt = 0;

+      do {

+        unsigned int branch_ct [ENTROPY_NODES] [2];

+        unsigned int coef_counts[MAX_ENTROPY_TOKENS];

+        vp9_prob coef_probs[ENTROPY_NODES];

+        for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

+          coef_counts[t] = context_counters_8x8[type] [band] [pt] [t];

+        vp9_tree_probs_from_distribution(

+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

+          coef_probs, branch_ct, coef_counts, 256, 1);

+        fprintf(f, "%s\n      {", Comma(pt));

+        t = 0;

+        do {

+          fprintf(f, "%s %d", Comma(t), coef_probs[t]);

+        } while (++t < ENTROPY_NODES);

+        fprintf(f, "}");

+      } while (++pt < PREV_COEF_CONTEXTS);

+      fprintf(f, "\n    }");

+    } while (++band < COEF_BANDS);

+    fprintf(f, "\n  }");

+  } while (++type < BLOCK_TYPES_8X8);

+  fprintf(f, "\n};\n");

+  fprintf(f, "static const vp9_prob\n"

+          "vp9_default_coef_probs_16x16[BLOCK_TYPES_16X16] [COEF_BANDS]\n"

+          "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");

+  type = 0;

+  do {

+    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);

+    band = 0;

+    do {

+      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);

+      pt = 0;

+      do {

+        unsigned int branch_ct [ENTROPY_NODES] [2];

+        unsigned int coef_counts[MAX_ENTROPY_TOKENS];

+        vp9_prob coef_probs[ENTROPY_NODES];

+        for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

+          coef_counts[t] = context_counters_16x16[type] [band] [pt] [t];

+        vp9_tree_probs_from_distribution(

+          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,

+          coef_probs, branch_ct, coef_counts, 256, 1);

+        fprintf(f, "%s\n      {", Comma(pt));

+        t = 0;

+        do {

+          fprintf(f, "%s %d", Comma(t), coef_probs[t]);

+        } while (++t < ENTROPY_NODES);

+        fprintf(f, "}");

+      } while (++pt < PREV_COEF_CONTEXTS);

+      fprintf(f, "\n    }");

+    } while (++band < COEF_BANDS);

+    fprintf(f, "\n  }");

+  } while (++type < BLOCK_TYPES_16X16);

+  fprintf(f, "\n};\n");

+  fclose(f);

+  f = fopen("context.bin", "wb");

+  fwrite(context_counters, sizeof(context_counters), 1, f);

+  fwrite(context_counters_8x8, sizeof(context_counters_8x8), 1, f);

+  fwrite(context_counters_16x16, sizeof(context_counters_16x16), 1, f);

+  fclose(f);

+}

+#endif

+void vp9_tokenize_initialize() {

+  fill_value_tokens();

+}

+static __inline void stuff_b(VP9_COMP *cpi,

+                             MACROBLOCKD *xd,

+                             const BLOCKD * const b,

+                             TOKENEXTRA **tp,

+                             PLANE_TYPE type,

+                             ENTROPY_CONTEXT *a,

+                             ENTROPY_CONTEXT *l,

+                             TX_SIZE tx_size,

+                             int dry_run) {

+  const int *bands;

+  unsigned int (*counts)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];

+  vp9_prob (*probs)[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

+  int pt, band;

+  TOKENEXTRA *t = *tp;

+  const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

+                          get_tx_type(xd, b) : DCT_DCT;

+  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

+  switch (tx_size) {

+    default:

+    case TX_4X4:

+      bands = vp9_coef_bands;

+      if (tx_type != DCT_DCT) {

+        counts = cpi->hybrid_coef_counts;

+        probs = cpi->common.fc.hybrid_coef_probs;

+      } else {

+        counts = cpi->coef_counts;

+        probs = cpi->common.fc.coef_probs;

+      }

+      break;

+    case TX_8X8:

+      bands = vp9_coef_bands_8x8;

+      if (tx_type != DCT_DCT) {

+        counts = cpi->hybrid_coef_counts_8x8;

+        probs = cpi->common.fc.hybrid_coef_probs_8x8;

+      } else {

+        counts = cpi->coef_counts_8x8;

+        probs = cpi->common.fc.coef_probs_8x8;

+      }

+      break;

+    case TX_16X16:

+      bands = vp9_coef_bands_16x16;

+      if (tx_type != DCT_DCT) {

+        counts = cpi->hybrid_coef_counts_16x16;

+        probs = cpi->common.fc.hybrid_coef_probs_16x16;

+      } else {

+        counts = cpi->coef_counts_16x16;

+        probs = cpi->common.fc.coef_probs_16x16;

+      }

+      break;

+  }

+  band = bands[(type == PLANE_TYPE_Y_NO_DC) ? 1 : 0];

+  t->Token = DCT_EOB_TOKEN;

+  t->context_tree = probs[type][band][pt];

+  t->skip_eob_node = 0;

+  ++t;

+  *tp = t;

+  *a = *l = 0;

+  if (!dry_run) {

+    ++counts[type][band][pt][DCT_EOB_TOKEN];

+  }

+}

+static void stuff_mb_8x8(VP9_COMP *cpi, MACROBLOCKD *xd,

+                         TOKENEXTRA **t, int dry_run) {

+  ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context;

+  ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context;

+  PLANE_TYPE plane_type;

+  int b;

+  const int has_y2_block = (xd->mode_info_context->mbmi.mode != B_PRED &&

+                            xd->mode_info_context->mbmi.mode != I8X8_PRED &&

+                            xd->mode_info_context->mbmi.mode != SPLITMV);

+  if (has_y2_block) {

+    stuff_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,

+            A + vp9_block2above_8x8[24], L + vp9_block2left_8x8[24],

+            TX_8X8, dry_run);

+    plane_type = PLANE_TYPE_Y_NO_DC;

+  } else {

+    plane_type = PLANE_TYPE_Y_WITH_DC;

+  }

+  for (b = 0; b < 16; b += 4) {

+    stuff_b(cpi, xd, xd->block + b, t, plane_type, A + vp9_block2above_8x8[b],

+            L + vp9_block2left_8x8[b], TX_8X8, dry_run);

+    A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];

+    L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];

+  }

+  for (b = 16; b < 24; b += 4) {

+    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,

+            A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],

+            TX_8X8, dry_run);

+    A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];

+    L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];

+  }

+}

+static void stuff_mb_16x16(VP9_COMP *cpi, MACROBLOCKD *xd,

+                           TOKENEXTRA **t, int dry_run) {

+  ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)xd->above_context;

+  ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)xd->left_context;

+  int b;

+  stuff_b(cpi, xd, xd->block, t, PLANE_TYPE_Y_WITH_DC, A, L, TX_16X16, dry_run);

+  A[1] = A[2] = A[3] = A[0];

+  L[1] = L[2] = L[3] = L[0];

+  for (b = 16; b < 24; b += 4) {

+    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],

+            L + vp9_block2above_8x8[b], TX_8X8, dry_run);

+    A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];

+    L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];

+  }

+  vpx_memset(&A[8], 0, sizeof(A[8]));

+  vpx_memset(&L[8], 0, sizeof(L[8]));

+}

+static void stuff_mb_4x4(VP9_COMP *cpi, MACROBLOCKD *xd,

+                         TOKENEXTRA **t, int dry_run) {

+  ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context;

+  ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context;

+  int b;

+  PLANE_TYPE plane_type;

+  const int has_y2_block = (xd->mode_info_context->mbmi.mode != B_PRED &&

+                            xd->mode_info_context->mbmi.mode != I8X8_PRED &&

+                            xd->mode_info_context->mbmi.mode != SPLITMV);

+  if (has_y2_block) {

+    stuff_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2, A + vp9_block2above[24],

+            L + vp9_block2left[24], TX_4X4, dry_run);

+    plane_type = PLANE_TYPE_Y_NO_DC;

+  } else {

+    plane_type = PLANE_TYPE_Y_WITH_DC;

+  }

+  for (b = 0; b < 16; b++)

+    stuff_b(cpi, xd, xd->block + b, t, plane_type, A + vp9_block2above[b],

+            L + vp9_block2left[b], TX_4X4, dry_run);

+  for (b = 16; b < 24; b++)

+    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],

+            L + vp9_block2left[b], TX_4X4, dry_run);

+}

+static void stuff_mb_8x8_4x4uv(VP9_COMP *cpi, MACROBLOCKD *xd,

+                               TOKENEXTRA **t, int dry_run) {

+  ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context;

+  ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context;

+  int b;

+  for (b = 0; b < 16; b += 4) {

+    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_Y_WITH_DC,

+            A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],

+            TX_8X8, dry_run);

+    A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];

+    L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];

+  }

+  for (b = 16; b < 24; b++)

+    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],

+            L + vp9_block2left[b], TX_4X4, dry_run);

+}

+void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {

+  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;

+  TOKENEXTRA * const t_backup = *t;

+  if (tx_size == TX_16X16) {

+    stuff_mb_16x16(cpi, xd, t, dry_run);

+  } else if (tx_size == TX_8X8) {

+    if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||

+        xd->mode_info_context->mbmi.mode == SPLITMV) {

+      stuff_mb_8x8_4x4uv(cpi, xd, t, dry_run);

+    } else {

+      stuff_mb_8x8(cpi, xd, t, dry_run);

+    }

+  } else {

+    stuff_mb_4x4(cpi, xd, t, dry_run);

+  }

+  if (dry_run) {

+    *t = t_backup;

+  }

+}

+void vp9_fix_contexts(MACROBLOCKD *xd) {

+  /* Clear entropy contexts for Y2 blocks */

+  if ((xd->mode_info_context->mbmi.mode != B_PRED

+      && xd->mode_info_context->mbmi.mode != I8X8_PRED

+      && xd->mode_info_context->mbmi.mode != SPLITMV)

+      || xd->mode_info_context->mbmi.txfm_size == TX_16X16

+      ) {

+    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));

+    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));

+  } else {

+    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);

+    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);

+  }

+}

--- /dev/null

+++ b/vp9/encoder/tokenize.h

@@ -1,0 +1,59 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef tokenize_h

+#define tokenize_h

+#include "vp9/common/entropy.h"

+#include "block.h"

+void vp9_tokenize_initialize();

+typedef struct {

+  short Token;

+  short Extra;

+} TOKENVALUE;

+typedef struct {

+  const vp9_prob *context_tree;

+  short           Extra;

+  unsigned char   Token;

+  unsigned char   skip_eob_node;

+} TOKENEXTRA;

+int rd_cost_mby(MACROBLOCKD *);

+extern int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block);

+extern int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd);

+extern int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block);

+extern int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd);

+extern int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd);

+#ifdef ENTROPY_STATS

+void init_context_counters();

+void print_context_counters();

+extern INT64 context_counters[BLOCK_TYPES][COEF_BANDS]

+                             [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];

+extern INT64 context_counters_8x8[BLOCK_TYPES_8X8][COEF_BANDS]

+                                 [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];

+extern INT64 context_counters_16x16[BLOCK_TYPES_16X16][COEF_BANDS]

+                                   [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];

+#endif

+extern const int *vp9_dct_value_cost_ptr;

+/* TODO: The Token field should be broken out into a separate char array to

+ *  improve cache locality, since it's needed for costing when the rest of the

+ *  fields are not.

+ */

+extern const TOKENVALUE *vp9_dct_value_tokens_ptr;

+#endif  /* tokenize_h */

--- /dev/null

+++ b/vp9/encoder/treewriter.c

@@ -1,0 +1,39 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "treewriter.h"

+static void cost(

+  int *const C,

+  vp9_tree T,

+  const vp9_prob *const P,

+  int i,

+  int c

+) {

+  const vp9_prob p = P [i >> 1];

+  do {

+    const vp9_tree_index j = T[i];

+    const int d = c + vp9_cost_bit(p, i & 1);

+    if (j <= 0)

+      C[-j] = d;

+    else

+      cost(C, T, P, j, d);

+  } while (++i & 1);

+}

+void vp9_cost_tokens(int *c, const vp9_prob *p, vp9_tree t) {

+  cost(c, t, p, 0, 0);

+}

+void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t) {

+  cost(c, t, p, 2, 0);

+}

--- /dev/null

+++ b/vp9/encoder/treewriter.h

@@ -1,0 +1,108 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_TREEWRITER_H

+#define __INC_TREEWRITER_H

+/* Trees map alphabets into huffman-like codes suitable for an arithmetic

+   bit coder.  Timothy S Murphy  11 October 2004 */

+#include "vp9/common/treecoder.h"

+#include "boolhuff.h"       /* for now */

+typedef BOOL_CODER vp9_writer;

+#define vp9_write encode_bool

+#define vp9_write_literal vp9_encode_value

+#define vp9_write_bit(W, V) vp9_write(W, V, vp9_prob_half)

+/* Approximate length of an encoded bool in 256ths of a bit at given prob */

+#define vp9_cost_zero(x) (vp9_prob_cost[x])

+#define vp9_cost_one(x) vp9_cost_zero(vp9_complement(x))

+#define vp9_cost_bit(x, b) vp9_cost_zero((b) ? vp9_complement(x) : (x))

+/* VP8BC version is scaled by 2^20 rather than 2^8; see bool_coder.h */

+/* Both of these return bits, not scaled bits. */

+static __inline unsigned int cost_branch(const unsigned int ct[2],

+                                         vp9_prob p) {

+  /* Imitate existing calculation */

+  return ((ct[0] * vp9_cost_zero(p))

+          + (ct[1] * vp9_cost_one(p))) >> 8;

+}

+static __inline unsigned int cost_branch256(const unsigned int ct[2],

+                                            vp9_prob p) {

+  /* Imitate existing calculation */

+  return ((ct[0] * vp9_cost_zero(p))

+          + (ct[1] * vp9_cost_one(p)));

+}

+/* Small functions to write explicit values and tokens, as well as

+   estimate their lengths. */

+static __inline void treed_write(vp9_writer *const w,

+                                 vp9_tree t,

+                                 const vp9_prob *const p,

+                                 int v,

+                                 /* number of bits in v, assumed nonzero */

+                                 int n) {

+  vp9_tree_index i = 0;

+  do {

+    const int b = (v >> --n) & 1;

+    vp9_write(w, b, p[i >> 1]);

+    i = t[i + b];

+  } while (n);

+}

+static __inline void write_token(vp9_writer *const w,

+                                 vp9_tree t,

+                                 const vp9_prob *const p,

+                                 vp9_token *const x) {

+  treed_write(w, t, p, x->value, x->Len);

+}

+static __inline int treed_cost(vp9_tree t,

+                               const vp9_prob *const p,

+                               int v,

+                               /* number of bits in v, assumed nonzero */

+                               int n) {

+  int c = 0;

+  vp9_tree_index i = 0;

+  do {

+    const int b = (v >> --n) & 1;

+    c += vp9_cost_bit(p[i >> 1], b);

+    i = t[i + b];

+  } while (n);

+  return c;

+}

+static __inline int cost_token(vp9_tree t,

+                               const vp9_prob *const p,

+                               vp9_token *const x) {

+  return treed_cost(t, p, x->value, x->Len);

+}

+/* Fill array of costs for all possible token values. */

+void vp9_cost_tokens(int *Costs, const vp9_prob *, vp9_tree);

+void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t);

+#endif

--- /dev/null

+++ b/vp9/encoder/variance.h

@@ -1,0 +1,84 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VARIANCE_H

+#define VARIANCE_H

+typedef unsigned int(*vp9_sad_fn_t)(const unsigned char *src_ptr,

+                                    int source_stride,

+                                    const unsigned char *ref_ptr,

+                                    int ref_stride,

+                                    unsigned int max_sad);

+typedef void (*vp9_copy32xn_fn_t)(const unsigned char *src_ptr,

+                                  int source_stride,

+                                  const unsigned char *ref_ptr,

+                                  int ref_stride,

+                                  int n);

+typedef void (*vp9_sad_multi_fn_t)(const unsigned char *src_ptr,

+                                   int source_stride,

+                                   const unsigned char *ref_ptr,

+                                   int  ref_stride,

+                                   unsigned int *sad_array);

+typedef void (*vp9_sad_multi1_fn_t)(const unsigned char *src_ptr,

+                                    int source_stride,

+                                    const unsigned char *ref_ptr,

+                                    int  ref_stride,

+                                    unsigned short *sad_array);

+typedef void (*vp9_sad_multi_d_fn_t)(const unsigned char *src_ptr,

+                                     int source_stride,

+                                     const unsigned char * const ref_ptr[],

+                                     int  ref_stride, unsigned int *sad_array);

+typedef unsigned int (*vp9_variance_fn_t)(const unsigned char *src_ptr,

+                                          int source_stride,

+                                          const unsigned char *ref_ptr,

+                                          int ref_stride,

+                                          unsigned int *sse);

+typedef unsigned int (*vp9_subpixvariance_fn_t)(const unsigned char  *src_ptr,

+                                                int source_stride,

+                                                int xoffset,

+                                                int yoffset,

+                                                const unsigned char *ref_ptr,

+                                                int Refstride,

+                                                unsigned int *sse);

+typedef void (*vp9_ssimpf_fn_t)(unsigned char *s, int sp, unsigned char *r,

+                                int rp, unsigned long *sum_s,

+                                unsigned long *sum_r, unsigned long *sum_sq_s,

+                                unsigned long *sum_sq_r,

+                                unsigned long *sum_sxr);

+typedef unsigned int (*vp9_getmbss_fn_t)(const short *);

+typedef unsigned int (*vp9_get16x16prederror_fn_t)(const unsigned char *src_ptr,

+                                                   int source_stride,

+                                                   const unsigned char *ref_ptr,

+                                                   int  ref_stride);

+typedef struct variance_vtable {

+    vp9_sad_fn_t            sdf;

+    vp9_variance_fn_t       vf;

+    vp9_subpixvariance_fn_t svf;

+    vp9_variance_fn_t       svf_halfpix_h;

+    vp9_variance_fn_t       svf_halfpix_v;

+    vp9_variance_fn_t       svf_halfpix_hv;

+    vp9_sad_multi_fn_t      sdx3f;

+    vp9_sad_multi1_fn_t     sdx8f;

+    vp9_sad_multi_d_fn_t    sdx4df;

+    vp9_copy32xn_fn_t       copymem;

+} vp9_variance_fn_ptr_t;

+#endif

--- /dev/null

+++ b/vp9/encoder/variance_c.c

@@ -1,0 +1,540 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "variance.h"

+#include "vp9/common/filter.h"

+unsigned int vp9_get_mb_ss_c(const short *src_ptr) {

+  unsigned int i, sum = 0;

+  for (i = 0; i < 256; i++) {

+    sum += (src_ptr[i] * src_ptr[i]);

+  }

+  return sum;

+}

+static void variance(const unsigned char *src_ptr,

+                     int  source_stride,

+                     const unsigned char *ref_ptr,

+                     int  recon_stride,

+                     int  w,

+                     int  h,

+                     unsigned int *sse,

+                     int *sum) {

+  int i, j;

+  int diff;

+  *sum = 0;

+  *sse = 0;

+  for (i = 0; i < h; i++) {

+    for (j = 0; j < w; j++) {

+      diff = src_ptr[j] - ref_ptr[j];

+      *sum += diff;

+      *sse += diff * diff;

+    }

+    src_ptr += source_stride;

+    ref_ptr += recon_stride;

+  }

+}

+#if CONFIG_SUPERBLOCKS

+unsigned int vp9_variance32x32_c(const unsigned char *src_ptr,

+                                 int  source_stride,

+                                 const unsigned char *ref_ptr,

+                                 int  recon_stride,

+                                 unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, &var, &avg);

+  *sse = var;

+  return (var - ((avg * avg) >> 10));

+}

+#endif

+unsigned int vp9_variance16x16_c(const unsigned char *src_ptr,

+                                 int  source_stride,

+                                 const unsigned char *ref_ptr,

+                                 int  recon_stride,

+                                 unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);

+  *sse = var;

+  return (var - ((avg * avg) >> 8));

+}

+unsigned int vp9_variance8x16_c(const unsigned char *src_ptr,

+                                int  source_stride,

+                                const unsigned char *ref_ptr,

+                                int  recon_stride,

+                                unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);

+  *sse = var;

+  return (var - ((avg * avg) >> 7));

+}

+unsigned int vp9_variance16x8_c(const unsigned char *src_ptr,

+                                int  source_stride,

+                                const unsigned char *ref_ptr,

+                                int  recon_stride,

+                                unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);

+  *sse = var;

+  return (var - ((avg * avg) >> 7));

+}

+unsigned int vp9_variance8x8_c(const unsigned char *src_ptr,

+                               int  source_stride,

+                               const unsigned char *ref_ptr,

+                               int  recon_stride,

+                               unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);

+  *sse = var;

+  return (var - ((avg * avg) >> 6));

+}

+unsigned int vp9_variance4x4_c(const unsigned char *src_ptr,

+                               int  source_stride,

+                               const unsigned char *ref_ptr,

+                               int  recon_stride,

+                               unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);

+  *sse = var;

+  return (var - ((avg * avg) >> 4));

+}

+unsigned int vp9_mse16x16_c(const unsigned char *src_ptr,

+                            int  source_stride,

+                            const unsigned char *ref_ptr,

+                            int  recon_stride,

+                            unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);

+  *sse = var;

+  return var;

+}

+/****************************************************************************

+ *

+ *  ROUTINE       : filter_block2d_bil_first_pass

+ *

+ *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.

+ *                  UINT32 src_pixels_per_line : Stride of input block.

+ *                  UINT32 pixel_step        : Offset between filter input samples (see notes).

+ *                  UINT32 output_height     : Input block height.

+ *                  UINT32 output_width      : Input block width.

+ *                  INT32  *vp9_filter          : Array of 2 bi-linear filter taps.

+ *

+ *  OUTPUTS       : INT32 *output_ptr        : Pointer to filtered block.

+ *

+ *  RETURNS       : void

+ *

+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in

+ *                  either horizontal or vertical direction to produce the

+ *                  filtered output block. Used to implement first-pass

+ *                  of 2-D separable filter.

+ *

+ *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.

+ *                  Two filter taps should sum to VP9_FILTER_WEIGHT.

+ *                  pixel_step defines whether the filter is applied

+ *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).

+ *                  It defines the offset required to move from one input

+ *                  to the next.

+ *

+ ****************************************************************************/

+static void var_filter_block2d_bil_first_pass(const unsigned char *src_ptr,

+                                              unsigned short *output_ptr,

+                                              unsigned int src_pixels_per_line,

+                                              int pixel_step,

+                                              unsigned int output_height,

+                                              unsigned int output_width,

+                                              const short *vp9_filter) {

+  unsigned int i, j;

+  for (i = 0; i < output_height; i++) {

+    for (j = 0; j < output_width; j++) {

+      // Apply bilinear filter

+      output_ptr[j] = (((int)src_ptr[0]          * vp9_filter[0]) +

+                       ((int)src_ptr[pixel_step] * vp9_filter[1]) +

+                       (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;

+      src_ptr++;

+    }

+    // Next row...

+    src_ptr    += src_pixels_per_line - output_width;

+    output_ptr += output_width;

+  }

+}

+/****************************************************************************

+ *

+ *  ROUTINE       : filter_block2d_bil_second_pass

+ *

+ *  INPUTS        : INT32  *src_ptr          : Pointer to source block.

+ *                  UINT32 src_pixels_per_line : Stride of input block.

+ *                  UINT32 pixel_step        : Offset between filter input samples (see notes).

+ *                  UINT32 output_height     : Input block height.

+ *                  UINT32 output_width      : Input block width.

+ *                  INT32  *vp9_filter          : Array of 2 bi-linear filter taps.

+ *

+ *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.

+ *

+ *  RETURNS       : void

+ *

+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in

+ *                  either horizontal or vertical direction to produce the

+ *                  filtered output block. Used to implement second-pass

+ *                  of 2-D separable filter.

+ *

+ *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.

+ *                  Two filter taps should sum to VP9_FILTER_WEIGHT.

+ *                  pixel_step defines whether the filter is applied

+ *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).

+ *                  It defines the offset required to move from one input

+ *                  to the next.

+ *

+ ****************************************************************************/

+static void var_filter_block2d_bil_second_pass(const unsigned short *src_ptr,

+                                               unsigned char *output_ptr,

+                                               unsigned int src_pixels_per_line,

+                                               unsigned int pixel_step,

+                                               unsigned int output_height,

+                                               unsigned int output_width,

+                                               const short *vp9_filter) {

+  unsigned int  i, j;

+  int  Temp;

+  for (i = 0; i < output_height; i++) {

+    for (j = 0; j < output_width; j++) {

+      // Apply filter

+      Temp = ((int)src_ptr[0]         * vp9_filter[0]) +

+             ((int)src_ptr[pixel_step] * vp9_filter[1]) +

+             (VP9_FILTER_WEIGHT / 2);

+      output_ptr[j] = (unsigned int)(Temp >> VP9_FILTER_SHIFT);

+      src_ptr++;

+    }

+    // Next row...

+    src_ptr    += src_pixels_per_line - output_width;

+    output_ptr += output_width;

+  }

+}

+unsigned int vp9_sub_pixel_variance4x4_c(const unsigned char  *src_ptr,

+                                         int  src_pixels_per_line,

+                                         int  xoffset,

+                                         int  yoffset,

+                                         const unsigned char *dst_ptr,

+                                         int dst_pixels_per_line,

+                                         unsigned int *sse) {

+  unsigned char  temp2[20 * 16];

+  const short *HFilter, *VFilter;

+  unsigned short FData3[5 * 4]; // Temp data bufffer used in filtering

+  HFilter = vp9_bilinear_filters[xoffset];

+  VFilter = vp9_bilinear_filters[yoffset];

+  // First filter 1d Horizontal

+  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);

+  // Now filter Verticaly

+  var_filter_block2d_bil_second_pass(FData3, temp2, 4,  4,  4,  4, VFilter);

+  return vp9_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);

+}

+unsigned int vp9_sub_pixel_variance8x8_c(const unsigned char  *src_ptr,

+                                         int  src_pixels_per_line,

+                                         int  xoffset,

+                                         int  yoffset,

+                                         const unsigned char *dst_ptr,

+                                         int dst_pixels_per_line,

+                                         unsigned int *sse) {

+  unsigned short FData3[9 * 8]; // Temp data bufffer used in filtering

+  unsigned char  temp2[20 * 16];

+  const short *HFilter, *VFilter;

+  HFilter = vp9_bilinear_filters[xoffset];

+  VFilter = vp9_bilinear_filters[yoffset];

+  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);

+  var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);

+  return vp9_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);

+}

+unsigned int vp9_sub_pixel_variance16x16_c(const unsigned char  *src_ptr,

+                                           int  src_pixels_per_line,

+                                           int  xoffset,

+                                           int  yoffset,

+                                           const unsigned char *dst_ptr,

+                                           int dst_pixels_per_line,

+                                           unsigned int *sse) {

+  unsigned short FData3[17 * 16]; // Temp data bufffer used in filtering

+  unsigned char  temp2[20 * 16];

+  const short *HFilter, *VFilter;

+  HFilter = vp9_bilinear_filters[xoffset];

+  VFilter = vp9_bilinear_filters[yoffset];

+  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);

+  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);

+  return vp9_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);

+}

+#if CONFIG_SUPERBLOCKS

+unsigned int vp9_sub_pixel_variance32x32_c(const unsigned char  *src_ptr,

+                                           int  src_pixels_per_line,

+                                           int  xoffset,

+                                           int  yoffset,

+                                           const unsigned char *dst_ptr,

+                                           int dst_pixels_per_line,

+                                           unsigned int *sse) {

+  unsigned short FData3[33 * 32]; // Temp data bufffer used in filtering

+  unsigned char  temp2[36 * 32];

+  const short *HFilter, *VFilter;

+  HFilter = vp9_bilinear_filters[xoffset];

+  VFilter = vp9_bilinear_filters[yoffset];

+  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter);

+  var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter);

+  return vp9_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);

+}

+#endif

+unsigned int vp9_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr,

+                                              int  source_stride,

+                                              const unsigned char *ref_ptr,

+                                              int  recon_stride,

+                                              unsigned int *sse) {

+  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 0,

+                                       ref_ptr, recon_stride, sse);

+}

+#if CONFIG_SUPERBLOCKS

+unsigned int vp9_variance_halfpixvar32x32_h_c(const unsigned char *src_ptr,

+                                              int  source_stride,

+                                              const unsigned char *ref_ptr,

+                                              int  recon_stride,

+                                              unsigned int *sse) {

+  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 0,

+                                       ref_ptr, recon_stride, sse);

+}

+#endif

+unsigned int vp9_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr,

+                                              int  source_stride,

+                                              const unsigned char *ref_ptr,

+                                              int  recon_stride,

+                                              unsigned int *sse) {

+  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8,

+                                       ref_ptr, recon_stride, sse);

+}

+#if CONFIG_SUPERBLOCKS

+unsigned int vp9_variance_halfpixvar32x32_v_c(const unsigned char *src_ptr,

+                                              int  source_stride,

+                                              const unsigned char *ref_ptr,

+                                              int  recon_stride,

+                                              unsigned int *sse) {

+  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 0, 8,

+                                       ref_ptr, recon_stride, sse);

+}

+#endif

+unsigned int vp9_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr,

+                                               int  source_stride,

+                                               const unsigned char *ref_ptr,

+                                               int  recon_stride,

+                                               unsigned int *sse) {

+  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 8,

+                                       ref_ptr, recon_stride, sse);

+}

+#if CONFIG_SUPERBLOCKS

+unsigned int vp9_variance_halfpixvar32x32_hv_c(const unsigned char *src_ptr,

+                                               int  source_stride,

+                                               const unsigned char *ref_ptr,

+                                               int  recon_stride,

+                                               unsigned int *sse) {

+  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 8,

+                                       ref_ptr, recon_stride, sse);

+}

+#endif

+unsigned int vp9_sub_pixel_mse16x16_c(const unsigned char  *src_ptr,

+                                      int  src_pixels_per_line,

+                                      int  xoffset,

+                                      int  yoffset,

+                                      const unsigned char *dst_ptr,

+                                      int dst_pixels_per_line,

+                                      unsigned int *sse) {

+  vp9_sub_pixel_variance16x16_c(src_ptr, src_pixels_per_line,

+                                xoffset, yoffset, dst_ptr,

+                                dst_pixels_per_line, sse);

+  return *sse;

+}

+#if CONFIG_SUPERBLOCKS

+unsigned int vp9_sub_pixel_mse32x32_c(const unsigned char  *src_ptr,

+                                      int  src_pixels_per_line,

+                                      int  xoffset,

+                                      int  yoffset,

+                                      const unsigned char *dst_ptr,

+                                      int dst_pixels_per_line,

+                                      unsigned int *sse) {

+  vp9_sub_pixel_variance32x32_c(src_ptr, src_pixels_per_line,

+                                xoffset, yoffset, dst_ptr,

+                                dst_pixels_per_line, sse);

+  return *sse;

+}

+#endif

+unsigned int vp9_sub_pixel_variance16x8_c(const unsigned char  *src_ptr,

+                                          int  src_pixels_per_line,

+                                          int  xoffset,

+                                          int  yoffset,

+                                          const unsigned char *dst_ptr,

+                                          int dst_pixels_per_line,

+                                          unsigned int *sse) {

+  unsigned short FData3[16 * 9];  // Temp data bufffer used in filtering

+  unsigned char  temp2[20 * 16];

+  const short *HFilter, *VFilter;

+  HFilter = vp9_bilinear_filters[xoffset];

+  VFilter = vp9_bilinear_filters[yoffset];

+  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);

+  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);

+  return vp9_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);

+}

+unsigned int vp9_sub_pixel_variance8x16_c(const unsigned char  *src_ptr,

+                                          int  src_pixels_per_line,

+                                          int  xoffset,

+                                          int  yoffset,

+                                          const unsigned char *dst_ptr,

+                                          int dst_pixels_per_line,

+                                          unsigned int *sse) {

+  unsigned short FData3[9 * 16];  // Temp data bufffer used in filtering

+  unsigned char  temp2[20 * 16];

+  const short *HFilter, *VFilter;

+  HFilter = vp9_bilinear_filters[xoffset];

+  VFilter = vp9_bilinear_filters[yoffset];

+  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,

+                                    1, 17, 8, HFilter);

+  var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);

+  return vp9_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);

+}

+#if CONFIG_NEWBESTREFMV

+unsigned int vp9_variance2x16_c(const unsigned char *src_ptr,

+                                const int  source_stride,

+                                const unsigned char *ref_ptr,

+                                const int  recon_stride,

+                                unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 2, 16, &var, &avg);

+  *sse = var;

+  return (var - ((avg * avg) >> 5));

+}

+unsigned int vp9_variance16x2_c(const unsigned char *src_ptr,

+                                const int  source_stride,

+                                const unsigned char *ref_ptr,

+                                const int  recon_stride,

+                                unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 2, &var, &avg);

+  *sse = var;

+  return (var - ((avg * avg) >> 5));

+}

+unsigned int vp9_sub_pixel_variance16x2_c(const unsigned char  *src_ptr,

+                                          const int  src_pixels_per_line,

+                                          const int  xoffset,

+                                          const int  yoffset,

+                                          const unsigned char *dst_ptr,

+                                          const int dst_pixels_per_line,

+                                          unsigned int *sse) {

+  unsigned short FData3[16 * 3];  // Temp data bufffer used in filtering

+  unsigned char  temp2[20 * 16];

+  const short *HFilter, *VFilter;

+  HFilter = vp9_bilinear_filters[xoffset];

+  VFilter = vp9_bilinear_filters[yoffset];

+  var_filter_block2d_bil_first_pass(src_ptr, FData3,

+                                    src_pixels_per_line, 1, 3, 16, HFilter);

+  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 2, 16, VFilter);

+  return vp9_variance16x2_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);

+}

+unsigned int vp9_sub_pixel_variance2x16_c(const unsigned char  *src_ptr,

+                                          const int  src_pixels_per_line,

+                                          const int  xoffset,

+                                          const int  yoffset,

+                                          const unsigned char *dst_ptr,

+                                          const int dst_pixels_per_line,

+                                          unsigned int *sse) {

+  unsigned short FData3[2 * 17];  // Temp data bufffer used in filtering

+  unsigned char  temp2[2 * 16];

+  const short *HFilter, *VFilter;

+  HFilter = vp9_bilinear_filters[xoffset];

+  VFilter = vp9_bilinear_filters[yoffset];

+  var_filter_block2d_bil_first_pass(src_ptr, FData3,

+                                    src_pixels_per_line, 1, 17, 2, HFilter);

+  var_filter_block2d_bil_second_pass(FData3, temp2, 2, 2, 16, 2, VFilter);

+  return vp9_variance2x16_c(temp2, 2, dst_ptr, dst_pixels_per_line, sse);

+}

+#endif

--- /dev/null

+++ b/vp9/encoder/x86/dct_mmx.asm

@@ -1,0 +1,241 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+;void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch)

+global sym(vp9_short_fdct4x4_mmx)

+sym(vp9_short_fdct4x4_mmx):

+    push        rbp

+    mov         rbp,        rsp

+    SHADOW_ARGS_TO_STACK 3

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov         rsi,        arg(0)      ; input

+        mov         rdi,        arg(1)      ; output

+        movsxd      rax,        dword ptr arg(2) ;pitch

+        lea         rcx,        [rsi + rax*2]

+        ; read the input data

+        movq        mm0,        [rsi]

+        movq        mm1,        [rsi + rax]

+        movq        mm2,        [rcx]

+        movq        mm4,        [rcx + rax]

+        ; transpose for the first stage

+        movq        mm3,        mm0         ; 00 01 02 03

+        movq        mm5,        mm2         ; 20 21 22 23

+        punpcklwd   mm0,        mm1         ; 00 10 01 11

+        punpckhwd   mm3,        mm1         ; 02 12 03 13

+        punpcklwd   mm2,        mm4         ; 20 30 21 31

+        punpckhwd   mm5,        mm4         ; 22 32 23 33

+        movq        mm1,        mm0         ; 00 10 01 11

+        punpckldq   mm0,        mm2         ; 00 10 20 30

+        punpckhdq   mm1,        mm2         ; 01 11 21 31

+        movq        mm2,        mm3         ; 02 12 03 13

+        punpckldq   mm2,        mm5         ; 02 12 22 32

+        punpckhdq   mm3,        mm5         ; 03 13 23 33

+        ; mm0 0

+        ; mm1 1

+        ; mm2 2

+        ; mm3 3

+        ; first stage

+        movq        mm5,        mm0

+        movq        mm4,        mm1

+        paddw       mm0,        mm3         ; a1 = 0 + 3

+        paddw       mm1,        mm2         ; b1 = 1 + 2

+        psubw       mm4,        mm2         ; c1 = 1 - 2

+        psubw       mm5,        mm3         ; d1 = 0 - 3

+        psllw       mm5,        3

+        psllw       mm4,        3

+        psllw       mm0,        3

+        psllw       mm1,        3

+        ; output 0 and 2

+        movq        mm2,        mm0         ; a1

+        paddw       mm0,        mm1         ; op[0] = a1 + b1

+        psubw       mm2,        mm1         ; op[2] = a1 - b1

+        ; output 1 and 3

+        ; interleave c1, d1

+        movq        mm1,        mm5         ; d1

+        punpcklwd   mm1,        mm4         ; c1 d1

+        punpckhwd   mm5,        mm4         ; c1 d1

+        movq        mm3,        mm1

+        movq        mm4,        mm5

+        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

+        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

+        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

+        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

+        paddd       mm1,        MMWORD PTR[GLOBAL(_14500)]

+        paddd       mm4,        MMWORD PTR[GLOBAL(_14500)]

+        paddd       mm3,        MMWORD PTR[GLOBAL(_7500)]

+        paddd       mm5,        MMWORD PTR[GLOBAL(_7500)]

+        psrad       mm1,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12

+        psrad       mm4,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12

+        psrad       mm3,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12

+        psrad       mm5,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12

+        packssdw    mm1,        mm4         ; op[1]

+        packssdw    mm3,        mm5         ; op[3]

+        ; done with vertical

+        ; transpose for the second stage

+        movq        mm4,        mm0         ; 00 10 20 30

+        movq        mm5,        mm2         ; 02 12 22 32

+        punpcklwd   mm0,        mm1         ; 00 01 10 11

+        punpckhwd   mm4,        mm1         ; 20 21 30 31

+        punpcklwd   mm2,        mm3         ; 02 03 12 13

+        punpckhwd   mm5,        mm3         ; 22 23 32 33

+        movq        mm1,        mm0         ; 00 01 10 11

+        punpckldq   mm0,        mm2         ; 00 01 02 03

+        punpckhdq   mm1,        mm2         ; 01 22 12 13

+        movq        mm2,        mm4         ; 20 31 30 31

+        punpckldq   mm2,        mm5         ; 20 21 22 23

+        punpckhdq   mm4,        mm5         ; 30 31 32 33

+        ; mm0 0

+        ; mm1 1

+        ; mm2 2

+        ; mm3 4

+        movq        mm5,        mm0

+        movq        mm3,        mm1

+        paddw       mm0,        mm4         ; a1 = 0 + 3

+        paddw       mm1,        mm2         ; b1 = 1 + 2

+        psubw       mm3,        mm2         ; c1 = 1 - 2

+        psubw       mm5,        mm4         ; d1 = 0 - 3

+        pxor        mm6,        mm6         ; zero out for compare

+        pcmpeqw     mm6,        mm5         ; d1 != 0

+        pandn       mm6,        MMWORD PTR[GLOBAL(_cmp_mask)]   ; clear upper,

+                                                                ; and keep bit 0 of lower

+        ; output 0 and 2

+        movq        mm2,        mm0         ; a1

+        paddw       mm0,        mm1         ; a1 + b1

+        psubw       mm2,        mm1         ; a1 - b1

+        paddw       mm0,        MMWORD PTR[GLOBAL(_7w)]

+        paddw       mm2,        MMWORD PTR[GLOBAL(_7w)]

+        psraw       mm0,        4           ; op[0] = (a1 + b1 + 7)>>4

+        psraw       mm2,        4           ; op[8] = (a1 - b1 + 7)>>4

+        movq        MMWORD PTR[rdi + 0 ],  mm0

+        movq        MMWORD PTR[rdi + 16],  mm2

+        ; output 1 and 3

+        ; interleave c1, d1

+        movq        mm1,        mm5         ; d1

+        punpcklwd   mm1,        mm3         ; c1 d1

+        punpckhwd   mm5,        mm3         ; c1 d1

+        movq        mm3,        mm1

+        movq        mm4,        mm5

+        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

+        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

+        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

+        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

+        paddd       mm1,        MMWORD PTR[GLOBAL(_12000)]

+        paddd       mm4,        MMWORD PTR[GLOBAL(_12000)]

+        paddd       mm3,        MMWORD PTR[GLOBAL(_51000)]

+        paddd       mm5,        MMWORD PTR[GLOBAL(_51000)]

+        psrad       mm1,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16

+        psrad       mm4,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16

+        psrad       mm3,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16

+        psrad       mm5,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16

+        packssdw    mm1,        mm4         ; op[4]

+        packssdw    mm3,        mm5         ; op[12]

+        paddw       mm1,        mm6         ; op[4] += (d1!=0)

+        movq        MMWORD PTR[rdi + 8 ],  mm1

+        movq        MMWORD PTR[rdi + 24],  mm3

+     ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+align 8

+_5352_2217:

+    dw 5352

+    dw 2217

+    dw 5352

+    dw 2217

+align 8

+_2217_neg5352:

+    dw 2217

+    dw -5352

+    dw 2217

+    dw -5352

+align 8

+_cmp_mask:

+    times 4 dw 1

+align 8

+_7w:

+    times 4 dw 7

+align 8

+_14500:

+    times 2 dd 14500

+align 8

+_7500:

+    times 2 dd 7500

+align 8

+_12000:

+    times 2 dd 12000

+align 8

+_51000:

+    times 2 dd 51000

--- /dev/null

+++ b/vp9/encoder/x86/dct_sse2.asm

@@ -1,0 +1,432 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+%macro STACK_FRAME_CREATE 0

+%if ABI_IS_32BIT

+  %define       input       rsi

+  %define       output      rdi

+  %define       pitch       rax

+    push        rbp

+    mov         rbp, rsp

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    mov         rsi, arg(0)

+    mov         rdi, arg(1)

+    movsxd      rax, dword ptr arg(2)

+    lea         rcx, [rsi + rax*2]

+%else

+  %ifidn __OUTPUT_FORMAT__,x64

+    %define     input       rcx

+    %define     output      rdx

+    %define     pitch       r8

+    SAVE_XMM 7, u

+  %else

+    %define     input       rdi

+    %define     output      rsi

+    %define     pitch       rdx

+  %endif

+%endif

+%endmacro

+%macro STACK_FRAME_DESTROY 0

+  %define     input

+  %define     output

+  %define     pitch

+%if ABI_IS_32BIT

+    pop         rdi

+    pop         rsi

+    RESTORE_GOT

+    pop         rbp

+%else

+  %ifidn __OUTPUT_FORMAT__,x64

+    RESTORE_XMM

+  %endif

+%endif

+    ret

+%endmacro

+;void vp9_short_fdct4x4_sse2(short *input, short *output, int pitch)

+global sym(vp9_short_fdct4x4_sse2)

+sym(vp9_short_fdct4x4_sse2):

+    STACK_FRAME_CREATE

+    movq        xmm0, MMWORD PTR[input        ] ;03 02 01 00

+    movq        xmm2, MMWORD PTR[input+  pitch] ;13 12 11 10

+    lea         input,          [input+2*pitch]

+    movq        xmm1, MMWORD PTR[input        ] ;23 22 21 20

+    movq        xmm3, MMWORD PTR[input+  pitch] ;33 32 31 30

+    punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00

+    punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20

+    movdqa      xmm2, xmm0

+    punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00

+    punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10

+    movdqa      xmm1, xmm0

+    punpckldq   xmm0, xmm2                      ;31 21 30 20 11 10 01 00

+    pshufhw     xmm1, xmm1, 0b1h                ;22 23 02 03 xx xx xx xx

+    pshufhw     xmm2, xmm2, 0b1h                ;32 33 12 13 xx xx xx xx

+    punpckhdq   xmm1, xmm2                      ;32 33 22 23 12 13 02 03

+    movdqa      xmm3, xmm0

+    paddw       xmm0, xmm1                      ;b1 a1 b1 a1 b1 a1 b1 a1

+    psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1

+    psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3

+    psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3

+    movdqa      xmm1, xmm0

+    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1

+    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1

+    movdqa      xmm4, xmm3

+    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]   ;c1*2217 + d1*5352

+    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352

+    paddd       xmm3, XMMWORD PTR[GLOBAL(_14500)]

+    paddd       xmm4, XMMWORD PTR[GLOBAL(_7500)]

+    psrad       xmm3, 12            ;(c1 * 2217 + d1 * 5352 +  14500)>>12

+    psrad       xmm4, 12            ;(d1 * 2217 - c1 * 5352 +   7500)>>12

+    packssdw    xmm0, xmm1                      ;op[2] op[0]

+    packssdw    xmm3, xmm4                      ;op[3] op[1]

+    ; 23 22 21 20 03 02 01 00

+    ;

+    ; 33 32 31 30 13 12 11 10

+    ;

+    movdqa      xmm2, xmm0

+    punpcklqdq  xmm0, xmm3                      ;13 12 11 10 03 02 01 00

+    punpckhqdq  xmm2, xmm3                      ;23 22 21 20 33 32 31 30

+    movdqa      xmm3, xmm0

+    punpcklwd   xmm0, xmm2                      ;32 30 22 20 12 10 02 00

+    punpckhwd   xmm3, xmm2                      ;33 31 23 21 13 11 03 01

+    movdqa      xmm2, xmm0

+    punpcklwd   xmm0, xmm3                      ;13 12 11 10 03 02 01 00

+    punpckhwd   xmm2, xmm3                      ;33 32 31 30 23 22 21 20

+    movdqa      xmm5, XMMWORD PTR[GLOBAL(_7)]

+    pshufd      xmm2, xmm2, 04eh

+    movdqa      xmm3, xmm0

+    paddw       xmm0, xmm2                      ;b1 b1 b1 b1 a1 a1 a1 a1

+    psubw       xmm3, xmm2                      ;c1 c1 c1 c1 d1 d1 d1 d1

+    pshufd      xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 b1 a1 a1

+    movdqa      xmm2, xmm3                      ;save d1 for compare

+    pshufd      xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 c1 d1 d1

+    pshuflw     xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 a1 b1 a1

+    pshuflw     xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 d1 c1 d1

+    pshufhw     xmm0, xmm0, 0d8h                ;b1 a1 b1 a1 b1 a1 b1 a1

+    pshufhw     xmm3, xmm3, 0d8h                ;c1 d1 c1 d1 c1 d1 c1 d1

+    movdqa      xmm1, xmm0

+    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1

+    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1

+    pxor        xmm4, xmm4                      ;zero out for compare

+    paddd       xmm0, xmm5

+    paddd       xmm1, xmm5

+    pcmpeqw     xmm2, xmm4

+    psrad       xmm0, 4                         ;(a1 + b1 + 7)>>4

+    psrad       xmm1, 4                         ;(a1 - b1 + 7)>>4

+    pandn       xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,

+                                                     ;and keep bit 0 of lower

+    movdqa      xmm4, xmm3

+    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]    ;c1*2217 + d1*5352

+    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352

+    paddd       xmm3, XMMWORD PTR[GLOBAL(_12000)]

+    paddd       xmm4, XMMWORD PTR[GLOBAL(_51000)]

+    packssdw    xmm0, xmm1                      ;op[8] op[0]

+    psrad       xmm3, 16                ;(c1 * 2217 + d1 * 5352 +  12000)>>16

+    psrad       xmm4, 16                ;(d1 * 2217 - c1 * 5352 +  51000)>>16

+    packssdw    xmm3, xmm4                      ;op[12] op[4]

+    movdqa      xmm1, xmm0

+    paddw       xmm3, xmm2                      ;op[4] += (d1!=0)

+    punpcklqdq  xmm0, xmm3                      ;op[4] op[0]

+    punpckhqdq  xmm1, xmm3                      ;op[12] op[8]

+    movdqa      XMMWORD PTR[output +  0], xmm0

+    movdqa      XMMWORD PTR[output + 16], xmm1

+    STACK_FRAME_DESTROY

+;void vp9_short_fdct8x4_sse2(short *input, short *output, int pitch)

+global sym(vp9_short_fdct8x4_sse2)

+sym(vp9_short_fdct8x4_sse2):

+    STACK_FRAME_CREATE

+        ; read the input data

+        movdqa      xmm0,       [input        ]

+        movdqa      xmm2,       [input+  pitch]

+        lea         input,      [input+2*pitch]

+        movdqa      xmm4,       [input        ]

+        movdqa      xmm3,       [input+  pitch]

+        ; transpose for the first stage

+        movdqa      xmm1,       xmm0        ; 00 01 02 03 04 05 06 07

+        movdqa      xmm5,       xmm4        ; 20 21 22 23 24 25 26 27

+        punpcklwd   xmm0,       xmm2        ; 00 10 01 11 02 12 03 13

+        punpckhwd   xmm1,       xmm2        ; 04 14 05 15 06 16 07 17

+        punpcklwd   xmm4,       xmm3        ; 20 30 21 31 22 32 23 33

+        punpckhwd   xmm5,       xmm3        ; 24 34 25 35 26 36 27 37

+        movdqa      xmm2,       xmm0        ; 00 10 01 11 02 12 03 13

+        punpckldq   xmm0,       xmm4        ; 00 10 20 30 01 11 21 31

+        punpckhdq   xmm2,       xmm4        ; 02 12 22 32 03 13 23 33

+        movdqa      xmm4,       xmm1        ; 04 14 05 15 06 16 07 17

+        punpckldq   xmm4,       xmm5        ; 04 14 24 34 05 15 25 35

+        punpckhdq   xmm1,       xmm5        ; 06 16 26 36 07 17 27 37

+        movdqa      xmm3,       xmm2        ; 02 12 22 32 03 13 23 33

+        punpckhqdq  xmm3,       xmm1        ; 03 13 23 33 07 17 27 37

+        punpcklqdq  xmm2,       xmm1        ; 02 12 22 32 06 16 26 36

+        movdqa      xmm1,       xmm0        ; 00 10 20 30 01 11 21 31

+        punpcklqdq  xmm0,       xmm4        ; 00 10 20 30 04 14 24 34

+        punpckhqdq  xmm1,       xmm4        ; 01 11 21 32 05 15 25 35

+        ; xmm0 0

+        ; xmm1 1

+        ; xmm2 2

+        ; xmm3 3

+        ; first stage

+        movdqa      xmm5,       xmm0

+        movdqa      xmm4,       xmm1

+        paddw       xmm0,       xmm3        ; a1 = 0 + 3

+        paddw       xmm1,       xmm2        ; b1 = 1 + 2

+        psubw       xmm4,       xmm2        ; c1 = 1 - 2

+        psubw       xmm5,       xmm3        ; d1 = 0 - 3

+        psllw       xmm5,        3

+        psllw       xmm4,        3

+        psllw       xmm0,        3

+        psllw       xmm1,        3

+        ; output 0 and 2

+        movdqa      xmm2,       xmm0        ; a1

+        paddw       xmm0,       xmm1        ; op[0] = a1 + b1

+        psubw       xmm2,       xmm1        ; op[2] = a1 - b1

+        ; output 1 and 3

+        ; interleave c1, d1

+        movdqa      xmm1,       xmm5        ; d1

+        punpcklwd   xmm1,       xmm4        ; c1 d1

+        punpckhwd   xmm5,       xmm4        ; c1 d1

+        movdqa      xmm3,       xmm1

+        movdqa      xmm4,       xmm5

+        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

+        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

+        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

+        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

+        paddd       xmm1,       XMMWORD PTR[GLOBAL(_14500)]

+        paddd       xmm4,       XMMWORD PTR[GLOBAL(_14500)]

+        paddd       xmm3,       XMMWORD PTR[GLOBAL(_7500)]

+        paddd       xmm5,       XMMWORD PTR[GLOBAL(_7500)]

+        psrad       xmm1,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12

+        psrad       xmm4,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12

+        psrad       xmm3,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12

+        psrad       xmm5,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12

+        packssdw    xmm1,       xmm4        ; op[1]

+        packssdw    xmm3,       xmm5        ; op[3]

+        ; done with vertical

+        ; transpose for the second stage

+        movdqa      xmm4,       xmm0         ; 00 10 20 30 04 14 24 34

+        movdqa      xmm5,       xmm2         ; 02 12 22 32 06 16 26 36

+        punpcklwd   xmm0,       xmm1         ; 00 01 10 11 20 21 30 31

+        punpckhwd   xmm4,       xmm1         ; 04 05 14 15 24 25 34 35

+        punpcklwd   xmm2,       xmm3         ; 02 03 12 13 22 23 32 33

+        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37

+        movdqa      xmm1,       xmm0         ; 00 01 10 11 20 21 30 31

+        punpckldq   xmm0,       xmm2         ; 00 01 02 03 10 11 12 13

+        punpckhdq   xmm1,       xmm2         ; 20 21 22 23 30 31 32 33

+        movdqa      xmm2,       xmm4         ; 04 05 14 15 24 25 34 35

+        punpckldq   xmm2,       xmm5         ; 04 05 06 07 14 15 16 17

+        punpckhdq   xmm4,       xmm5         ; 24 25 26 27 34 35 36 37

+        movdqa      xmm3,       xmm1         ; 20 21 22 23 30 31 32 33

+        punpckhqdq  xmm3,       xmm4         ; 30 31 32 33 34 35 36 37

+        punpcklqdq  xmm1,       xmm4         ; 20 21 22 23 24 25 26 27

+        movdqa      xmm4,       xmm0         ; 00 01 02 03 10 11 12 13

+        punpcklqdq  xmm0,       xmm2         ; 00 01 02 03 04 05 06 07

+        punpckhqdq  xmm4,       xmm2         ; 10 11 12 13 14 15 16 17

+        ; xmm0 0

+        ; xmm1 4

+        ; xmm2 1

+        ; xmm3 3

+        movdqa      xmm5,       xmm0

+        movdqa      xmm2,       xmm1

+        paddw       xmm0,       xmm3        ; a1 = 0 + 3

+        paddw       xmm1,       xmm4        ; b1 = 1 + 2

+        psubw       xmm4,       xmm2        ; c1 = 1 - 2

+        psubw       xmm5,       xmm3        ; d1 = 0 - 3

+        pxor        xmm6,       xmm6        ; zero out for compare

+        pcmpeqw     xmm6,       xmm5        ; d1 != 0

+        pandn       xmm6,       XMMWORD PTR[GLOBAL(_cmp_mask8x4)]   ; clear upper,

+                                                                    ; and keep bit 0 of lower

+        ; output 0 and 2

+        movdqa      xmm2,       xmm0        ; a1

+        paddw       xmm0,       xmm1        ; a1 + b1

+        psubw       xmm2,       xmm1        ; a1 - b1

+        paddw       xmm0,       XMMWORD PTR[GLOBAL(_7w)]

+        paddw       xmm2,       XMMWORD PTR[GLOBAL(_7w)]

+        psraw       xmm0,       4           ; op[0] = (a1 + b1 + 7)>>4

+        psraw       xmm2,       4           ; op[8] = (a1 - b1 + 7)>>4

+        ; output 1 and 3

+        ; interleave c1, d1

+        movdqa      xmm1,       xmm5        ; d1

+        punpcklwd   xmm1,       xmm4        ; c1 d1

+        punpckhwd   xmm5,       xmm4        ; c1 d1

+        movdqa      xmm3,       xmm1

+        movdqa      xmm4,       xmm5

+        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

+        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352

+        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

+        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352

+        paddd       xmm1,       XMMWORD PTR[GLOBAL(_12000)]

+        paddd       xmm4,       XMMWORD PTR[GLOBAL(_12000)]

+        paddd       xmm3,       XMMWORD PTR[GLOBAL(_51000)]

+        paddd       xmm5,       XMMWORD PTR[GLOBAL(_51000)]

+        psrad       xmm1,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16

+        psrad       xmm4,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16

+        psrad       xmm3,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16

+        psrad       xmm5,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16

+        packssdw    xmm1,       xmm4        ; op[4]

+        packssdw    xmm3,       xmm5        ; op[12]

+        paddw       xmm1,       xmm6        ; op[4] += (d1!=0)

+        movdqa      xmm4,       xmm0

+        movdqa      xmm5,       xmm2

+        punpcklqdq  xmm0,       xmm1

+        punpckhqdq  xmm4,       xmm1

+        punpcklqdq  xmm2,       xmm3

+        punpckhqdq  xmm5,       xmm3

+        movdqa      XMMWORD PTR[output + 0 ],  xmm0

+        movdqa      XMMWORD PTR[output + 16],  xmm2

+        movdqa      XMMWORD PTR[output + 32],  xmm4

+        movdqa      XMMWORD PTR[output + 48],  xmm5

+    STACK_FRAME_DESTROY

+SECTION_RODATA

+align 16

+_5352_2217:

+    dw 5352

+    dw 2217

+    dw 5352

+    dw 2217

+    dw 5352

+    dw 2217

+    dw 5352

+    dw 2217

+align 16

+_2217_neg5352:

+    dw 2217

+    dw -5352

+    dw 2217

+    dw -5352

+    dw 2217

+    dw -5352

+    dw 2217

+    dw -5352

+align 16

+_mult_add:

+    times 8 dw 1

+align 16

+_cmp_mask:

+    times 4 dw 1

+    times 4 dw 0

+align 16

+_cmp_mask8x4:

+    times 8 dw 1

+align 16

+_mult_sub:

+    dw 1

+    dw -1

+    dw 1

+    dw -1

+    dw 1

+    dw -1

+    dw 1

+    dw -1

+align 16

+_7:

+    times 4 dd 7

+align 16

+_7w:

+    times 8 dw 7

+align 16

+_14500:

+    times 4 dd 14500

+align 16

+_7500:

+    times 4 dd 7500

+align 16

+_12000:

+    times 4 dd 12000

+align 16

+_51000:

+    times 4 dd 51000

--- /dev/null

+++ b/vp9/encoder/x86/encodeopt.asm

@@ -1,0 +1,386 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+;int vp9_block_error_xmm(short *coeff_ptr,  short *dcoef_ptr)

+global sym(vp9_block_error_xmm)

+sym(vp9_block_error_xmm):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 2

+    push rsi

+    push rdi

+    ; end prologue

+        mov         rsi,        arg(0) ;coeff_ptr

+        mov         rdi,        arg(1) ;dcoef_ptr

+        movdqa      xmm0,       [rsi]

+        movdqa      xmm1,       [rdi]

+        movdqa      xmm2,       [rsi+16]

+        movdqa      xmm3,       [rdi+16]

+        psubw       xmm0,       xmm1

+        psubw       xmm2,       xmm3

+        pmaddwd     xmm0,       xmm0

+        pmaddwd     xmm2,       xmm2

+        paddd       xmm0,       xmm2

+        pxor        xmm5,       xmm5

+        movdqa      xmm1,       xmm0

+        punpckldq   xmm0,       xmm5

+        punpckhdq   xmm1,       xmm5

+        paddd       xmm0,       xmm1

+        movdqa      xmm1,       xmm0

+        psrldq      xmm0,       8

+        paddd       xmm0,       xmm1

+        movq        rax,        xmm0

+    pop rdi

+    pop rsi

+    ; begin epilog

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;int vp9_block_error_mmx(short *coeff_ptr,  short *dcoef_ptr)

+global sym(vp9_block_error_mmx)

+sym(vp9_block_error_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 2

+    push rsi

+    push rdi

+    ; end prolog

+        mov         rsi,        arg(0) ;coeff_ptr

+        pxor        mm7,        mm7

+        mov         rdi,        arg(1) ;dcoef_ptr

+        movq        mm3,        [rsi]

+        movq        mm4,        [rdi]

+        movq        mm5,        [rsi+8]

+        movq        mm6,        [rdi+8]

+        pxor        mm1,        mm1 ; from movd mm1, dc ; dc =0

+        movq        mm2,        mm7

+        psubw       mm5,        mm6

+        por         mm1,        mm2

+        pmaddwd     mm5,        mm5

+        pcmpeqw     mm1,        mm7

+        psubw       mm3,        mm4

+        pand        mm1,        mm3

+        pmaddwd     mm1,        mm1

+        paddd       mm1,        mm5

+        movq        mm3,        [rsi+16]

+        movq        mm4,        [rdi+16]

+        movq        mm5,        [rsi+24]

+        movq        mm6,        [rdi+24]

+        psubw       mm5,        mm6

+        pmaddwd     mm5,        mm5

+        psubw       mm3,        mm4

+        pmaddwd     mm3,        mm3

+        paddd       mm3,        mm5

+        paddd       mm1,        mm3

+        movq        mm0,        mm1

+        psrlq       mm1,        32

+        paddd       mm0,        mm1

+        movq        rax,        mm0

+    pop rdi

+    pop rsi

+    ; begin epilog

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);

+global sym(vp9_mbblock_error_mmx_impl)

+sym(vp9_mbblock_error_mmx_impl):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 3

+    push rsi

+    push rdi

+    ; end prolog

+        mov         rsi,        arg(0) ;coeff_ptr

+        pxor        mm7,        mm7

+        mov         rdi,        arg(1) ;dcoef_ptr

+        pxor        mm2,        mm2

+        movd        mm1,        dword ptr arg(2) ;dc

+        por         mm1,        mm2

+        pcmpeqw     mm1,        mm7

+        mov         rcx,        16

+.mberror_loop_mmx:

+        movq        mm3,       [rsi]

+        movq        mm4,       [rdi]

+        movq        mm5,       [rsi+8]

+        movq        mm6,       [rdi+8]

+        psubw       mm5,        mm6

+        pmaddwd     mm5,        mm5

+        psubw       mm3,        mm4

+        pand        mm3,        mm1

+        pmaddwd     mm3,        mm3

+        paddd       mm2,        mm5

+        paddd       mm2,        mm3

+        movq        mm3,       [rsi+16]

+        movq        mm4,       [rdi+16]

+        movq        mm5,       [rsi+24]

+        movq        mm6,       [rdi+24]

+        psubw       mm5,        mm6

+        pmaddwd     mm5,        mm5

+        psubw       mm3,        mm4

+        pmaddwd     mm3,        mm3

+        paddd       mm2,        mm5

+        paddd       mm2,        mm3

+        add         rsi,        32

+        add         rdi,        32

+        sub         rcx,        1

+        jnz         .mberror_loop_mmx

+        movq        mm0,        mm2

+        psrlq       mm2,        32

+        paddd       mm0,        mm2

+        movq        rax,        mm0

+    pop rdi

+    pop rsi

+    ; begin epilog

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);

+global sym(vp9_mbblock_error_xmm_impl)

+sym(vp9_mbblock_error_xmm_impl):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 3

+    SAVE_XMM 6

+    push rsi

+    push rdi

+    ; end prolog

+        mov         rsi,        arg(0) ;coeff_ptr

+        pxor        xmm6,       xmm6

+        mov         rdi,        arg(1) ;dcoef_ptr

+        pxor        xmm4,       xmm4

+        movd        xmm5,       dword ptr arg(2) ;dc

+        por         xmm5,       xmm4

+        pcmpeqw     xmm5,       xmm6

+        mov         rcx,        16

+.mberror_loop:

+        movdqa      xmm0,       [rsi]

+        movdqa      xmm1,       [rdi]

+        movdqa      xmm2,       [rsi+16]

+        movdqa      xmm3,       [rdi+16]

+        psubw       xmm2,       xmm3

+        pmaddwd     xmm2,       xmm2

+        psubw       xmm0,       xmm1

+        pand        xmm0,       xmm5

+        pmaddwd     xmm0,       xmm0

+        add         rsi,        32

+        add         rdi,        32

+        sub         rcx,        1

+        paddd       xmm4,       xmm2

+        paddd       xmm4,       xmm0

+        jnz         .mberror_loop

+        movdqa      xmm0,       xmm4

+        punpckldq   xmm0,       xmm6

+        punpckhdq   xmm4,       xmm6

+        paddd       xmm0,       xmm4

+        movdqa      xmm1,       xmm0

+        psrldq      xmm0,       8

+        paddd       xmm0,       xmm1

+        movq        rax,        xmm0

+    pop rdi

+    pop rsi

+    ; begin epilog

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);

+global sym(vp9_mbuverror_mmx_impl)

+sym(vp9_mbuverror_mmx_impl):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 2

+    push rsi

+    push rdi

+    ; end prolog

+        mov             rsi,        arg(0) ;s_ptr

+        mov             rdi,        arg(1) ;d_ptr

+        mov             rcx,        16

+        pxor            mm7,        mm7

+.mbuverror_loop_mmx:

+        movq            mm1,        [rsi]

+        movq            mm2,        [rdi]

+        psubw           mm1,        mm2

+        pmaddwd         mm1,        mm1

+        movq            mm3,        [rsi+8]

+        movq            mm4,        [rdi+8]

+        psubw           mm3,        mm4

+        pmaddwd         mm3,        mm3

+        paddd           mm7,        mm1

+        paddd           mm7,        mm3

+        add             rsi,        16

+        add             rdi,        16

+        dec             rcx

+        jnz             .mbuverror_loop_mmx

+        movq            mm0,        mm7

+        psrlq           mm7,        32

+        paddd           mm0,        mm7

+        movq            rax,        mm0

+    pop rdi

+    pop rsi

+    ; begin epilog

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);

+global sym(vp9_mbuverror_xmm_impl)

+sym(vp9_mbuverror_xmm_impl):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 2

+    push rsi

+    push rdi

+    ; end prolog

+        mov             rsi,        arg(0) ;s_ptr

+        mov             rdi,        arg(1) ;d_ptr

+        mov             rcx,        16

+        pxor            xmm3,       xmm3

+.mbuverror_loop:

+        movdqa          xmm1,       [rsi]

+        movdqa          xmm2,       [rdi]

+        psubw           xmm1,       xmm2

+        pmaddwd         xmm1,       xmm1

+        paddd           xmm3,       xmm1

+        add             rsi,        16

+        add             rdi,        16

+        dec             rcx

+        jnz             .mbuverror_loop

+        pxor        xmm0,           xmm0

+        movdqa      xmm1,           xmm3

+        movdqa      xmm2,           xmm1

+        punpckldq   xmm1,           xmm0

+        punpckhdq   xmm2,           xmm0

+        paddd       xmm1,           xmm2

+        movdqa      xmm2,           xmm1

+        psrldq      xmm1,           8

+        paddd       xmm1,           xmm2

+        movq            rax,            xmm1

+    pop rdi

+    pop rsi

+    ; begin epilog

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

--- /dev/null

+++ b/vp9/encoder/x86/fwalsh_sse2.asm

@@ -1,0 +1,164 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+;void vp9_short_walsh4x4_sse2(short *input, short *output, int pitch)

+global sym(vp9_short_walsh4x4_sse2)

+sym(vp9_short_walsh4x4_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 3

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    mov     rsi, arg(0)           ; input

+    mov     rdi, arg(1)           ; output

+    movsxd  rdx, dword ptr arg(2) ; pitch

+    ; first for loop

+    movq    xmm0, MMWORD PTR [rsi]           ; load input

+    movq    xmm1, MMWORD PTR [rsi + rdx]

+    lea     rsi,  [rsi + rdx*2]

+    movq    xmm2, MMWORD PTR [rsi]

+    movq    xmm3, MMWORD PTR [rsi + rdx]

+    punpcklwd xmm0,  xmm1

+    punpcklwd xmm2,  xmm3

+    movdqa    xmm1, xmm0

+    punpckldq xmm0, xmm2           ; ip[1] ip[0]

+    punpckhdq xmm1, xmm2           ; ip[3] ip[2]

+    movdqa    xmm2, xmm0

+    paddw     xmm0, xmm1

+    psubw     xmm2, xmm1

+    psllw     xmm0, 2              ; d1  a1

+    psllw     xmm2, 2              ; c1  b1

+    movdqa    xmm1, xmm0

+    punpcklqdq xmm0, xmm2          ; b1  a1

+    punpckhqdq xmm1, xmm2          ; c1  d1

+    pxor      xmm6, xmm6

+    movq      xmm6, xmm0

+    pxor      xmm7, xmm7

+    pcmpeqw   xmm7, xmm6

+    paddw     xmm7, [GLOBAL(c1)]

+    movdqa    xmm2, xmm0

+    paddw     xmm0, xmm1           ; b1+c1  a1+d1

+    psubw     xmm2, xmm1           ; b1-c1  a1-d1

+    paddw     xmm0, xmm7           ; b1+c1  a1+d1+(a1!=0)

+    ; second for loop

+    ; input: 13  9  5  1 12  8  4  0 (xmm0)

+    ;        14 10  6  2 15 11  7  3 (xmm2)

+    ; after shuffle:

+    ;        13  5  9  1 12  4  8  0 (xmm0)

+    ;        14  6 10  2 15  7 11  3 (xmm1)

+    pshuflw   xmm3, xmm0, 0xd8

+    pshufhw   xmm0, xmm3, 0xd8

+    pshuflw   xmm3, xmm2, 0xd8

+    pshufhw   xmm1, xmm3, 0xd8

+    movdqa    xmm2, xmm0

+    pmaddwd   xmm0, [GLOBAL(c1)]    ; d11 a11 d10 a10

+    pmaddwd   xmm2, [GLOBAL(cn1)]   ; c11 b11 c10 b10

+    movdqa    xmm3, xmm1

+    pmaddwd   xmm1, [GLOBAL(c1)]    ; d12 a12 d13 a13

+    pmaddwd   xmm3, [GLOBAL(cn1)]   ; c12 b12 c13 b13

+    pshufd    xmm4, xmm0, 0xd8      ; d11 d10 a11 a10

+    pshufd    xmm5, xmm2, 0xd8      ; c11 c10 b11 b10

+    pshufd    xmm6, xmm1, 0x72      ; d13 d12 a13 a12

+    pshufd    xmm7, xmm3, 0x72      ; c13 c12 b13 b12

+    movdqa    xmm0, xmm4

+    punpcklqdq xmm0, xmm5           ; b11 b10 a11 a10

+    punpckhqdq xmm4, xmm5           ; c11 c10 d11 d10

+    movdqa    xmm1, xmm6

+    punpcklqdq xmm1, xmm7           ; b13 b12 a13 a12

+    punpckhqdq xmm6, xmm7           ; c13 c12 d13 d12

+    movdqa    xmm2, xmm0

+    paddd     xmm0, xmm4            ; b21 b20 a21 a20

+    psubd     xmm2, xmm4            ; c21 c20 d21 d20

+    movdqa    xmm3, xmm1

+    paddd     xmm1, xmm6            ; b23 b22 a23 a22

+    psubd     xmm3, xmm6            ; c23 c22 d23 d22

+    pxor      xmm4, xmm4

+    movdqa    xmm5, xmm4

+    pcmpgtd   xmm4, xmm0

+    pcmpgtd   xmm5, xmm2

+    pand      xmm4, [GLOBAL(cd1)]

+    pand      xmm5, [GLOBAL(cd1)]

+    pxor      xmm6, xmm6

+    movdqa    xmm7, xmm6

+    pcmpgtd   xmm6, xmm1

+    pcmpgtd   xmm7, xmm3

+    pand      xmm6, [GLOBAL(cd1)]

+    pand      xmm7, [GLOBAL(cd1)]

+    paddd     xmm0, xmm4

+    paddd     xmm2, xmm5

+    paddd     xmm0, [GLOBAL(cd3)]

+    paddd     xmm2, [GLOBAL(cd3)]

+    paddd     xmm1, xmm6

+    paddd     xmm3, xmm7

+    paddd     xmm1, [GLOBAL(cd3)]

+    paddd     xmm3, [GLOBAL(cd3)]

+    psrad     xmm0, 3

+    psrad     xmm1, 3

+    psrad     xmm2, 3

+    psrad     xmm3, 3

+    movdqa    xmm4, xmm0

+    punpcklqdq xmm0, xmm1           ; a23 a22 a21 a20

+    punpckhqdq xmm4, xmm1           ; b23 b22 b21 b20

+    movdqa    xmm5, xmm2

+    punpckhqdq xmm2, xmm3           ; c23 c22 c21 c20

+    punpcklqdq xmm5, xmm3           ; d23 d22 d21 d20

+    packssdw  xmm0, xmm4            ; b23 b22 b21 b20 a23 a22 a21 a20

+    packssdw  xmm2, xmm5            ; d23 d22 d21 d20 c23 c22 c21 c20

+    movdqa  XMMWORD PTR [rdi], xmm0

+    movdqa  XMMWORD PTR [rdi + 16], xmm2

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+c1:

+    dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001

+align 16

+cn1:

+    dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff

+align 16

+cd1:

+    dd 0x00000001, 0x00000001, 0x00000001, 0x00000001

+align 16

+cd3:

+    dd 0x00000003, 0x00000003, 0x00000003, 0x00000003

--- /dev/null

+++ b/vp9/encoder/x86/mcomp_x86.h

@@ -1,0 +1,40 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef MCOMP_X86_H

+#define MCOMP_X86_H

+#if HAVE_SSE3

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp9_search_full_search

+#define vp9_search_full_search vp9_full_search_sadx3

+#undef  vp9_search_refining_search

+#define vp9_search_refining_search vp9_refining_search_sadx4

+#undef  vp9_search_diamond_search

+#define vp9_search_diamond_search vp9_diamond_search_sadx4

+#endif

+#endif

+#if HAVE_SSE4_1

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp9_search_full_search

+#define vp9_search_full_search vp9_full_search_sadx8

+#endif

+#endif

+#endif

--- /dev/null

+++ b/vp9/encoder/x86/quantize_mmx.asm

@@ -1,0 +1,286 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+;int vp9_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,

+;                           short *qcoeff_ptr,short *dequant_ptr,

+;                           short *scan_mask, short *round_ptr,

+;                           short *quant_ptr, short *dqcoeff_ptr);

+global sym(vp9_fast_quantize_b_impl_mmx)

+sym(vp9_fast_quantize_b_impl_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 8

+    push rsi

+    push rdi

+    ; end prolog

+        mov             rsi,        arg(0) ;coeff_ptr

+        movq            mm0,        [rsi]

+        mov             rax,        arg(1) ;zbin_ptr

+        movq            mm1,        [rax]

+        movq            mm3,        mm0

+        psraw           mm0,        15

+        pxor            mm3,        mm0

+        psubw           mm3,        mm0         ; abs

+        movq            mm2,        mm3

+        pcmpgtw         mm1,        mm2

+        pandn           mm1,        mm2

+        movq            mm3,        mm1

+        mov             rdx,        arg(6) ;quant_ptr

+        movq            mm1,        [rdx]

+        mov             rcx,        arg(5) ;round_ptr

+        movq            mm2,        [rcx]

+        paddw           mm3,        mm2

+        pmulhuw         mm3,        mm1

+        pxor            mm3,        mm0

+        psubw           mm3,        mm0     ;gain the sign back

+        mov             rdi,        arg(2) ;qcoeff_ptr

+        movq            mm0,        mm3

+        movq            [rdi],      mm3

+        mov             rax,        arg(3) ;dequant_ptr

+        movq            mm2,        [rax]

+        pmullw          mm3,        mm2

+        mov             rax,        arg(7) ;dqcoeff_ptr

+        movq            [rax],      mm3

+        ; next 8

+        movq            mm4,        [rsi+8]

+        mov             rax,        arg(1) ;zbin_ptr

+        movq            mm5,        [rax+8]

+        movq            mm7,        mm4

+        psraw           mm4,        15

+        pxor            mm7,        mm4

+        psubw           mm7,        mm4         ; abs

+        movq            mm6,        mm7

+        pcmpgtw         mm5,        mm6

+        pandn           mm5,        mm6

+        movq            mm7,        mm5

+        movq            mm5,        [rdx+8]

+        movq            mm6,        [rcx+8]

+        paddw           mm7,        mm6

+        pmulhuw         mm7,        mm5

+        pxor            mm7,        mm4

+        psubw           mm7,        mm4;gain the sign back

+        mov             rdi,        arg(2) ;qcoeff_ptr

+        movq            mm1,        mm7

+        movq            [rdi+8],    mm7

+        mov             rax,        arg(3) ;dequant_ptr

+        movq            mm6,        [rax+8]

+        pmullw          mm7,        mm6

+        mov             rax,        arg(7) ;dqcoeff_ptr

+        movq            [rax+8],    mm7

+                ; next 8

+        movq            mm4,        [rsi+16]

+        mov             rax,        arg(1) ;zbin_ptr

+        movq            mm5,        [rax+16]

+        movq            mm7,        mm4

+        psraw           mm4,        15

+        pxor            mm7,        mm4

+        psubw           mm7,        mm4         ; abs

+        movq            mm6,        mm7

+        pcmpgtw         mm5,        mm6

+        pandn           mm5,        mm6

+        movq            mm7,        mm5

+        movq            mm5,        [rdx+16]

+        movq            mm6,        [rcx+16]

+        paddw           mm7,        mm6

+        pmulhuw         mm7,        mm5

+        pxor            mm7,        mm4

+        psubw           mm7,        mm4;gain the sign back

+        mov             rdi,        arg(2) ;qcoeff_ptr

+        movq            mm1,        mm7

+        movq            [rdi+16],   mm7

+        mov             rax,        arg(3) ;dequant_ptr

+        movq            mm6,        [rax+16]

+        pmullw          mm7,        mm6

+        mov             rax,        arg(7) ;dqcoeff_ptr

+        movq            [rax+16],   mm7

+                ; next 8

+        movq            mm4,        [rsi+24]

+        mov             rax,        arg(1) ;zbin_ptr

+        movq            mm5,        [rax+24]

+        movq            mm7,        mm4

+        psraw           mm4,        15

+        pxor            mm7,        mm4

+        psubw           mm7,        mm4         ; abs

+        movq            mm6,        mm7

+        pcmpgtw         mm5,        mm6

+        pandn           mm5,        mm6

+        movq            mm7,        mm5

+        movq            mm5,        [rdx+24]

+        movq            mm6,        [rcx+24]

+        paddw           mm7,        mm6

+        pmulhuw         mm7,        mm5

+        pxor            mm7,        mm4

+        psubw           mm7,        mm4;gain the sign back

+        mov             rdi,        arg(2) ;qcoeff_ptr

+        movq            mm1,        mm7

+        movq            [rdi+24],   mm7

+        mov             rax,        arg(3) ;dequant_ptr

+        movq            mm6,        [rax+24]

+        pmullw          mm7,        mm6

+        mov             rax,        arg(7) ;dqcoeff_ptr

+        movq            [rax+24],   mm7

+        mov             rdi,        arg(4) ;scan_mask

+        mov             rsi,        arg(2) ;qcoeff_ptr

+        pxor            mm5,        mm5

+        pxor            mm7,        mm7

+        movq            mm0,        [rsi]

+        movq            mm1,        [rsi+8]

+        movq            mm2,        [rdi]

+        movq            mm3,        [rdi+8];

+        pcmpeqw         mm0,        mm7

+        pcmpeqw         mm1,        mm7

+        pcmpeqw         mm6,        mm6

+        pxor            mm0,        mm6

+        pxor            mm1,        mm6

+        psrlw           mm0,        15

+        psrlw           mm1,        15

+        pmaddwd         mm0,        mm2

+        pmaddwd         mm1,        mm3

+        movq            mm5,        mm0

+        paddd           mm5,        mm1

+        movq            mm0,        [rsi+16]

+        movq            mm1,        [rsi+24]

+        movq            mm2,        [rdi+16]

+        movq            mm3,        [rdi+24];

+        pcmpeqw         mm0,        mm7

+        pcmpeqw         mm1,        mm7

+        pcmpeqw         mm6,        mm6

+        pxor            mm0,        mm6

+        pxor            mm1,        mm6

+        psrlw           mm0,        15

+        psrlw           mm1,        15

+        pmaddwd         mm0,        mm2

+        pmaddwd         mm1,        mm3

+        paddd           mm5,        mm0

+        paddd           mm5,        mm1

+        movq            mm0,        mm5

+        psrlq           mm5,        32

+        paddd           mm0,        mm5

+        ; eob adjustment begins here

+        movq            rcx,        mm0

+        and             rcx,        0xffff

+        xor             rdx,        rdx

+        sub             rdx,        rcx ; rdx=-rcx

+        bsr             rax,        rcx

+        inc             rax

+        sar             rdx,        31

+        and             rax,        rdx

+        ; Substitute the sse assembly for the old mmx mixed assembly/C. The

+        ; following is kept as reference

+        ;    movq            rcx,        mm0

+        ;    bsr             rax,        rcx

+        ;

+        ;    mov             eob,        rax

+        ;    mov             eee,        rcx

+        ;

+        ;if(eee==0)

+        ;{

+        ;    eob=-1;

+        ;}

+        ;else if(eee<0)

+        ;{

+        ;    eob=15;

+        ;}

+        ;d->eob = eob+1;

+    ; begin epilog

+    pop rdi

+    pop rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

--- /dev/null

+++ b/vp9/encoder/x86/quantize_sse2.asm

@@ -1,0 +1,380 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license and patent

+;  grant that can be found in the LICENSE file in the root of the source

+;  tree. All contributing project authors may be found in the AUTHORS

+;  file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+%include "asm_enc_offsets.asm"

+; void vp9_regular_quantize_b_sse2 | arg

+;  (BLOCK  *b,                     |  0

+;   BLOCKD *d)                     |  1

+global sym(vp9_regular_quantize_b_sse2)

+sym(vp9_regular_quantize_b_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SAVE_XMM 7

+    GET_GOT     rbx

+%if ABI_IS_32BIT

+    push        rdi

+    push        rsi

+%else

+  %ifidn __OUTPUT_FORMAT__,x64

+    push        rdi

+    push        rsi

+  %endif

+%endif

+    ALIGN_STACK 16, rax

+    %define zrun_zbin_boost   0  ;  8

+    %define abs_minus_zbin    8  ; 32

+    %define temp_qcoeff       40 ; 32

+    %define qcoeff            72 ; 32

+    %define stack_size        104

+    sub         rsp, stack_size

+    ; end prolog

+%if ABI_IS_32BIT

+    mov         rdi, arg(0)                 ; BLOCK *b

+    mov         rsi, arg(1)                 ; BLOCKD *d

+%else

+  %ifidn __OUTPUT_FORMAT__,x64

+    mov         rdi, rcx                    ; BLOCK *b

+    mov         rsi, rdx                    ; BLOCKD *d

+  %else

+    ;mov         rdi, rdi                    ; BLOCK *b

+    ;mov         rsi, rsi                    ; BLOCKD *d

+  %endif

+%endif

+    mov         rdx, [rdi + vp9_block_coeff] ; coeff_ptr

+    mov         rcx, [rdi + vp9_block_zbin] ; zbin_ptr

+    movd        xmm7, [rdi + vp9_block_zbin_extra] ; zbin_oq_value

+    ; z

+    movdqa      xmm0, [rdx]

+    movdqa      xmm4, [rdx + 16]

+    mov         rdx, [rdi + vp9_block_round] ; round_ptr

+    pshuflw     xmm7, xmm7, 0

+    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value

+    movdqa      xmm1, xmm0

+    movdqa      xmm5, xmm4

+    ; sz

+    psraw       xmm0, 15

+    psraw       xmm4, 15

+    ; (z ^ sz)

+    pxor        xmm1, xmm0

+    pxor        xmm5, xmm4

+    ; x = abs(z)

+    psubw       xmm1, xmm0

+    psubw       xmm5, xmm4

+    movdqa      xmm2, [rcx]

+    movdqa      xmm3, [rcx + 16]

+    mov         rcx, [rdi + vp9_block_quant] ; quant_ptr

+    ; *zbin_ptr + zbin_oq_value

+    paddw       xmm2, xmm7

+    paddw       xmm3, xmm7

+    ; x - (*zbin_ptr + zbin_oq_value)

+    psubw       xmm1, xmm2

+    psubw       xmm5, xmm3

+    movdqa      [rsp + abs_minus_zbin], xmm1

+    movdqa      [rsp + abs_minus_zbin + 16], xmm5

+    ; add (zbin_ptr + zbin_oq_value) back

+    paddw       xmm1, xmm2

+    paddw       xmm5, xmm3

+    movdqa      xmm2, [rdx]

+    movdqa      xmm6, [rdx + 16]

+    movdqa      xmm3, [rcx]

+    movdqa      xmm7, [rcx + 16]

+    ; x + round

+    paddw       xmm1, xmm2

+    paddw       xmm5, xmm6

+    ; y = x * quant_ptr >> 16

+    pmulhw      xmm3, xmm1

+    pmulhw      xmm7, xmm5

+    ; y += x

+    paddw       xmm1, xmm3

+    paddw       xmm5, xmm7

+    movdqa      [rsp + temp_qcoeff], xmm1

+    movdqa      [rsp + temp_qcoeff + 16], xmm5

+    pxor        xmm6, xmm6

+    ; zero qcoeff

+    movdqa      [rsp + qcoeff], xmm6

+    movdqa      [rsp + qcoeff + 16], xmm6

+    mov         rdx, [rdi + vp9_block_zrun_zbin_boost] ; zbin_boost_ptr

+    mov         rax, [rdi + vp9_block_quant_shift] ; quant_shift_ptr

+    mov         [rsp + zrun_zbin_boost], rdx

+%macro ZIGZAG_LOOP 1

+    ; x

+    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]

+    ; if (x >= zbin)

+    sub         cx, WORD PTR[rdx]           ; x - zbin

+    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++

+    jl          .rq_zigzag_loop_%1           ; x < zbin

+    movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]

+    ; downshift by quant_shift[rc]

+    movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]

+    sar         edi, cl                     ; also sets Z bit

+    je          .rq_zigzag_loop_%1           ; !y

+    mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]

+    mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost

+.rq_zigzag_loop_%1:

+%endmacro

+; in vp9_default_zig_zag1d order: see vp9/common/entropy.c

+ZIGZAG_LOOP  0

+ZIGZAG_LOOP  1

+ZIGZAG_LOOP  4

+ZIGZAG_LOOP  8

+ZIGZAG_LOOP  5

+ZIGZAG_LOOP  2

+ZIGZAG_LOOP  3

+ZIGZAG_LOOP  6

+ZIGZAG_LOOP  9

+ZIGZAG_LOOP 12

+ZIGZAG_LOOP 13

+ZIGZAG_LOOP 10

+ZIGZAG_LOOP  7

+ZIGZAG_LOOP 11

+ZIGZAG_LOOP 14

+ZIGZAG_LOOP 15

+    movdqa      xmm2, [rsp + qcoeff]

+    movdqa      xmm3, [rsp + qcoeff + 16]

+    mov         rcx, [rsi + vp9_blockd_dequant] ; dequant_ptr

+    mov         rdi, [rsi + vp9_blockd_dqcoeff] ; dqcoeff_ptr

+    ; y ^ sz

+    pxor        xmm2, xmm0

+    pxor        xmm3, xmm4

+    ; x = (y ^ sz) - sz

+    psubw       xmm2, xmm0

+    psubw       xmm3, xmm4

+    ; dequant

+    movdqa      xmm0, [rcx]

+    movdqa      xmm1, [rcx + 16]

+    mov         rcx, [rsi + vp9_blockd_qcoeff] ; qcoeff_ptr

+    pmullw      xmm0, xmm2

+    pmullw      xmm1, xmm3

+    movdqa      [rcx], xmm2        ; store qcoeff

+    movdqa      [rcx + 16], xmm3

+    movdqa      [rdi], xmm0        ; store dqcoeff

+    movdqa      [rdi + 16], xmm1

+    ; select the last value (in zig_zag order) for EOB

+    pcmpeqw     xmm2, xmm6

+    pcmpeqw     xmm3, xmm6

+    ; !

+    pcmpeqw     xmm6, xmm6

+    pxor        xmm2, xmm6

+    pxor        xmm3, xmm6

+    ; mask inv_zig_zag

+    pand        xmm2, [GLOBAL(inv_zig_zag)]

+    pand        xmm3, [GLOBAL(inv_zig_zag + 16)]

+    ; select the max value

+    pmaxsw      xmm2, xmm3

+    pshufd      xmm3, xmm2, 00001110b

+    pmaxsw      xmm2, xmm3

+    pshuflw     xmm3, xmm2, 00001110b

+    pmaxsw      xmm2, xmm3

+    pshuflw     xmm3, xmm2, 00000001b

+    pmaxsw      xmm2, xmm3

+    movd        eax, xmm2

+    and         eax, 0xff

+    mov         [rsi + vp9_blockd_eob], eax

+    ; begin epilog

+    add         rsp, stack_size

+    pop         rsp

+%if ABI_IS_32BIT

+    pop         rsi

+    pop         rdi

+%else

+  %ifidn __OUTPUT_FORMAT__,x64

+    pop         rsi

+    pop         rdi

+  %endif

+%endif

+    RESTORE_GOT

+    RESTORE_XMM

+    pop         rbp

+    ret

+; void vp9_fast_quantize_b_sse2 | arg

+;  (BLOCK  *b,                  |  0

+;   BLOCKD *d)                  |  1

+global sym(vp9_fast_quantize_b_sse2)

+sym(vp9_fast_quantize_b_sse2):

+    push        rbp

+    mov         rbp, rsp

+    GET_GOT     rbx

+%if ABI_IS_32BIT

+    push        rdi

+    push        rsi

+%else

+  %ifidn __OUTPUT_FORMAT__,x64

+    push        rdi

+    push        rsi

+  %else

+    ; these registers are used for passing arguments

+  %endif

+%endif

+    ; end prolog

+%if ABI_IS_32BIT

+    mov         rdi, arg(0)                 ; BLOCK *b

+    mov         rsi, arg(1)                 ; BLOCKD *d

+%else

+  %ifidn __OUTPUT_FORMAT__,x64

+    mov         rdi, rcx                    ; BLOCK *b

+    mov         rsi, rdx                    ; BLOCKD *d

+  %else

+    ;mov         rdi, rdi                    ; BLOCK *b

+    ;mov         rsi, rsi                    ; BLOCKD *d

+  %endif

+%endif

+    mov         rax, [rdi + vp9_block_coeff]

+    mov         rcx, [rdi + vp9_block_round]

+    mov         rdx, [rdi + vp9_block_quant_fast]

+    ; z = coeff

+    movdqa      xmm0, [rax]

+    movdqa      xmm4, [rax + 16]

+    ; dup z so we can save sz

+    movdqa      xmm1, xmm0

+    movdqa      xmm5, xmm4

+    ; sz = z >> 15

+    psraw       xmm0, 15

+    psraw       xmm4, 15

+    ; x = abs(z) = (z ^ sz) - sz

+    pxor        xmm1, xmm0

+    pxor        xmm5, xmm4

+    psubw       xmm1, xmm0

+    psubw       xmm5, xmm4

+    ; x += round

+    paddw       xmm1, [rcx]

+    paddw       xmm5, [rcx + 16]

+    mov         rax, [rsi + vp9_blockd_qcoeff]

+    mov         rcx, [rsi + vp9_blockd_dequant]

+    mov         rdi, [rsi + vp9_blockd_dqcoeff]

+    ; y = x * quant >> 16

+    pmulhw      xmm1, [rdx]

+    pmulhw      xmm5, [rdx + 16]

+    ; x = (y ^ sz) - sz

+    pxor        xmm1, xmm0

+    pxor        xmm5, xmm4

+    psubw       xmm1, xmm0

+    psubw       xmm5, xmm4

+    ; qcoeff = x

+    movdqa      [rax], xmm1

+    movdqa      [rax + 16], xmm5

+    ; x * dequant

+    movdqa      xmm2, xmm1

+    movdqa      xmm3, xmm5

+    pmullw      xmm2, [rcx]

+    pmullw      xmm3, [rcx + 16]

+    ; dqcoeff = x * dequant

+    movdqa      [rdi], xmm2

+    movdqa      [rdi + 16], xmm3

+    pxor        xmm4, xmm4                  ;clear all bits

+    pcmpeqw     xmm1, xmm4

+    pcmpeqw     xmm5, xmm4

+    pcmpeqw     xmm4, xmm4                  ;set all bits

+    pxor        xmm1, xmm4

+    pxor        xmm5, xmm4

+    pand        xmm1, [GLOBAL(inv_zig_zag)]

+    pand        xmm5, [GLOBAL(inv_zig_zag + 16)]

+    pmaxsw      xmm1, xmm5

+    ; now down to 8

+    pshufd      xmm5, xmm1, 00001110b

+    pmaxsw      xmm1, xmm5

+    ; only 4 left

+    pshuflw     xmm5, xmm1, 00001110b

+    pmaxsw      xmm1, xmm5

+    ; okay, just 2!

+    pshuflw     xmm5, xmm1, 00000001b

+    pmaxsw      xmm1, xmm5

+    movd        eax, xmm1

+    and         eax, 0xff

+    mov         [rsi + vp9_blockd_eob], eax

+    ; begin epilog

+%if ABI_IS_32BIT

+    pop         rsi

+    pop         rdi

+%else

+  %ifidn __OUTPUT_FORMAT__,x64

+    pop         rsi

+    pop         rdi

+  %endif

+%endif

+    RESTORE_GOT

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+inv_zig_zag:

+  dw 0x0001, 0x0002, 0x0006, 0x0007

+  dw 0x0003, 0x0005, 0x0008, 0x000d

+  dw 0x0004, 0x0009, 0x000c, 0x000e

+  dw 0x000a, 0x000b, 0x000f, 0x0010

--- /dev/null

+++ b/vp9/encoder/x86/quantize_sse4.asm

@@ -1,0 +1,254 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license and patent

+;  grant that can be found in the LICENSE file in the root of the source

+;  tree. All contributing project authors may be found in the AUTHORS

+;  file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+%include "asm_enc_offsets.asm"

+; void vp9_regular_quantize_b_sse4 | arg

+;  (BLOCK  *b,                     |  0

+;   BLOCKD *d)                     |  1

+global sym(vp9_regular_quantize_b_sse4)

+sym(vp9_regular_quantize_b_sse4):

+%if ABI_IS_32BIT

+    push        rbp

+    mov         rbp, rsp

+    GET_GOT     rbx

+    push        rdi

+    push        rsi

+    ALIGN_STACK 16, rax

+    %define qcoeff      0 ; 32

+    %define stack_size 32

+    sub         rsp, stack_size

+%else

+  %ifidn __OUTPUT_FORMAT__,x64

+    SAVE_XMM 8, u

+    push        rdi

+    push        rsi

+  %endif

+%endif

+    ; end prolog

+%if ABI_IS_32BIT

+    mov         rdi, arg(0)                 ; BLOCK *b

+    mov         rsi, arg(1)                 ; BLOCKD *d

+%else

+  %ifidn __OUTPUT_FORMAT__,x64

+    mov         rdi, rcx                    ; BLOCK *b

+    mov         rsi, rdx                    ; BLOCKD *d

+  %else

+    ;mov         rdi, rdi                    ; BLOCK *b

+    ;mov         rsi, rsi                    ; BLOCKD *d

+  %endif

+%endif

+    mov         rax, [rdi + vp9_block_coeff]

+    mov         rcx, [rdi + vp9_block_zbin]

+    mov         rdx, [rdi + vp9_block_round]

+    movd        xmm7, [rdi + vp9_block_zbin_extra]

+    ; z

+    movdqa      xmm0, [rax]

+    movdqa      xmm1, [rax + 16]

+    ; duplicate zbin_oq_value

+    pshuflw     xmm7, xmm7, 0

+    punpcklwd   xmm7, xmm7

+    movdqa      xmm2, xmm0

+    movdqa      xmm3, xmm1

+    ; sz

+    psraw       xmm0, 15

+    psraw       xmm1, 15

+    ; (z ^ sz)

+    pxor        xmm2, xmm0

+    pxor        xmm3, xmm1

+    ; x = abs(z)

+    psubw       xmm2, xmm0

+    psubw       xmm3, xmm1

+    ; zbin

+    movdqa      xmm4, [rcx]

+    movdqa      xmm5, [rcx + 16]

+    ; *zbin_ptr + zbin_oq_value

+    paddw       xmm4, xmm7

+    paddw       xmm5, xmm7

+    movdqa      xmm6, xmm2

+    movdqa      xmm7, xmm3

+    ; x - (*zbin_ptr + zbin_oq_value)

+    psubw       xmm6, xmm4

+    psubw       xmm7, xmm5

+    ; round

+    movdqa      xmm4, [rdx]

+    movdqa      xmm5, [rdx + 16]

+    mov         rax, [rdi + vp9_block_quant_shift]

+    mov         rcx, [rdi + vp9_block_quant]

+    mov         rdx, [rdi + vp9_block_zrun_zbin_boost]

+    ; x + round

+    paddw       xmm2, xmm4

+    paddw       xmm3, xmm5

+    ; quant

+    movdqa      xmm4, [rcx]

+    movdqa      xmm5, [rcx + 16]

+    ; y = x * quant_ptr >> 16

+    pmulhw      xmm4, xmm2

+    pmulhw      xmm5, xmm3

+    ; y += x

+    paddw       xmm2, xmm4

+    paddw       xmm3, xmm5

+    pxor        xmm4, xmm4

+%if ABI_IS_32BIT

+    movdqa      [rsp + qcoeff], xmm4

+    movdqa      [rsp + qcoeff + 16], xmm4

+%else

+    pxor        xmm8, xmm8

+%endif

+    ; quant_shift

+    movdqa      xmm5, [rax]

+    ; zrun_zbin_boost

+    mov         rax, rdx

+%macro ZIGZAG_LOOP 5

+    ; x

+    pextrw      ecx, %4, %2

+    ; if (x >= zbin)

+    sub         cx, WORD PTR[rdx]           ; x - zbin

+    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++

+    jl          .rq_zigzag_loop_%1          ; x < zbin

+    pextrw      edi, %3, %2                 ; y

+    ; downshift by quant_shift[rc]

+    pextrb      ecx, xmm5, %1               ; quant_shift[rc]

+    sar         edi, cl                     ; also sets Z bit

+    je          .rq_zigzag_loop_%1          ; !y

+%if ABI_IS_32BIT

+    mov         WORD PTR[rsp + qcoeff + %1 *2], di

+%else

+    pinsrw      %5, edi, %2                 ; qcoeff[rc]

+%endif

+    mov         rdx, rax                    ; reset to b->zrun_zbin_boost

+.rq_zigzag_loop_%1:

+%endmacro

+; in vp9_default_zig_zag1d order: see vp9/common/entropy.c

+ZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4

+ZIGZAG_LOOP  1, 1, xmm2, xmm6, xmm4

+ZIGZAG_LOOP  4, 4, xmm2, xmm6, xmm4

+ZIGZAG_LOOP  8, 0, xmm3, xmm7, xmm8

+ZIGZAG_LOOP  5, 5, xmm2, xmm6, xmm4

+ZIGZAG_LOOP  2, 2, xmm2, xmm6, xmm4

+ZIGZAG_LOOP  3, 3, xmm2, xmm6, xmm4

+ZIGZAG_LOOP  6, 6, xmm2, xmm6, xmm4

+ZIGZAG_LOOP  9, 1, xmm3, xmm7, xmm8

+ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8

+ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8

+ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8

+ZIGZAG_LOOP  7, 7, xmm2, xmm6, xmm4

+ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8

+ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8

+ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8

+    mov         rcx, [rsi + vp9_blockd_dequant]

+    mov         rdi, [rsi + vp9_blockd_dqcoeff]

+%if ABI_IS_32BIT

+    movdqa      xmm4, [rsp + qcoeff]

+    movdqa      xmm5, [rsp + qcoeff + 16]

+%else

+    %define     xmm5 xmm8

+%endif

+    ; y ^ sz

+    pxor        xmm4, xmm0

+    pxor        xmm5, xmm1

+    ; x = (y ^ sz) - sz

+    psubw       xmm4, xmm0

+    psubw       xmm5, xmm1

+    ; dequant

+    movdqa      xmm0, [rcx]

+    movdqa      xmm1, [rcx + 16]

+    mov         rcx, [rsi + vp9_blockd_qcoeff]

+    pmullw      xmm0, xmm4

+    pmullw      xmm1, xmm5

+    ; store qcoeff

+    movdqa      [rcx], xmm4

+    movdqa      [rcx + 16], xmm5

+    ; store dqcoeff

+    movdqa      [rdi], xmm0

+    movdqa      [rdi + 16], xmm1

+    ; select the last value (in zig_zag order) for EOB

+    pxor        xmm6, xmm6

+    pcmpeqw     xmm4, xmm6

+    pcmpeqw     xmm5, xmm6

+    packsswb    xmm4, xmm5

+    pshufb      xmm4, [GLOBAL(zig_zag1d)]

+    pmovmskb    edx, xmm4

+    xor         rdi, rdi

+    mov         eax, -1

+    xor         dx, ax

+    bsr         eax, edx

+    sub         edi, edx

+    sar         edi, 31

+    add         eax, 1

+    and         eax, edi

+    mov         [rsi + vp9_blockd_eob], eax

+    ; begin epilog

+%if ABI_IS_32BIT

+    add         rsp, stack_size

+    pop         rsp

+    pop         rsi

+    pop         rdi

+    RESTORE_GOT

+    pop         rbp

+%else

+  %undef xmm5

+  %ifidn __OUTPUT_FORMAT__,x64

+    pop         rsi

+    pop         rdi

+    RESTORE_XMM

+  %endif

+%endif

+    ret

+SECTION_RODATA

+align 16

+; vp9/common/entropy.c: vp9_default_zig_zag1d

+zig_zag1d:

+    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15

--- /dev/null

+++ b/vp9/encoder/x86/quantize_ssse3.asm

@@ -1,0 +1,138 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license and patent

+;  grant that can be found in the LICENSE file in the root of the source

+;  tree. All contributing project authors may be found in the AUTHORS

+;  file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+%include "asm_enc_offsets.asm"

+; void vp9_fast_quantize_b_ssse3 | arg

+;  (BLOCK  *b,                   |  0

+;   BLOCKD *d)                   |  1

+;

+global sym(vp9_fast_quantize_b_ssse3)

+sym(vp9_fast_quantize_b_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    GET_GOT     rbx

+%if ABI_IS_32BIT

+    push        rdi

+    push        rsi

+%else

+  %ifidn __OUTPUT_FORMAT__,x64

+    push        rdi

+    push        rsi

+  %endif

+%endif

+    ; end prolog

+%if ABI_IS_32BIT

+    mov         rdi, arg(0)                 ; BLOCK *b

+    mov         rsi, arg(1)                 ; BLOCKD *d

+%else

+  %ifidn __OUTPUT_FORMAT__,x64

+    mov         rdi, rcx                    ; BLOCK *b

+    mov         rsi, rdx                    ; BLOCKD *d

+  %else

+    ;mov         rdi, rdi                    ; BLOCK *b

+    ;mov         rsi, rsi                    ; BLOCKD *d

+  %endif

+%endif

+    mov         rax, [rdi + vp9_block_coeff]

+    mov         rcx, [rdi + vp9_block_round]

+    mov         rdx, [rdi + vp9_block_quant_fast]

+    ; coeff

+    movdqa      xmm0, [rax]

+    movdqa      xmm4, [rax + 16]

+    ; round

+    movdqa      xmm2, [rcx]

+    movdqa      xmm3, [rcx + 16]

+    movdqa      xmm1, xmm0

+    movdqa      xmm5, xmm4

+    ; sz = z >> 15

+    psraw       xmm0, 15

+    psraw       xmm4, 15

+    pabsw       xmm1, xmm1

+    pabsw       xmm5, xmm5

+    paddw       xmm1, xmm2

+    paddw       xmm5, xmm3

+    ; quant_fast

+    pmulhw      xmm1, [rdx]

+    pmulhw      xmm5, [rdx + 16]

+    mov         rax, [rsi + vp9_blockd_qcoeff]

+    mov         rdi, [rsi + vp9_blockd_dequant]

+    mov         rcx, [rsi + vp9_blockd_dqcoeff]

+    pxor        xmm1, xmm0

+    pxor        xmm5, xmm4

+    psubw       xmm1, xmm0

+    psubw       xmm5, xmm4

+    movdqa      [rax], xmm1

+    movdqa      [rax + 16], xmm5

+    movdqa      xmm2, [rdi]

+    movdqa      xmm3, [rdi + 16]

+    pxor        xmm4, xmm4

+    pmullw      xmm2, xmm1

+    pmullw      xmm3, xmm5

+    pcmpeqw     xmm1, xmm4                  ;non zero mask

+    pcmpeqw     xmm5, xmm4                  ;non zero mask

+    packsswb    xmm1, xmm5

+    pshufb      xmm1, [GLOBAL(zz_shuf)]

+    pmovmskb    edx, xmm1

+    xor         rdi, rdi

+    mov         eax, -1

+    xor         dx, ax                      ;flip the bits for bsr

+    bsr         eax, edx

+    movdqa      [rcx], xmm2                 ;store dqcoeff

+    movdqa      [rcx + 16], xmm3            ;store dqcoeff

+    sub         edi, edx                    ;check for all zeros in bit mask

+    sar         edi, 31                     ;0 or -1

+    add         eax, 1

+    and         eax, edi                    ;if the bit mask was all zero,

+                                            ;then eob = 0

+    mov         [rsi + vp9_blockd_eob], eax

+    ; begin epilog

+%if ABI_IS_32BIT

+    pop         rsi

+    pop         rdi

+%else

+  %ifidn __OUTPUT_FORMAT__,x64

+    pop         rsi

+    pop         rdi

+  %endif

+%endif

+    RESTORE_GOT

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+zz_shuf:

+    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15

--- /dev/null

+++ b/vp9/encoder/x86/quantize_x86.h

@@ -1,0 +1,48 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license and patent

+ *  grant that can be found in the LICENSE file in the root of the source

+ *  tree. All contributing project authors may be found in the AUTHORS

+ *  file in the root of the source tree.

+ */

+#ifndef QUANTIZE_X86_H

+#define QUANTIZE_X86_H

+/* Note:

+ *

+ * This platform is commonly built for runtime CPU detection. If you modify

+ * any of the function mappings present in this file, be sure to also update

+ * them in the function pointer initialization code

+ */

+#if HAVE_MMX

+#endif /* HAVE_MMX */

+#if HAVE_SSE2

+extern prototype_quantize_block(vp9_regular_quantize_b_sse2);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef vp9_quantize_quantb

+#define vp9_quantize_quantb vp9_regular_quantize_b_sse2

+#endif /* !CONFIG_RUNTIME_CPU_DETECT */

+#endif /* HAVE_SSE2 */

+#if HAVE_SSE4_1

+extern prototype_quantize_block(vp9_regular_quantize_b_sse4);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef vp9_quantize_quantb

+#define vp9_quantize_quantb vp9_regular_quantize_b_sse4

+#endif /* !CONFIG_RUNTIME_CPU_DETECT */

+#endif /* HAVE_SSE4_1 */

+#endif /* QUANTIZE_X86_H */

--- /dev/null

+++ b/vp9/encoder/x86/sad_mmx.asm

@@ -1,0 +1,427 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+global sym(vp9_sad16x16_mmx)

+global sym(vp9_sad8x16_mmx)

+global sym(vp9_sad8x8_mmx)

+global sym(vp9_sad4x4_mmx)

+global sym(vp9_sad16x8_mmx)

+;unsigned int vp9_sad16x16_mmx(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride)

+sym(vp9_sad16x16_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    push rsi

+    push rdi

+    ; end prolog

+        mov             rsi,        arg(0) ;src_ptr

+        mov             rdi,        arg(2) ;ref_ptr

+        movsxd          rax,        dword ptr arg(1) ;src_stride

+        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+        lea             rcx,        [rsi+rax*8]

+        lea             rcx,        [rcx+rax*8]

+        pxor            mm7,        mm7

+        pxor            mm6,        mm6

+.x16x16sad_mmx_loop:

+        movq            mm0,        QWORD PTR [rsi]

+        movq            mm2,        QWORD PTR [rsi+8]

+        movq            mm1,        QWORD PTR [rdi]

+        movq            mm3,        QWORD PTR [rdi+8]

+        movq            mm4,        mm0

+        movq            mm5,        mm2

+        psubusb         mm0,        mm1

+        psubusb         mm1,        mm4

+        psubusb         mm2,        mm3

+        psubusb         mm3,        mm5

+        por             mm0,        mm1

+        por             mm2,        mm3

+        movq            mm1,        mm0

+        movq            mm3,        mm2

+        punpcklbw       mm0,        mm6

+        punpcklbw       mm2,        mm6

+        punpckhbw       mm1,        mm6

+        punpckhbw       mm3,        mm6

+        paddw           mm0,        mm2

+        paddw           mm1,        mm3

+        lea             rsi,        [rsi+rax]

+        add             rdi,        rdx

+        paddw           mm7,        mm0

+        paddw           mm7,        mm1

+        cmp             rsi,        rcx

+        jne             .x16x16sad_mmx_loop

+        movq            mm0,        mm7

+        punpcklwd       mm0,        mm6

+        punpckhwd       mm7,        mm6

+        paddw           mm0,        mm7

+        movq            mm7,        mm0

+        psrlq           mm0,        32

+        paddw           mm7,        mm0

+        movq            rax,        mm7

+    pop rdi

+    pop rsi

+    mov rsp, rbp

+    ; begin epilog

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;unsigned int vp9_sad8x16_mmx(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride)

+sym(vp9_sad8x16_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    push rsi

+    push rdi

+    ; end prolog

+        mov             rsi,        arg(0) ;src_ptr

+        mov             rdi,        arg(2) ;ref_ptr

+        movsxd          rax,        dword ptr arg(1) ;src_stride

+        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+        lea             rcx,        [rsi+rax*8]

+        lea             rcx,        [rcx+rax*8]

+        pxor            mm7,        mm7

+        pxor            mm6,        mm6

+.x8x16sad_mmx_loop:

+        movq            mm0,        QWORD PTR [rsi]

+        movq            mm1,        QWORD PTR [rdi]

+        movq            mm2,        mm0

+        psubusb         mm0,        mm1

+        psubusb         mm1,        mm2

+        por             mm0,        mm1

+        movq            mm2,        mm0

+        punpcklbw       mm0,        mm6

+        punpckhbw       mm2,        mm6

+        lea             rsi,        [rsi+rax]

+        add             rdi,        rdx

+        paddw           mm7,        mm0

+        paddw           mm7,        mm2

+        cmp             rsi,        rcx

+        jne             .x8x16sad_mmx_loop

+        movq            mm0,        mm7

+        punpcklwd       mm0,        mm6

+        punpckhwd       mm7,        mm6

+        paddw           mm0,        mm7

+        movq            mm7,        mm0

+        psrlq           mm0,        32

+        paddw           mm7,        mm0

+        movq            rax,        mm7

+    pop rdi

+    pop rsi

+    mov rsp, rbp

+    ; begin epilog

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;unsigned int vp9_sad8x8_mmx(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride)

+sym(vp9_sad8x8_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    push rsi

+    push rdi

+    ; end prolog

+        mov             rsi,        arg(0) ;src_ptr

+        mov             rdi,        arg(2) ;ref_ptr

+        movsxd          rax,        dword ptr arg(1) ;src_stride

+        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+        lea             rcx,        [rsi+rax*8]

+        pxor            mm7,        mm7

+        pxor            mm6,        mm6

+.x8x8sad_mmx_loop:

+        movq            mm0,        QWORD PTR [rsi]

+        movq            mm1,        QWORD PTR [rdi]

+        movq            mm2,        mm0

+        psubusb         mm0,        mm1

+        psubusb         mm1,        mm2

+        por             mm0,        mm1

+        movq            mm2,        mm0

+        punpcklbw       mm0,        mm6

+        punpckhbw       mm2,        mm6

+        paddw           mm0,        mm2

+        lea             rsi,       [rsi+rax]

+        add             rdi,        rdx

+        paddw           mm7,       mm0

+        cmp             rsi,        rcx

+        jne             .x8x8sad_mmx_loop

+        movq            mm0,        mm7

+        punpcklwd       mm0,        mm6

+        punpckhwd       mm7,        mm6

+        paddw           mm0,        mm7

+        movq            mm7,        mm0

+        psrlq           mm0,        32

+        paddw           mm7,        mm0

+        movq            rax,        mm7

+    pop rdi

+    pop rsi

+    mov rsp, rbp

+    ; begin epilog

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;unsigned int vp9_sad4x4_mmx(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride)

+sym(vp9_sad4x4_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    push rsi

+    push rdi

+    ; end prolog

+        mov             rsi,        arg(0) ;src_ptr

+        mov             rdi,        arg(2) ;ref_ptr

+        movsxd          rax,        dword ptr arg(1) ;src_stride

+        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+        movd            mm0,        DWORD PTR [rsi]

+        movd            mm1,        DWORD PTR [rdi]

+        movd            mm2,        DWORD PTR [rsi+rax]

+        movd            mm3,        DWORD PTR [rdi+rdx]

+        punpcklbw       mm0,        mm2

+        punpcklbw       mm1,        mm3

+        movq            mm2,        mm0

+        psubusb         mm0,        mm1

+        psubusb         mm1,        mm2

+        por             mm0,        mm1

+        movq            mm2,        mm0

+        pxor            mm3,        mm3

+        punpcklbw       mm0,        mm3

+        punpckhbw       mm2,        mm3

+        paddw           mm0,        mm2

+        lea             rsi,        [rsi+rax*2]

+        lea             rdi,        [rdi+rdx*2]

+        movd            mm4,        DWORD PTR [rsi]

+        movd            mm5,        DWORD PTR [rdi]

+        movd            mm6,        DWORD PTR [rsi+rax]

+        movd            mm7,        DWORD PTR [rdi+rdx]

+        punpcklbw       mm4,        mm6

+        punpcklbw       mm5,        mm7

+        movq            mm6,        mm4

+        psubusb         mm4,        mm5

+        psubusb         mm5,        mm6

+        por             mm4,        mm5

+        movq            mm5,        mm4

+        punpcklbw       mm4,        mm3

+        punpckhbw       mm5,        mm3

+        paddw           mm4,        mm5

+        paddw           mm0,        mm4

+        movq            mm1,        mm0

+        punpcklwd       mm0,        mm3

+        punpckhwd       mm1,        mm3

+        paddw           mm0,        mm1

+        movq            mm1,        mm0

+        psrlq           mm0,        32

+        paddw           mm0,        mm1

+        movq            rax,        mm0

+    pop rdi

+    pop rsi

+    mov rsp, rbp

+    ; begin epilog

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;unsigned int vp9_sad16x8_mmx(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride)

+sym(vp9_sad16x8_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    push rsi

+    push rdi

+    ; end prolog

+        mov             rsi,        arg(0) ;src_ptr

+        mov             rdi,        arg(2) ;ref_ptr

+        movsxd          rax,        dword ptr arg(1) ;src_stride

+        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+        lea             rcx,        [rsi+rax*8]

+        pxor            mm7,        mm7

+        pxor            mm6,        mm6

+.x16x8sad_mmx_loop:

+        movq            mm0,       [rsi]

+        movq            mm1,       [rdi]

+        movq            mm2,        [rsi+8]

+        movq            mm3,        [rdi+8]

+        movq            mm4,        mm0

+        movq            mm5,        mm2

+        psubusb         mm0,        mm1

+        psubusb         mm1,        mm4

+        psubusb         mm2,        mm3

+        psubusb         mm3,        mm5

+        por             mm0,        mm1

+        por             mm2,        mm3

+        movq            mm1,        mm0

+        movq            mm3,        mm2

+        punpcklbw       mm0,        mm6

+        punpckhbw       mm1,        mm6

+        punpcklbw       mm2,        mm6

+        punpckhbw       mm3,        mm6

+        paddw           mm0,        mm2

+        paddw           mm1,        mm3

+        paddw           mm0,        mm1

+        lea             rsi,        [rsi+rax]

+        add             rdi,        rdx

+        paddw           mm7,        mm0

+        cmp             rsi,        rcx

+        jne             .x16x8sad_mmx_loop

+        movq            mm0,        mm7

+        punpcklwd       mm0,        mm6

+        punpckhwd       mm7,        mm6

+        paddw           mm0,        mm7

+        movq            mm7,        mm0

+        psrlq           mm0,        32

+        paddw           mm7,        mm0

+        movq            rax,        mm7

+    pop rdi

+    pop rsi

+    mov rsp, rbp

+    ; begin epilog

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

--- /dev/null

+++ b/vp9/encoder/x86/sad_sse2.asm

@@ -1,0 +1,410 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+;unsigned int vp9_sad16x16_wmt(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride)

+global sym(vp9_sad16x16_wmt)

+sym(vp9_sad16x16_wmt):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    SAVE_XMM 6

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov             rsi,        arg(0) ;src_ptr

+        mov             rdi,        arg(2) ;ref_ptr

+        movsxd          rax,        dword ptr arg(1) ;src_stride

+        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+        lea             rcx,        [rsi+rax*8]

+        lea             rcx,        [rcx+rax*8]

+        pxor            xmm6,       xmm6

+.x16x16sad_wmt_loop:

+        movq            xmm0,       QWORD PTR [rsi]

+        movq            xmm2,       QWORD PTR [rsi+8]

+        movq            xmm1,       QWORD PTR [rdi]

+        movq            xmm3,       QWORD PTR [rdi+8]

+        movq            xmm4,       QWORD PTR [rsi+rax]

+        movq            xmm5,       QWORD PTR [rdi+rdx]

+        punpcklbw       xmm0,       xmm2

+        punpcklbw       xmm1,       xmm3

+        psadbw          xmm0,       xmm1

+        movq            xmm2,       QWORD PTR [rsi+rax+8]

+        movq            xmm3,       QWORD PTR [rdi+rdx+8]

+        lea             rsi,        [rsi+rax*2]

+        lea             rdi,        [rdi+rdx*2]

+        punpcklbw       xmm4,       xmm2

+        punpcklbw       xmm5,       xmm3

+        psadbw          xmm4,       xmm5

+        paddw           xmm6,       xmm0

+        paddw           xmm6,       xmm4

+        cmp             rsi,        rcx

+        jne             .x16x16sad_wmt_loop

+        movq            xmm0,       xmm6

+        psrldq          xmm6,       8

+        paddw           xmm0,       xmm6

+        movq            rax,        xmm0

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;unsigned int vp9_sad8x16_wmt(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride,

+;    int  max_err)

+global sym(vp9_sad8x16_wmt)

+sym(vp9_sad8x16_wmt):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    push        rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov             rsi,        arg(0) ;src_ptr

+        mov             rdi,        arg(2) ;ref_ptr

+        movsxd          rbx,        dword ptr arg(1) ;src_stride

+        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+        lea             rcx,        [rsi+rbx*8]

+        lea             rcx,        [rcx+rbx*8]

+        pxor            mm7,        mm7

+.x8x16sad_wmt_loop:

+        movq            rax,        mm7

+        cmp             eax,        arg(4)

+        jg              .x8x16sad_wmt_early_exit

+        movq            mm0,        QWORD PTR [rsi]

+        movq            mm1,        QWORD PTR [rdi]

+        movq            mm2,        QWORD PTR [rsi+rbx]

+        movq            mm3,        QWORD PTR [rdi+rdx]

+        psadbw          mm0,        mm1

+        psadbw          mm2,        mm3

+        lea             rsi,        [rsi+rbx*2]

+        lea             rdi,        [rdi+rdx*2]

+        paddw           mm7,        mm0

+        paddw           mm7,        mm2

+        cmp             rsi,        rcx

+        jne             .x8x16sad_wmt_loop

+        movq            rax,        mm7

+.x8x16sad_wmt_early_exit:

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    pop         rbx

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;unsigned int vp9_sad8x8_wmt(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride)

+global sym(vp9_sad8x8_wmt)

+sym(vp9_sad8x8_wmt):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    push        rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov             rsi,        arg(0) ;src_ptr

+        mov             rdi,        arg(2) ;ref_ptr

+        movsxd          rbx,        dword ptr arg(1) ;src_stride

+        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+        lea             rcx,        [rsi+rbx*8]

+        pxor            mm7,        mm7

+.x8x8sad_wmt_loop:

+        movq            rax,        mm7

+        cmp             eax,        arg(4)

+        jg              .x8x8sad_wmt_early_exit

+        movq            mm0,        QWORD PTR [rsi]

+        movq            mm1,        QWORD PTR [rdi]

+        psadbw          mm0,        mm1

+        lea             rsi,        [rsi+rbx]

+        add             rdi,        rdx

+        paddw           mm7,        mm0

+        cmp             rsi,        rcx

+        jne             .x8x8sad_wmt_loop

+        movq            rax,        mm7

+.x8x8sad_wmt_early_exit:

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    pop         rbx

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;unsigned int vp9_sad4x4_wmt(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride)

+global sym(vp9_sad4x4_wmt)

+sym(vp9_sad4x4_wmt):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov             rsi,        arg(0) ;src_ptr

+        mov             rdi,        arg(2) ;ref_ptr

+        movsxd          rax,        dword ptr arg(1) ;src_stride

+        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+        movd            mm0,        DWORD PTR [rsi]

+        movd            mm1,        DWORD PTR [rdi]

+        movd            mm2,        DWORD PTR [rsi+rax]

+        movd            mm3,        DWORD PTR [rdi+rdx]

+        punpcklbw       mm0,        mm2

+        punpcklbw       mm1,        mm3

+        psadbw          mm0,        mm1

+        lea             rsi,        [rsi+rax*2]

+        lea             rdi,        [rdi+rdx*2]

+        movd            mm4,        DWORD PTR [rsi]

+        movd            mm5,        DWORD PTR [rdi]

+        movd            mm6,        DWORD PTR [rsi+rax]

+        movd            mm7,        DWORD PTR [rdi+rdx]

+        punpcklbw       mm4,        mm6

+        punpcklbw       mm5,        mm7

+        psadbw          mm4,        mm5

+        paddw           mm0,        mm4

+        movq            rax,        mm0

+    ; begin epilog

+    pop rdi

+    pop rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;unsigned int vp9_sad16x8_wmt(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride)

+global sym(vp9_sad16x8_wmt)

+sym(vp9_sad16x8_wmt):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    push        rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov             rsi,        arg(0) ;src_ptr

+        mov             rdi,        arg(2) ;ref_ptr

+        movsxd          rbx,        dword ptr arg(1) ;src_stride

+        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+        lea             rcx,        [rsi+rbx*8]

+        pxor            mm7,        mm7

+.x16x8sad_wmt_loop:

+        movq            rax,        mm7

+        cmp             eax,        arg(4)

+        jg              .x16x8sad_wmt_early_exit

+        movq            mm0,        QWORD PTR [rsi]

+        movq            mm2,        QWORD PTR [rsi+8]

+        movq            mm1,        QWORD PTR [rdi]

+        movq            mm3,        QWORD PTR [rdi+8]

+        movq            mm4,        QWORD PTR [rsi+rbx]

+        movq            mm5,        QWORD PTR [rdi+rdx]

+        psadbw          mm0,        mm1

+        psadbw          mm2,        mm3

+        movq            mm1,        QWORD PTR [rsi+rbx+8]

+        movq            mm3,        QWORD PTR [rdi+rdx+8]

+        psadbw          mm4,        mm5

+        psadbw          mm1,        mm3

+        lea             rsi,        [rsi+rbx*2]

+        lea             rdi,        [rdi+rdx*2]

+        paddw           mm0,        mm2

+        paddw           mm4,        mm1

+        paddw           mm7,        mm0

+        paddw           mm7,        mm4

+        cmp             rsi,        rcx

+        jne             .x16x8sad_wmt_loop

+        movq            rax,        mm7

+.x16x8sad_wmt_early_exit:

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    pop         rbx

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_copy32xn_sse2(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *dst_ptr,

+;    int  dst_stride,

+;    int height);

+global sym(vp9_copy32xn_sse2)

+sym(vp9_copy32xn_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    SAVE_XMM 7

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov             rsi,        arg(0) ;src_ptr

+        mov             rdi,        arg(2) ;dst_ptr

+        movsxd          rax,        dword ptr arg(1) ;src_stride

+        movsxd          rdx,        dword ptr arg(3) ;dst_stride

+        movsxd          rcx,        dword ptr arg(4) ;height

+.block_copy_sse2_loopx4:

+        movdqu          xmm0,       XMMWORD PTR [rsi]

+        movdqu          xmm1,       XMMWORD PTR [rsi + 16]

+        movdqu          xmm2,       XMMWORD PTR [rsi + rax]

+        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]

+        lea             rsi,        [rsi+rax*2]

+        movdqu          xmm4,       XMMWORD PTR [rsi]

+        movdqu          xmm5,       XMMWORD PTR [rsi + 16]

+        movdqu          xmm6,       XMMWORD PTR [rsi + rax]

+        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]

+        lea             rsi,    [rsi+rax*2]

+        movdqa          XMMWORD PTR [rdi], xmm0

+        movdqa          XMMWORD PTR [rdi + 16], xmm1

+        movdqa          XMMWORD PTR [rdi + rdx], xmm2

+        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3

+        lea             rdi,    [rdi+rdx*2]

+        movdqa          XMMWORD PTR [rdi], xmm4

+        movdqa          XMMWORD PTR [rdi + 16], xmm5

+        movdqa          XMMWORD PTR [rdi + rdx], xmm6

+        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7

+        lea             rdi,    [rdi+rdx*2]

+        sub             rcx,     4

+        cmp             rcx,     4

+        jge             .block_copy_sse2_loopx4

+        cmp             rcx, 0

+        je              .copy_is_done

+.block_copy_sse2_loop:

+        movdqu          xmm0,       XMMWORD PTR [rsi]

+        movdqu          xmm1,       XMMWORD PTR [rsi + 16]

+        lea             rsi,    [rsi+rax]

+        movdqa          XMMWORD PTR [rdi], xmm0

+        movdqa          XMMWORD PTR [rdi + 16], xmm1

+        lea             rdi,    [rdi+rdx]

+        sub             rcx,     1

+        jne             .block_copy_sse2_loop

+.copy_is_done:

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

--- /dev/null

+++ b/vp9/encoder/x86/sad_sse3.asm

@@ -1,0 +1,960 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+%macro STACK_FRAME_CREATE_X3 0

+%if ABI_IS_32BIT

+  %define     src_ptr       rsi

+  %define     src_stride    rax

+  %define     ref_ptr       rdi

+  %define     ref_stride    rdx

+  %define     end_ptr       rcx

+  %define     ret_var       rbx

+  %define     result_ptr    arg(4)

+  %define     max_err       arg(4)

+  %define     height        dword ptr arg(4)

+    push        rbp

+    mov         rbp,        rsp

+    push        rsi

+    push        rdi

+    push        rbx

+    mov         rsi,        arg(0)              ; src_ptr

+    mov         rdi,        arg(2)              ; ref_ptr

+    movsxd      rax,        dword ptr arg(1)    ; src_stride

+    movsxd      rdx,        dword ptr arg(3)    ; ref_stride

+%else

+  %ifidn __OUTPUT_FORMAT__,x64

+    SAVE_XMM 7, u

+    %define     src_ptr     rcx

+    %define     src_stride  rdx

+    %define     ref_ptr     r8

+    %define     ref_stride  r9

+    %define     end_ptr     r10

+    %define     ret_var     r11

+    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]

+    %define     max_err     [rsp+xmm_stack_space+8+4*8]

+    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]

+  %else

+    %define     src_ptr     rdi

+    %define     src_stride  rsi

+    %define     ref_ptr     rdx

+    %define     ref_stride  rcx

+    %define     end_ptr     r9

+    %define     ret_var     r10

+    %define     result_ptr  r8

+    %define     max_err     r8

+    %define     height      r8

+  %endif

+%endif

+%endmacro

+%macro STACK_FRAME_DESTROY_X3 0

+  %define     src_ptr

+  %define     src_stride

+  %define     ref_ptr

+  %define     ref_stride

+  %define     end_ptr

+  %define     ret_var

+  %define     result_ptr

+  %define     max_err

+  %define     height

+%if ABI_IS_32BIT

+    pop         rbx

+    pop         rdi

+    pop         rsi

+    pop         rbp

+%else

+  %ifidn __OUTPUT_FORMAT__,x64

+    RESTORE_XMM

+  %endif

+%endif

+    ret

+%endmacro

+%macro STACK_FRAME_CREATE_X4 0

+%if ABI_IS_32BIT

+  %define     src_ptr       rsi

+  %define     src_stride    rax

+  %define     r0_ptr        rcx

+  %define     r1_ptr        rdx

+  %define     r2_ptr        rbx

+  %define     r3_ptr        rdi

+  %define     ref_stride    rbp

+  %define     result_ptr    arg(4)

+    push        rbp

+    mov         rbp,        rsp

+    push        rsi

+    push        rdi

+    push        rbx

+    push        rbp

+    mov         rdi,        arg(2)              ; ref_ptr_base

+    LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi

+    mov         rsi,        arg(0)              ; src_ptr

+    movsxd      rbx,        dword ptr arg(1)    ; src_stride

+    movsxd      rbp,        dword ptr arg(3)    ; ref_stride

+    xchg        rbx,        rax

+%else

+  %ifidn __OUTPUT_FORMAT__,x64

+    SAVE_XMM 7, u

+    %define     src_ptr     rcx

+    %define     src_stride  rdx

+    %define     r0_ptr      rsi

+    %define     r1_ptr      r10

+    %define     r2_ptr      r11

+    %define     r3_ptr      r8

+    %define     ref_stride  r9

+    %define     result_ptr  [rsp+xmm_stack_space+16+4*8]

+    push        rsi

+    LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr

+  %else

+    %define     src_ptr     rdi

+    %define     src_stride  rsi

+    %define     r0_ptr      r9

+    %define     r1_ptr      r10

+    %define     r2_ptr      r11

+    %define     r3_ptr      rdx

+    %define     ref_stride  rcx

+    %define     result_ptr  r8

+    LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr

+  %endif

+%endif

+%endmacro

+%macro STACK_FRAME_DESTROY_X4 0

+  %define     src_ptr

+  %define     src_stride

+  %define     r0_ptr

+  %define     r1_ptr

+  %define     r2_ptr

+  %define     r3_ptr

+  %define     ref_stride

+  %define     result_ptr

+%if ABI_IS_32BIT

+    pop         rbx

+    pop         rdi

+    pop         rsi

+    pop         rbp

+%else

+  %ifidn __OUTPUT_FORMAT__,x64

+    pop         rsi

+    RESTORE_XMM

+  %endif

+%endif

+    ret

+%endmacro

+%macro PROCESS_16X2X3 5

+%if %1==0

+        movdqa          xmm0,       XMMWORD PTR [%2]

+        lddqu           xmm5,       XMMWORD PTR [%3]

+        lddqu           xmm6,       XMMWORD PTR [%3+1]

+        lddqu           xmm7,       XMMWORD PTR [%3+2]

+        psadbw          xmm5,       xmm0

+        psadbw          xmm6,       xmm0

+        psadbw          xmm7,       xmm0

+%else

+        movdqa          xmm0,       XMMWORD PTR [%2]

+        lddqu           xmm1,       XMMWORD PTR [%3]

+        lddqu           xmm2,       XMMWORD PTR [%3+1]

+        lddqu           xmm3,       XMMWORD PTR [%3+2]

+        psadbw          xmm1,       xmm0

+        psadbw          xmm2,       xmm0

+        psadbw          xmm3,       xmm0

+        paddw           xmm5,       xmm1

+        paddw           xmm6,       xmm2

+        paddw           xmm7,       xmm3

+%endif

+        movdqa          xmm0,       XMMWORD PTR [%2+%4]

+        lddqu           xmm1,       XMMWORD PTR [%3+%5]

+        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]

+        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]

+%if %1==0 || %1==1

+        lea             %2,         [%2+%4*2]

+        lea             %3,         [%3+%5*2]

+%endif

+        psadbw          xmm1,       xmm0

+        psadbw          xmm2,       xmm0

+        psadbw          xmm3,       xmm0

+        paddw           xmm5,       xmm1

+        paddw           xmm6,       xmm2

+        paddw           xmm7,       xmm3

+%endmacro

+%macro PROCESS_8X2X3 5

+%if %1==0

+        movq            mm0,       QWORD PTR [%2]

+        movq            mm5,       QWORD PTR [%3]

+        movq            mm6,       QWORD PTR [%3+1]

+        movq            mm7,       QWORD PTR [%3+2]

+        psadbw          mm5,       mm0

+        psadbw          mm6,       mm0

+        psadbw          mm7,       mm0

+%else

+        movq            mm0,       QWORD PTR [%2]

+        movq            mm1,       QWORD PTR [%3]

+        movq            mm2,       QWORD PTR [%3+1]

+        movq            mm3,       QWORD PTR [%3+2]

+        psadbw          mm1,       mm0

+        psadbw          mm2,       mm0

+        psadbw          mm3,       mm0

+        paddw           mm5,       mm1

+        paddw           mm6,       mm2

+        paddw           mm7,       mm3

+%endif

+        movq            mm0,       QWORD PTR [%2+%4]

+        movq            mm1,       QWORD PTR [%3+%5]

+        movq            mm2,       QWORD PTR [%3+%5+1]

+        movq            mm3,       QWORD PTR [%3+%5+2]

+%if %1==0 || %1==1

+        lea             %2,        [%2+%4*2]

+        lea             %3,        [%3+%5*2]

+%endif

+        psadbw          mm1,       mm0

+        psadbw          mm2,       mm0

+        psadbw          mm3,       mm0

+        paddw           mm5,       mm1

+        paddw           mm6,       mm2

+        paddw           mm7,       mm3

+%endmacro

+%macro LOAD_X4_ADDRESSES 5

+        mov             %2,         [%1+REG_SZ_BYTES*0]

+        mov             %3,         [%1+REG_SZ_BYTES*1]

+        mov             %4,         [%1+REG_SZ_BYTES*2]

+        mov             %5,         [%1+REG_SZ_BYTES*3]

+%endmacro

+%macro PROCESS_16X2X4 8

+%if %1==0

+        movdqa          xmm0,       XMMWORD PTR [%2]

+        lddqu           xmm4,       XMMWORD PTR [%3]

+        lddqu           xmm5,       XMMWORD PTR [%4]

+        lddqu           xmm6,       XMMWORD PTR [%5]

+        lddqu           xmm7,       XMMWORD PTR [%6]

+        psadbw          xmm4,       xmm0

+        psadbw          xmm5,       xmm0

+        psadbw          xmm6,       xmm0

+        psadbw          xmm7,       xmm0

+%else

+        movdqa          xmm0,       XMMWORD PTR [%2]

+        lddqu           xmm1,       XMMWORD PTR [%3]

+        lddqu           xmm2,       XMMWORD PTR [%4]

+        lddqu           xmm3,       XMMWORD PTR [%5]

+        psadbw          xmm1,       xmm0

+        psadbw          xmm2,       xmm0

+        psadbw          xmm3,       xmm0

+        paddw           xmm4,       xmm1

+        lddqu           xmm1,       XMMWORD PTR [%6]

+        paddw           xmm5,       xmm2

+        paddw           xmm6,       xmm3

+        psadbw          xmm1,       xmm0

+        paddw           xmm7,       xmm1

+%endif

+        movdqa          xmm0,       XMMWORD PTR [%2+%7]

+        lddqu           xmm1,       XMMWORD PTR [%3+%8]

+        lddqu           xmm2,       XMMWORD PTR [%4+%8]

+        lddqu           xmm3,       XMMWORD PTR [%5+%8]

+        psadbw          xmm1,       xmm0

+        psadbw          xmm2,       xmm0

+        psadbw          xmm3,       xmm0

+        paddw           xmm4,       xmm1

+        lddqu           xmm1,       XMMWORD PTR [%6+%8]

+        paddw           xmm5,       xmm2

+        paddw           xmm6,       xmm3

+%if %1==0 || %1==1

+        lea             %2,         [%2+%7*2]

+        lea             %3,         [%3+%8*2]

+        lea             %4,         [%4+%8*2]

+        lea             %5,         [%5+%8*2]

+        lea             %6,         [%6+%8*2]

+%endif

+        psadbw          xmm1,       xmm0

+        paddw           xmm7,       xmm1

+%endmacro

+%macro PROCESS_8X2X4 8

+%if %1==0

+        movq            mm0,        QWORD PTR [%2]

+        movq            mm4,        QWORD PTR [%3]

+        movq            mm5,        QWORD PTR [%4]

+        movq            mm6,        QWORD PTR [%5]

+        movq            mm7,        QWORD PTR [%6]

+        psadbw          mm4,        mm0

+        psadbw          mm5,        mm0

+        psadbw          mm6,        mm0

+        psadbw          mm7,        mm0

+%else

+        movq            mm0,        QWORD PTR [%2]

+        movq            mm1,        QWORD PTR [%3]

+        movq            mm2,        QWORD PTR [%4]

+        movq            mm3,        QWORD PTR [%5]

+        psadbw          mm1,        mm0

+        psadbw          mm2,        mm0

+        psadbw          mm3,        mm0

+        paddw           mm4,        mm1

+        movq            mm1,        QWORD PTR [%6]

+        paddw           mm5,        mm2

+        paddw           mm6,        mm3

+        psadbw          mm1,        mm0

+        paddw           mm7,        mm1

+%endif

+        movq            mm0,        QWORD PTR [%2+%7]

+        movq            mm1,        QWORD PTR [%3+%8]

+        movq            mm2,        QWORD PTR [%4+%8]

+        movq            mm3,        QWORD PTR [%5+%8]

+        psadbw          mm1,        mm0

+        psadbw          mm2,        mm0

+        psadbw          mm3,        mm0

+        paddw           mm4,        mm1

+        movq            mm1,        QWORD PTR [%6+%8]

+        paddw           mm5,        mm2

+        paddw           mm6,        mm3

+%if %1==0 || %1==1

+        lea             %2,         [%2+%7*2]

+        lea             %3,         [%3+%8*2]

+        lea             %4,         [%4+%8*2]

+        lea             %5,         [%5+%8*2]

+        lea             %6,         [%6+%8*2]

+%endif

+        psadbw          mm1,        mm0

+        paddw           mm7,        mm1

+%endmacro

+;void int vp9_sad16x16x3_sse3(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride,

+;    int  *results)

+global sym(vp9_sad16x16x3_sse3)

+sym(vp9_sad16x16x3_sse3):

+    STACK_FRAME_CREATE_X3

+        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride

+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

+        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride

+        mov             rcx,        result_ptr

+        movq            xmm0,       xmm5

+        psrldq          xmm5,       8

+        paddw           xmm0,       xmm5

+        movd            [rcx],      xmm0

+;-

+        movq            xmm0,       xmm6

+        psrldq          xmm6,       8

+        paddw           xmm0,       xmm6

+        movd            [rcx+4],    xmm0

+;-

+        movq            xmm0,       xmm7

+        psrldq          xmm7,       8

+        paddw           xmm0,       xmm7

+        movd            [rcx+8],    xmm0

+    STACK_FRAME_DESTROY_X3

+;void int vp9_sad16x8x3_sse3(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride,

+;    int  *results)

+global sym(vp9_sad16x8x3_sse3)

+sym(vp9_sad16x8x3_sse3):

+    STACK_FRAME_CREATE_X3

+        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride

+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

+        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride

+        mov             rcx,        result_ptr

+        movq            xmm0,       xmm5

+        psrldq          xmm5,       8

+        paddw           xmm0,       xmm5

+        movd            [rcx],      xmm0

+;-

+        movq            xmm0,       xmm6

+        psrldq          xmm6,       8

+        paddw           xmm0,       xmm6

+        movd            [rcx+4],    xmm0

+;-

+        movq            xmm0,       xmm7

+        psrldq          xmm7,       8

+        paddw           xmm0,       xmm7

+        movd            [rcx+8],    xmm0

+    STACK_FRAME_DESTROY_X3

+;void int vp9_sad8x16x3_sse3(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride,

+;    int  *results)

+global sym(vp9_sad8x16x3_sse3)

+sym(vp9_sad8x16x3_sse3):

+    STACK_FRAME_CREATE_X3

+        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride

+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

+        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride

+        mov             rcx,        result_ptr

+        punpckldq       mm5,        mm6

+        movq            [rcx],      mm5

+        movd            [rcx+8],    mm7

+    STACK_FRAME_DESTROY_X3

+;void int vp9_sad8x8x3_sse3(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride,

+;    int  *results)

+global sym(vp9_sad8x8x3_sse3)

+sym(vp9_sad8x8x3_sse3):

+    STACK_FRAME_CREATE_X3

+        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride

+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride

+        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride

+        mov             rcx,        result_ptr

+        punpckldq       mm5,        mm6

+        movq            [rcx],      mm5

+        movd            [rcx+8],    mm7

+    STACK_FRAME_DESTROY_X3

+;void int vp9_sad4x4x3_sse3(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride,

+;    int  *results)

+global sym(vp9_sad4x4x3_sse3)

+sym(vp9_sad4x4x3_sse3):

+    STACK_FRAME_CREATE_X3

+        movd            mm0,        DWORD PTR [src_ptr]

+        movd            mm1,        DWORD PTR [ref_ptr]

+        movd            mm2,        DWORD PTR [src_ptr+src_stride]

+        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]

+        punpcklbw       mm0,        mm2

+        punpcklbw       mm1,        mm3

+        movd            mm4,        DWORD PTR [ref_ptr+1]

+        movd            mm5,        DWORD PTR [ref_ptr+2]

+        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]

+        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]

+        psadbw          mm1,        mm0

+        punpcklbw       mm4,        mm2

+        punpcklbw       mm5,        mm3

+        psadbw          mm4,        mm0

+        psadbw          mm5,        mm0

+        lea             src_ptr,    [src_ptr+src_stride*2]

+        lea             ref_ptr,    [ref_ptr+ref_stride*2]

+        movd            mm0,        DWORD PTR [src_ptr]

+        movd            mm2,        DWORD PTR [ref_ptr]

+        movd            mm3,        DWORD PTR [src_ptr+src_stride]

+        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]

+        punpcklbw       mm0,        mm3

+        punpcklbw       mm2,        mm6

+        movd            mm3,        DWORD PTR [ref_ptr+1]

+        movd            mm7,        DWORD PTR [ref_ptr+2]

+        psadbw          mm2,        mm0

+        paddw           mm1,        mm2

+        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]

+        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]

+        punpcklbw       mm3,        mm2

+        punpcklbw       mm7,        mm6

+        psadbw          mm3,        mm0

+        psadbw          mm7,        mm0

+        paddw           mm3,        mm4

+        paddw           mm7,        mm5

+        mov             rcx,        result_ptr

+        punpckldq       mm1,        mm3

+        movq            [rcx],      mm1

+        movd            [rcx+8],    mm7

+    STACK_FRAME_DESTROY_X3

+;unsigned int vp9_sad16x16_sse3(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride,

+;    int  max_err)

+;%define lddqu movdqu

+global sym(vp9_sad16x16_sse3)

+sym(vp9_sad16x16_sse3):

+    STACK_FRAME_CREATE_X3

+        mov             end_ptr,    4

+        pxor            xmm7,        xmm7

+.vp9_sad16x16_sse3_loop:

+        movdqa          xmm0,       XMMWORD PTR [src_ptr]

+        movdqu          xmm1,       XMMWORD PTR [ref_ptr]

+        movdqa          xmm2,       XMMWORD PTR [src_ptr+src_stride]

+        movdqu          xmm3,       XMMWORD PTR [ref_ptr+ref_stride]

+        lea             src_ptr,    [src_ptr+src_stride*2]

+        lea             ref_ptr,    [ref_ptr+ref_stride*2]

+        movdqa          xmm4,       XMMWORD PTR [src_ptr]

+        movdqu          xmm5,       XMMWORD PTR [ref_ptr]

+        movdqa          xmm6,       XMMWORD PTR [src_ptr+src_stride]

+        psadbw          xmm0,       xmm1

+        movdqu          xmm1,       XMMWORD PTR [ref_ptr+ref_stride]

+        psadbw          xmm2,       xmm3

+        psadbw          xmm4,       xmm5

+        psadbw          xmm6,       xmm1

+        lea             src_ptr,    [src_ptr+src_stride*2]

+        lea             ref_ptr,    [ref_ptr+ref_stride*2]

+        paddw           xmm7,        xmm0

+        paddw           xmm7,        xmm2

+        paddw           xmm7,        xmm4

+        paddw           xmm7,        xmm6

+        sub             end_ptr,     1

+        jne             .vp9_sad16x16_sse3_loop

+        movq            xmm0,       xmm7

+        psrldq          xmm7,       8

+        paddw           xmm0,       xmm7

+        movq            rax,        xmm0

+    STACK_FRAME_DESTROY_X3

+;void vp9_copy32xn_sse3(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *dst_ptr,

+;    int  dst_stride,

+;    int height);

+global sym(vp9_copy32xn_sse3)

+sym(vp9_copy32xn_sse3):

+    STACK_FRAME_CREATE_X3

+.block_copy_sse3_loopx4:

+        lea             end_ptr,    [src_ptr+src_stride*2]

+        movdqu          xmm0,       XMMWORD PTR [src_ptr]

+        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]

+        movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]

+        movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]

+        movdqu          xmm4,       XMMWORD PTR [end_ptr]

+        movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]

+        movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]

+        movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]

+        lea             src_ptr,    [src_ptr+src_stride*4]

+        lea             end_ptr,    [ref_ptr+ref_stride*2]

+        movdqa          XMMWORD PTR [ref_ptr], xmm0

+        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1

+        movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2

+        movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3

+        movdqa          XMMWORD PTR [end_ptr], xmm4

+        movdqa          XMMWORD PTR [end_ptr + 16], xmm5

+        movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6

+        movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7

+        lea             ref_ptr,    [ref_ptr+ref_stride*4]

+        sub             height,     4

+        cmp             height,     4

+        jge             .block_copy_sse3_loopx4

+        ;Check to see if there is more rows need to be copied.

+        cmp             height, 0

+        je              .copy_is_done

+.block_copy_sse3_loop:

+        movdqu          xmm0,       XMMWORD PTR [src_ptr]

+        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]

+        lea             src_ptr,    [src_ptr+src_stride]

+        movdqa          XMMWORD PTR [ref_ptr], xmm0

+        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1

+        lea             ref_ptr,    [ref_ptr+ref_stride]

+        sub             height,     1

+        jne             .block_copy_sse3_loop

+.copy_is_done:

+    STACK_FRAME_DESTROY_X3

+;void vp9_sad16x16x4d_sse3(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr_base,

+;    int  ref_stride,

+;    int  *results)

+global sym(vp9_sad16x16x4d_sse3)

+sym(vp9_sad16x16x4d_sse3):

+    STACK_FRAME_CREATE_X4

+        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+%if ABI_IS_32BIT

+        pop             rbp

+%endif

+        mov             rcx,        result_ptr

+        movq            xmm0,       xmm4

+        psrldq          xmm4,       8

+        paddw           xmm0,       xmm4

+        movd            [rcx],      xmm0

+;-

+        movq            xmm0,       xmm5

+        psrldq          xmm5,       8

+        paddw           xmm0,       xmm5

+        movd            [rcx+4],    xmm0

+;-

+        movq            xmm0,       xmm6

+        psrldq          xmm6,       8

+        paddw           xmm0,       xmm6

+        movd            [rcx+8],    xmm0

+;-

+        movq            xmm0,       xmm7

+        psrldq          xmm7,       8

+        paddw           xmm0,       xmm7

+        movd            [rcx+12],   xmm0

+    STACK_FRAME_DESTROY_X4

+;void vp9_sad16x8x4d_sse3(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr_base,

+;    int  ref_stride,

+;    int  *results)

+global sym(vp9_sad16x8x4d_sse3)

+sym(vp9_sad16x8x4d_sse3):

+    STACK_FRAME_CREATE_X4

+        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+%if ABI_IS_32BIT

+        pop             rbp

+%endif

+        mov             rcx,        result_ptr

+        movq            xmm0,       xmm4

+        psrldq          xmm4,       8

+        paddw           xmm0,       xmm4

+        movd            [rcx],      xmm0

+;-

+        movq            xmm0,       xmm5

+        psrldq          xmm5,       8

+        paddw           xmm0,       xmm5

+        movd            [rcx+4],    xmm0

+;-

+        movq            xmm0,       xmm6

+        psrldq          xmm6,       8

+        paddw           xmm0,       xmm6

+        movd            [rcx+8],    xmm0

+;-

+        movq            xmm0,       xmm7

+        psrldq          xmm7,       8

+        paddw           xmm0,       xmm7

+        movd            [rcx+12],   xmm0

+    STACK_FRAME_DESTROY_X4

+;void int vp9_sad8x16x4d_sse3(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride,

+;    int  *results)

+global sym(vp9_sad8x16x4d_sse3)

+sym(vp9_sad8x16x4d_sse3):

+    STACK_FRAME_CREATE_X4

+        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+%if ABI_IS_32BIT

+        pop             rbp

+%endif

+        mov             rcx,        result_ptr

+        punpckldq       mm4,        mm5

+        punpckldq       mm6,        mm7

+        movq            [rcx],      mm4

+        movq            [rcx+8],    mm6

+    STACK_FRAME_DESTROY_X4

+;void int vp9_sad8x8x4d_sse3(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride,

+;    int  *results)

+global sym(vp9_sad8x8x4d_sse3)

+sym(vp9_sad8x8x4d_sse3):

+    STACK_FRAME_CREATE_X4

+        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

+%if ABI_IS_32BIT

+        pop             rbp

+%endif

+        mov             rcx,        result_ptr

+        punpckldq       mm4,        mm5

+        punpckldq       mm6,        mm7

+        movq            [rcx],      mm4

+        movq            [rcx+8],    mm6

+    STACK_FRAME_DESTROY_X4

+;void int vp9_sad4x4x4d_sse3(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride,

+;    int  *results)

+global sym(vp9_sad4x4x4d_sse3)

+sym(vp9_sad4x4x4d_sse3):

+    STACK_FRAME_CREATE_X4

+        movd            mm0,        DWORD PTR [src_ptr]

+        movd            mm1,        DWORD PTR [r0_ptr]

+        movd            mm2,        DWORD PTR [src_ptr+src_stride]

+        movd            mm3,        DWORD PTR [r0_ptr+ref_stride]

+        punpcklbw       mm0,        mm2

+        punpcklbw       mm1,        mm3

+        movd            mm4,        DWORD PTR [r1_ptr]

+        movd            mm5,        DWORD PTR [r2_ptr]

+        movd            mm6,        DWORD PTR [r3_ptr]

+        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]

+        movd            mm3,        DWORD PTR [r2_ptr+ref_stride]

+        movd            mm7,        DWORD PTR [r3_ptr+ref_stride]

+        psadbw          mm1,        mm0

+        punpcklbw       mm4,        mm2

+        punpcklbw       mm5,        mm3

+        punpcklbw       mm6,        mm7

+        psadbw          mm4,        mm0

+        psadbw          mm5,        mm0

+        psadbw          mm6,        mm0

+        lea             src_ptr,    [src_ptr+src_stride*2]

+        lea             r0_ptr,     [r0_ptr+ref_stride*2]

+        lea             r1_ptr,     [r1_ptr+ref_stride*2]

+        lea             r2_ptr,     [r2_ptr+ref_stride*2]

+        lea             r3_ptr,     [r3_ptr+ref_stride*2]

+        movd            mm0,        DWORD PTR [src_ptr]

+        movd            mm2,        DWORD PTR [r0_ptr]

+        movd            mm3,        DWORD PTR [src_ptr+src_stride]

+        movd            mm7,        DWORD PTR [r0_ptr+ref_stride]

+        punpcklbw       mm0,        mm3

+        punpcklbw       mm2,        mm7

+        movd            mm3,        DWORD PTR [r1_ptr]

+        movd            mm7,        DWORD PTR [r2_ptr]

+        psadbw          mm2,        mm0

+%if ABI_IS_32BIT

+        mov             rax,        rbp

+        pop             rbp

+%define     ref_stride    rax

+%endif

+        mov             rsi,        result_ptr

+        paddw           mm1,        mm2

+        movd            [rsi],      mm1

+        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]

+        movd            mm1,        DWORD PTR [r2_ptr+ref_stride]

+        punpcklbw       mm3,        mm2

+        punpcklbw       mm7,        mm1

+        psadbw          mm3,        mm0

+        psadbw          mm7,        mm0

+        movd            mm2,        DWORD PTR [r3_ptr]

+        movd            mm1,        DWORD PTR [r3_ptr+ref_stride]

+        paddw           mm3,        mm4

+        paddw           mm7,        mm5

+        movd            [rsi+4],    mm3

+        punpcklbw       mm2,        mm1

+        movd            [rsi+8],    mm7

+        psadbw          mm2,        mm0

+        paddw           mm2,        mm6

+        movd            [rsi+12],   mm2

+    STACK_FRAME_DESTROY_X4

--- /dev/null

+++ b/vp9/encoder/x86/sad_sse4.asm

@@ -1,0 +1,353 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+%macro PROCESS_16X2X8 1

+%if %1

+        movdqa          xmm0,       XMMWORD PTR [rsi]

+        movq            xmm1,       MMWORD PTR [rdi]

+        movq            xmm3,       MMWORD PTR [rdi+8]

+        movq            xmm2,       MMWORD PTR [rdi+16]

+        punpcklqdq      xmm1,       xmm3

+        punpcklqdq      xmm3,       xmm2

+        movdqa          xmm2,       xmm1

+        mpsadbw         xmm1,       xmm0,  0x0

+        mpsadbw         xmm2,       xmm0,  0x5

+        psrldq          xmm0,       8

+        movdqa          xmm4,       xmm3

+        mpsadbw         xmm3,       xmm0,  0x0

+        mpsadbw         xmm4,       xmm0,  0x5

+        paddw           xmm1,       xmm2

+        paddw           xmm1,       xmm3

+        paddw           xmm1,       xmm4

+%else

+        movdqa          xmm0,       XMMWORD PTR [rsi]

+        movq            xmm5,       MMWORD PTR [rdi]

+        movq            xmm3,       MMWORD PTR [rdi+8]

+        movq            xmm2,       MMWORD PTR [rdi+16]

+        punpcklqdq      xmm5,       xmm3

+        punpcklqdq      xmm3,       xmm2

+        movdqa          xmm2,       xmm5

+        mpsadbw         xmm5,       xmm0,  0x0

+        mpsadbw         xmm2,       xmm0,  0x5

+        psrldq          xmm0,       8

+        movdqa          xmm4,       xmm3

+        mpsadbw         xmm3,       xmm0,  0x0

+        mpsadbw         xmm4,       xmm0,  0x5

+        paddw           xmm5,       xmm2

+        paddw           xmm5,       xmm3

+        paddw           xmm5,       xmm4

+        paddw           xmm1,       xmm5

+%endif

+        movdqa          xmm0,       XMMWORD PTR [rsi + rax]

+        movq            xmm5,       MMWORD PTR [rdi+ rdx]

+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]

+        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]

+        punpcklqdq      xmm5,       xmm3

+        punpcklqdq      xmm3,       xmm2

+        lea             rsi,        [rsi+rax*2]

+        lea             rdi,        [rdi+rdx*2]

+        movdqa          xmm2,       xmm5

+        mpsadbw         xmm5,       xmm0,  0x0

+        mpsadbw         xmm2,       xmm0,  0x5

+        psrldq          xmm0,       8

+        movdqa          xmm4,       xmm3

+        mpsadbw         xmm3,       xmm0,  0x0

+        mpsadbw         xmm4,       xmm0,  0x5

+        paddw           xmm5,       xmm2

+        paddw           xmm5,       xmm3

+        paddw           xmm5,       xmm4

+        paddw           xmm1,       xmm5

+%endmacro

+%macro PROCESS_8X2X8 1

+%if %1

+        movq            xmm0,       MMWORD PTR [rsi]

+        movq            xmm1,       MMWORD PTR [rdi]

+        movq            xmm3,       MMWORD PTR [rdi+8]

+        punpcklqdq      xmm1,       xmm3

+        movdqa          xmm2,       xmm1

+        mpsadbw         xmm1,       xmm0,  0x0

+        mpsadbw         xmm2,       xmm0,  0x5

+        paddw           xmm1,       xmm2

+%else

+        movq            xmm0,       MMWORD PTR [rsi]

+        movq            xmm5,       MMWORD PTR [rdi]

+        movq            xmm3,       MMWORD PTR [rdi+8]

+        punpcklqdq      xmm5,       xmm3

+        movdqa          xmm2,       xmm5

+        mpsadbw         xmm5,       xmm0,  0x0

+        mpsadbw         xmm2,       xmm0,  0x5

+        paddw           xmm5,       xmm2

+        paddw           xmm1,       xmm5

+%endif

+        movq            xmm0,       MMWORD PTR [rsi + rax]

+        movq            xmm5,       MMWORD PTR [rdi+ rdx]

+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]

+        punpcklqdq      xmm5,       xmm3

+        lea             rsi,        [rsi+rax*2]

+        lea             rdi,        [rdi+rdx*2]

+        movdqa          xmm2,       xmm5

+        mpsadbw         xmm5,       xmm0,  0x0

+        mpsadbw         xmm2,       xmm0,  0x5

+        paddw           xmm5,       xmm2

+        paddw           xmm1,       xmm5

+%endmacro

+%macro PROCESS_4X2X8 1

+%if %1

+        movd            xmm0,       [rsi]

+        movq            xmm1,       MMWORD PTR [rdi]

+        movq            xmm3,       MMWORD PTR [rdi+8]

+        punpcklqdq      xmm1,       xmm3

+        mpsadbw         xmm1,       xmm0,  0x0

+%else

+        movd            xmm0,       [rsi]

+        movq            xmm5,       MMWORD PTR [rdi]

+        movq            xmm3,       MMWORD PTR [rdi+8]

+        punpcklqdq      xmm5,       xmm3

+        mpsadbw         xmm5,       xmm0,  0x0

+        paddw           xmm1,       xmm5

+%endif

+        movd            xmm0,       [rsi + rax]

+        movq            xmm5,       MMWORD PTR [rdi+ rdx]

+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]

+        punpcklqdq      xmm5,       xmm3

+        lea             rsi,        [rsi+rax*2]

+        lea             rdi,        [rdi+rdx*2]

+        mpsadbw         xmm5,       xmm0,  0x0

+        paddw           xmm1,       xmm5

+%endmacro

+;void vp9_sad16x16x8_sse4(

+;    const unsigned char *src_ptr,

+;    int  src_stride,

+;    const unsigned char *ref_ptr,

+;    int  ref_stride,

+;    unsigned short *sad_array);

+global sym(vp9_sad16x16x8_sse4)

+sym(vp9_sad16x16x8_sse4):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov             rsi,        arg(0)           ;src_ptr

+        mov             rdi,        arg(2)           ;ref_ptr

+        movsxd          rax,        dword ptr arg(1) ;src_stride

+        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+        PROCESS_16X2X8 1

+        PROCESS_16X2X8 0

+        PROCESS_16X2X8 0

+        PROCESS_16X2X8 0

+        PROCESS_16X2X8 0

+        PROCESS_16X2X8 0

+        PROCESS_16X2X8 0

+        PROCESS_16X2X8 0

+        mov             rdi,        arg(4)           ;Results

+        movdqa          XMMWORD PTR [rdi],    xmm1

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_sad16x8x8_sse4(

+;    const unsigned char *src_ptr,

+;    int  src_stride,

+;    const unsigned char *ref_ptr,

+;    int  ref_stride,

+;    unsigned short *sad_array

+;);

+global sym(vp9_sad16x8x8_sse4)

+sym(vp9_sad16x8x8_sse4):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov             rsi,        arg(0)           ;src_ptr

+        mov             rdi,        arg(2)           ;ref_ptr

+        movsxd          rax,        dword ptr arg(1) ;src_stride

+        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+        PROCESS_16X2X8 1

+        PROCESS_16X2X8 0

+        PROCESS_16X2X8 0

+        PROCESS_16X2X8 0

+        mov             rdi,        arg(4)           ;Results

+        movdqa          XMMWORD PTR [rdi],    xmm1

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_sad8x8x8_sse4(

+;    const unsigned char *src_ptr,

+;    int  src_stride,

+;    const unsigned char *ref_ptr,

+;    int  ref_stride,

+;    unsigned short *sad_array

+;);

+global sym(vp9_sad8x8x8_sse4)

+sym(vp9_sad8x8x8_sse4):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov             rsi,        arg(0)           ;src_ptr

+        mov             rdi,        arg(2)           ;ref_ptr

+        movsxd          rax,        dword ptr arg(1) ;src_stride

+        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+        PROCESS_8X2X8 1

+        PROCESS_8X2X8 0

+        PROCESS_8X2X8 0

+        PROCESS_8X2X8 0

+        mov             rdi,        arg(4)           ;Results

+        movdqa          XMMWORD PTR [rdi],    xmm1

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_sad8x16x8_sse4(

+;    const unsigned char *src_ptr,

+;    int  src_stride,

+;    const unsigned char *ref_ptr,

+;    int  ref_stride,

+;    unsigned short *sad_array

+;);

+global sym(vp9_sad8x16x8_sse4)

+sym(vp9_sad8x16x8_sse4):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov             rsi,        arg(0)           ;src_ptr

+        mov             rdi,        arg(2)           ;ref_ptr

+        movsxd          rax,        dword ptr arg(1) ;src_stride

+        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+        PROCESS_8X2X8 1

+        PROCESS_8X2X8 0

+        PROCESS_8X2X8 0

+        PROCESS_8X2X8 0

+        PROCESS_8X2X8 0

+        PROCESS_8X2X8 0

+        PROCESS_8X2X8 0

+        PROCESS_8X2X8 0

+        mov             rdi,        arg(4)           ;Results

+        movdqa          XMMWORD PTR [rdi],    xmm1

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_sad4x4x8_c(

+;    const unsigned char *src_ptr,

+;    int  src_stride,

+;    const unsigned char *ref_ptr,

+;    int  ref_stride,

+;    unsigned short *sad_array

+;);

+global sym(vp9_sad4x4x8_sse4)

+sym(vp9_sad4x4x8_sse4):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov             rsi,        arg(0)           ;src_ptr

+        mov             rdi,        arg(2)           ;ref_ptr

+        movsxd          rax,        dword ptr arg(1) ;src_stride

+        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+        PROCESS_4X2X8 1

+        PROCESS_4X2X8 0

+        mov             rdi,        arg(4)           ;Results

+        movdqa          XMMWORD PTR [rdi],    xmm1

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

--- /dev/null

+++ b/vp9/encoder/x86/sad_ssse3.asm

@@ -1,0 +1,370 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+%macro PROCESS_16X2X3 1

+%if %1

+        movdqa          xmm0,       XMMWORD PTR [rsi]

+        lddqu           xmm5,       XMMWORD PTR [rdi]

+        lddqu           xmm6,       XMMWORD PTR [rdi+1]

+        lddqu           xmm7,       XMMWORD PTR [rdi+2]

+        psadbw          xmm5,       xmm0

+        psadbw          xmm6,       xmm0

+        psadbw          xmm7,       xmm0

+%else

+        movdqa          xmm0,       XMMWORD PTR [rsi]

+        lddqu           xmm1,       XMMWORD PTR [rdi]

+        lddqu           xmm2,       XMMWORD PTR [rdi+1]

+        lddqu           xmm3,       XMMWORD PTR [rdi+2]

+        psadbw          xmm1,       xmm0

+        psadbw          xmm2,       xmm0

+        psadbw          xmm3,       xmm0

+        paddw           xmm5,       xmm1

+        paddw           xmm6,       xmm2

+        paddw           xmm7,       xmm3

+%endif

+        movdqa          xmm0,       XMMWORD PTR [rsi+rax]

+        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]

+        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]

+        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]

+        lea             rsi,        [rsi+rax*2]

+        lea             rdi,        [rdi+rdx*2]

+        psadbw          xmm1,       xmm0

+        psadbw          xmm2,       xmm0

+        psadbw          xmm3,       xmm0

+        paddw           xmm5,       xmm1

+        paddw           xmm6,       xmm2

+        paddw           xmm7,       xmm3

+%endmacro

+%macro PROCESS_16X2X3_OFFSET 2

+%if %1

+        movdqa          xmm0,       XMMWORD PTR [rsi]

+        movdqa          xmm4,       XMMWORD PTR [rdi]

+        movdqa          xmm7,       XMMWORD PTR [rdi+16]

+        movdqa          xmm5,       xmm7

+        palignr         xmm5,       xmm4,       %2

+        movdqa          xmm6,       xmm7

+        palignr         xmm6,       xmm4,       (%2+1)

+        palignr         xmm7,       xmm4,       (%2+2)

+        psadbw          xmm5,       xmm0

+        psadbw          xmm6,       xmm0

+        psadbw          xmm7,       xmm0

+%else

+        movdqa          xmm0,       XMMWORD PTR [rsi]

+        movdqa          xmm4,       XMMWORD PTR [rdi]

+        movdqa          xmm3,       XMMWORD PTR [rdi+16]

+        movdqa          xmm1,       xmm3

+        palignr         xmm1,       xmm4,       %2

+        movdqa          xmm2,       xmm3

+        palignr         xmm2,       xmm4,       (%2+1)

+        palignr         xmm3,       xmm4,       (%2+2)

+        psadbw          xmm1,       xmm0

+        psadbw          xmm2,       xmm0

+        psadbw          xmm3,       xmm0

+        paddw           xmm5,       xmm1

+        paddw           xmm6,       xmm2

+        paddw           xmm7,       xmm3

+%endif

+        movdqa          xmm0,       XMMWORD PTR [rsi+rax]

+        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]

+        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]

+        movdqa          xmm1,       xmm3

+        palignr         xmm1,       xmm4,       %2

+        movdqa          xmm2,       xmm3

+        palignr         xmm2,       xmm4,       (%2+1)

+        palignr         xmm3,       xmm4,       (%2+2)

+        lea             rsi,        [rsi+rax*2]

+        lea             rdi,        [rdi+rdx*2]

+        psadbw          xmm1,       xmm0

+        psadbw          xmm2,       xmm0

+        psadbw          xmm3,       xmm0

+        paddw           xmm5,       xmm1

+        paddw           xmm6,       xmm2

+        paddw           xmm7,       xmm3

+%endmacro

+%macro PROCESS_16X16X3_OFFSET 2

+%2_aligned_by_%1:

+        sub             rdi,        %1

+        PROCESS_16X2X3_OFFSET 1, %1

+        PROCESS_16X2X3_OFFSET 0, %1

+        PROCESS_16X2X3_OFFSET 0, %1

+        PROCESS_16X2X3_OFFSET 0, %1

+        PROCESS_16X2X3_OFFSET 0, %1

+        PROCESS_16X2X3_OFFSET 0, %1

+        PROCESS_16X2X3_OFFSET 0, %1

+        PROCESS_16X2X3_OFFSET 0, %1

+        jmp             %2_store_off

+%endmacro

+%macro PROCESS_16X8X3_OFFSET 2

+%2_aligned_by_%1:

+        sub             rdi,        %1

+        PROCESS_16X2X3_OFFSET 1, %1

+        PROCESS_16X2X3_OFFSET 0, %1

+        PROCESS_16X2X3_OFFSET 0, %1

+        PROCESS_16X2X3_OFFSET 0, %1

+        jmp             %2_store_off

+%endmacro

+;void int vp9_sad16x16x3_ssse3(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride,

+;    int  *results)

+global sym(vp9_sad16x16x3_ssse3)

+sym(vp9_sad16x16x3_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    SAVE_XMM 7

+    push        rsi

+    push        rdi

+    push        rcx

+    ; end prolog

+        mov             rsi,        arg(0) ;src_ptr

+        mov             rdi,        arg(2) ;ref_ptr

+        mov             rdx,        0xf

+        and             rdx,        rdi

+        jmp .vp9_sad16x16x3_ssse3_skiptable

+.vp9_sad16x16x3_ssse3_jumptable:

+        dd .vp9_sad16x16x3_ssse3_aligned_by_0  - .vp9_sad16x16x3_ssse3_do_jump

+        dd .vp9_sad16x16x3_ssse3_aligned_by_1  - .vp9_sad16x16x3_ssse3_do_jump

+        dd .vp9_sad16x16x3_ssse3_aligned_by_2  - .vp9_sad16x16x3_ssse3_do_jump

+        dd .vp9_sad16x16x3_ssse3_aligned_by_3  - .vp9_sad16x16x3_ssse3_do_jump

+        dd .vp9_sad16x16x3_ssse3_aligned_by_4  - .vp9_sad16x16x3_ssse3_do_jump

+        dd .vp9_sad16x16x3_ssse3_aligned_by_5  - .vp9_sad16x16x3_ssse3_do_jump

+        dd .vp9_sad16x16x3_ssse3_aligned_by_6  - .vp9_sad16x16x3_ssse3_do_jump

+        dd .vp9_sad16x16x3_ssse3_aligned_by_7  - .vp9_sad16x16x3_ssse3_do_jump

+        dd .vp9_sad16x16x3_ssse3_aligned_by_8  - .vp9_sad16x16x3_ssse3_do_jump

+        dd .vp9_sad16x16x3_ssse3_aligned_by_9  - .vp9_sad16x16x3_ssse3_do_jump

+        dd .vp9_sad16x16x3_ssse3_aligned_by_10 - .vp9_sad16x16x3_ssse3_do_jump

+        dd .vp9_sad16x16x3_ssse3_aligned_by_11 - .vp9_sad16x16x3_ssse3_do_jump

+        dd .vp9_sad16x16x3_ssse3_aligned_by_12 - .vp9_sad16x16x3_ssse3_do_jump

+        dd .vp9_sad16x16x3_ssse3_aligned_by_13 - .vp9_sad16x16x3_ssse3_do_jump

+        dd .vp9_sad16x16x3_ssse3_aligned_by_14 - .vp9_sad16x16x3_ssse3_do_jump

+        dd .vp9_sad16x16x3_ssse3_aligned_by_15 - .vp9_sad16x16x3_ssse3_do_jump

+.vp9_sad16x16x3_ssse3_skiptable:

+        call .vp9_sad16x16x3_ssse3_do_jump

+.vp9_sad16x16x3_ssse3_do_jump:

+        pop             rcx                         ; get the address of do_jump

+        mov             rax,  .vp9_sad16x16x3_ssse3_jumptable - .vp9_sad16x16x3_ssse3_do_jump

+        add             rax,  rcx  ; get the absolute address of vp9_sad16x16x3_ssse3_jumptable

+        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable

+        add             rcx,        rax

+        movsxd          rax,        dword ptr arg(1) ;src_stride

+        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+        jmp             rcx

+        PROCESS_16X16X3_OFFSET 0,  .vp9_sad16x16x3_ssse3

+        PROCESS_16X16X3_OFFSET 1,  .vp9_sad16x16x3_ssse3

+        PROCESS_16X16X3_OFFSET 2,  .vp9_sad16x16x3_ssse3

+        PROCESS_16X16X3_OFFSET 3,  .vp9_sad16x16x3_ssse3

+        PROCESS_16X16X3_OFFSET 4,  .vp9_sad16x16x3_ssse3

+        PROCESS_16X16X3_OFFSET 5,  .vp9_sad16x16x3_ssse3

+        PROCESS_16X16X3_OFFSET 6,  .vp9_sad16x16x3_ssse3

+        PROCESS_16X16X3_OFFSET 7,  .vp9_sad16x16x3_ssse3

+        PROCESS_16X16X3_OFFSET 8,  .vp9_sad16x16x3_ssse3

+        PROCESS_16X16X3_OFFSET 9,  .vp9_sad16x16x3_ssse3

+        PROCESS_16X16X3_OFFSET 10, .vp9_sad16x16x3_ssse3

+        PROCESS_16X16X3_OFFSET 11, .vp9_sad16x16x3_ssse3

+        PROCESS_16X16X3_OFFSET 12, .vp9_sad16x16x3_ssse3

+        PROCESS_16X16X3_OFFSET 13, .vp9_sad16x16x3_ssse3

+        PROCESS_16X16X3_OFFSET 14, .vp9_sad16x16x3_ssse3

+.vp9_sad16x16x3_ssse3_aligned_by_15:

+        PROCESS_16X2X3 1

+        PROCESS_16X2X3 0

+        PROCESS_16X2X3 0

+        PROCESS_16X2X3 0

+        PROCESS_16X2X3 0

+        PROCESS_16X2X3 0

+        PROCESS_16X2X3 0

+        PROCESS_16X2X3 0

+.vp9_sad16x16x3_ssse3_store_off:

+        mov             rdi,        arg(4) ;Results

+        movq            xmm0,       xmm5

+        psrldq          xmm5,       8

+        paddw           xmm0,       xmm5

+        movd            [rdi],      xmm0

+;-

+        movq            xmm0,       xmm6

+        psrldq          xmm6,       8

+        paddw           xmm0,       xmm6

+        movd            [rdi+4],    xmm0

+;-

+        movq            xmm0,       xmm7

+        psrldq          xmm7,       8

+        paddw           xmm0,       xmm7

+        movd            [rdi+8],    xmm0

+    ; begin epilog

+    pop         rcx

+    pop         rdi

+    pop         rsi

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void int vp9_sad16x8x3_ssse3(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *ref_ptr,

+;    int  ref_stride,

+;    int  *results)

+global sym(vp9_sad16x8x3_ssse3)

+sym(vp9_sad16x8x3_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    SAVE_XMM 7

+    push        rsi

+    push        rdi

+    push        rcx

+    ; end prolog

+        mov             rsi,        arg(0) ;src_ptr

+        mov             rdi,        arg(2) ;ref_ptr

+        mov             rdx,        0xf

+        and             rdx,        rdi

+        jmp .vp9_sad16x8x3_ssse3_skiptable

+.vp9_sad16x8x3_ssse3_jumptable:

+        dd .vp9_sad16x8x3_ssse3_aligned_by_0  - .vp9_sad16x8x3_ssse3_do_jump

+        dd .vp9_sad16x8x3_ssse3_aligned_by_1  - .vp9_sad16x8x3_ssse3_do_jump

+        dd .vp9_sad16x8x3_ssse3_aligned_by_2  - .vp9_sad16x8x3_ssse3_do_jump

+        dd .vp9_sad16x8x3_ssse3_aligned_by_3  - .vp9_sad16x8x3_ssse3_do_jump

+        dd .vp9_sad16x8x3_ssse3_aligned_by_4  - .vp9_sad16x8x3_ssse3_do_jump

+        dd .vp9_sad16x8x3_ssse3_aligned_by_5  - .vp9_sad16x8x3_ssse3_do_jump

+        dd .vp9_sad16x8x3_ssse3_aligned_by_6  - .vp9_sad16x8x3_ssse3_do_jump

+        dd .vp9_sad16x8x3_ssse3_aligned_by_7  - .vp9_sad16x8x3_ssse3_do_jump

+        dd .vp9_sad16x8x3_ssse3_aligned_by_8  - .vp9_sad16x8x3_ssse3_do_jump

+        dd .vp9_sad16x8x3_ssse3_aligned_by_9  - .vp9_sad16x8x3_ssse3_do_jump

+        dd .vp9_sad16x8x3_ssse3_aligned_by_10 - .vp9_sad16x8x3_ssse3_do_jump

+        dd .vp9_sad16x8x3_ssse3_aligned_by_11 - .vp9_sad16x8x3_ssse3_do_jump

+        dd .vp9_sad16x8x3_ssse3_aligned_by_12 - .vp9_sad16x8x3_ssse3_do_jump

+        dd .vp9_sad16x8x3_ssse3_aligned_by_13 - .vp9_sad16x8x3_ssse3_do_jump

+        dd .vp9_sad16x8x3_ssse3_aligned_by_14 - .vp9_sad16x8x3_ssse3_do_jump

+        dd .vp9_sad16x8x3_ssse3_aligned_by_15 - .vp9_sad16x8x3_ssse3_do_jump

+.vp9_sad16x8x3_ssse3_skiptable:

+        call .vp9_sad16x8x3_ssse3_do_jump

+.vp9_sad16x8x3_ssse3_do_jump:

+        pop             rcx                         ; get the address of do_jump

+        mov             rax,  .vp9_sad16x8x3_ssse3_jumptable - .vp9_sad16x8x3_ssse3_do_jump

+        add             rax,  rcx  ; get the absolute address of vp9_sad16x8x3_ssse3_jumptable

+        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable

+        add             rcx,        rax

+        movsxd          rax,        dword ptr arg(1) ;src_stride

+        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+        jmp             rcx

+        PROCESS_16X8X3_OFFSET 0,  .vp9_sad16x8x3_ssse3

+        PROCESS_16X8X3_OFFSET 1,  .vp9_sad16x8x3_ssse3

+        PROCESS_16X8X3_OFFSET 2,  .vp9_sad16x8x3_ssse3

+        PROCESS_16X8X3_OFFSET 3,  .vp9_sad16x8x3_ssse3

+        PROCESS_16X8X3_OFFSET 4,  .vp9_sad16x8x3_ssse3

+        PROCESS_16X8X3_OFFSET 5,  .vp9_sad16x8x3_ssse3

+        PROCESS_16X8X3_OFFSET 6,  .vp9_sad16x8x3_ssse3

+        PROCESS_16X8X3_OFFSET 7,  .vp9_sad16x8x3_ssse3

+        PROCESS_16X8X3_OFFSET 8,  .vp9_sad16x8x3_ssse3

+        PROCESS_16X8X3_OFFSET 9,  .vp9_sad16x8x3_ssse3

+        PROCESS_16X8X3_OFFSET 10, .vp9_sad16x8x3_ssse3

+        PROCESS_16X8X3_OFFSET 11, .vp9_sad16x8x3_ssse3

+        PROCESS_16X8X3_OFFSET 12, .vp9_sad16x8x3_ssse3

+        PROCESS_16X8X3_OFFSET 13, .vp9_sad16x8x3_ssse3

+        PROCESS_16X8X3_OFFSET 14, .vp9_sad16x8x3_ssse3

+.vp9_sad16x8x3_ssse3_aligned_by_15:

+        PROCESS_16X2X3 1

+        PROCESS_16X2X3 0

+        PROCESS_16X2X3 0

+        PROCESS_16X2X3 0

+.vp9_sad16x8x3_ssse3_store_off:

+        mov             rdi,        arg(4) ;Results

+        movq            xmm0,       xmm5

+        psrldq          xmm5,       8

+        paddw           xmm0,       xmm5

+        movd            [rdi],      xmm0

+;-

+        movq            xmm0,       xmm6

+        psrldq          xmm6,       8

+        paddw           xmm0,       xmm6

+        movd            [rdi+4],    xmm0

+;-

+        movq            xmm0,       xmm7

+        psrldq          xmm7,       8

+        paddw           xmm0,       xmm7

+        movd            [rdi+8],    xmm0

+    ; begin epilog

+    pop         rcx

+    pop         rdi

+    pop         rsi

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

--- /dev/null

+++ b/vp9/encoder/x86/ssim_opt.asm

@@ -1,0 +1,216 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr

+%macro TABULATE_SSIM 0

+        paddusw         xmm15, xmm3  ; sum_s

+        paddusw         xmm14, xmm4  ; sum_r

+        movdqa          xmm1, xmm3

+        pmaddwd         xmm1, xmm1

+        paddd           xmm13, xmm1 ; sum_sq_s

+        movdqa          xmm2, xmm4

+        pmaddwd         xmm2, xmm2

+        paddd           xmm12, xmm2 ; sum_sq_r

+        pmaddwd         xmm3, xmm4

+        paddd           xmm11, xmm3  ; sum_sxr

+%endmacro

+; Sum across the register %1 starting with q words

+%macro SUM_ACROSS_Q 1

+        movdqa          xmm2,%1

+        punpckldq       %1,xmm0

+        punpckhdq       xmm2,xmm0

+        paddq           %1,xmm2

+        movdqa          xmm2,%1

+        punpcklqdq      %1,xmm0

+        punpckhqdq      xmm2,xmm0

+        paddq           %1,xmm2

+%endmacro

+; Sum across the register %1 starting with q words

+%macro SUM_ACROSS_W 1

+        movdqa          xmm1, %1

+        punpcklwd       %1,xmm0

+        punpckhwd       xmm1,xmm0

+        paddd           %1, xmm1

+        SUM_ACROSS_Q    %1

+%endmacro

+;void ssim_parms_sse2(

+;    unsigned char *s,

+;    int sp,

+;    unsigned char *r,

+;    int rp

+;    unsigned long *sum_s,

+;    unsigned long *sum_r,

+;    unsigned long *sum_sq_s,

+;    unsigned long *sum_sq_r,

+;    unsigned long *sum_sxr);

+;

+; TODO: Use parm passing through structure, probably don't need the pxors

+; ( calling app will initialize to 0 ) could easily fit everything in sse2

+; without too much hastle, and can probably do better estimates with psadw

+; or pavgb At this point this is just meant to be first pass for calculating

+; all the parms needed for 16x16 ssim so we can play with dssim as distortion

+; in mode selection code.

+global sym(vp9_ssim_parms_16x16_sse2)

+sym(vp9_ssim_parms_16x16_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 9

+    SAVE_XMM 15

+    push        rsi

+    push        rdi

+    ; end prolog

+    mov             rsi,        arg(0) ;s

+    mov             rcx,        arg(1) ;sp

+    mov             rdi,        arg(2) ;r

+    mov             rax,        arg(3) ;rp

+    pxor            xmm0, xmm0

+    pxor            xmm15,xmm15  ;sum_s

+    pxor            xmm14,xmm14  ;sum_r

+    pxor            xmm13,xmm13  ;sum_sq_s

+    pxor            xmm12,xmm12  ;sum_sq_r

+    pxor            xmm11,xmm11  ;sum_sxr

+    mov             rdx, 16      ;row counter

+.NextRow:

+    ;grab source and reference pixels

+    movdqu          xmm5, [rsi]

+    movdqu          xmm6, [rdi]

+    movdqa          xmm3, xmm5

+    movdqa          xmm4, xmm6

+    punpckhbw       xmm3, xmm0 ; high_s

+    punpckhbw       xmm4, xmm0 ; high_r

+    TABULATE_SSIM

+    movdqa          xmm3, xmm5

+    movdqa          xmm4, xmm6

+    punpcklbw       xmm3, xmm0 ; low_s

+    punpcklbw       xmm4, xmm0 ; low_r

+    TABULATE_SSIM

+    add             rsi, rcx   ; next s row

+    add             rdi, rax   ; next r row

+    dec             rdx        ; counter

+    jnz .NextRow

+    SUM_ACROSS_W    xmm15

+    SUM_ACROSS_W    xmm14

+    SUM_ACROSS_Q    xmm13

+    SUM_ACROSS_Q    xmm12

+    SUM_ACROSS_Q    xmm11

+    mov             rdi,arg(4)

+    movd            [rdi], xmm15;

+    mov             rdi,arg(5)

+    movd            [rdi], xmm14;

+    mov             rdi,arg(6)

+    movd            [rdi], xmm13;

+    mov             rdi,arg(7)

+    movd            [rdi], xmm12;

+    mov             rdi,arg(8)

+    movd            [rdi], xmm11;

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void ssim_parms_sse2(

+;    unsigned char *s,

+;    int sp,

+;    unsigned char *r,

+;    int rp

+;    unsigned long *sum_s,

+;    unsigned long *sum_r,

+;    unsigned long *sum_sq_s,

+;    unsigned long *sum_sq_r,

+;    unsigned long *sum_sxr);

+;

+; TODO: Use parm passing through structure, probably don't need the pxors

+; ( calling app will initialize to 0 ) could easily fit everything in sse2

+; without too much hastle, and can probably do better estimates with psadw

+; or pavgb At this point this is just meant to be first pass for calculating

+; all the parms needed for 16x16 ssim so we can play with dssim as distortion

+; in mode selection code.

+global sym(vp9_ssim_parms_8x8_sse2)

+sym(vp9_ssim_parms_8x8_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 9

+    SAVE_XMM 15

+    push        rsi

+    push        rdi

+    ; end prolog

+    mov             rsi,        arg(0) ;s

+    mov             rcx,        arg(1) ;sp

+    mov             rdi,        arg(2) ;r

+    mov             rax,        arg(3) ;rp

+    pxor            xmm0, xmm0

+    pxor            xmm15,xmm15  ;sum_s

+    pxor            xmm14,xmm14  ;sum_r

+    pxor            xmm13,xmm13  ;sum_sq_s

+    pxor            xmm12,xmm12  ;sum_sq_r

+    pxor            xmm11,xmm11  ;sum_sxr

+    mov             rdx, 8      ;row counter

+.NextRow:

+    ;grab source and reference pixels

+    movq            xmm3, [rsi]

+    movq            xmm4, [rdi]

+    punpcklbw       xmm3, xmm0 ; low_s

+    punpcklbw       xmm4, xmm0 ; low_r

+    TABULATE_SSIM

+    add             rsi, rcx   ; next s row

+    add             rdi, rax   ; next r row

+    dec             rdx        ; counter

+    jnz .NextRow

+    SUM_ACROSS_W    xmm15

+    SUM_ACROSS_W    xmm14

+    SUM_ACROSS_Q    xmm13

+    SUM_ACROSS_Q    xmm12

+    SUM_ACROSS_Q    xmm11

+    mov             rdi,arg(4)

+    movd            [rdi], xmm15;

+    mov             rdi,arg(5)

+    movd            [rdi], xmm14;

+    mov             rdi,arg(6)

+    movd            [rdi], xmm13;

+    mov             rdi,arg(7)

+    movd            [rdi], xmm12;

+    mov             rdi,arg(8)

+    movd            [rdi], xmm11;

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

--- /dev/null

+++ b/vp9/encoder/x86/subtract_mmx.asm

@@ -1,0 +1,432 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+;void vp9_subtract_b_mmx_impl(unsigned char *z,  int src_stride,

+;                            short *diff, unsigned char *Predictor,

+;                            int pitch);

+global sym(vp9_subtract_b_mmx_impl)

+sym(vp9_subtract_b_mmx_impl):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    push rsi

+    push rdi

+    ; end prolog

+        mov     rdi,        arg(2) ;diff

+        mov     rax,        arg(3) ;Predictor

+        mov     rsi,        arg(0) ;z

+        movsxd  rdx,        dword ptr arg(1);src_stride;

+        movsxd  rcx,        dword ptr arg(4);pitch

+        pxor    mm7,        mm7

+        movd    mm0,        [rsi]

+        movd    mm1,        [rax]

+        punpcklbw   mm0,    mm7

+        punpcklbw   mm1,    mm7

+        psubw   mm0,        mm1

+        movq    [rdi],      mm0

+        movd    mm0,        [rsi+rdx]

+        movd    mm1,        [rax+rcx]

+        punpcklbw   mm0,    mm7

+        punpcklbw   mm1,    mm7

+        psubw   mm0,        mm1

+        movq    [rdi+rcx*2],mm0

+        movd    mm0,        [rsi+rdx*2]

+        movd    mm1,        [rax+rcx*2]

+        punpcklbw   mm0,    mm7

+        punpcklbw   mm1,    mm7

+        psubw   mm0,        mm1

+        movq    [rdi+rcx*4],        mm0

+        lea     rsi,        [rsi+rdx*2]

+        lea     rcx,        [rcx+rcx*2]

+        movd    mm0,        [rsi+rdx]

+        movd    mm1,        [rax+rcx]

+        punpcklbw   mm0,    mm7

+        punpcklbw   mm1,    mm7

+        psubw   mm0,        mm1

+        movq    [rdi+rcx*2],        mm0

+    ; begin epilog

+    pop rdi

+    pop rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)

+global sym(vp9_subtract_mby_mmx)

+sym(vp9_subtract_mby_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    push rsi

+    push rdi

+    ; end prolog

+            mov         rsi,            arg(1) ;src

+            mov         rdi,            arg(0) ;diff

+            mov         rax,            arg(2) ;pred

+            movsxd      rdx,            dword ptr arg(3) ;stride

+            mov         rcx,            16

+            pxor        mm0,            mm0

+.submby_loop:

+            movq        mm1,            [rsi]

+            movq        mm3,            [rax]

+            movq        mm2,            mm1

+            movq        mm4,            mm3

+            punpcklbw   mm1,            mm0

+            punpcklbw   mm3,            mm0

+            punpckhbw   mm2,            mm0

+            punpckhbw   mm4,            mm0

+            psubw       mm1,            mm3

+            psubw       mm2,            mm4

+            movq        [rdi],          mm1

+            movq        [rdi+8],        mm2

+            movq        mm1,            [rsi+8]

+            movq        mm3,            [rax+8]

+            movq        mm2,            mm1

+            movq        mm4,            mm3

+            punpcklbw   mm1,            mm0

+            punpcklbw   mm3,            mm0

+            punpckhbw   mm2,            mm0

+            punpckhbw   mm4,            mm0

+            psubw       mm1,            mm3

+            psubw       mm2,            mm4

+            movq        [rdi+16],       mm1

+            movq        [rdi+24],       mm2

+            add         rdi,            32

+            add         rax,            16

+            lea         rsi,            [rsi+rdx]

+            sub         rcx,            1

+            jnz         .submby_loop

+    pop rdi

+    pop rsi

+    ; begin epilog

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)

+global sym(vp9_subtract_mbuv_mmx)

+sym(vp9_subtract_mbuv_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    push rsi

+    push rdi

+    ; end prolog

+    ;short *udiff = diff + 256;

+    ;short *vdiff = diff + 320;

+    ;unsigned char *upred = pred + 256;

+    ;unsigned char *vpred = pred + 320;

+        ;unsigned char  *z    = usrc;

+        ;unsigned short *diff = udiff;

+        ;unsigned char  *Predictor= upred;

+            mov     rdi,        arg(0) ;diff

+            mov     rax,        arg(3) ;pred

+            mov     rsi,        arg(1) ;z = usrc

+            add     rdi,        256*2  ;diff = diff + 256 (shorts)

+            add     rax,        256    ;Predictor = pred + 256

+            movsxd  rdx,        dword ptr arg(4) ;stride;

+            pxor    mm7,        mm7

+            movq    mm0,        [rsi]

+            movq    mm1,        [rax]

+            movq    mm3,        mm0

+            movq    mm4,        mm1

+            punpcklbw   mm0,    mm7

+            punpcklbw   mm1,    mm7

+            punpckhbw   mm3,    mm7

+            punpckhbw   mm4,    mm7

+            psubw   mm0,        mm1

+            psubw   mm3,        mm4

+            movq    [rdi],      mm0

+            movq    [rdi+8],    mm3

+            movq    mm0,        [rsi+rdx]

+            movq    mm1,        [rax+8]

+            movq    mm3,        mm0

+            movq    mm4,        mm1

+            punpcklbw   mm0,    mm7

+            punpcklbw   mm1,    mm7

+            punpckhbw   mm3,    mm7

+            punpckhbw   mm4,    mm7

+            psubw   mm0,        mm1

+            psubw   mm3,        mm4

+            movq    [rdi+16],   mm0

+            movq    [rdi+24],   mm3

+            movq    mm0,        [rsi+rdx*2]

+            movq    mm1,        [rax+16]

+            movq    mm3,        mm0

+            movq    mm4,        mm1

+            punpcklbw   mm0,    mm7

+            punpcklbw   mm1,    mm7

+            punpckhbw   mm3,    mm7

+            punpckhbw   mm4,    mm7

+            psubw   mm0,        mm1

+            psubw   mm3,        mm4

+            movq    [rdi+32],   mm0

+            movq    [rdi+40],   mm3

+            lea     rsi,        [rsi+rdx*2]

+            movq    mm0,        [rsi+rdx]

+            movq    mm1,        [rax+24]

+            movq    mm3,        mm0

+            movq    mm4,        mm1

+            punpcklbw   mm0,    mm7

+            punpcklbw   mm1,    mm7

+            punpckhbw   mm3,    mm7

+            punpckhbw   mm4,    mm7

+            psubw   mm0,        mm1

+            psubw   mm3,        mm4

+            movq    [rdi+48],   mm0

+            movq    [rdi+56],   mm3

+            add     rdi,        64

+            add     rax,        32

+            lea     rsi,        [rsi+rdx*2]

+            movq    mm0,        [rsi]

+            movq    mm1,        [rax]

+            movq    mm3,        mm0

+            movq    mm4,        mm1

+            punpcklbw   mm0,    mm7

+            punpcklbw   mm1,    mm7

+            punpckhbw   mm3,    mm7

+            punpckhbw   mm4,    mm7

+            psubw   mm0,        mm1

+            psubw   mm3,        mm4

+            movq    [rdi],      mm0

+            movq    [rdi+8],    mm3

+            movq    mm0,        [rsi+rdx]

+            movq    mm1,        [rax+8]

+            movq    mm3,        mm0

+            movq    mm4,        mm1

+            punpcklbw   mm0,    mm7

+            punpcklbw   mm1,    mm7

+            punpckhbw   mm3,    mm7

+            punpckhbw   mm4,    mm7

+            psubw   mm0,        mm1

+            psubw   mm3,        mm4

+            movq    [rdi+16],   mm0

+            movq    [rdi+24],   mm3

+            movq    mm0,        [rsi+rdx*2]

+            movq    mm1,        [rax+16]

+            movq    mm3,        mm0

+            movq    mm4,        mm1

+            punpcklbw   mm0,    mm7

+            punpcklbw   mm1,    mm7

+            punpckhbw   mm3,    mm7

+            punpckhbw   mm4,    mm7

+            psubw   mm0,        mm1

+            psubw   mm3,        mm4

+            movq    [rdi+32],   mm0

+            movq    [rdi+40],   mm3

+            lea     rsi,        [rsi+rdx*2]

+            movq    mm0,        [rsi+rdx]

+            movq    mm1,        [rax+24]

+            movq    mm3,        mm0

+            movq    mm4,        mm1

+            punpcklbw   mm0,    mm7

+            punpcklbw   mm1,    mm7

+            punpckhbw   mm3,    mm7

+            punpckhbw   mm4,    mm7

+            psubw   mm0,        mm1

+            psubw   mm3,        mm4

+            movq    [rdi+48],   mm0

+            movq    [rdi+56],   mm3

+        ;unsigned char  *z    = vsrc;

+        ;unsigned short *diff = vdiff;

+        ;unsigned char  *Predictor= vpred;

+            mov     rdi,        arg(0) ;diff

+            mov     rax,        arg(3) ;pred

+            mov     rsi,        arg(2) ;z = usrc

+            add     rdi,        320*2  ;diff = diff + 320 (shorts)

+            add     rax,        320    ;Predictor = pred + 320

+            movsxd  rdx,        dword ptr arg(4) ;stride;

+            pxor    mm7,        mm7

+            movq    mm0,        [rsi]

+            movq    mm1,        [rax]

+            movq    mm3,        mm0

+            movq    mm4,        mm1

+            punpcklbw   mm0,    mm7

+            punpcklbw   mm1,    mm7

+            punpckhbw   mm3,    mm7

+            punpckhbw   mm4,    mm7

+            psubw   mm0,        mm1

+            psubw   mm3,        mm4

+            movq    [rdi],      mm0

+            movq    [rdi+8],    mm3

+            movq    mm0,        [rsi+rdx]

+            movq    mm1,        [rax+8]

+            movq    mm3,        mm0

+            movq    mm4,        mm1

+            punpcklbw   mm0,    mm7

+            punpcklbw   mm1,    mm7

+            punpckhbw   mm3,    mm7

+            punpckhbw   mm4,    mm7

+            psubw   mm0,        mm1

+            psubw   mm3,        mm4

+            movq    [rdi+16],   mm0

+            movq    [rdi+24],   mm3

+            movq    mm0,        [rsi+rdx*2]

+            movq    mm1,        [rax+16]

+            movq    mm3,        mm0

+            movq    mm4,        mm1

+            punpcklbw   mm0,    mm7

+            punpcklbw   mm1,    mm7

+            punpckhbw   mm3,    mm7

+            punpckhbw   mm4,    mm7

+            psubw   mm0,        mm1

+            psubw   mm3,        mm4

+            movq    [rdi+32],   mm0

+            movq    [rdi+40],   mm3

+            lea     rsi,        [rsi+rdx*2]

+            movq    mm0,        [rsi+rdx]

+            movq    mm1,        [rax+24]

+            movq    mm3,        mm0

+            movq    mm4,        mm1

+            punpcklbw   mm0,    mm7

+            punpcklbw   mm1,    mm7

+            punpckhbw   mm3,    mm7

+            punpckhbw   mm4,    mm7

+            psubw   mm0,        mm1

+            psubw   mm3,        mm4

+            movq    [rdi+48],   mm0

+            movq    [rdi+56],   mm3

+            add     rdi,        64

+            add     rax,        32

+            lea     rsi,        [rsi+rdx*2]

+            movq    mm0,        [rsi]

+            movq    mm1,        [rax]

+            movq    mm3,        mm0

+            movq    mm4,        mm1

+            punpcklbw   mm0,    mm7

+            punpcklbw   mm1,    mm7

+            punpckhbw   mm3,    mm7

+            punpckhbw   mm4,    mm7

+            psubw   mm0,        mm1

+            psubw   mm3,        mm4

+            movq    [rdi],      mm0

+            movq    [rdi+8],    mm3

+            movq    mm0,        [rsi+rdx]

+            movq    mm1,        [rax+8]

+            movq    mm3,        mm0

+            movq    mm4,        mm1

+            punpcklbw   mm0,    mm7

+            punpcklbw   mm1,    mm7

+            punpckhbw   mm3,    mm7

+            punpckhbw   mm4,    mm7

+            psubw   mm0,        mm1

+            psubw   mm3,        mm4

+            movq    [rdi+16],   mm0

+            movq    [rdi+24],   mm3

+            movq    mm0,        [rsi+rdx*2]

+            movq    mm1,        [rax+16]

+            movq    mm3,        mm0

+            movq    mm4,        mm1

+            punpcklbw   mm0,    mm7

+            punpcklbw   mm1,    mm7

+            punpckhbw   mm3,    mm7

+            punpckhbw   mm4,    mm7

+            psubw   mm0,        mm1

+            psubw   mm3,        mm4

+            movq    [rdi+32],   mm0

+            movq    [rdi+40],   mm3

+            lea     rsi,        [rsi+rdx*2]

+            movq    mm0,        [rsi+rdx]

+            movq    mm1,        [rax+24]

+            movq    mm3,        mm0

+            movq    mm4,        mm1

+            punpcklbw   mm0,    mm7

+            punpcklbw   mm1,    mm7

+            punpckhbw   mm3,    mm7

+            punpckhbw   mm4,    mm7

+            psubw   mm0,        mm1

+            psubw   mm3,        mm4

+            movq    [rdi+48],   mm0

+            movq    [rdi+56],   mm3

+    ; begin epilog

+    pop rdi

+    pop rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

--- /dev/null

+++ b/vp9/encoder/x86/subtract_sse2.asm

@@ -1,0 +1,356 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+;void vp9_subtract_b_sse2_impl(unsigned char *z,  int src_stride,

+;                            short *diff, unsigned char *Predictor,

+;                            int pitch);

+global sym(vp9_subtract_b_sse2_impl)

+sym(vp9_subtract_b_sse2_impl):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    GET_GOT     rbx

+    push rsi

+    push rdi

+    ; end prolog

+        mov     rdi,        arg(2) ;diff

+        mov     rax,        arg(3) ;Predictor

+        mov     rsi,        arg(0) ;z

+        movsxd  rdx,        dword ptr arg(1);src_stride;

+        movsxd  rcx,        dword ptr arg(4);pitch

+        pxor    mm7,        mm7

+        movd    mm0,        [rsi]

+        movd    mm1,        [rax]

+        punpcklbw   mm0,    mm7

+        punpcklbw   mm1,    mm7

+        psubw   mm0,        mm1

+        movq    MMWORD PTR [rdi],      mm0

+        movd    mm0,        [rsi+rdx]

+        movd    mm1,        [rax+rcx]

+        punpcklbw   mm0,    mm7

+        punpcklbw   mm1,    mm7

+        psubw   mm0,        mm1

+        movq    MMWORD PTR [rdi+rcx*2], mm0

+        movd    mm0,        [rsi+rdx*2]

+        movd    mm1,        [rax+rcx*2]

+        punpcklbw   mm0,    mm7

+        punpcklbw   mm1,    mm7

+        psubw   mm0,        mm1

+        movq    MMWORD PTR [rdi+rcx*4], mm0

+        lea     rsi,        [rsi+rdx*2]

+        lea     rcx,        [rcx+rcx*2]

+        movd    mm0,        [rsi+rdx]

+        movd    mm1,        [rax+rcx]

+        punpcklbw   mm0,    mm7

+        punpcklbw   mm1,    mm7

+        psubw   mm0,        mm1

+        movq    MMWORD PTR [rdi+rcx*2], mm0

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)

+global sym(vp9_subtract_mby_sse2)

+sym(vp9_subtract_mby_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push rsi

+    push rdi

+    ; end prolog

+            mov         rsi,            arg(1) ;src

+            mov         rdi,            arg(0) ;diff

+            mov         rax,            arg(2) ;pred

+            movsxd      rdx,            dword ptr arg(3) ;stride

+            mov         rcx,            8      ; do two lines at one time

+.submby_loop:

+            movdqa      xmm0,           XMMWORD PTR [rsi]   ; src

+            movdqa      xmm1,           XMMWORD PTR [rax]   ; pred

+            movdqa      xmm2,           xmm0

+            psubb       xmm0,           xmm1

+            pxor        xmm1,           [GLOBAL(t80)]   ;convert to signed values

+            pxor        xmm2,           [GLOBAL(t80)]

+            pcmpgtb     xmm1,           xmm2            ; obtain sign information

+            movdqa      xmm2,    xmm0

+            movdqa      xmm3,    xmm1

+            punpcklbw   xmm0,    xmm1            ; put sign back to subtraction

+            punpckhbw   xmm2,    xmm3            ; put sign back to subtraction

+            movdqa      XMMWORD PTR [rdi],   xmm0

+            movdqa      XMMWORD PTR [rdi +16], xmm2

+            movdqa      xmm4,           XMMWORD PTR [rsi + rdx]

+            movdqa      xmm5,           XMMWORD PTR [rax + 16]

+            movdqa      xmm6,           xmm4

+            psubb       xmm4,           xmm5

+            pxor        xmm5,           [GLOBAL(t80)]   ;convert to signed values

+            pxor        xmm6,           [GLOBAL(t80)]

+            pcmpgtb     xmm5,           xmm6            ; obtain sign information

+            movdqa      xmm6,    xmm4

+            movdqa      xmm7,    xmm5

+            punpcklbw   xmm4,    xmm5            ; put sign back to subtraction

+            punpckhbw   xmm6,    xmm7            ; put sign back to subtraction

+            movdqa      XMMWORD PTR [rdi +32], xmm4

+            movdqa      XMMWORD PTR [rdi +48], xmm6

+            add         rdi,            64

+            add         rax,            32

+            lea         rsi,            [rsi+rdx*2]

+            sub         rcx,            1

+            jnz         .submby_loop

+    pop rdi

+    pop rsi

+    ; begin epilog

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)

+global sym(vp9_subtract_mbuv_sse2)

+sym(vp9_subtract_mbuv_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    GET_GOT     rbx

+    push rsi

+    push rdi

+    ; end prolog

+            mov     rdi,        arg(0) ;diff

+            mov     rax,        arg(3) ;pred

+            mov     rsi,        arg(1) ;z = usrc

+            add     rdi,        256*2  ;diff = diff + 256 (shorts)

+            add     rax,        256    ;Predictor = pred + 256

+            movsxd  rdx,        dword ptr arg(4) ;stride;

+            lea     rcx,        [rdx + rdx*2]

+            ;u

+            ;line 0 1

+            movq       xmm0,    MMWORD PTR [rsi]  ; src

+            movq       xmm2,    MMWORD PTR [rsi+rdx]

+            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred

+            punpcklqdq xmm0,    xmm2

+            movdqa     xmm2,    xmm0

+            psubb      xmm0,    xmm1            ; subtraction with sign missed

+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values

+            pxor       xmm2,    [GLOBAL(t80)]

+            pcmpgtb    xmm1,    xmm2            ; obtain sign information

+            movdqa     xmm2,    xmm0

+            movdqa     xmm3,    xmm1

+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction

+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction

+            movdqa     XMMWORD PTR [rdi],   xmm0

+            movdqa     XMMWORD PTR [rdi +16],   xmm2

+            ;line 2 3

+            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src

+            movq       xmm2,    MMWORD PTR [rsi+rcx]

+            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred

+            punpcklqdq xmm0,    xmm2

+            movdqa     xmm2,    xmm0

+            psubb      xmm0,    xmm1            ; subtraction with sign missed

+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values

+            pxor       xmm2,    [GLOBAL(t80)]

+            pcmpgtb    xmm1,    xmm2            ; obtain sign information

+            movdqa     xmm2,    xmm0

+            movdqa     xmm3,    xmm1

+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction

+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction

+            movdqa     XMMWORD PTR [rdi + 32],   xmm0

+            movdqa     XMMWORD PTR [rdi + 48],   xmm2

+            ;line 4 5

+            lea        rsi,     [rsi + rdx*4]

+            movq       xmm0,    MMWORD PTR [rsi]  ; src

+            movq       xmm2,    MMWORD PTR [rsi+rdx]

+            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred

+            punpcklqdq xmm0,    xmm2

+            movdqa     xmm2,    xmm0

+            psubb      xmm0,    xmm1            ; subtraction with sign missed

+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values

+            pxor       xmm2,    [GLOBAL(t80)]

+            pcmpgtb    xmm1,    xmm2            ; obtain sign information

+            movdqa     xmm2,    xmm0

+            movdqa     xmm3,    xmm1

+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction

+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction

+            movdqa     XMMWORD PTR [rdi + 64],   xmm0

+            movdqa     XMMWORD PTR [rdi + 80],   xmm2

+            ;line 6 7

+            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src

+            movq       xmm2,    MMWORD PTR [rsi+rcx]

+            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred

+            punpcklqdq xmm0,    xmm2

+            movdqa     xmm2,    xmm0

+            psubb      xmm0,    xmm1            ; subtraction with sign missed

+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values

+            pxor       xmm2,    [GLOBAL(t80)]

+            pcmpgtb    xmm1,    xmm2            ; obtain sign information

+            movdqa     xmm2,    xmm0

+            movdqa     xmm3,    xmm1

+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction

+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction

+            movdqa     XMMWORD PTR [rdi + 96],   xmm0

+            movdqa     XMMWORD PTR [rdi + 112],  xmm2

+            ;v

+            mov     rsi,        arg(2) ;z = vsrc

+            add     rdi,        64*2  ;diff = diff + 320 (shorts)

+            add     rax,        64    ;Predictor = pred + 320

+            ;line 0 1

+            movq       xmm0,    MMWORD PTR [rsi]  ; src

+            movq       xmm2,    MMWORD PTR [rsi+rdx]

+            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred

+            punpcklqdq xmm0,    xmm2

+            movdqa     xmm2,    xmm0

+            psubb      xmm0,    xmm1            ; subtraction with sign missed

+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values

+            pxor       xmm2,    [GLOBAL(t80)]

+            pcmpgtb    xmm1,    xmm2            ; obtain sign information

+            movdqa     xmm2,    xmm0

+            movdqa     xmm3,    xmm1

+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction

+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction

+            movdqa     XMMWORD PTR [rdi],   xmm0

+            movdqa     XMMWORD PTR [rdi +16],   xmm2

+            ;line 2 3

+            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src

+            movq       xmm2,    MMWORD PTR [rsi+rcx]

+            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred

+            punpcklqdq xmm0,    xmm2

+            movdqa     xmm2,    xmm0

+            psubb      xmm0,    xmm1            ; subtraction with sign missed

+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values

+            pxor       xmm2,    [GLOBAL(t80)]

+            pcmpgtb    xmm1,    xmm2            ; obtain sign information

+            movdqa     xmm2,    xmm0

+            movdqa     xmm3,    xmm1

+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction

+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction

+            movdqa     XMMWORD PTR [rdi + 32],   xmm0

+            movdqa     XMMWORD PTR [rdi + 48],   xmm2

+            ;line 4 5

+            lea        rsi,     [rsi + rdx*4]

+            movq       xmm0,    MMWORD PTR [rsi]  ; src

+            movq       xmm2,    MMWORD PTR [rsi+rdx]

+            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred

+            punpcklqdq xmm0,    xmm2

+            movdqa     xmm2,    xmm0

+            psubb      xmm0,    xmm1            ; subtraction with sign missed

+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values

+            pxor       xmm2,    [GLOBAL(t80)]

+            pcmpgtb    xmm1,    xmm2            ; obtain sign information

+            movdqa     xmm2,    xmm0

+            movdqa     xmm3,    xmm1

+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction

+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction

+            movdqa     XMMWORD PTR [rdi + 64],   xmm0

+            movdqa     XMMWORD PTR [rdi + 80],   xmm2

+            ;line 6 7

+            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src

+            movq       xmm2,    MMWORD PTR [rsi+rcx]

+            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred

+            punpcklqdq xmm0,    xmm2

+            movdqa     xmm2,    xmm0

+            psubb      xmm0,    xmm1            ; subtraction with sign missed

+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values

+            pxor       xmm2,    [GLOBAL(t80)]

+            pcmpgtb    xmm1,    xmm2            ; obtain sign information

+            movdqa     xmm2,    xmm0

+            movdqa     xmm3,    xmm1

+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction

+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction

+            movdqa     XMMWORD PTR [rdi + 96],   xmm0

+            movdqa     XMMWORD PTR [rdi + 112],  xmm2

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+t80:

+    times 16 db 0x80

--- /dev/null

+++ b/vp9/encoder/x86/temporal_filter_apply_sse2.asm

@@ -1,0 +1,207 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+; void vp9_temporal_filter_apply_sse2 | arg

+;  (unsigned char  *frame1,           |  0

+;   unsigned int    stride,           |  1

+;   unsigned char  *frame2,           |  2

+;   unsigned int    block_size,       |  3

+;   int             strength,         |  4

+;   int             filter_weight,    |  5

+;   unsigned int   *accumulator,      |  6

+;   unsigned short *count)            |  7

+global sym(vp9_temporal_filter_apply_sse2)

+sym(vp9_temporal_filter_apply_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 8

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ALIGN_STACK 16, rax

+    %define block_size    0

+    %define strength      16

+    %define filter_weight 32

+    %define rounding_bit  48

+    %define rbp_backup    64

+    %define stack_size    80

+    sub         rsp,           stack_size

+    mov         [rsp + rbp_backup], rbp

+    ; end prolog

+        mov         rdx,            arg(3)

+        mov         [rsp + block_size], rdx

+        movd        xmm6,            arg(4)

+        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read

+        ; calculate the rounding bit outside the loop

+        ; 0x8000 >> (16 - strength)

+        mov         rdx,            16

+        sub         rdx,            arg(4) ; 16 - strength

+        movd        xmm4,           rdx    ; can't use rdx w/ shift

+        movdqa      xmm5,           [GLOBAL(_const_top_bit)]

+        psrlw       xmm5,           xmm4

+        movdqa      [rsp + rounding_bit], xmm5

+        mov         rsi,            arg(0) ; src/frame1

+        mov         rdx,            arg(2) ; predictor frame

+        mov         rdi,            arg(6) ; accumulator

+        mov         rax,            arg(7) ; count

+        ; dup the filter weight and store for later

+        movd        xmm0,           arg(5) ; filter_weight

+        pshuflw     xmm0,           xmm0, 0

+        punpcklwd   xmm0,           xmm0

+        movdqa      [rsp + filter_weight], xmm0

+        mov         rbp,            arg(1) ; stride

+        pxor        xmm7,           xmm7   ; zero for extraction

+        lea         rcx,            [rdx + 16*16*1]

+        cmp         dword ptr [rsp + block_size], 8

+        jne         .temporal_filter_apply_load_16

+        lea         rcx,            [rdx + 8*8*1]

+.temporal_filter_apply_load_8:

+        movq        xmm0,           [rsi]  ; first row

+        lea         rsi,            [rsi + rbp] ; += stride

+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]

+        movq        xmm1,           [rsi]  ; second row

+        lea         rsi,            [rsi + rbp] ; += stride

+        punpcklbw   xmm1,           xmm7   ; src[ 8-15]

+        jmp         .temporal_filter_apply_load_finished

+.temporal_filter_apply_load_16:

+        movdqa      xmm0,           [rsi]  ; src (frame1)

+        lea         rsi,            [rsi + rbp] ; += stride

+        movdqa      xmm1,           xmm0

+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]

+        punpckhbw   xmm1,           xmm7   ; src[ 8-15]

+.temporal_filter_apply_load_finished:

+        movdqa      xmm2,           [rdx]  ; predictor (frame2)

+        movdqa      xmm3,           xmm2

+        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]

+        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]

+        ; modifier = src_byte - pixel_value

+        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]

+        psubw       xmm1,           xmm3   ; src - pred[ 8-15]

+        ; modifier *= modifier

+        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2

+        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2

+        ; modifier *= 3

+        pmullw      xmm0,           [GLOBAL(_const_3w)]

+        pmullw      xmm1,           [GLOBAL(_const_3w)]

+        ; modifer += 0x8000 >> (16 - strength)

+        paddw       xmm0,           [rsp + rounding_bit]

+        paddw       xmm1,           [rsp + rounding_bit]

+        ; modifier >>= strength

+        psrlw       xmm0,           [rsp + strength]

+        psrlw       xmm1,           [rsp + strength]

+        ; modifier = 16 - modifier

+        ; saturation takes care of modifier > 16

+        movdqa      xmm3,           [GLOBAL(_const_16w)]

+        movdqa      xmm2,           [GLOBAL(_const_16w)]

+        psubusw     xmm3,           xmm1

+        psubusw     xmm2,           xmm0

+        ; modifier *= filter_weight

+        pmullw      xmm2,           [rsp + filter_weight]

+        pmullw      xmm3,           [rsp + filter_weight]

+        ; count

+        movdqa      xmm4,           [rax]

+        movdqa      xmm5,           [rax+16]

+        ; += modifier

+        paddw       xmm4,           xmm2

+        paddw       xmm5,           xmm3

+        ; write back

+        movdqa      [rax],          xmm4

+        movdqa      [rax+16],       xmm5

+        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))

+        ; load and extract the predictor up to shorts

+        pxor        xmm7,           xmm7

+        movdqa      xmm0,           [rdx]

+        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))

+        movdqa      xmm1,           xmm0

+        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]

+        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]

+        ; modifier *= pixel_value

+        pmullw      xmm0,           xmm2

+        pmullw      xmm1,           xmm3

+        ; expand to double words

+        movdqa      xmm2,           xmm0

+        punpcklwd   xmm0,           xmm7   ; [ 0- 3]

+        punpckhwd   xmm2,           xmm7   ; [ 4- 7]

+        movdqa      xmm3,           xmm1

+        punpcklwd   xmm1,           xmm7   ; [ 8-11]

+        punpckhwd   xmm3,           xmm7   ; [12-15]

+        ; accumulator

+        movdqa      xmm4,           [rdi]

+        movdqa      xmm5,           [rdi+16]

+        movdqa      xmm6,           [rdi+32]

+        movdqa      xmm7,           [rdi+48]

+        ; += modifier

+        paddd       xmm4,           xmm0

+        paddd       xmm5,           xmm2

+        paddd       xmm6,           xmm1

+        paddd       xmm7,           xmm3

+        ; write back

+        movdqa      [rdi],          xmm4

+        movdqa      [rdi+16],       xmm5

+        movdqa      [rdi+32],       xmm6

+        movdqa      [rdi+48],       xmm7

+        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))

+        cmp         rdx,            rcx

+        je          .temporal_filter_apply_epilog

+        pxor        xmm7,           xmm7   ; zero for extraction

+        cmp         dword ptr [rsp + block_size], 16

+        je          .temporal_filter_apply_load_16

+        jmp         .temporal_filter_apply_load_8

+.temporal_filter_apply_epilog:

+    ; begin epilog

+    mov         rbp,            [rsp + rbp_backup]

+    add         rsp,            stack_size

+    pop         rsp

+    pop         rdi

+    pop         rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+_const_3w:

+    times 8 dw 3

+align 16

+_const_top_bit:

+    times 8 dw 1<<15

+align 16

+_const_16w

+    times 8 dw 16

--- /dev/null

+++ b/vp9/encoder/x86/temporal_filter_x86.h

@@ -1,0 +1,27 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef __INC_TEMPORAL_FILTER_X86_H

+#define __INC_TEMPORAL_FILTER_X86_H

+#if HAVE_SSE2

+extern prototype_apply(vp9_temporal_filter_apply_sse2);

+#if !CONFIG_RUNTIME_CPU_DETECT

+#undef  vp9_temporal_filter_apply

+#define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2

+#endif

+#endif

+#endif // __INC_TEMPORAL_FILTER_X86_H

--- /dev/null

+++ b/vp9/encoder/x86/variance_impl_mmx.asm

@@ -1,0 +1,851 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+;unsigned int vp9_get_mb_ss_mmx( short *src_ptr )

+global sym(vp9_get_mb_ss_mmx)

+sym(vp9_get_mb_ss_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 7

+    GET_GOT     rbx

+    push rsi

+    push rdi

+    sub         rsp, 8

+    ; end prolog

+        mov         rax, arg(0) ;src_ptr

+        mov         rcx, 16

+        pxor        mm4, mm4

+.NEXTROW:

+        movq        mm0, [rax]

+        movq        mm1, [rax+8]

+        movq        mm2, [rax+16]

+        movq        mm3, [rax+24]

+        pmaddwd     mm0, mm0

+        pmaddwd     mm1, mm1

+        pmaddwd     mm2, mm2

+        pmaddwd     mm3, mm3

+        paddd       mm4, mm0

+        paddd       mm4, mm1

+        paddd       mm4, mm2

+        paddd       mm4, mm3

+        add         rax, 32

+        dec         rcx

+        ja          .NEXTROW

+        movq        QWORD PTR [rsp], mm4

+        ;return sum[0]+sum[1];

+        movsxd      rax, dword ptr [rsp]

+        movsxd      rcx, dword ptr [rsp+4]

+        add         rax, rcx

+    ; begin epilog

+    add rsp, 8

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;unsigned int vp9_get8x8var_mmx

+;(

+;    unsigned char *src_ptr,

+;    int  source_stride,

+;    unsigned char *ref_ptr,

+;    int  recon_stride,

+;    unsigned int *SSE,

+;    int *Sum

+;)

+global sym(vp9_get8x8var_mmx)

+sym(vp9_get8x8var_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    push rsi

+    push rdi

+    push rbx

+    sub         rsp, 16

+    ; end prolog

+        pxor        mm5, mm5                    ; Blank mmx6

+        pxor        mm6, mm6                    ; Blank mmx7

+        pxor        mm7, mm7                    ; Blank mmx7

+        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses

+        mov         rbx, arg(2) ;[ref_ptr]

+        movsxd      rcx, dword ptr arg(1) ;[source_stride]

+        movsxd      rdx, dword ptr arg(3) ;[recon_stride]

+        ; Row 1

+        movq        mm0, [rax]                  ; Copy eight bytes to mm0

+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

+        movq        mm2, mm0                    ; Take copies

+        movq        mm3, mm1                    ; Take copies

+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

+        punpcklbw   mm1, mm6

+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

+        punpckhbw   mm3, mm6

+        psubsw      mm0, mm1                    ; A-B (low order) to MM0

+        psubsw      mm2, mm3                    ; A-B (high order) to MM2

+        paddw       mm5, mm0                    ; accumulate differences in mm5

+        paddw       mm5, mm2                    ; accumulate differences in mm5

+        pmaddwd     mm0, mm0                    ; square and accumulate

+        pmaddwd     mm2, mm2                    ; square and accumulate

+        add         rbx,rdx                     ; Inc pointer into ref data

+        add         rax,rcx                     ; Inc pointer into the new data

+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

+        paddd       mm7, mm0                    ; accumulate in mm7

+        paddd       mm7, mm2                    ; accumulate in mm7

+        ; Row 2

+        movq        mm0, [rax]                  ; Copy eight bytes to mm0

+        movq        mm2, mm0                    ; Take copies

+        movq        mm3, mm1                    ; Take copies

+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

+        punpcklbw   mm1, mm6

+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

+        punpckhbw   mm3, mm6

+        psubsw      mm0, mm1                    ; A-B (low order) to MM0

+        psubsw      mm2, mm3                    ; A-B (high order) to MM2

+        paddw       mm5, mm0                    ; accumulate differences in mm5

+        paddw       mm5, mm2                    ; accumulate differences in mm5

+        pmaddwd     mm0, mm0                    ; square and accumulate

+        pmaddwd     mm2, mm2                    ; square and accumulate

+        add         rbx,rdx                     ; Inc pointer into ref data

+        add         rax,rcx                     ; Inc pointer into the new data

+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

+        paddd       mm7, mm0                    ; accumulate in mm7

+        paddd       mm7, mm2                    ; accumulate in mm7

+        ; Row 3

+        movq        mm0, [rax]                  ; Copy eight bytes to mm0

+        movq        mm2, mm0                    ; Take copies

+        movq        mm3, mm1                    ; Take copies

+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

+        punpcklbw   mm1, mm6

+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

+        punpckhbw   mm3, mm6

+        psubsw      mm0, mm1                    ; A-B (low order) to MM0

+        psubsw      mm2, mm3                    ; A-B (high order) to MM2

+        paddw       mm5, mm0                    ; accumulate differences in mm5

+        paddw       mm5, mm2                    ; accumulate differences in mm5

+        pmaddwd     mm0, mm0                    ; square and accumulate

+        pmaddwd     mm2, mm2                    ; square and accumulate

+        add         rbx,rdx                     ; Inc pointer into ref data

+        add         rax,rcx                     ; Inc pointer into the new data

+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

+        paddd       mm7, mm0                    ; accumulate in mm7

+        paddd       mm7, mm2                    ; accumulate in mm7

+        ; Row 4

+        movq        mm0, [rax]                  ; Copy eight bytes to mm0

+        movq        mm2, mm0                    ; Take copies

+        movq        mm3, mm1                    ; Take copies

+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

+        punpcklbw   mm1, mm6

+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

+        punpckhbw   mm3, mm6

+        psubsw      mm0, mm1                    ; A-B (low order) to MM0

+        psubsw      mm2, mm3                    ; A-B (high order) to MM2

+        paddw       mm5, mm0                    ; accumulate differences in mm5

+        paddw       mm5, mm2                    ; accumulate differences in mm5

+        pmaddwd     mm0, mm0                    ; square and accumulate

+        pmaddwd     mm2, mm2                    ; square and accumulate

+        add         rbx,rdx                     ; Inc pointer into ref data

+        add         rax,rcx                     ; Inc pointer into the new data

+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

+        paddd       mm7, mm0                    ; accumulate in mm7

+        paddd       mm7, mm2                    ; accumulate in mm7

+        ; Row 5

+        movq        mm0, [rax]                  ; Copy eight bytes to mm0

+        movq        mm2, mm0                    ; Take copies

+        movq        mm3, mm1                    ; Take copies

+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

+        punpcklbw   mm1, mm6

+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

+        punpckhbw   mm3, mm6

+        psubsw      mm0, mm1                    ; A-B (low order) to MM0

+        psubsw      mm2, mm3                    ; A-B (high order) to MM2

+        paddw       mm5, mm0                    ; accumulate differences in mm5

+        paddw       mm5, mm2                    ; accumulate differences in mm5

+        pmaddwd     mm0, mm0                    ; square and accumulate

+        pmaddwd     mm2, mm2                    ; square and accumulate

+        add         rbx,rdx                     ; Inc pointer into ref data

+        add         rax,rcx                     ; Inc pointer into the new data

+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

+        ;              movq        mm4, [rbx + rdx]

+        paddd       mm7, mm0                    ; accumulate in mm7

+        paddd       mm7, mm2                    ; accumulate in mm7

+        ; Row 6

+        movq        mm0, [rax]                  ; Copy eight bytes to mm0

+        movq        mm2, mm0                    ; Take copies

+        movq        mm3, mm1                    ; Take copies

+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

+        punpcklbw   mm1, mm6

+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

+        punpckhbw   mm3, mm6

+        psubsw      mm0, mm1                    ; A-B (low order) to MM0

+        psubsw      mm2, mm3                    ; A-B (high order) to MM2

+        paddw       mm5, mm0                    ; accumulate differences in mm5

+        paddw       mm5, mm2                    ; accumulate differences in mm5

+        pmaddwd     mm0, mm0                    ; square and accumulate

+        pmaddwd     mm2, mm2                    ; square and accumulate

+        add         rbx,rdx                     ; Inc pointer into ref data

+        add         rax,rcx                     ; Inc pointer into the new data

+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

+        paddd       mm7, mm0                    ; accumulate in mm7

+        paddd       mm7, mm2                    ; accumulate in mm7

+        ; Row 7

+        movq        mm0, [rax]                  ; Copy eight bytes to mm0

+        movq        mm2, mm0                    ; Take copies

+        movq        mm3, mm1                    ; Take copies

+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

+        punpcklbw   mm1, mm6

+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

+        punpckhbw   mm3, mm6

+        psubsw      mm0, mm1                    ; A-B (low order) to MM0

+        psubsw      mm2, mm3                    ; A-B (high order) to MM2

+        paddw       mm5, mm0                    ; accumulate differences in mm5

+        paddw       mm5, mm2                    ; accumulate differences in mm5

+        pmaddwd     mm0, mm0                    ; square and accumulate

+        pmaddwd     mm2, mm2                    ; square and accumulate

+        add         rbx,rdx                     ; Inc pointer into ref data

+        add         rax,rcx                     ; Inc pointer into the new data

+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

+        paddd       mm7, mm0                    ; accumulate in mm7

+        paddd       mm7, mm2                    ; accumulate in mm7

+        ; Row 8

+        movq        mm0, [rax]                  ; Copy eight bytes to mm0

+        movq        mm2, mm0                    ; Take copies

+        movq        mm3, mm1                    ; Take copies

+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

+        punpcklbw   mm1, mm6

+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision

+        punpckhbw   mm3, mm6

+        psubsw      mm0, mm1                    ; A-B (low order) to MM0

+        psubsw      mm2, mm3                    ; A-B (high order) to MM2

+        paddw       mm5, mm0                    ; accumulate differences in mm5

+        paddw       mm5, mm2                    ; accumulate differences in mm5

+        pmaddwd     mm0, mm0                    ; square and accumulate

+        pmaddwd     mm2, mm2                    ; square and accumulate

+        add         rbx,rdx                     ; Inc pointer into ref data

+        add         rax,rcx                     ; Inc pointer into the new data

+        paddd       mm7, mm0                    ; accumulate in mm7

+        paddd       mm7, mm2                    ; accumulate in mm7

+        ; Now accumulate the final results.

+        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory

+        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory

+        movsx       rdx, WORD PTR [rsp+8]

+        movsx       rcx, WORD PTR [rsp+10]

+        movsx       rbx, WORD PTR [rsp+12]

+        movsx       rax, WORD PTR [rsp+14]

+        add         rdx, rcx

+        add         rbx, rax

+        add         rdx, rbx    ;XSum

+        movsxd      rax, DWORD PTR [rsp]

+        movsxd      rcx, DWORD PTR [rsp+4]

+        add         rax, rcx    ;XXSum

+        mov         rsi, arg(4) ;SSE

+        mov         rdi, arg(5) ;Sum

+        mov         dword ptr [rsi], eax

+        mov         dword ptr [rdi], edx

+        xor         rax, rax    ; return 0

+    ; begin epilog

+    add rsp, 16

+    pop rbx

+    pop rdi

+    pop rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;unsigned int

+;vp9_get4x4var_mmx

+;(

+;    unsigned char *src_ptr,

+;    int  source_stride,

+;    unsigned char *ref_ptr,

+;    int  recon_stride,

+;    unsigned int *SSE,

+;    int *Sum

+;)

+global sym(vp9_get4x4var_mmx)

+sym(vp9_get4x4var_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    push rsi

+    push rdi

+    push rbx

+    sub         rsp, 16

+    ; end prolog

+        pxor        mm5, mm5                    ; Blank mmx6

+        pxor        mm6, mm6                    ; Blank mmx7

+        pxor        mm7, mm7                    ; Blank mmx7

+        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses

+        mov         rbx, arg(2) ;[ref_ptr]

+        movsxd      rcx, dword ptr arg(1) ;[source_stride]

+        movsxd      rdx, dword ptr arg(3) ;[recon_stride]

+        ; Row 1

+        movq        mm0, [rax]                  ; Copy eight bytes to mm0

+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

+        punpcklbw   mm1, mm6

+        psubsw      mm0, mm1                    ; A-B (low order) to MM0

+        paddw       mm5, mm0                    ; accumulate differences in mm5

+        pmaddwd     mm0, mm0                    ; square and accumulate

+        add         rbx,rdx                     ; Inc pointer into ref data

+        add         rax,rcx                     ; Inc pointer into the new data

+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

+        paddd       mm7, mm0                    ; accumulate in mm7

+        ; Row 2

+        movq        mm0, [rax]                  ; Copy eight bytes to mm0

+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

+        punpcklbw   mm1, mm6

+        psubsw      mm0, mm1                    ; A-B (low order) to MM0

+        paddw       mm5, mm0                    ; accumulate differences in mm5

+        pmaddwd     mm0, mm0                    ; square and accumulate

+        add         rbx,rdx                     ; Inc pointer into ref data

+        add         rax,rcx                     ; Inc pointer into the new data

+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

+        paddd       mm7, mm0                    ; accumulate in mm7

+        ; Row 3

+        movq        mm0, [rax]                  ; Copy eight bytes to mm0

+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

+        punpcklbw   mm1, mm6

+        psubsw      mm0, mm1                    ; A-B (low order) to MM0

+        paddw       mm5, mm0                    ; accumulate differences in mm5

+        pmaddwd     mm0, mm0                    ; square and accumulate

+        add         rbx,rdx                     ; Inc pointer into ref data

+        add         rax,rcx                     ; Inc pointer into the new data

+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1

+        paddd       mm7, mm0                    ; accumulate in mm7

+        ; Row 4

+        movq        mm0, [rax]                  ; Copy eight bytes to mm0

+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

+        punpcklbw   mm1, mm6

+        psubsw      mm0, mm1                    ; A-B (low order) to MM0

+        paddw       mm5, mm0                    ; accumulate differences in mm5

+        pmaddwd     mm0, mm0                    ; square and accumulate

+        paddd       mm7, mm0                    ; accumulate in mm7

+        ; Now accumulate the final results.

+        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory

+        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory

+        movsx       rdx, WORD PTR [rsp+8]

+        movsx       rcx, WORD PTR [rsp+10]

+        movsx       rbx, WORD PTR [rsp+12]

+        movsx       rax, WORD PTR [rsp+14]

+        add         rdx, rcx

+        add         rbx, rax

+        add         rdx, rbx    ;XSum

+        movsxd      rax, DWORD PTR [rsp]

+        movsxd      rcx, DWORD PTR [rsp+4]

+        add         rax, rcx    ;XXSum

+        mov         rsi, arg(4) ;SSE

+        mov         rdi, arg(5) ;Sum

+        mov         dword ptr [rsi], eax

+        mov         dword ptr [rdi], edx

+        xor         rax, rax    ; return 0

+    ; begin epilog

+    add rsp, 16

+    pop rbx

+    pop rdi

+    pop rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;unsigned int

+;vp9_get4x4sse_cs_mmx

+;(

+;    unsigned char *src_ptr,

+;    int  source_stride,

+;    unsigned char *ref_ptr,

+;    int  recon_stride

+;)

+global sym(vp9_get4x4sse_cs_mmx)

+sym(vp9_get4x4sse_cs_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 4

+    push rsi

+    push rdi

+    push rbx

+    ; end prolog

+        pxor        mm6, mm6                    ; Blank mmx7

+        pxor        mm7, mm7                    ; Blank mmx7

+        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses

+        mov         rbx, arg(2) ;[ref_ptr]

+        movsxd      rcx, dword ptr arg(1) ;[source_stride]

+        movsxd      rdx, dword ptr arg(3) ;[recon_stride]

+        ; Row 1

+        movd        mm0, [rax]                  ; Copy eight bytes to mm0

+        movd        mm1, [rbx]                  ; Copy eight bytes to mm1

+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

+        punpcklbw   mm1, mm6

+        psubsw      mm0, mm1                    ; A-B (low order) to MM0

+        pmaddwd     mm0, mm0                    ; square and accumulate

+        add         rbx,rdx                     ; Inc pointer into ref data

+        add         rax,rcx                     ; Inc pointer into the new data

+        movd        mm1, [rbx]                  ; Copy eight bytes to mm1

+        paddd       mm7, mm0                    ; accumulate in mm7

+        ; Row 2

+        movd        mm0, [rax]                  ; Copy eight bytes to mm0

+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

+        punpcklbw   mm1, mm6

+        psubsw      mm0, mm1                    ; A-B (low order) to MM0

+        pmaddwd     mm0, mm0                    ; square and accumulate

+        add         rbx,rdx                     ; Inc pointer into ref data

+        add         rax,rcx                     ; Inc pointer into the new data

+        movd        mm1, [rbx]                  ; Copy eight bytes to mm1

+        paddd       mm7, mm0                    ; accumulate in mm7

+        ; Row 3

+        movd        mm0, [rax]                  ; Copy eight bytes to mm0

+        punpcklbw   mm1, mm6

+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

+        psubsw      mm0, mm1                    ; A-B (low order) to MM0

+        pmaddwd     mm0, mm0                    ; square and accumulate

+        add         rbx,rdx                     ; Inc pointer into ref data

+        add         rax,rcx                     ; Inc pointer into the new data

+        movd        mm1, [rbx]                  ; Copy eight bytes to mm1

+        paddd       mm7, mm0                    ; accumulate in mm7

+        ; Row 4

+        movd        mm0, [rax]                  ; Copy eight bytes to mm0

+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision

+        punpcklbw   mm1, mm6

+        psubsw      mm0, mm1                    ; A-B (low order) to MM0

+        pmaddwd     mm0, mm0                    ; square and accumulate

+        paddd       mm7, mm0                    ; accumulate in mm7

+        movq        mm0,    mm7                 ;

+        psrlq       mm7,    32

+        paddd       mm0,    mm7

+        movq        rax,    mm0

+    ; begin epilog

+    pop rbx

+    pop rdi

+    pop rsi

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+%define mmx_filter_shift            7

+;void vp9_filter_block2d_bil4x4_var_mmx

+;(

+;    unsigned char *ref_ptr,

+;    int ref_pixels_per_line,

+;    unsigned char *src_ptr,

+;    int src_pixels_per_line,

+;    unsigned short *HFilter,

+;    unsigned short *VFilter,

+;    int *sum,

+;    unsigned int *sumsquared

+;)

+global sym(vp9_filter_block2d_bil4x4_var_mmx)

+sym(vp9_filter_block2d_bil4x4_var_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 8

+    GET_GOT     rbx

+    push rsi

+    push rdi

+    sub         rsp, 16

+    ; end prolog

+        pxor            mm6,            mm6                 ;

+        pxor            mm7,            mm7                 ;

+        mov             rax,            arg(4) ;HFilter             ;

+        mov             rdx,            arg(5) ;VFilter             ;

+        mov             rsi,            arg(0) ;ref_ptr              ;

+        mov             rdi,            arg(2) ;src_ptr              ;

+        mov             rcx,            4                   ;

+        pxor            mm0,            mm0                 ;

+        movd            mm1,            [rsi]               ;

+        movd            mm3,            [rsi+1]             ;

+        punpcklbw       mm1,            mm0                 ;

+        pmullw          mm1,            [rax]               ;

+        punpcklbw       mm3,            mm0                 ;

+        pmullw          mm3,            [rax+8]             ;

+        paddw           mm1,            mm3                 ;

+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

+        psraw           mm1,            mmx_filter_shift    ;

+        movq            mm5,            mm1

+%if ABI_IS_32BIT

+        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;

+%else

+        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;

+        add             rsi, r8

+%endif

+.filter_block2d_bil4x4_var_mmx_loop:

+        movd            mm1,            [rsi]               ;

+        movd            mm3,            [rsi+1]             ;

+        punpcklbw       mm1,            mm0                 ;

+        pmullw          mm1,            [rax]               ;

+        punpcklbw       mm3,            mm0                 ;

+        pmullw          mm3,            [rax+8]             ;

+        paddw           mm1,            mm3                 ;

+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

+        psraw           mm1,            mmx_filter_shift    ;

+        movq            mm3,            mm5                 ;

+        movq            mm5,            mm1                 ;

+        pmullw          mm3,            [rdx]               ;

+        pmullw          mm1,            [rdx+8]             ;

+        paddw           mm1,            mm3                 ;

+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

+        psraw           mm1,            mmx_filter_shift    ;

+        movd            mm3,            [rdi]               ;

+        punpcklbw       mm3,            mm0                 ;

+        psubw           mm1,            mm3                 ;

+        paddw           mm6,            mm1                 ;

+        pmaddwd         mm1,            mm1                 ;

+        paddd           mm7,            mm1                 ;

+%if ABI_IS_32BIT

+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;

+        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;

+%else

+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line

+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line

+        add             rsi,            r8

+        add             rdi,            r9

+%endif

+        sub             rcx,            1                   ;

+        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;

+        pxor            mm3,            mm3                 ;

+        pxor            mm2,            mm2                 ;

+        punpcklwd       mm2,            mm6                 ;

+        punpckhwd       mm3,            mm6                 ;

+        paddd           mm2,            mm3                 ;

+        movq            mm6,            mm2                 ;

+        psrlq           mm6,            32                  ;

+        paddd           mm2,            mm6                 ;

+        psrad           mm2,            16                  ;

+        movq            mm4,            mm7                 ;

+        psrlq           mm4,            32                  ;

+        paddd           mm4,            mm7                 ;

+        mov             rdi,            arg(6) ;sum

+        mov             rsi,            arg(7) ;sumsquared

+        movd            dword ptr [rdi],          mm2                 ;

+        movd            dword ptr [rsi],          mm4                 ;

+    ; begin epilog

+    add rsp, 16

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_filter_block2d_bil_var_mmx

+;(

+;    unsigned char *ref_ptr,

+;    int ref_pixels_per_line,

+;    unsigned char *src_ptr,

+;    int src_pixels_per_line,

+;    unsigned int Height,

+;    unsigned short *HFilter,

+;    unsigned short *VFilter,

+;    int *sum,

+;    unsigned int *sumsquared

+;)

+global sym(vp9_filter_block2d_bil_var_mmx)

+sym(vp9_filter_block2d_bil_var_mmx):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 9

+    GET_GOT     rbx

+    push rsi

+    push rdi

+    sub         rsp, 16

+    ; end prolog

+        pxor            mm6,            mm6                 ;

+        pxor            mm7,            mm7                 ;

+        mov             rax,            arg(5) ;HFilter             ;

+        mov             rdx,            arg(6) ;VFilter             ;

+        mov             rsi,            arg(0) ;ref_ptr              ;

+        mov             rdi,            arg(2) ;src_ptr              ;

+        movsxd          rcx,            dword ptr arg(4) ;Height              ;

+        pxor            mm0,            mm0                 ;

+        movq            mm1,            [rsi]               ;

+        movq            mm3,            [rsi+1]             ;

+        movq            mm2,            mm1                 ;

+        movq            mm4,            mm3                 ;

+        punpcklbw       mm1,            mm0                 ;

+        punpckhbw       mm2,            mm0                 ;

+        pmullw          mm1,            [rax]               ;

+        pmullw          mm2,            [rax]               ;

+        punpcklbw       mm3,            mm0                 ;

+        punpckhbw       mm4,            mm0                 ;

+        pmullw          mm3,            [rax+8]             ;

+        pmullw          mm4,            [rax+8]             ;

+        paddw           mm1,            mm3                 ;

+        paddw           mm2,            mm4                 ;

+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

+        psraw           mm1,            mmx_filter_shift    ;

+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;

+        psraw           mm2,            mmx_filter_shift    ;

+        movq            mm5,            mm1

+        packuswb        mm5,            mm2                 ;

+%if ABI_IS_32BIT

+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line

+%else

+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line

+        add             rsi,            r8

+%endif

+.filter_block2d_bil_var_mmx_loop:

+        movq            mm1,            [rsi]               ;

+        movq            mm3,            [rsi+1]             ;

+        movq            mm2,            mm1                 ;

+        movq            mm4,            mm3                 ;

+        punpcklbw       mm1,            mm0                 ;

+        punpckhbw       mm2,            mm0                 ;

+        pmullw          mm1,            [rax]               ;

+        pmullw          mm2,            [rax]               ;

+        punpcklbw       mm3,            mm0                 ;

+        punpckhbw       mm4,            mm0                 ;

+        pmullw          mm3,            [rax+8]             ;

+        pmullw          mm4,            [rax+8]             ;

+        paddw           mm1,            mm3                 ;

+        paddw           mm2,            mm4                 ;

+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

+        psraw           mm1,            mmx_filter_shift    ;

+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;

+        psraw           mm2,            mmx_filter_shift    ;

+        movq            mm3,            mm5                 ;

+        movq            mm4,            mm5                 ;

+        punpcklbw       mm3,            mm0                 ;

+        punpckhbw       mm4,            mm0                 ;

+        movq            mm5,            mm1                 ;

+        packuswb        mm5,            mm2                 ;

+        pmullw          mm3,            [rdx]               ;

+        pmullw          mm4,            [rdx]               ;

+        pmullw          mm1,            [rdx+8]             ;

+        pmullw          mm2,            [rdx+8]             ;

+        paddw           mm1,            mm3                 ;

+        paddw           mm2,            mm4                 ;

+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;

+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;

+        psraw           mm1,            mmx_filter_shift    ;

+        psraw           mm2,            mmx_filter_shift    ;

+        movq            mm3,            [rdi]               ;

+        movq            mm4,            mm3                 ;

+        punpcklbw       mm3,            mm0                 ;

+        punpckhbw       mm4,            mm0                 ;

+        psubw           mm1,            mm3                 ;

+        psubw           mm2,            mm4                 ;

+        paddw           mm6,            mm1                 ;

+        pmaddwd         mm1,            mm1                 ;

+        paddw           mm6,            mm2                 ;

+        pmaddwd         mm2,            mm2                 ;

+        paddd           mm7,            mm1                 ;

+        paddd           mm7,            mm2                 ;

+%if ABI_IS_32BIT

+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;

+        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;

+%else

+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;

+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;

+        add             rsi,            r8

+        add             rdi,            r9

+%endif

+        sub             rcx,            1                   ;

+        jnz             .filter_block2d_bil_var_mmx_loop       ;

+        pxor            mm3,            mm3                 ;

+        pxor            mm2,            mm2                 ;

+        punpcklwd       mm2,            mm6                 ;

+        punpckhwd       mm3,            mm6                 ;

+        paddd           mm2,            mm3                 ;

+        movq            mm6,            mm2                 ;

+        psrlq           mm6,            32                  ;

+        paddd           mm2,            mm6                 ;

+        psrad           mm2,            16                  ;

+        movq            mm4,            mm7                 ;

+        psrlq           mm4,            32                  ;

+        paddd           mm4,            mm7                 ;

+        mov             rdi,            arg(7) ;sum

+        mov             rsi,            arg(8) ;sumsquared

+        movd            dword ptr [rdi],          mm2                 ;

+        movd            dword ptr [rsi],          mm4                 ;

+    ; begin epilog

+    add rsp, 16

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+;short mmx_bi_rd[4] = { 64, 64, 64, 64};

+align 16

+mmx_bi_rd:

+    times 4 dw 64

--- /dev/null

+++ b/vp9/encoder/x86/variance_impl_sse2.asm

@@ -1,0 +1,1367 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+%define xmm_filter_shift            7

+;unsigned int vp9_get_mb_ss_sse2

+;(

+;    short *src_ptr

+;)

+global sym(vp9_get_mb_ss_sse2)

+sym(vp9_get_mb_ss_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 1

+    GET_GOT     rbx

+    push rsi

+    push rdi

+    sub         rsp, 16

+    ; end prolog

+        mov         rax, arg(0) ;[src_ptr]

+        mov         rcx, 8

+        pxor        xmm4, xmm4

+.NEXTROW:

+        movdqa      xmm0, [rax]

+        movdqa      xmm1, [rax+16]

+        movdqa      xmm2, [rax+32]

+        movdqa      xmm3, [rax+48]

+        pmaddwd     xmm0, xmm0

+        pmaddwd     xmm1, xmm1

+        pmaddwd     xmm2, xmm2

+        pmaddwd     xmm3, xmm3

+        paddd       xmm0, xmm1

+        paddd       xmm2, xmm3

+        paddd       xmm4, xmm0

+        paddd       xmm4, xmm2

+        add         rax, 0x40

+        dec         rcx

+        ja          .NEXTROW

+        movdqa      xmm3,xmm4

+        psrldq      xmm4,8

+        paddd       xmm4,xmm3

+        movdqa      xmm3,xmm4

+        psrldq      xmm4,4

+        paddd       xmm4,xmm3

+        movq        rax,xmm4

+    ; begin epilog

+    add rsp, 16

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;unsigned int vp9_get16x16var_sse2

+;(

+;    unsigned char   *  src_ptr,

+;    int             source_stride,

+;    unsigned char   *  ref_ptr,

+;    int             recon_stride,

+;    unsigned int    *  SSE,

+;    int             *  Sum

+;)

+global sym(vp9_get16x16var_sse2)

+sym(vp9_get16x16var_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    push rbx

+    push rsi

+    push rdi

+    ; end prolog

+        mov         rsi,            arg(0) ;[src_ptr]

+        mov         rdi,            arg(2) ;[ref_ptr]

+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]

+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]

+        ; Prefetch data

+        lea             rcx,    [rax+rax*2]

+        prefetcht0      [rsi]

+        prefetcht0      [rsi+rax]

+        prefetcht0      [rsi+rax*2]

+        prefetcht0      [rsi+rcx]

+        lea             rbx,    [rsi+rax*4]

+        prefetcht0      [rbx]

+        prefetcht0      [rbx+rax]

+        prefetcht0      [rbx+rax*2]

+        prefetcht0      [rbx+rcx]

+        lea             rcx,    [rdx+rdx*2]

+        prefetcht0      [rdi]

+        prefetcht0      [rdi+rdx]

+        prefetcht0      [rdi+rdx*2]

+        prefetcht0      [rdi+rcx]

+        lea             rbx,    [rdi+rdx*4]

+        prefetcht0      [rbx]

+        prefetcht0      [rbx+rdx]

+        prefetcht0      [rbx+rdx*2]

+        prefetcht0      [rbx+rcx]

+        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack

+        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs

+        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse

+        mov         rcx,            16

+.var16loop:

+        movdqu      xmm1,           XMMWORD PTR [rsi]

+        movdqu      xmm2,           XMMWORD PTR [rdi]

+        prefetcht0      [rsi+rax*8]

+        prefetcht0      [rdi+rdx*8]

+        movdqa      xmm3,           xmm1

+        movdqa      xmm4,           xmm2

+        punpcklbw   xmm1,           xmm0

+        punpckhbw   xmm3,           xmm0

+        punpcklbw   xmm2,           xmm0

+        punpckhbw   xmm4,           xmm0

+        psubw       xmm1,           xmm2

+        psubw       xmm3,           xmm4

+        paddw       xmm7,           xmm1

+        pmaddwd     xmm1,           xmm1

+        paddw       xmm7,           xmm3

+        pmaddwd     xmm3,           xmm3

+        paddd       xmm6,           xmm1

+        paddd       xmm6,           xmm3

+        add         rsi,            rax

+        add         rdi,            rdx

+        sub         rcx,            1

+        jnz         .var16loop

+        movdqa      xmm1,           xmm6

+        pxor        xmm6,           xmm6

+        pxor        xmm5,           xmm5

+        punpcklwd   xmm6,           xmm7

+        punpckhwd   xmm5,           xmm7

+        psrad       xmm5,           16

+        psrad       xmm6,           16

+        paddd       xmm6,           xmm5

+        movdqa      xmm2,           xmm1

+        punpckldq   xmm1,           xmm0

+        punpckhdq   xmm2,           xmm0

+        movdqa      xmm7,           xmm6

+        paddd       xmm1,           xmm2

+        punpckldq   xmm6,           xmm0

+        punpckhdq   xmm7,           xmm0

+        paddd       xmm6,           xmm7

+        movdqa      xmm2,           xmm1

+        movdqa      xmm7,           xmm6

+        psrldq      xmm1,           8

+        psrldq      xmm6,           8

+        paddd       xmm7,           xmm6

+        paddd       xmm1,           xmm2

+        mov         rax,            arg(5) ;[Sum]

+        mov         rdi,            arg(4) ;[SSE]

+        movd DWORD PTR [rax],       xmm7

+        movd DWORD PTR [rdi],       xmm1

+    ; begin epilog

+    pop rdi

+    pop rsi

+    pop rbx

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;unsigned int vp9_get8x8var_sse2

+;(

+;    unsigned char   *  src_ptr,

+;    int             source_stride,

+;    unsigned char   *  ref_ptr,

+;    int             recon_stride,

+;    unsigned int    *  SSE,

+;    int             *  Sum

+;)

+global sym(vp9_get8x8var_sse2)

+sym(vp9_get8x8var_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push rsi

+    push rdi

+    sub         rsp, 16

+    ; end prolog

+        mov         rsi,            arg(0) ;[src_ptr]

+        mov         rdi,            arg(2) ;[ref_ptr]

+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]

+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]

+        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack

+        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs

+        movq        xmm1,           QWORD PTR [rsi]

+        movq        xmm2,           QWORD PTR [rdi]

+        punpcklbw   xmm1,           xmm0

+        punpcklbw   xmm2,           xmm0

+        psubsw      xmm1,           xmm2

+        paddw       xmm7,           xmm1

+        pmaddwd     xmm1,           xmm1

+        movq        xmm2,           QWORD PTR[rsi + rax]

+        movq        xmm3,           QWORD PTR[rdi + rdx]

+        punpcklbw   xmm2,           xmm0

+        punpcklbw   xmm3,           xmm0

+        psubsw      xmm2,           xmm3

+        paddw       xmm7,           xmm2

+        pmaddwd     xmm2,           xmm2

+        paddd       xmm1,           xmm2

+        movq        xmm2,           QWORD PTR[rsi + rax * 2]

+        movq        xmm3,           QWORD PTR[rdi + rdx * 2]

+        punpcklbw   xmm2,           xmm0

+        punpcklbw   xmm3,           xmm0

+        psubsw      xmm2,           xmm3

+        paddw       xmm7,           xmm2

+        pmaddwd     xmm2,           xmm2

+        paddd       xmm1,           xmm2

+        lea         rsi,            [rsi + rax * 2]

+        lea         rdi,            [rdi + rdx * 2]

+        movq        xmm2,           QWORD PTR[rsi + rax]

+        movq        xmm3,           QWORD PTR[rdi + rdx]

+        punpcklbw   xmm2,           xmm0

+        punpcklbw   xmm3,           xmm0

+        psubsw      xmm2,           xmm3

+        paddw       xmm7,           xmm2

+        pmaddwd     xmm2,           xmm2

+        paddd       xmm1,           xmm2

+        movq        xmm2,           QWORD PTR[rsi + rax *2]

+        movq        xmm3,           QWORD PTR[rdi + rdx *2]

+        punpcklbw   xmm2,           xmm0

+        punpcklbw   xmm3,           xmm0

+        psubsw      xmm2,           xmm3

+        paddw       xmm7,           xmm2

+        pmaddwd     xmm2,           xmm2

+        paddd       xmm1,           xmm2

+        lea         rsi,            [rsi + rax * 2]

+        lea         rdi,            [rdi + rdx * 2]

+        movq        xmm2,           QWORD PTR[rsi + rax]

+        movq        xmm3,           QWORD PTR[rdi + rdx]

+        punpcklbw   xmm2,           xmm0

+        punpcklbw   xmm3,           xmm0

+        psubsw      xmm2,           xmm3

+        paddw       xmm7,           xmm2

+        pmaddwd     xmm2,           xmm2

+        paddd       xmm1,           xmm2

+        movq        xmm2,           QWORD PTR[rsi + rax *2]

+        movq        xmm3,           QWORD PTR[rdi + rdx *2]

+        punpcklbw   xmm2,           xmm0

+        punpcklbw   xmm3,           xmm0

+        psubsw      xmm2,           xmm3

+        paddw       xmm7,           xmm2

+        pmaddwd     xmm2,           xmm2

+        paddd       xmm1,           xmm2

+        lea         rsi,            [rsi + rax * 2]

+        lea         rdi,            [rdi + rdx * 2]

+        movq        xmm2,           QWORD PTR[rsi + rax]

+        movq        xmm3,           QWORD PTR[rdi + rdx]

+        punpcklbw   xmm2,           xmm0

+        punpcklbw   xmm3,           xmm0

+        psubsw      xmm2,           xmm3

+        paddw       xmm7,           xmm2

+        pmaddwd     xmm2,           xmm2

+        paddd       xmm1,           xmm2

+        movdqa      xmm6,           xmm7

+        punpcklwd   xmm6,           xmm0

+        punpckhwd   xmm7,           xmm0

+        movdqa      xmm2,           xmm1

+        paddw       xmm6,           xmm7

+        punpckldq   xmm1,           xmm0

+        punpckhdq   xmm2,           xmm0

+        movdqa      xmm7,           xmm6

+        paddd       xmm1,           xmm2

+        punpckldq   xmm6,           xmm0

+        punpckhdq   xmm7,           xmm0

+        paddw       xmm6,           xmm7

+        movdqa      xmm2,           xmm1

+        movdqa      xmm7,           xmm6

+        psrldq      xmm1,           8

+        psrldq      xmm6,           8

+        paddw       xmm7,           xmm6

+        paddd       xmm1,           xmm2

+        mov         rax,            arg(5) ;[Sum]

+        mov         rdi,            arg(4) ;[SSE]

+        movq        rdx,            xmm7

+        movsx       rcx,            dx

+        mov  dword ptr [rax],       ecx

+        movd DWORD PTR [rdi],       xmm1

+    ; begin epilog

+    add rsp, 16

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_filter_block2d_bil_var_sse2

+;(

+;    unsigned char *ref_ptr,

+;    int ref_pixels_per_line,

+;    unsigned char *src_ptr,

+;    int src_pixels_per_line,

+;    unsigned int Height,

+;    int  xoffset,

+;    int  yoffset,

+;    int *sum,

+;    unsigned int *sumsquared;;

+;

+;)

+global sym(vp9_filter_block2d_bil_var_sse2)

+sym(vp9_filter_block2d_bil_var_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 9

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push rsi

+    push rdi

+    push rbx

+    ; end prolog

+        pxor            xmm6,           xmm6                 ;

+        pxor            xmm7,           xmm7                 ;

+        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding

+        movdqa          xmm4,           XMMWORD PTR [rsi]

+        lea             rcx,            [GLOBAL(bilinear_filters_sse2)]

+        movsxd          rax,            dword ptr arg(5)     ; xoffset

+        cmp             rax,            0                    ; skip first_pass filter if xoffset=0

+        je              filter_block2d_bil_var_sse2_sp_only

+        shl             rax,            5                    ; point to filter coeff with xoffset

+        lea             rax,            [rax + rcx]          ; HFilter

+        movsxd          rdx,            dword ptr arg(6)     ; yoffset

+        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0

+        je              filter_block2d_bil_var_sse2_fp_only

+        shl             rdx,            5

+        lea             rdx,            [rdx + rcx]          ; VFilter

+        mov             rsi,            arg(0)               ;ref_ptr

+        mov             rdi,            arg(2)               ;src_ptr

+        movsxd          rcx,            dword ptr arg(4)     ;Height

+        pxor            xmm0,           xmm0                 ;

+        movq            xmm1,           QWORD PTR [rsi]      ;

+        movq            xmm3,           QWORD PTR [rsi+1]    ;

+        punpcklbw       xmm1,           xmm0                 ;

+        pmullw          xmm1,           [rax]                ;

+        punpcklbw       xmm3,           xmm0

+        pmullw          xmm3,           [rax+16]             ;

+        paddw           xmm1,           xmm3                 ;

+        paddw           xmm1,           xmm4                 ;

+        psraw           xmm1,           xmm_filter_shift     ;

+        movdqa          xmm5,           xmm1

+        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line

+        lea             rsi,            [rsi + rbx]

+%if ABI_IS_32BIT=0

+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line

+%endif

+filter_block2d_bil_var_sse2_loop:

+        movq            xmm1,           QWORD PTR [rsi]               ;

+        movq            xmm3,           QWORD PTR [rsi+1]             ;

+        punpcklbw       xmm1,           xmm0                 ;

+        pmullw          xmm1,           [rax]               ;

+        punpcklbw       xmm3,           xmm0                 ;

+        pmullw          xmm3,           [rax+16]             ;

+        paddw           xmm1,           xmm3                 ;

+        paddw           xmm1,           xmm4               ;

+        psraw           xmm1,           xmm_filter_shift    ;

+        movdqa          xmm3,           xmm5                 ;

+        movdqa          xmm5,           xmm1                 ;

+        pmullw          xmm3,           [rdx]               ;

+        pmullw          xmm1,           [rdx+16]             ;

+        paddw           xmm1,           xmm3                 ;

+        paddw           xmm1,           xmm4                 ;

+        psraw           xmm1,           xmm_filter_shift    ;

+        movq            xmm3,           QWORD PTR [rdi]               ;

+        punpcklbw       xmm3,           xmm0                 ;

+        psubw           xmm1,           xmm3                 ;

+        paddw           xmm6,           xmm1                 ;

+        pmaddwd         xmm1,           xmm1                 ;

+        paddd           xmm7,           xmm1                 ;

+        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line

+%if ABI_IS_32BIT

+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line

+%else

+        lea             rdi,            [rdi + r9]

+%endif

+        sub             rcx,            1                   ;

+        jnz             filter_block2d_bil_var_sse2_loop       ;

+        jmp             filter_block2d_bil_variance

+filter_block2d_bil_var_sse2_sp_only:

+        movsxd          rdx,            dword ptr arg(6)     ; yoffset

+        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0

+        je              filter_block2d_bil_var_sse2_full_pixel

+        shl             rdx,            5

+        lea             rdx,            [rdx + rcx]          ; VFilter

+        mov             rsi,            arg(0)               ;ref_ptr

+        mov             rdi,            arg(2)               ;src_ptr

+        movsxd          rcx,            dword ptr arg(4)     ;Height

+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line

+        pxor            xmm0,           xmm0                 ;

+        movq            xmm1,           QWORD PTR [rsi]      ;

+        punpcklbw       xmm1,           xmm0                 ;

+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line

+        lea             rsi,            [rsi + rax]

+filter_block2d_bil_sp_only_loop:

+        movq            xmm3,           QWORD PTR [rsi]             ;

+        punpcklbw       xmm3,           xmm0                 ;

+        movdqa          xmm5,           xmm3

+        pmullw          xmm1,           [rdx]               ;

+        pmullw          xmm3,           [rdx+16]             ;

+        paddw           xmm1,           xmm3                 ;

+        paddw           xmm1,           xmm4                 ;

+        psraw           xmm1,           xmm_filter_shift    ;

+        movq            xmm3,           QWORD PTR [rdi]               ;

+        punpcklbw       xmm3,           xmm0                 ;

+        psubw           xmm1,           xmm3                 ;

+        paddw           xmm6,           xmm1                 ;

+        pmaddwd         xmm1,           xmm1                 ;

+        paddd           xmm7,           xmm1                 ;

+        movdqa          xmm1,           xmm5                 ;

+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line

+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line

+        sub             rcx,            1                   ;

+        jnz             filter_block2d_bil_sp_only_loop       ;

+        jmp             filter_block2d_bil_variance

+filter_block2d_bil_var_sse2_full_pixel:

+        mov             rsi,            arg(0)               ;ref_ptr

+        mov             rdi,            arg(2)               ;src_ptr

+        movsxd          rcx,            dword ptr arg(4)     ;Height

+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line

+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line

+        pxor            xmm0,           xmm0                 ;

+filter_block2d_bil_full_pixel_loop:

+        movq            xmm1,           QWORD PTR [rsi]               ;

+        punpcklbw       xmm1,           xmm0                 ;

+        movq            xmm2,           QWORD PTR [rdi]               ;

+        punpcklbw       xmm2,           xmm0                 ;

+        psubw           xmm1,           xmm2                 ;

+        paddw           xmm6,           xmm1                 ;

+        pmaddwd         xmm1,           xmm1                 ;

+        paddd           xmm7,           xmm1                 ;

+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line

+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line

+        sub             rcx,            1                   ;

+        jnz             filter_block2d_bil_full_pixel_loop       ;

+        jmp             filter_block2d_bil_variance

+filter_block2d_bil_var_sse2_fp_only:

+        mov             rsi,            arg(0)               ;ref_ptr

+        mov             rdi,            arg(2)               ;src_ptr

+        movsxd          rcx,            dword ptr arg(4)     ;Height

+        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line

+        pxor            xmm0,           xmm0                 ;

+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line

+filter_block2d_bil_fp_only_loop:

+        movq            xmm1,           QWORD PTR [rsi]       ;

+        movq            xmm3,           QWORD PTR [rsi+1]     ;

+        punpcklbw       xmm1,           xmm0                 ;

+        pmullw          xmm1,           [rax]               ;

+        punpcklbw       xmm3,           xmm0                 ;

+        pmullw          xmm3,           [rax+16]             ;

+        paddw           xmm1,           xmm3                 ;

+        paddw           xmm1,           xmm4  ;

+        psraw           xmm1,           xmm_filter_shift    ;

+        movq            xmm3,           QWORD PTR [rdi]     ;

+        punpcklbw       xmm3,           xmm0                 ;

+        psubw           xmm1,           xmm3                 ;

+        paddw           xmm6,           xmm1                 ;

+        pmaddwd         xmm1,           xmm1                 ;

+        paddd           xmm7,           xmm1                 ;

+        lea             rsi,            [rsi + rdx]

+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line

+        sub             rcx,            1                   ;

+        jnz             filter_block2d_bil_fp_only_loop       ;

+        jmp             filter_block2d_bil_variance

+filter_block2d_bil_variance:

+        movdq2q         mm6,            xmm6                ;

+        movdq2q         mm7,            xmm7                ;

+        psrldq          xmm6,           8

+        psrldq          xmm7,           8

+        movdq2q         mm2,            xmm6

+        movdq2q         mm3,            xmm7

+        paddw           mm6,            mm2

+        paddd           mm7,            mm3

+        pxor            mm3,            mm3                 ;

+        pxor            mm2,            mm2                 ;

+        punpcklwd       mm2,            mm6                 ;

+        punpckhwd       mm3,            mm6                 ;

+        paddd           mm2,            mm3                 ;

+        movq            mm6,            mm2                 ;

+        psrlq           mm6,            32                  ;

+        paddd           mm2,            mm6                 ;

+        psrad           mm2,            16                  ;

+        movq            mm4,            mm7                 ;

+        psrlq           mm4,            32                  ;

+        paddd           mm4,            mm7                 ;

+        mov             rsi,            arg(7) ; sum

+        mov             rdi,            arg(8) ; sumsquared

+        movd            [rsi],          mm2    ; xsum

+        movd            [rdi],          mm4    ; xxsum

+    ; begin epilog

+    pop rbx

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_half_horiz_vert_variance8x_h_sse2

+;(

+;    unsigned char *ref_ptr,

+;    int ref_pixels_per_line,

+;    unsigned char *src_ptr,

+;    int src_pixels_per_line,

+;    unsigned int Height,

+;    int *sum,

+;    unsigned int *sumsquared

+;)

+global sym(vp9_half_horiz_vert_variance8x_h_sse2)

+sym(vp9_half_horiz_vert_variance8x_h_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 7

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push rsi

+    push rdi

+    ; end prolog

+%if ABI_IS_32BIT=0

+    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line

+    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line

+%endif

+        pxor            xmm6,           xmm6                ;  error accumulator

+        pxor            xmm7,           xmm7                ;  sse eaccumulator

+        mov             rsi,            arg(0) ;ref_ptr              ;

+        mov             rdi,            arg(2) ;src_ptr              ;

+        movsxd          rcx,            dword ptr arg(4) ;Height              ;

+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line

+        pxor            xmm0,           xmm0                ;

+        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8

+        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9

+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1

+%if ABI_IS_32BIT

+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source

+%else

+        add             rsi, r8

+%endif

+.half_horiz_vert_variance8x_h_1:

+        movq            xmm1,           QWORD PTR [rsi]     ;

+        movq            xmm2,           QWORD PTR [rsi+1]   ;

+        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1

+        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above

+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above

+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8

+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above

+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3

+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences

+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5

+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences

+        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row

+%if ABI_IS_32BIT

+        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source

+        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination

+%else

+        add             rsi, r8

+        add             rdi, r9

+%endif

+        sub             rcx,            1                   ;

+        jnz             .half_horiz_vert_variance8x_h_1     ;

+        movdq2q         mm6,            xmm6                ;

+        movdq2q         mm7,            xmm7                ;

+        psrldq          xmm6,           8

+        psrldq          xmm7,           8

+        movdq2q         mm2,            xmm6

+        movdq2q         mm3,            xmm7

+        paddw           mm6,            mm2

+        paddd           mm7,            mm3

+        pxor            mm3,            mm3                 ;

+        pxor            mm2,            mm2                 ;

+        punpcklwd       mm2,            mm6                 ;

+        punpckhwd       mm3,            mm6                 ;

+        paddd           mm2,            mm3                 ;

+        movq            mm6,            mm2                 ;

+        psrlq           mm6,            32                  ;

+        paddd           mm2,            mm6                 ;

+        psrad           mm2,            16                  ;

+        movq            mm4,            mm7                 ;

+        psrlq           mm4,            32                  ;

+        paddd           mm4,            mm7                 ;

+        mov             rsi,            arg(5) ; sum

+        mov             rdi,            arg(6) ; sumsquared

+        movd            [rsi],          mm2                 ;

+        movd            [rdi],          mm4                 ;

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_half_horiz_vert_variance16x_h_sse2

+;(

+;    unsigned char *ref_ptr,

+;    int ref_pixels_per_line,

+;    unsigned char *src_ptr,

+;    int src_pixels_per_line,

+;    unsigned int Height,

+;    int *sum,

+;    unsigned int *sumsquared

+;)

+global sym(vp9_half_horiz_vert_variance16x_h_sse2)

+sym(vp9_half_horiz_vert_variance16x_h_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 7

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push rsi

+    push rdi

+    ; end prolog

+        pxor            xmm6,           xmm6                ;  error accumulator

+        pxor            xmm7,           xmm7                ;  sse eaccumulator

+        mov             rsi,            arg(0) ;ref_ptr              ;

+        mov             rdi,            arg(2) ;src_ptr              ;

+        movsxd          rcx,            dword ptr arg(4) ;Height              ;

+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line

+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line

+        pxor            xmm0,           xmm0                ;

+        movdqu          xmm5,           XMMWORD PTR [rsi]

+        movdqu          xmm3,           XMMWORD PTR [rsi+1]

+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1

+        lea             rsi,            [rsi + rax]

+.half_horiz_vert_variance16x_h_1:

+        movdqu          xmm1,           XMMWORD PTR [rsi]     ;

+        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;

+        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1

+        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above

+        movdqa          xmm4,           xmm5

+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above

+        punpckhbw       xmm4,           xmm0

+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7

+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above

+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3

+        movq            xmm3,           QWORD PTR [rdi+8]

+        punpcklbw       xmm3,           xmm0

+        psubw           xmm4,           xmm3

+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences

+        paddw           xmm6,           xmm4

+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5

+        pmaddwd         xmm4,           xmm4

+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences

+        paddd           xmm7,           xmm4

+        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row

+        lea             rsi,            [rsi + rax]

+        lea             rdi,            [rdi + rdx]

+        sub             rcx,            1                   ;

+        jnz             .half_horiz_vert_variance16x_h_1    ;

+        pxor        xmm1,           xmm1

+        pxor        xmm5,           xmm5

+        punpcklwd   xmm0,           xmm6

+        punpckhwd   xmm1,           xmm6

+        psrad       xmm0,           16

+        psrad       xmm1,           16

+        paddd       xmm0,           xmm1

+        movdqa      xmm1,           xmm0

+        movdqa      xmm6,           xmm7

+        punpckldq   xmm6,           xmm5

+        punpckhdq   xmm7,           xmm5

+        paddd       xmm6,           xmm7

+        punpckldq   xmm0,           xmm5

+        punpckhdq   xmm1,           xmm5

+        paddd       xmm0,           xmm1

+        movdqa      xmm7,           xmm6

+        movdqa      xmm1,           xmm0

+        psrldq      xmm7,           8

+        psrldq      xmm1,           8

+        paddd       xmm6,           xmm7

+        paddd       xmm0,           xmm1

+        mov         rsi,            arg(5) ;[Sum]

+        mov         rdi,            arg(6) ;[SSE]

+        movd        [rsi],       xmm0

+        movd        [rdi],       xmm6

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_half_vert_variance8x_h_sse2

+;(

+;    unsigned char *ref_ptr,

+;    int ref_pixels_per_line,

+;    unsigned char *src_ptr,

+;    int src_pixels_per_line,

+;    unsigned int Height,

+;    int *sum,

+;    unsigned int *sumsquared

+;)

+global sym(vp9_half_vert_variance8x_h_sse2)

+sym(vp9_half_vert_variance8x_h_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 7

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push rsi

+    push rdi

+    ; end prolog

+%if ABI_IS_32BIT=0

+    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line

+    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line

+%endif

+        pxor            xmm6,           xmm6                ;  error accumulator

+        pxor            xmm7,           xmm7                ;  sse eaccumulator

+        mov             rsi,            arg(0) ;ref_ptr              ;

+        mov             rdi,            arg(2) ;src_ptr              ;

+        movsxd          rcx,            dword ptr arg(4) ;Height              ;

+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line

+        pxor            xmm0,           xmm0                ;

+.half_vert_variance8x_h_1:

+        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8

+        movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9

+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)

+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above

+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8

+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above

+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3

+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences

+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5

+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences

+%if ABI_IS_32BIT

+        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source

+        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination

+%else

+        add             rsi, r8

+        add             rdi, r9

+%endif

+        sub             rcx,            1                   ;

+        jnz             .half_vert_variance8x_h_1          ;

+        movdq2q         mm6,            xmm6                ;

+        movdq2q         mm7,            xmm7                ;

+        psrldq          xmm6,           8

+        psrldq          xmm7,           8

+        movdq2q         mm2,            xmm6

+        movdq2q         mm3,            xmm7

+        paddw           mm6,            mm2

+        paddd           mm7,            mm3

+        pxor            mm3,            mm3                 ;

+        pxor            mm2,            mm2                 ;

+        punpcklwd       mm2,            mm6                 ;

+        punpckhwd       mm3,            mm6                 ;

+        paddd           mm2,            mm3                 ;

+        movq            mm6,            mm2                 ;

+        psrlq           mm6,            32                  ;

+        paddd           mm2,            mm6                 ;

+        psrad           mm2,            16                  ;

+        movq            mm4,            mm7                 ;

+        psrlq           mm4,            32                  ;

+        paddd           mm4,            mm7                 ;

+        mov             rsi,            arg(5) ; sum

+        mov             rdi,            arg(6) ; sumsquared

+        movd            [rsi],          mm2                 ;

+        movd            [rdi],          mm4                 ;

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_half_vert_variance16x_h_sse2

+;(

+;    unsigned char *ref_ptr,

+;    int ref_pixels_per_line,

+;    unsigned char *src_ptr,

+;    int src_pixels_per_line,

+;    unsigned int Height,

+;    int *sum,

+;    unsigned int *sumsquared

+;)

+global sym(vp9_half_vert_variance16x_h_sse2)

+sym(vp9_half_vert_variance16x_h_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 7

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push rsi

+    push rdi

+    ; end prolog

+        pxor            xmm6,           xmm6                ;  error accumulator

+        pxor            xmm7,           xmm7                ;  sse eaccumulator

+        mov             rsi,            arg(0)              ;ref_ptr

+        mov             rdi,            arg(2)              ;src_ptr

+        movsxd          rcx,            dword ptr arg(4)    ;Height

+        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line

+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line

+        movdqu          xmm5,           XMMWORD PTR [rsi]

+        lea             rsi,            [rsi + rax          ]

+        pxor            xmm0,           xmm0

+.half_vert_variance16x_h_1:

+        movdqu          xmm3,           XMMWORD PTR [rsi]

+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)

+        movdqa          xmm4,           xmm5

+        punpcklbw       xmm5,           xmm0

+        punpckhbw       xmm4,           xmm0

+        movq            xmm2,           QWORD PTR [rdi]

+        punpcklbw       xmm2,           xmm0

+        psubw           xmm5,           xmm2

+        movq            xmm2,           QWORD PTR [rdi+8]

+        punpcklbw       xmm2,           xmm0

+        psubw           xmm4,           xmm2

+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences

+        paddw           xmm6,           xmm4

+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5

+        pmaddwd         xmm4,           xmm4

+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences

+        paddd           xmm7,           xmm4

+        movdqa          xmm5,           xmm3

+        lea             rsi,            [rsi + rax]

+        lea             rdi,            [rdi + rdx]

+        sub             rcx,            1

+        jnz             .half_vert_variance16x_h_1

+        pxor        xmm1,           xmm1

+        pxor        xmm5,           xmm5

+        punpcklwd   xmm0,           xmm6

+        punpckhwd   xmm1,           xmm6

+        psrad       xmm0,           16

+        psrad       xmm1,           16

+        paddd       xmm0,           xmm1

+        movdqa      xmm1,           xmm0

+        movdqa      xmm6,           xmm7

+        punpckldq   xmm6,           xmm5

+        punpckhdq   xmm7,           xmm5

+        paddd       xmm6,           xmm7

+        punpckldq   xmm0,           xmm5

+        punpckhdq   xmm1,           xmm5

+        paddd       xmm0,           xmm1

+        movdqa      xmm7,           xmm6

+        movdqa      xmm1,           xmm0

+        psrldq      xmm7,           8

+        psrldq      xmm1,           8

+        paddd       xmm6,           xmm7

+        paddd       xmm0,           xmm1

+        mov         rsi,            arg(5) ;[Sum]

+        mov         rdi,            arg(6) ;[SSE]

+        movd        [rsi],       xmm0

+        movd        [rdi],       xmm6

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_half_horiz_variance8x_h_sse2

+;(

+;    unsigned char *ref_ptr,

+;    int ref_pixels_per_line,

+;    unsigned char *src_ptr,

+;    int src_pixels_per_line,

+;    unsigned int Height,

+;    int *sum,

+;    unsigned int *sumsquared

+;)

+global sym(vp9_half_horiz_variance8x_h_sse2)

+sym(vp9_half_horiz_variance8x_h_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 7

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push rsi

+    push rdi

+    ; end prolog

+%if ABI_IS_32BIT=0

+    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line

+    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line

+%endif

+        pxor            xmm6,           xmm6                ;  error accumulator

+        pxor            xmm7,           xmm7                ;  sse eaccumulator

+        mov             rsi,            arg(0) ;ref_ptr              ;

+        mov             rdi,            arg(2) ;src_ptr              ;

+        movsxd          rcx,            dword ptr arg(4) ;Height              ;

+        pxor            xmm0,           xmm0                ;

+.half_horiz_variance8x_h_1:

+        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8

+        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9

+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)

+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above

+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8

+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above

+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3

+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences

+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5

+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences

+%if ABI_IS_32BIT

+        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source

+        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination

+%else

+        add             rsi, r8

+        add             rdi, r9

+%endif

+        sub             rcx,            1                   ;

+        jnz             .half_horiz_variance8x_h_1          ;

+        movdq2q         mm6,            xmm6                ;

+        movdq2q         mm7,            xmm7                ;

+        psrldq          xmm6,           8

+        psrldq          xmm7,           8

+        movdq2q         mm2,            xmm6

+        movdq2q         mm3,            xmm7

+        paddw           mm6,            mm2

+        paddd           mm7,            mm3

+        pxor            mm3,            mm3                 ;

+        pxor            mm2,            mm2                 ;

+        punpcklwd       mm2,            mm6                 ;

+        punpckhwd       mm3,            mm6                 ;

+        paddd           mm2,            mm3                 ;

+        movq            mm6,            mm2                 ;

+        psrlq           mm6,            32                  ;

+        paddd           mm2,            mm6                 ;

+        psrad           mm2,            16                  ;

+        movq            mm4,            mm7                 ;

+        psrlq           mm4,            32                  ;

+        paddd           mm4,            mm7                 ;

+        mov             rsi,            arg(5) ; sum

+        mov             rdi,            arg(6) ; sumsquared

+        movd            [rsi],          mm2                 ;

+        movd            [rdi],          mm4                 ;

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_half_horiz_variance16x_h_sse2

+;(

+;    unsigned char *ref_ptr,

+;    int ref_pixels_per_line,

+;    unsigned char *src_ptr,

+;    int src_pixels_per_line,

+;    unsigned int Height,

+;    int *sum,

+;    unsigned int *sumsquared

+;)

+global sym(vp9_half_horiz_variance16x_h_sse2)

+sym(vp9_half_horiz_variance16x_h_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 7

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push rsi

+    push rdi

+    ; end prolog

+        pxor            xmm6,           xmm6                ;  error accumulator

+        pxor            xmm7,           xmm7                ;  sse eaccumulator

+        mov             rsi,            arg(0) ;ref_ptr              ;

+        mov             rdi,            arg(2) ;src_ptr              ;

+        movsxd          rcx,            dword ptr arg(4) ;Height              ;

+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line

+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line

+        pxor            xmm0,           xmm0                ;

+.half_horiz_variance16x_h_1:

+        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15

+        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16

+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)

+        movdqa          xmm1,           xmm5

+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above

+        punpckhbw       xmm1,           xmm0

+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7

+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above

+        movq            xmm2,           QWORD PTR [rdi+8]

+        punpcklbw       xmm2,           xmm0

+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3

+        psubw           xmm1,           xmm2

+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences

+        paddw           xmm6,           xmm1

+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5

+        pmaddwd         xmm1,           xmm1

+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences

+        paddd           xmm7,           xmm1

+        lea             rsi,            [rsi + rax]

+        lea             rdi,            [rdi + rdx]

+        sub             rcx,            1                   ;

+        jnz             .half_horiz_variance16x_h_1         ;

+        pxor        xmm1,           xmm1

+        pxor        xmm5,           xmm5

+        punpcklwd   xmm0,           xmm6

+        punpckhwd   xmm1,           xmm6

+        psrad       xmm0,           16

+        psrad       xmm1,           16

+        paddd       xmm0,           xmm1

+        movdqa      xmm1,           xmm0

+        movdqa      xmm6,           xmm7

+        punpckldq   xmm6,           xmm5

+        punpckhdq   xmm7,           xmm5

+        paddd       xmm6,           xmm7

+        punpckldq   xmm0,           xmm5

+        punpckhdq   xmm1,           xmm5

+        paddd       xmm0,           xmm1

+        movdqa      xmm7,           xmm6

+        movdqa      xmm1,           xmm0

+        psrldq      xmm7,           8

+        psrldq      xmm1,           8

+        paddd       xmm6,           xmm7

+        paddd       xmm0,           xmm1

+        mov         rsi,            arg(5) ;[Sum]

+        mov         rdi,            arg(6) ;[SSE]

+        movd        [rsi],       xmm0

+        movd        [rdi],       xmm6

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};

+align 16

+xmm_bi_rd:

+    times 8 dw 64

+align 16

+bilinear_filters_sse2:

+    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0

+    dw 120, 120, 120, 120, 120, 120, 120, 120,  8,  8,  8,  8,  8,  8,  8,  8

+    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16

+    dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24

+    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32

+    dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40

+    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48

+    dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56

+    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64

+    dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72

+    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80

+    dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88

+    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96

+    dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104

+    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112

+    dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120

--- /dev/null

+++ b/vp9/encoder/x86/variance_impl_ssse3.asm

@@ -1,0 +1,372 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+%define xmm_filter_shift            7

+;void vp9_filter_block2d_bil_var_ssse3

+;(

+;    unsigned char *ref_ptr,

+;    int ref_pixels_per_line,

+;    unsigned char *src_ptr,

+;    int src_pixels_per_line,

+;    unsigned int Height,

+;    int  xoffset,

+;    int  yoffset,

+;    int *sum,

+;    unsigned int *sumsquared;;

+;

+;)

+;Note: The filter coefficient at offset=0 is 128. Since the second register

+;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.

+global sym(vp9_filter_block2d_bil_var_ssse3)

+sym(vp9_filter_block2d_bil_var_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 9

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push rsi

+    push rdi

+    ; end prolog

+        pxor            xmm6,           xmm6

+        pxor            xmm7,           xmm7

+        lea             rcx,            [GLOBAL(bilinear_filters_ssse3)]

+        movsxd          rax,            dword ptr arg(5)     ; xoffset

+        cmp             rax,            0                    ; skip first_pass filter if xoffset=0

+        je              .filter_block2d_bil_var_ssse3_sp_only

+        shl             rax,            4                    ; point to filter coeff with xoffset

+        lea             rax,            [rax + rcx]          ; HFilter

+        movsxd          rdx,            dword ptr arg(6)     ; yoffset

+        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0

+        je              .filter_block2d_bil_var_ssse3_fp_only

+        shl             rdx,            4

+        lea             rdx,            [rdx + rcx]          ; VFilter

+        mov             rsi,            arg(0)               ;ref_ptr

+        mov             rdi,            arg(2)               ;src_ptr

+        movsxd          rcx,            dword ptr arg(4)     ;Height

+        movdqu          xmm0,           XMMWORD PTR [rsi]

+        movdqu          xmm1,           XMMWORD PTR [rsi+1]

+        movdqa          xmm2,           xmm0

+        punpcklbw       xmm0,           xmm1

+        punpckhbw       xmm2,           xmm1

+        pmaddubsw       xmm0,           [rax]

+        pmaddubsw       xmm2,           [rax]

+        paddw           xmm0,           [GLOBAL(xmm_bi_rd)]

+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]

+        psraw           xmm0,           xmm_filter_shift

+        psraw           xmm2,           xmm_filter_shift

+        packuswb        xmm0,           xmm2

+%if ABI_IS_32BIT

+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line

+%else

+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line

+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line

+        lea             rsi,            [rsi + r8]

+%endif

+.filter_block2d_bil_var_ssse3_loop:

+        movdqu          xmm1,           XMMWORD PTR [rsi]

+        movdqu          xmm2,           XMMWORD PTR [rsi+1]

+        movdqa          xmm3,           xmm1

+        punpcklbw       xmm1,           xmm2

+        punpckhbw       xmm3,           xmm2

+        pmaddubsw       xmm1,           [rax]

+        pmaddubsw       xmm3,           [rax]

+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]

+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]

+        psraw           xmm1,           xmm_filter_shift

+        psraw           xmm3,           xmm_filter_shift

+        packuswb        xmm1,           xmm3

+        movdqa          xmm2,           xmm0

+        movdqa          xmm0,           xmm1

+        movdqa          xmm3,           xmm2

+        punpcklbw       xmm2,           xmm1

+        punpckhbw       xmm3,           xmm1

+        pmaddubsw       xmm2,           [rdx]

+        pmaddubsw       xmm3,           [rdx]

+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]

+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]

+        psraw           xmm2,           xmm_filter_shift

+        psraw           xmm3,           xmm_filter_shift

+        movq            xmm1,           QWORD PTR [rdi]

+        pxor            xmm4,           xmm4

+        punpcklbw       xmm1,           xmm4

+        movq            xmm5,           QWORD PTR [rdi+8]

+        punpcklbw       xmm5,           xmm4

+        psubw           xmm2,           xmm1

+        psubw           xmm3,           xmm5

+        paddw           xmm6,           xmm2

+        paddw           xmm6,           xmm3

+        pmaddwd         xmm2,           xmm2

+        pmaddwd         xmm3,           xmm3

+        paddd           xmm7,           xmm2

+        paddd           xmm7,           xmm3

+%if ABI_IS_32BIT

+        add             rsi,            dword ptr arg(1)     ;ref_pixels_per_line

+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line

+%else

+        lea             rsi,            [rsi + r8]

+        lea             rdi,            [rdi + r9]

+%endif

+        sub             rcx,            1

+        jnz             .filter_block2d_bil_var_ssse3_loop

+        jmp             .filter_block2d_bil_variance

+.filter_block2d_bil_var_ssse3_sp_only:

+        movsxd          rdx,            dword ptr arg(6)     ; yoffset

+        cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0

+        je              .filter_block2d_bil_var_ssse3_full_pixel

+        shl             rdx,            4

+        lea             rdx,            [rdx + rcx]          ; VFilter

+        mov             rsi,            arg(0)               ;ref_ptr

+        mov             rdi,            arg(2)               ;src_ptr

+        movsxd          rcx,            dword ptr arg(4)     ;Height

+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line

+        movdqu          xmm1,           XMMWORD PTR [rsi]

+        movdqa          xmm0,           xmm1

+%if ABI_IS_32BIT=0

+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line

+%endif

+        lea             rsi,            [rsi + rax]

+.filter_block2d_bil_sp_only_loop:

+        movdqu          xmm3,           XMMWORD PTR [rsi]

+        movdqa          xmm2,           xmm1

+        movdqa          xmm0,           xmm3

+        punpcklbw       xmm1,           xmm3

+        punpckhbw       xmm2,           xmm3

+        pmaddubsw       xmm1,           [rdx]

+        pmaddubsw       xmm2,           [rdx]

+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]

+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]

+        psraw           xmm1,           xmm_filter_shift

+        psraw           xmm2,           xmm_filter_shift

+        movq            xmm3,           QWORD PTR [rdi]

+        pxor            xmm4,           xmm4

+        punpcklbw       xmm3,           xmm4

+        movq            xmm5,           QWORD PTR [rdi+8]

+        punpcklbw       xmm5,           xmm4

+        psubw           xmm1,           xmm3

+        psubw           xmm2,           xmm5

+        paddw           xmm6,           xmm1

+        paddw           xmm6,           xmm2

+        pmaddwd         xmm1,           xmm1

+        pmaddwd         xmm2,           xmm2

+        paddd           xmm7,           xmm1

+        paddd           xmm7,           xmm2

+        movdqa          xmm1,           xmm0

+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line

+%if ABI_IS_32BIT

+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line

+%else

+        lea             rdi,            [rdi + r9]

+%endif

+        sub             rcx,            1

+        jnz             .filter_block2d_bil_sp_only_loop

+        jmp             .filter_block2d_bil_variance

+.filter_block2d_bil_var_ssse3_full_pixel:

+        mov             rsi,            arg(0)               ;ref_ptr

+        mov             rdi,            arg(2)               ;src_ptr

+        movsxd          rcx,            dword ptr arg(4)     ;Height

+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line

+        movsxd          rdx,            dword ptr arg(3)     ;src_pixels_per_line

+        pxor            xmm0,           xmm0

+.filter_block2d_bil_full_pixel_loop:

+        movq            xmm1,           QWORD PTR [rsi]

+        punpcklbw       xmm1,           xmm0

+        movq            xmm2,           QWORD PTR [rsi+8]

+        punpcklbw       xmm2,           xmm0

+        movq            xmm3,           QWORD PTR [rdi]

+        punpcklbw       xmm3,           xmm0

+        movq            xmm4,           QWORD PTR [rdi+8]

+        punpcklbw       xmm4,           xmm0

+        psubw           xmm1,           xmm3

+        psubw           xmm2,           xmm4

+        paddw           xmm6,           xmm1

+        paddw           xmm6,           xmm2

+        pmaddwd         xmm1,           xmm1

+        pmaddwd         xmm2,           xmm2

+        paddd           xmm7,           xmm1

+        paddd           xmm7,           xmm2

+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line

+        lea             rdi,            [rdi + rdx]          ;src_pixels_per_line

+        sub             rcx,            1

+        jnz             .filter_block2d_bil_full_pixel_loop

+        jmp             .filter_block2d_bil_variance

+.filter_block2d_bil_var_ssse3_fp_only:

+        mov             rsi,            arg(0)               ;ref_ptr

+        mov             rdi,            arg(2)               ;src_ptr

+        movsxd          rcx,            dword ptr arg(4)     ;Height

+        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line

+        pxor            xmm0,           xmm0

+%if ABI_IS_32BIT=0

+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line

+%endif

+.filter_block2d_bil_fp_only_loop:

+        movdqu          xmm1,           XMMWORD PTR [rsi]

+        movdqu          xmm2,           XMMWORD PTR [rsi+1]

+        movdqa          xmm3,           xmm1

+        punpcklbw       xmm1,           xmm2

+        punpckhbw       xmm3,           xmm2

+        pmaddubsw       xmm1,           [rax]

+        pmaddubsw       xmm3,           [rax]

+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]

+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]

+        psraw           xmm1,           xmm_filter_shift

+        psraw           xmm3,           xmm_filter_shift

+        movq            xmm2,           XMMWORD PTR [rdi]

+        pxor            xmm4,           xmm4

+        punpcklbw       xmm2,           xmm4

+        movq            xmm5,           QWORD PTR [rdi+8]

+        punpcklbw       xmm5,           xmm4

+        psubw           xmm1,           xmm2

+        psubw           xmm3,           xmm5

+        paddw           xmm6,           xmm1

+        paddw           xmm6,           xmm3

+        pmaddwd         xmm1,           xmm1

+        pmaddwd         xmm3,           xmm3

+        paddd           xmm7,           xmm1

+        paddd           xmm7,           xmm3

+        lea             rsi,            [rsi + rdx]

+%if ABI_IS_32BIT

+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line

+%else

+        lea             rdi,            [rdi + r9]

+%endif

+        sub             rcx,            1

+        jnz             .filter_block2d_bil_fp_only_loop

+        jmp             .filter_block2d_bil_variance

+.filter_block2d_bil_variance:

+        pxor        xmm0,           xmm0

+        pxor        xmm1,           xmm1

+        pxor        xmm5,           xmm5

+        punpcklwd   xmm0,           xmm6

+        punpckhwd   xmm1,           xmm6

+        psrad       xmm0,           16

+        psrad       xmm1,           16

+        paddd       xmm0,           xmm1

+        movdqa      xmm1,           xmm0

+        movdqa      xmm6,           xmm7

+        punpckldq   xmm6,           xmm5

+        punpckhdq   xmm7,           xmm5

+        paddd       xmm6,           xmm7

+        punpckldq   xmm0,           xmm5

+        punpckhdq   xmm1,           xmm5

+        paddd       xmm0,           xmm1

+        movdqa      xmm7,           xmm6

+        movdqa      xmm1,           xmm0

+        psrldq      xmm7,           8

+        psrldq      xmm1,           8

+        paddd       xmm6,           xmm7

+        paddd       xmm0,           xmm1

+        mov         rsi,            arg(7) ;[Sum]

+        mov         rdi,            arg(8) ;[SSE]

+        movd        [rsi],       xmm0

+        movd        [rdi],       xmm6

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+xmm_bi_rd:

+    times 8 dw 64

+align 16

+bilinear_filters_ssse3:

+    times 8 db 128, 0

+    times 8 db 120, 8

+    times 8 db 112, 16

+    times 8 db 104, 24

+    times 8 db  96, 32

+    times 8 db  88, 40

+    times 8 db  80, 48

+    times 8 db  72, 56

+    times 8 db  64, 64

+    times 8 db  56, 72

+    times 8 db  48, 80

+    times 8 db  40, 88

+    times 8 db  32, 96

+    times 8 db  24, 104

+    times 8 db  16, 112

+    times 8 db   8, 120

--- /dev/null

+++ b/vp9/encoder/x86/variance_mmx.c

@@ -1,0 +1,406 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_config.h"

+#include "vp9/encoder/variance.h"

+#include "vp9/common/pragmas.h"

+#include "vpx_ports/mem.h"

+extern void filter_block1d_h6_mmx

+(

+  const unsigned char *src_ptr,

+  unsigned short *output_ptr,

+  unsigned int src_pixels_per_line,

+  unsigned int pixel_step,

+  unsigned int output_height,

+  unsigned int output_width,

+  short *vp7_filter

+);

+extern void filter_block1d_v6_mmx

+(

+  const short *src_ptr,

+  unsigned char *output_ptr,

+  unsigned int pixels_per_line,

+  unsigned int pixel_step,

+  unsigned int output_height,

+  unsigned int output_width,

+  short *vp7_filter

+);

+extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr);

+extern unsigned int vp9_get8x8var_mmx

+(

+  const unsigned char *src_ptr,

+  int  source_stride,

+  const unsigned char *ref_ptr,

+  int  recon_stride,

+  unsigned int *SSE,

+  int *Sum

+);

+extern unsigned int vp9_get4x4var_mmx

+(

+  const unsigned char *src_ptr,

+  int  source_stride,

+  const unsigned char *ref_ptr,

+  int  recon_stride,

+  unsigned int *SSE,

+  int *Sum

+);

+extern void vp9_filter_block2d_bil4x4_var_mmx

+(

+  const unsigned char *ref_ptr,

+  int ref_pixels_per_line,

+  const unsigned char *src_ptr,

+  int src_pixels_per_line,

+  const short *HFilter,

+  const short *VFilter,

+  int *sum,

+  unsigned int *sumsquared

+);

+extern void vp9_filter_block2d_bil_var_mmx

+(

+  const unsigned char *ref_ptr,

+  int ref_pixels_per_line,

+  const unsigned char *src_ptr,

+  int src_pixels_per_line,

+  unsigned int Height,

+  const short *HFilter,

+  const short *VFilter,

+  int *sum,

+  unsigned int *sumsquared

+);

+unsigned int vp9_variance4x4_mmx(

+  const unsigned char *src_ptr,

+  int  source_stride,

+  const unsigned char *ref_ptr,

+  int  recon_stride,

+  unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);

+  *sse = var;

+  return (var - ((avg * avg) >> 4));

+}

+unsigned int vp9_variance8x8_mmx(

+  const unsigned char *src_ptr,

+  int  source_stride,

+  const unsigned char *ref_ptr,

+  int  recon_stride,

+  unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);

+  *sse = var;

+  return (var - ((avg * avg) >> 6));

+}

+unsigned int vp9_mse16x16_mmx(

+  const unsigned char *src_ptr,

+  int  source_stride,

+  const unsigned char *ref_ptr,

+  int  recon_stride,

+  unsigned int *sse) {

+  unsigned int sse0, sse1, sse2, sse3, var;

+  int sum0, sum1, sum2, sum3;

+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);

+  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);

+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);

+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);

+  var = sse0 + sse1 + sse2 + sse3;

+  *sse = var;

+  return var;

+}

+unsigned int vp9_variance16x16_mmx(

+  const unsigned char *src_ptr,

+  int  source_stride,

+  const unsigned char *ref_ptr,

+  int  recon_stride,

+  unsigned int *sse) {

+  unsigned int sse0, sse1, sse2, sse3, var;

+  int sum0, sum1, sum2, sum3, avg;

+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);

+  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);

+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);

+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);

+  var = sse0 + sse1 + sse2 + sse3;

+  avg = sum0 + sum1 + sum2 + sum3;

+  *sse = var;

+  return (var - ((avg * avg) >> 8));

+}

+unsigned int vp9_variance16x8_mmx(

+  const unsigned char *src_ptr,

+  int  source_stride,

+  const unsigned char *ref_ptr,

+  int  recon_stride,

+  unsigned int *sse) {

+  unsigned int sse0, sse1, var;

+  int sum0, sum1, avg;

+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);

+  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);

+  var = sse0 + sse1;

+  avg = sum0 + sum1;

+  *sse = var;

+  return (var - ((avg * avg) >> 7));

+}

+unsigned int vp9_variance8x16_mmx(

+  const unsigned char *src_ptr,

+  int  source_stride,

+  const unsigned char *ref_ptr,

+  int  recon_stride,

+  unsigned int *sse) {

+  unsigned int sse0, sse1, var;

+  int sum0, sum1, avg;

+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);

+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);

+  var = sse0 + sse1;

+  avg = sum0 + sum1;

+  *sse = var;

+  return (var - ((avg * avg) >> 7));

+}

+///////////////////////////////////////////////////////////////////////////

+// the mmx function that does the bilinear filtering and var calculation //

+// int one pass                                                          //

+///////////////////////////////////////////////////////////////////////////

+DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {

+  { 128, 128, 128, 128,  0,  0,  0,  0 },

+  { 120, 120, 120, 120,  8,  8,  8,  8 },

+  { 112, 112, 112, 112, 16, 16, 16, 16 },

+  { 104, 104, 104, 104, 24, 24, 24, 24 },

+  {  96, 96, 96, 96, 32, 32, 32, 32 },

+  {  88, 88, 88, 88, 40, 40, 40, 40 },

+  {  80, 80, 80, 80, 48, 48, 48, 48 },

+  {  72, 72, 72, 72, 56, 56, 56, 56 },

+  {  64, 64, 64, 64, 64, 64, 64, 64 },

+  {  56, 56, 56, 56, 72, 72, 72, 72 },

+  {  48, 48, 48, 48, 80, 80, 80, 80 },

+  {  40, 40, 40, 40, 88, 88, 88, 88 },

+  {  32, 32, 32, 32, 96, 96, 96, 96 },

+  {  24, 24, 24, 24, 104, 104, 104, 104 },

+  {  16, 16, 16, 16, 112, 112, 112, 112 },

+  {   8,  8,  8,  8, 120, 120, 120, 120 }

+};

+unsigned int vp9_sub_pixel_variance4x4_mmx

+(

+  const unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  const unsigned char *dst_ptr,

+  int dst_pixels_per_line,

+  unsigned int *sse)

+{

+  int xsum;

+  unsigned int xxsum;

+  vp9_filter_block2d_bil4x4_var_mmx(

+    src_ptr, src_pixels_per_line,

+    dst_ptr, dst_pixels_per_line,

+    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],

+    &xsum, &xxsum

+  );

+  *sse = xxsum;

+  return (xxsum - ((xsum * xsum) >> 4));

+}

+unsigned int vp9_sub_pixel_variance8x8_mmx

+(

+  const unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  const unsigned char *dst_ptr,

+  int dst_pixels_per_line,

+  unsigned int *sse

+) {

+  int xsum;

+  unsigned int xxsum;

+  vp9_filter_block2d_bil_var_mmx(

+    src_ptr, src_pixels_per_line,

+    dst_ptr, dst_pixels_per_line, 8,

+    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],

+    &xsum, &xxsum

+  );

+  *sse = xxsum;

+  return (xxsum - ((xsum * xsum) >> 6));

+}

+unsigned int vp9_sub_pixel_variance16x16_mmx

+(

+  const unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  const unsigned char *dst_ptr,

+  int dst_pixels_per_line,

+  unsigned int *sse

+) {

+  int xsum0, xsum1;

+  unsigned int xxsum0, xxsum1;

+  vp9_filter_block2d_bil_var_mmx(

+    src_ptr, src_pixels_per_line,

+    dst_ptr, dst_pixels_per_line, 16,

+    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],

+    &xsum0, &xxsum0

+  );

+  vp9_filter_block2d_bil_var_mmx(

+    src_ptr + 8, src_pixels_per_line,

+    dst_ptr + 8, dst_pixels_per_line, 16,

+    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],

+    &xsum1, &xxsum1

+  );

+  xsum0 += xsum1;

+  xxsum0 += xxsum1;

+  *sse = xxsum0;

+  return (xxsum0 - ((xsum0 * xsum0) >> 8));

+}

+unsigned int vp9_sub_pixel_mse16x16_mmx(

+  const unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  const unsigned char *dst_ptr,

+  int dst_pixels_per_line,

+  unsigned int *sse

+) {

+  vp9_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);

+  return *sse;

+}

+unsigned int vp9_sub_pixel_variance16x8_mmx

+(

+  const unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  const unsigned char *dst_ptr,

+  int dst_pixels_per_line,

+  unsigned int *sse

+) {

+  int xsum0, xsum1;

+  unsigned int xxsum0, xxsum1;

+  vp9_filter_block2d_bil_var_mmx(

+    src_ptr, src_pixels_per_line,

+    dst_ptr, dst_pixels_per_line, 8,

+    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],

+    &xsum0, &xxsum0

+  );

+  vp9_filter_block2d_bil_var_mmx(

+    src_ptr + 8, src_pixels_per_line,

+    dst_ptr + 8, dst_pixels_per_line, 8,

+    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],

+    &xsum1, &xxsum1

+  );

+  xsum0 += xsum1;

+  xxsum0 += xxsum1;

+  *sse = xxsum0;

+  return (xxsum0 - ((xsum0 * xsum0) >> 7));

+}

+unsigned int vp9_sub_pixel_variance8x16_mmx

+(

+  const unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  const unsigned char *dst_ptr,

+  int dst_pixels_per_line,

+  unsigned int *sse

+) {

+  int xsum;

+  unsigned int xxsum;

+  vp9_filter_block2d_bil_var_mmx(

+    src_ptr, src_pixels_per_line,

+    dst_ptr, dst_pixels_per_line, 16,

+    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],

+    &xsum, &xxsum

+  );

+  *sse = xxsum;

+  return (xxsum - ((xsum * xsum) >> 7));

+}

+unsigned int vp9_variance_halfpixvar16x16_h_mmx(

+  const unsigned char *src_ptr,

+  int  source_stride,

+  const unsigned char *ref_ptr,

+  int  recon_stride,

+  unsigned int *sse) {

+  return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 0,

+                                         ref_ptr, recon_stride, sse);

+}

+unsigned int vp9_variance_halfpixvar16x16_v_mmx(

+  const unsigned char *src_ptr,

+  int  source_stride,

+  const unsigned char *ref_ptr,

+  int  recon_stride,

+  unsigned int *sse) {

+  return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 8,

+                                         ref_ptr, recon_stride, sse);

+}

+unsigned int vp9_variance_halfpixvar16x16_hv_mmx(

+  const unsigned char *src_ptr,

+  int  source_stride,

+  const unsigned char *ref_ptr,

+  int  recon_stride,

+  unsigned int *sse) {

+  return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 8,

+                                         ref_ptr, recon_stride, sse);

+}

--- /dev/null

+++ b/vp9/encoder/x86/variance_sse2.c

@@ -1,0 +1,517 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_config.h"

+#include "vp9/encoder/variance.h"

+#include "vp9/common/pragmas.h"

+#include "vpx_ports/mem.h"

+#define HALFNDX 8

+extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);

+extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);

+extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);

+extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);

+extern void vp9_filter_block2d_bil4x4_var_mmx

+(

+  const unsigned char *ref_ptr,

+  int ref_pixels_per_line,

+  const unsigned char *src_ptr,

+  int src_pixels_per_line,

+  const short *HFilter,

+  const short *VFilter,

+  int *sum,

+  unsigned int *sumsquared

+);

+extern unsigned int vp9_get4x4var_mmx

+(

+  const unsigned char *src_ptr,

+  int  source_stride,

+  const unsigned char *ref_ptr,

+  int  recon_stride,

+  unsigned int *SSE,

+  int *Sum

+);

+unsigned int vp9_get_mb_ss_sse2

+(

+  const short *src_ptr

+);

+unsigned int vp9_get16x16var_sse2

+(

+  const unsigned char *src_ptr,

+  int source_stride,

+  const unsigned char *ref_ptr,

+  int recon_stride,

+  unsigned int *SSE,

+  int *Sum

+);

+unsigned int vp9_get8x8var_sse2

+(

+  const unsigned char *src_ptr,

+  int source_stride,

+  const unsigned char *ref_ptr,

+  int recon_stride,

+  unsigned int *SSE,

+  int *Sum

+);

+void vp9_filter_block2d_bil_var_sse2

+(

+  const unsigned char *ref_ptr,

+  int ref_pixels_per_line,

+  const unsigned char *src_ptr,

+  int src_pixels_per_line,

+  unsigned int Height,

+  int  xoffset,

+  int  yoffset,

+  int *sum,

+  unsigned int *sumsquared

+);

+void vp9_half_horiz_vert_variance8x_h_sse2

+(

+  const unsigned char *ref_ptr,

+  int ref_pixels_per_line,

+  const unsigned char *src_ptr,

+  int src_pixels_per_line,

+  unsigned int Height,

+  int *sum,

+  unsigned int *sumsquared

+);

+void vp9_half_horiz_vert_variance16x_h_sse2

+(

+  const unsigned char *ref_ptr,

+  int ref_pixels_per_line,

+  const unsigned char *src_ptr,

+  int src_pixels_per_line,

+  unsigned int Height,

+  int *sum,

+  unsigned int *sumsquared

+);

+void vp9_half_horiz_variance8x_h_sse2

+(

+  const unsigned char *ref_ptr,

+  int ref_pixels_per_line,

+  const unsigned char *src_ptr,

+  int src_pixels_per_line,

+  unsigned int Height,

+  int *sum,

+  unsigned int *sumsquared

+);

+void vp9_half_horiz_variance16x_h_sse2

+(

+  const unsigned char *ref_ptr,

+  int ref_pixels_per_line,

+  const unsigned char *src_ptr,

+  int src_pixels_per_line,

+  unsigned int Height,

+  int *sum,

+  unsigned int *sumsquared

+);

+void vp9_half_vert_variance8x_h_sse2

+(

+  const unsigned char *ref_ptr,

+  int ref_pixels_per_line,

+  const unsigned char *src_ptr,

+  int src_pixels_per_line,

+  unsigned int Height,

+  int *sum,

+  unsigned int *sumsquared

+);

+void vp9_half_vert_variance16x_h_sse2

+(

+  const unsigned char *ref_ptr,

+  int ref_pixels_per_line,

+  const unsigned char *src_ptr,

+  int src_pixels_per_line,

+  unsigned int Height,

+  int *sum,

+  unsigned int *sumsquared

+);

+DECLARE_ALIGNED(16, extern short, vp9_bilinear_filters_mmx[16][8]);

+unsigned int vp9_variance4x4_wmt(

+  const unsigned char *src_ptr,

+  int  source_stride,

+  const unsigned char *ref_ptr,

+  int  recon_stride,

+  unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);

+  *sse = var;

+  return (var - ((avg * avg) >> 4));

+}

+unsigned int vp9_variance8x8_wmt

+(

+  const unsigned char *src_ptr,

+  int  source_stride,

+  const unsigned char *ref_ptr,

+  int  recon_stride,

+  unsigned int *sse) {

+  unsigned int var;

+  int avg;

+  vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);

+  *sse = var;

+  return (var - ((avg * avg) >> 6));

+}

+unsigned int vp9_variance16x16_wmt

+(

+  const unsigned char *src_ptr,

+  int  source_stride,

+  const unsigned char *ref_ptr,

+  int  recon_stride,

+  unsigned int *sse) {

+  unsigned int sse0;

+  int sum0;

+  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);

+  *sse = sse0;

+  return (sse0 - ((sum0 * sum0) >> 8));

+}

+unsigned int vp9_mse16x16_wmt(

+  const unsigned char *src_ptr,

+  int  source_stride,

+  const unsigned char *ref_ptr,

+  int  recon_stride,

+  unsigned int *sse) {

+  unsigned int sse0;

+  int sum0;

+  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);

+  *sse = sse0;

+  return sse0;

+}

+unsigned int vp9_variance16x8_wmt

+(

+  const unsigned char *src_ptr,

+  int  source_stride,

+  const unsigned char *ref_ptr,

+  int  recon_stride,

+  unsigned int *sse) {

+  unsigned int sse0, sse1, var;

+  int sum0, sum1, avg;

+  vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);

+  vp9_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);

+  var = sse0 + sse1;

+  avg = sum0 + sum1;

+  *sse = var;

+  return (var - ((avg * avg) >> 7));

+}

+unsigned int vp9_variance8x16_wmt

+(

+  const unsigned char *src_ptr,

+  int  source_stride,

+  const unsigned char *ref_ptr,

+  int  recon_stride,

+  unsigned int *sse) {

+  unsigned int sse0, sse1, var;

+  int sum0, sum1, avg;

+  vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);

+  vp9_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);

+  var = sse0 + sse1;

+  avg = sum0 + sum1;

+  *sse = var;

+  return (var - ((avg * avg) >> 7));

+}

+unsigned int vp9_sub_pixel_variance4x4_wmt

+(

+  const unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  const unsigned char *dst_ptr,

+  int dst_pixels_per_line,

+  unsigned int *sse

+) {

+  int xsum;

+  unsigned int xxsum;

+  vp9_filter_block2d_bil4x4_var_mmx(

+    src_ptr, src_pixels_per_line,

+    dst_ptr, dst_pixels_per_line,

+    vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],

+    &xsum, &xxsum

+  );

+  *sse = xxsum;

+  return (xxsum - ((xsum * xsum) >> 4));

+}

+unsigned int vp9_sub_pixel_variance8x8_wmt

+(

+  const unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  const unsigned char *dst_ptr,

+  int dst_pixels_per_line,

+  unsigned int *sse

+) {

+  int xsum;

+  unsigned int xxsum;

+  if (xoffset == HALFNDX && yoffset == 0) {

+    vp9_half_horiz_variance8x_h_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 8,

+      &xsum, &xxsum);

+  } else if (xoffset == 0 && yoffset == HALFNDX) {

+    vp9_half_vert_variance8x_h_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 8,

+      &xsum, &xxsum);

+  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {

+    vp9_half_horiz_vert_variance8x_h_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 8,

+      &xsum, &xxsum);

+  } else {

+    vp9_filter_block2d_bil_var_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 8,

+      xoffset, yoffset,

+      &xsum, &xxsum);

+  }

+  *sse = xxsum;

+  return (xxsum - ((xsum * xsum) >> 6));

+}

+unsigned int vp9_sub_pixel_variance16x16_wmt

+(

+  const unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  const unsigned char *dst_ptr,

+  int dst_pixels_per_line,

+  unsigned int *sse

+) {

+  int xsum0, xsum1;

+  unsigned int xxsum0, xxsum1;

+  // note we could avoid these if statements if the calling function

+  // just called the appropriate functions inside.

+  if (xoffset == HALFNDX && yoffset == 0) {

+    vp9_half_horiz_variance16x_h_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 16,

+      &xsum0, &xxsum0);

+  } else if (xoffset == 0 && yoffset == HALFNDX) {

+    vp9_half_vert_variance16x_h_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 16,

+      &xsum0, &xxsum0);

+  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {

+    vp9_half_horiz_vert_variance16x_h_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 16,

+      &xsum0, &xxsum0);

+  } else {

+    vp9_filter_block2d_bil_var_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 16,

+      xoffset, yoffset,

+      &xsum0, &xxsum0

+    );

+    vp9_filter_block2d_bil_var_sse2(

+      src_ptr + 8, src_pixels_per_line,

+      dst_ptr + 8, dst_pixels_per_line, 16,

+      xoffset, yoffset,

+      &xsum1, &xxsum1

+    );

+    xsum0 += xsum1;

+    xxsum0 += xxsum1;

+  }

+  *sse = xxsum0;

+  return (xxsum0 - ((xsum0 * xsum0) >> 8));

+}

+unsigned int vp9_sub_pixel_mse16x16_wmt(

+  const unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  const unsigned char *dst_ptr,

+  int dst_pixels_per_line,

+  unsigned int *sse

+) {

+  vp9_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);

+  return *sse;

+}

+unsigned int vp9_sub_pixel_variance16x8_wmt

+(

+  const unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  const unsigned char *dst_ptr,

+  int dst_pixels_per_line,

+  unsigned int *sse

+) {

+  int xsum0, xsum1;

+  unsigned int xxsum0, xxsum1;

+  if (xoffset == HALFNDX && yoffset == 0) {

+    vp9_half_horiz_variance16x_h_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 8,

+      &xsum0, &xxsum0);

+  } else if (xoffset == 0 && yoffset == HALFNDX) {

+    vp9_half_vert_variance16x_h_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 8,

+      &xsum0, &xxsum0);

+  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {

+    vp9_half_horiz_vert_variance16x_h_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 8,

+      &xsum0, &xxsum0);

+  } else {

+    vp9_filter_block2d_bil_var_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 8,

+      xoffset, yoffset,

+      &xsum0, &xxsum0);

+    vp9_filter_block2d_bil_var_sse2(

+      src_ptr + 8, src_pixels_per_line,

+      dst_ptr + 8, dst_pixels_per_line, 8,

+      xoffset, yoffset,

+      &xsum1, &xxsum1);

+    xsum0 += xsum1;

+    xxsum0 += xxsum1;

+  }

+  *sse = xxsum0;

+  return (xxsum0 - ((xsum0 * xsum0) >> 7));

+}

+unsigned int vp9_sub_pixel_variance8x16_wmt

+(

+  const unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  const unsigned char *dst_ptr,

+  int dst_pixels_per_line,

+  unsigned int *sse

+) {

+  int xsum;

+  unsigned int xxsum;

+  if (xoffset == HALFNDX && yoffset == 0) {

+    vp9_half_horiz_variance8x_h_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 16,

+      &xsum, &xxsum);

+  } else if (xoffset == 0 && yoffset == HALFNDX) {

+    vp9_half_vert_variance8x_h_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 16,

+      &xsum, &xxsum);

+  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {

+    vp9_half_horiz_vert_variance8x_h_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 16,

+      &xsum, &xxsum);

+  } else {

+    vp9_filter_block2d_bil_var_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 16,

+      xoffset, yoffset,

+      &xsum, &xxsum);

+  }

+  *sse = xxsum;

+  return (xxsum - ((xsum * xsum) >> 7));

+}

+unsigned int vp9_variance_halfpixvar16x16_h_wmt(

+  const unsigned char *src_ptr,

+  int  src_pixels_per_line,

+  const unsigned char *dst_ptr,

+  int  dst_pixels_per_line,

+  unsigned int *sse) {

+  int xsum0;

+  unsigned int xxsum0;

+  vp9_half_horiz_variance16x_h_sse2(

+    src_ptr, src_pixels_per_line,

+    dst_ptr, dst_pixels_per_line, 16,

+    &xsum0, &xxsum0);

+  *sse = xxsum0;

+  return (xxsum0 - ((xsum0 * xsum0) >> 8));

+}

+unsigned int vp9_variance_halfpixvar16x16_v_wmt(

+  const unsigned char *src_ptr,

+  int  src_pixels_per_line,

+  const unsigned char *dst_ptr,

+  int  dst_pixels_per_line,

+  unsigned int *sse) {

+  int xsum0;

+  unsigned int xxsum0;

+  vp9_half_vert_variance16x_h_sse2(

+    src_ptr, src_pixels_per_line,

+    dst_ptr, dst_pixels_per_line, 16,

+    &xsum0, &xxsum0);

+  *sse = xxsum0;

+  return (xxsum0 - ((xsum0 * xsum0) >> 8));

+}

+unsigned int vp9_variance_halfpixvar16x16_hv_wmt(

+  const unsigned char *src_ptr,

+  int  src_pixels_per_line,

+  const unsigned char *dst_ptr,

+  int  dst_pixels_per_line,

+  unsigned int *sse) {

+  int xsum0;

+  unsigned int xxsum0;

+  vp9_half_horiz_vert_variance16x_h_sse2(

+    src_ptr, src_pixels_per_line,

+    dst_ptr, dst_pixels_per_line, 16,

+    &xsum0, &xxsum0);

+  *sse = xxsum0;

+  return (xxsum0 - ((xsum0 * xsum0) >> 8));

+}

--- /dev/null

+++ b/vp9/encoder/x86/variance_ssse3.c

@@ -1,0 +1,151 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_config.h"

+#include "vp9/encoder/variance.h"

+#include "vp9/common/pragmas.h"

+#include "vpx_ports/mem.h"

+#define HALFNDX 8

+extern unsigned int vp9_get16x16var_sse2

+(

+  const unsigned char *src_ptr,

+  int source_stride,

+  const unsigned char *ref_ptr,

+  int recon_stride,

+  unsigned int *SSE,

+  int *Sum

+);

+extern void vp9_half_horiz_vert_variance16x_h_sse2

+(

+  const unsigned char *ref_ptr,

+  int ref_pixels_per_line,

+  const unsigned char *src_ptr,

+  int src_pixels_per_line,

+  unsigned int Height,

+  int *sum,

+  unsigned int *sumsquared

+);

+extern void vp9_half_horiz_variance16x_h_sse2

+(

+  const unsigned char *ref_ptr,

+  int ref_pixels_per_line,

+  const unsigned char *src_ptr,

+  int src_pixels_per_line,

+  unsigned int Height,

+  int *sum,

+  unsigned int *sumsquared

+);

+extern void vp9_half_vert_variance16x_h_sse2

+(

+  const unsigned char *ref_ptr,

+  int ref_pixels_per_line,

+  const unsigned char *src_ptr,

+  int src_pixels_per_line,

+  unsigned int Height,

+  int *sum,

+  unsigned int *sumsquared

+);

+extern void vp9_filter_block2d_bil_var_ssse3

+(

+  const unsigned char *ref_ptr,

+  int ref_pixels_per_line,

+  const unsigned char *src_ptr,

+  int src_pixels_per_line,

+  unsigned int Height,

+  int  xoffset,

+  int  yoffset,

+  int *sum,

+  unsigned int *sumsquared

+);

+unsigned int vp9_sub_pixel_variance16x16_ssse3

+(

+  const unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  const unsigned char *dst_ptr,

+  int dst_pixels_per_line,

+  unsigned int *sse

+) {

+  int xsum0;

+  unsigned int xxsum0;

+  // note we could avoid these if statements if the calling function

+  // just called the appropriate functions inside.

+  if (xoffset == HALFNDX && yoffset == 0) {

+    vp9_half_horiz_variance16x_h_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 16,

+      &xsum0, &xxsum0);

+  } else if (xoffset == 0 && yoffset == HALFNDX) {

+    vp9_half_vert_variance16x_h_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 16,

+      &xsum0, &xxsum0);

+  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {

+    vp9_half_horiz_vert_variance16x_h_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 16,

+      &xsum0, &xxsum0);

+  } else {

+    vp9_filter_block2d_bil_var_ssse3(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 16,

+      xoffset, yoffset,

+      &xsum0, &xxsum0);

+  }

+  *sse = xxsum0;

+  return (xxsum0 - ((xsum0 * xsum0) >> 8));

+}

+unsigned int vp9_sub_pixel_variance16x8_ssse3

+(

+  const unsigned char  *src_ptr,

+  int  src_pixels_per_line,

+  int  xoffset,

+  int  yoffset,

+  const unsigned char *dst_ptr,

+  int dst_pixels_per_line,

+  unsigned int *sse

+) {

+  int xsum0;

+  unsigned int xxsum0;

+  if (xoffset == HALFNDX && yoffset == 0) {

+    vp9_half_horiz_variance16x_h_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 8,

+      &xsum0, &xxsum0);

+  } else if (xoffset == 0 && yoffset == HALFNDX) {

+    vp9_half_vert_variance16x_h_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 8,

+      &xsum0, &xxsum0);

+  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {

+    vp9_half_horiz_vert_variance16x_h_sse2(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 8,

+      &xsum0, &xxsum0);

+  } else {

+    vp9_filter_block2d_bil_var_ssse3(

+      src_ptr, src_pixels_per_line,

+      dst_ptr, dst_pixels_per_line, 8,

+      xoffset, yoffset,

+      &xsum0, &xxsum0);

+  }

+  *sse = xxsum0;

+  return (xxsum0 - ((xsum0 * xsum0) >> 7));

+}

--- /dev/null

+++ b/vp9/encoder/x86/x86_csystemdependent.c

@@ -1,0 +1,114 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_ports/config.h"

+#include "vpx_ports/x86.h"

+#include "vp9/encoder/variance.h"

+#include "vp9/encoder/onyx_int.h"

+#if HAVE_MMX

+void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) {

+  vp9_short_fdct4x4_mmx(input,   output,    pitch);

+  vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch);

+}

+int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);

+int vp9_mbblock_error_mmx(MACROBLOCK *mb, int dc) {

+  short *coeff_ptr =  mb->block[0].coeff;

+  short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;

+  return vp9_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc);

+}

+int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);

+int vp9_mbuverror_mmx(MACROBLOCK *mb) {

+  short *s_ptr = &mb->coeff[256];

+  short *d_ptr = &mb->e_mbd.dqcoeff[256];

+  return vp9_mbuverror_mmx_impl(s_ptr, d_ptr);

+}

+void vp9_subtract_b_mmx_impl(unsigned char *z,  int src_stride,

+                             short *diff, unsigned char *predictor,

+                             int pitch);

+void vp9_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) {

+  unsigned char *z = *(be->base_src) + be->src;

+  unsigned int  src_stride = be->src_stride;

+  short *diff = &be->src_diff[0];

+  unsigned char *predictor = &bd->predictor[0];

+  vp9_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);

+}

+#endif

+#if HAVE_SSE2

+int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);

+int vp9_mbblock_error_xmm(MACROBLOCK *mb, int dc) {

+  short *coeff_ptr =  mb->block[0].coeff;

+  short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;

+  return vp9_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc);

+}

+int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);

+int vp9_mbuverror_xmm(MACROBLOCK *mb) {

+  short *s_ptr = &mb->coeff[256];

+  short *d_ptr = &mb->e_mbd.dqcoeff[256];

+  return vp9_mbuverror_xmm_impl(s_ptr, d_ptr);

+}

+void vp9_subtract_b_sse2_impl(unsigned char *z,  int src_stride,

+                              short *diff, unsigned char *predictor,

+                              int pitch);

+void vp9_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) {

+  unsigned char *z = *(be->base_src) + be->src;

+  unsigned int  src_stride = be->src_stride;

+  short *diff = &be->src_diff[0];

+  unsigned char *predictor = &bd->predictor[0];

+  vp9_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);

+}

+#endif

+void vp9_arch_x86_encoder_init(VP9_COMP *cpi) {

+#if CONFIG_RUNTIME_CPU_DETECT

+  int flags = x86_simd_caps();

+  /* Note:

+   *

+   * This platform can be built without runtime CPU detection as well. If

+   * you modify any of the function mappings present in this file, be sure

+   * to also update them in static mapings (<arch>/filename_<arch>.h)

+   */

+  /* Override default functions with fastest ones for this CPU. */

+#if HAVE_SSE2

+  if (flags & HAS_SSE2) {

+    cpi->rtcd.temporal.apply                 = vp9_temporal_filter_apply_sse2;

+  }

+#endif

+#if HAVE_SSE3

+  if (flags & HAS_SSE3) {

+    cpi->rtcd.search.full_search             = vp9_full_search_sadx3;

+    cpi->rtcd.search.diamond_search          = vp9_diamond_search_sadx4;

+    cpi->rtcd.search.refining_search         = vp9_refining_search_sadx4;

+  }

+#endif

+#if HAVE_SSE4_1

+  if (flags & HAS_SSE4_1) {

+    cpi->rtcd.search.full_search             = vp9_full_search_sadx8;

+  }

+#endif

+#endif

+}

--- /dev/null

+++ b/vp9/exports_dec

@@ -1,0 +1,2 @@

+data vpx_codec_vp8_dx_algo

+text vpx_codec_vp8_dx

--- /dev/null

+++ b/vp9/exports_enc

@@ -1,0 +1,4 @@

+data vpx_codec_vp8_cx_algo

+text vpx_codec_vp8_cx

+data vpx_codec_vp8x_cx_algo

+text vpx_codec_vp8x_cx

--- /dev/null

+++ b/vp9/vp9_common.mk

@@ -1,0 +1,179 @@

+##

+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+##

+##  Use of this source code is governed by a BSD-style license

+##  that can be found in the LICENSE file in the root of the source

+##  tree. An additional intellectual property rights grant can be found

+##  in the file PATENTS.  All contributing project authors may

+##  be found in the AUTHORS file in the root of the source tree.

+##

+VP9_COMMON_SRCS-yes += vp9_common.mk

+VP9_COMMON_SRCS-yes += common/type_aliases.h

+VP9_COMMON_SRCS-yes += common/pragmas.h

+VP9_COMMON_SRCS-yes += common/ppflags.h

+VP9_COMMON_SRCS-yes += common/onyx.h

+VP9_COMMON_SRCS-yes += common/onyxd.h

+VP9_COMMON_SRCS-yes += common/alloccommon.c

+VP9_COMMON_SRCS-yes += common/asm_com_offsets.c

+VP9_COMMON_SRCS-yes += common/blockd.c

+VP9_COMMON_SRCS-yes += common/coefupdateprobs.h

+VP9_COMMON_SRCS-yes += common/debugmodes.c

+VP9_COMMON_SRCS-yes += common/entropy.c

+VP9_COMMON_SRCS-yes += common/entropymode.c

+VP9_COMMON_SRCS-yes += common/entropymv.c

+VP9_COMMON_SRCS-yes += common/extend.c

+VP9_COMMON_SRCS-yes += common/filter.c

+VP9_COMMON_SRCS-yes += common/filter.h

+VP9_COMMON_SRCS-yes += common/findnearmv.c

+VP9_COMMON_SRCS-yes += common/generic/systemdependent.c

+VP9_COMMON_SRCS-yes += common/idctllm.c

+VP9_COMMON_SRCS-yes += common/alloccommon.h

+VP9_COMMON_SRCS-yes += common/blockd.h

+VP9_COMMON_SRCS-yes += common/common.h

+VP9_COMMON_SRCS-yes += common/common_types.h

+VP9_COMMON_SRCS-yes += common/entropy.h

+VP9_COMMON_SRCS-yes += common/entropymode.h

+VP9_COMMON_SRCS-yes += common/entropymv.h

+VP9_COMMON_SRCS-yes += common/extend.h

+VP9_COMMON_SRCS-yes += common/findnearmv.h

+VP9_COMMON_SRCS-yes += common/header.h

+VP9_COMMON_SRCS-yes += common/idct.h

+VP9_COMMON_SRCS-yes += common/invtrans.h

+VP9_COMMON_SRCS-yes += common/loopfilter.h

+VP9_COMMON_SRCS-yes += common/modecont.h

+VP9_COMMON_SRCS-yes += common/mv.h

+VP9_COMMON_SRCS-yes += common/onyxc_int.h

+VP9_COMMON_SRCS-yes += common/pred_common.h

+VP9_COMMON_SRCS-yes += common/pred_common.c

+VP9_COMMON_SRCS-yes += common/quant_common.h

+VP9_COMMON_SRCS-yes += common/reconinter.h

+VP9_COMMON_SRCS-yes += common/reconintra.h

+VP9_COMMON_SRCS-yes += common/reconintra4x4.h

+VP9_COMMON_SRCS-yes += common/rtcd.c

+VP9_COMMON_SRCS-yes += common/rtcd_defs.sh

+VP9_COMMON_SRCS-yes += common/sadmxn.h

+VP9_COMMON_SRCS-yes += common/seg_common.h

+VP9_COMMON_SRCS-yes += common/seg_common.c

+VP9_COMMON_SRCS-yes += common/setupintrarecon.h

+VP9_COMMON_SRCS-yes += common/subpixel.h

+VP9_COMMON_SRCS-yes += common/swapyv12buffer.h

+VP9_COMMON_SRCS-yes += common/systemdependent.h

+VP9_COMMON_SRCS-yes += common/treecoder.h

+VP9_COMMON_SRCS-yes += common/invtrans.c

+VP9_COMMON_SRCS-yes += common/loopfilter.c

+VP9_COMMON_SRCS-yes += common/loopfilter_filters.c

+VP9_COMMON_SRCS-yes += common/mbpitch.c

+VP9_COMMON_SRCS-yes += common/modecont.c

+VP9_COMMON_SRCS-yes += common/modecontext.c

+VP9_COMMON_SRCS-yes += common/mvref_common.c

+VP9_COMMON_SRCS-yes += common/mvref_common.h

+VP9_COMMON_SRCS-yes += common/quant_common.c

+VP9_COMMON_SRCS-yes += common/recon.c

+VP9_COMMON_SRCS-yes += common/reconinter.c

+VP9_COMMON_SRCS-yes += common/reconintra.c

+VP9_COMMON_SRCS-yes += common/reconintra4x4.c

+VP9_COMMON_SRCS-yes += common/setupintrarecon.c

+VP9_COMMON_SRCS-yes += common/swapyv12buffer.c

+VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c

+VP9_COMMON_SRCS-yes += common/treecoder.c

+VP9_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/implicit_segmentation.c

+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/idct_x86.h

+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/subpixel_x86.h

+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.h

+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/postproc_x86.h

+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/x86_systemdependent.c

+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c

+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c

+VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h

+VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c

+VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm

+VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm

+VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm

+VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm

+VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm

+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm

+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm

+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c

+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm

+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm

+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm

+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_8t_ssse3.asm

+VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm

+ifeq ($(CONFIG_POSTPROC),yes)

+VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm

+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm

+endif

+# common (c)

+ifeq ($(CONFIG_CSM),yes)

+VP9_COMMON_SRCS-yes += common/maskingmv.c

+VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/mask_sse3.asm

+endif

+VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/filter_sse4.c

+ifeq ($(HAVE_SSE4_1),yes)

+vp9/common/x86/filter_sse4.c.o: CFLAGS += -msse4

+endif

+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/filter_sse2.c

+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/sadmxn_x86.c

+ifeq ($(HAVE_SSE2),yes)

+vp9/common/x86/filter_sse2.c.o: CFLAGS += -msse2

+vp9/common/x86/loopfilter_x86.c.o: CFLAGS += -msse2

+vp9/common/x86/sadmxn_x86.c.o: CFLAGS += -msse2

+endif

+VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/arm_systemdependent.c

+VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/bilinearfilter_arm.c

+VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/bilinearfilter_arm.h

+VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/filter_arm.c

+VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/idct_arm.h

+VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.c

+VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.h

+VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/recon_arm.h

+VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/reconintra_arm.c

+VP9_COMMON_SRCS-$(ARCH_ARM)  += common/arm/subpixel_arm.h

+# common (armv6)

+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/bilinearfilter_v6$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/copymem8x4_v6$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/copymem8x8_v6$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/copymem16x16_v6$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/dc_only_idct_add_v6$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/iwalsh_v6$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/filter_v6$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/idct_v6$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/loopfilter_v6$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/recon_v6$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/simpleloopfilter_v6$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/sixtappredict8x4_v6$(ASM)

+# common (neon)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict4x4_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict8x4_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict8x8_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict16x16_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/copymem8x4_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/copymem8x8_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/copymem16x16_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/dc_only_idct_add_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/iwalsh_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfilter_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/mbloopfilter_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon2b_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon4b_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/reconb_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/shortidct4x4llm_1_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/shortidct4x4llm_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict4x4_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict8x4_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict8x8_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict16x16_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon16x16mb_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/save_neon_reg$(ASM)

+VP9_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon_neon.c

--- /dev/null

+++ b/vp9/vp9_cx_iface.c

@@ -1,0 +1,1169 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx/vpx_codec.h"

+#include "vpx/internal/vpx_codec_internal.h"

+#include "vpx_version.h"

+#include "vp9/encoder/onyx_int.h"

+#include "vpx/vp8e.h"

+#include "vp9/encoder/firstpass.h"

+#include "vp9/common/onyx.h"

+#include <stdlib.h>

+#include <string.h>

+/* This value is a sentinel for determining whether the user has set a mode

+ * directly through the deprecated VP8E_SET_ENCODING_MODE control.

+ */

+#define NO_MODE_SET 255

+struct vp8_extracfg {

+  struct vpx_codec_pkt_list *pkt_list;

+  vp8e_encoding_mode      encoding_mode;               /** best, good, realtime            */

+  int                         cpu_used;                    /** available cpu percentage in 1/16*/

+  unsigned int                enable_auto_alt_ref;           /** if encoder decides to uses alternate reference frame */

+  unsigned int                noise_sensitivity;

+  unsigned int                Sharpness;

+  unsigned int                static_thresh;

+  unsigned int                token_partitions;

+  unsigned int                arnr_max_frames;    /* alt_ref Noise Reduction Max Frame Count */

+  unsigned int                arnr_strength;    /* alt_ref Noise Reduction Strength */

+  unsigned int                arnr_type;        /* alt_ref filter type */

+  unsigned int                experimental;

+  vp8e_tuning                 tuning;

+  unsigned int                cq_level;         /* constrained quality level */

+  unsigned int                rc_max_intra_bitrate_pct;

+};

+struct extraconfig_map {

+  int                 usage;

+  struct vp8_extracfg cfg;

+};

+static const struct extraconfig_map extracfg_map[] = {

+  {

+    0,

+    {

+      NULL,

+      VP8_BEST_QUALITY_ENCODING,  /* Encoding Mode */

+      0,                          /* cpu_used      */

+      0,                          /* enable_auto_alt_ref */

+      0,                          /* noise_sensitivity */

+      0,                          /* Sharpness */

+      0,                          /* static_thresh */

+      VP8_ONE_TOKENPARTITION,     /* token_partitions */

+      0,                          /* arnr_max_frames */

+      3,                          /* arnr_strength */

+      3,                          /* arnr_type*/

+      0,                          /* experimental mode */

+      0,                          /* tuning*/

+      10,                         /* cq_level */

+      0,                          /* rc_max_intra_bitrate_pct */

+    }

+  }

+};

+struct vpx_codec_alg_priv {

+  vpx_codec_priv_t        base;

+  vpx_codec_enc_cfg_t     cfg;

+  struct vp8_extracfg     vp8_cfg;

+  VP9_CONFIG              oxcf;

+  VP9_PTR             cpi;

+  unsigned char          *cx_data;

+  unsigned int            cx_data_sz;

+  vpx_image_t             preview_img;

+  unsigned int            next_frame_flag;

+  vp8_postproc_cfg_t      preview_ppcfg;

+  vpx_codec_pkt_list_decl(64) pkt_list;              // changed to accomendate the maximum number of lagged frames allowed

+  int                         deprecated_mode;

+  unsigned int                fixed_kf_cntr;

+};

+static vpx_codec_err_t

+update_error_state(vpx_codec_alg_priv_t                 *ctx,

+                   const struct vpx_internal_error_info *error) {

+  vpx_codec_err_t res;

+  if ((res = error->error_code))

+    ctx->base.err_detail = error->has_detail

+                           ? error->detail

+                           : NULL;

+  return res;

+}

+#undef ERROR

+#define ERROR(str) do {\

+    ctx->base.err_detail = str;\

+    return VPX_CODEC_INVALID_PARAM;\

+  } while(0)

+#define RANGE_CHECK(p,memb,lo,hi) do {\

+    if(!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \

+      ERROR(#memb " out of range ["#lo".."#hi"]");\

+  } while(0)

+#define RANGE_CHECK_HI(p,memb,hi) do {\

+    if(!((p)->memb <= (hi))) \

+      ERROR(#memb " out of range [.."#hi"]");\

+  } while(0)

+#define RANGE_CHECK_LO(p,memb,lo) do {\

+    if(!((p)->memb >= (lo))) \

+      ERROR(#memb " out of range ["#lo"..]");\

+  } while(0)

+#define RANGE_CHECK_BOOL(p,memb) do {\

+    if(!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\

+  } while(0)

+static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,

+                                       const vpx_codec_enc_cfg_t *cfg,

+                                       const struct vp8_extracfg *vp8_cfg) {

+  RANGE_CHECK(cfg, g_w,                   1, 16383); /* 14 bits available */

+  RANGE_CHECK(cfg, g_h,                   1, 16383); /* 14 bits available */

+  RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);

+  RANGE_CHECK(cfg, g_timebase.num,        1, cfg->g_timebase.den);

+  RANGE_CHECK_HI(cfg, g_profile,          3);

+  RANGE_CHECK_HI(cfg, rc_max_quantizer,   63);

+  RANGE_CHECK_HI(cfg, rc_min_quantizer,   cfg->rc_max_quantizer);

+  RANGE_CHECK_HI(cfg, g_threads,          64);

+  RANGE_CHECK_HI(cfg, g_lag_in_frames,    MAX_LAG_BUFFERS);

+  RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CQ);

+  RANGE_CHECK_HI(cfg, rc_undershoot_pct,  1000);

+  RANGE_CHECK_HI(cfg, rc_overshoot_pct,   1000);

+  RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);

+  RANGE_CHECK(cfg, kf_mode,               VPX_KF_DISABLED, VPX_KF_AUTO);

+  // RANGE_CHECK_BOOL(cfg,                 g_delete_firstpassfile);

+  RANGE_CHECK_BOOL(cfg,                   rc_resize_allowed);

+  RANGE_CHECK_HI(cfg, rc_dropframe_thresh,   100);

+  RANGE_CHECK_HI(cfg, rc_resize_up_thresh,   100);

+  RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);

+  RANGE_CHECK(cfg,        g_pass,         VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);

+  /* VP8 does not support a lower bound on the keyframe interval in

+   * automatic keyframe placement mode.

+   */

+  if (cfg->kf_mode != VPX_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist

+      && cfg->kf_min_dist > 0)

+    ERROR("kf_min_dist not supported in auto mode, use 0 "

+          "or kf_max_dist instead.");

+  RANGE_CHECK_BOOL(vp8_cfg,               enable_auto_alt_ref);

+  RANGE_CHECK(vp8_cfg, cpu_used,           -16, 16);

+  RANGE_CHECK(vp8_cfg, encoding_mode,      VP8_BEST_QUALITY_ENCODING, VP8_REAL_TIME_ENCODING);

+  RANGE_CHECK_HI(vp8_cfg, noise_sensitivity,  6);

+  RANGE_CHECK(vp8_cfg, token_partitions,   VP8_ONE_TOKENPARTITION, VP8_EIGHT_TOKENPARTITION);

+  RANGE_CHECK_HI(vp8_cfg, Sharpness,       7);

+  RANGE_CHECK(vp8_cfg, arnr_max_frames, 0, 15);

+  RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);

+  RANGE_CHECK(vp8_cfg, arnr_type,       1, 3);

+  RANGE_CHECK(vp8_cfg, cq_level, 0, 63);

+  if (cfg->g_pass == VPX_RC_LAST_PASS) {

+    size_t           packet_sz = sizeof(FIRSTPASS_STATS);

+    int              n_packets = cfg->rc_twopass_stats_in.sz / packet_sz;

+    FIRSTPASS_STATS *stats;

+    if (!cfg->rc_twopass_stats_in.buf)

+      ERROR("rc_twopass_stats_in.buf not set.");

+    if (cfg->rc_twopass_stats_in.sz % packet_sz)

+      ERROR("rc_twopass_stats_in.sz indicates truncated packet.");

+    if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz)

+      ERROR("rc_twopass_stats_in requires at least two packets.");

+    stats = (void *)((char *)cfg->rc_twopass_stats_in.buf

+                     + (n_packets - 1) * packet_sz);

+    if ((int)(stats->count + 0.5) != n_packets - 1)

+      ERROR("rc_twopass_stats_in missing EOS stats packet");

+  }

+  return VPX_CODEC_OK;

+}

+static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,

+                                    const vpx_image_t    *img) {

+  switch (img->fmt) {

+    case VPX_IMG_FMT_YV12:

+    case VPX_IMG_FMT_I420:

+    case VPX_IMG_FMT_VPXI420:

+    case VPX_IMG_FMT_VPXYV12:

+      break;

+    default:

+      ERROR("Invalid image format. Only YV12 and I420 images are supported");

+  }

+  if ((img->d_w != ctx->cfg.g_w) || (img->d_h != ctx->cfg.g_h))

+    ERROR("Image size must match encoder init configuration size");

+  return VPX_CODEC_OK;

+}

+static vpx_codec_err_t set_vp8e_config(VP9_CONFIG *oxcf,

+                                       vpx_codec_enc_cfg_t cfg,

+                                       struct vp8_extracfg vp8_cfg) {

+  oxcf->Version               = cfg.g_profile;

+  oxcf->Version              |= vp8_cfg.experimental ? 0x4 : 0;

+  oxcf->Width                 = cfg.g_w;

+  oxcf->Height                = cfg.g_h;

+  /* guess a frame rate if out of whack, use 30 */

+  oxcf->frame_rate             = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num);

+  if (oxcf->frame_rate > 180) {

+    oxcf->frame_rate = 30;

+  }

+  switch (cfg.g_pass) {

+    case VPX_RC_ONE_PASS:

+      oxcf->Mode = MODE_BESTQUALITY;

+      break;

+    case VPX_RC_FIRST_PASS:

+      oxcf->Mode = MODE_FIRSTPASS;

+      break;

+    case VPX_RC_LAST_PASS:

+      oxcf->Mode = MODE_SECONDPASS_BEST;

+      break;

+  }

+  if (cfg.g_pass == VPX_RC_FIRST_PASS) {

+    oxcf->allow_lag              = 0;

+    oxcf->lag_in_frames           = 0;

+  } else {

+    oxcf->allow_lag              = (cfg.g_lag_in_frames) > 0;

+    oxcf->lag_in_frames           = cfg.g_lag_in_frames;

+  }

+  // VBR only supported for now.

+  // CBR code has been deprectated for experimental phase.

+  // CQ mode not yet tested

+  oxcf->end_usage          = USAGE_LOCAL_FILE_PLAYBACK;

+  /*if (cfg.rc_end_usage == VPX_CQ)

+      oxcf->end_usage      = USAGE_CONSTRAINED_QUALITY;

+  else

+      oxcf->end_usage      = USAGE_LOCAL_FILE_PLAYBACK;*/

+  oxcf->target_bandwidth       = cfg.rc_target_bitrate;

+  oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;

+  oxcf->best_allowed_q          = cfg.rc_min_quantizer;

+  oxcf->worst_allowed_q         = cfg.rc_max_quantizer;

+  oxcf->cq_level                = vp8_cfg.cq_level;

+  oxcf->fixed_q = -1;

+  oxcf->under_shoot_pct         = cfg.rc_undershoot_pct;

+  oxcf->over_shoot_pct          = cfg.rc_overshoot_pct;

+  oxcf->maximum_buffer_size     = cfg.rc_buf_sz;

+  oxcf->starting_buffer_level   = cfg.rc_buf_initial_sz;

+  oxcf->optimal_buffer_level    = cfg.rc_buf_optimal_sz;

+  oxcf->two_pass_vbrbias        = cfg.rc_2pass_vbr_bias_pct;

+  oxcf->two_pass_vbrmin_section  = cfg.rc_2pass_vbr_minsection_pct;

+  oxcf->two_pass_vbrmax_section  = cfg.rc_2pass_vbr_maxsection_pct;

+  oxcf->auto_key               = cfg.kf_mode == VPX_KF_AUTO

+                                 && cfg.kf_min_dist != cfg.kf_max_dist;

+  // oxcf->kf_min_dist         = cfg.kf_min_dis;

+  oxcf->key_freq               = cfg.kf_max_dist;

+  // oxcf->delete_first_pass_file = cfg.g_delete_firstpassfile;

+  // strcpy(oxcf->first_pass_file, cfg.g_firstpass_file);

+  oxcf->cpu_used               =  vp8_cfg.cpu_used;

+  oxcf->encode_breakout        =  vp8_cfg.static_thresh;

+  oxcf->play_alternate         =  vp8_cfg.enable_auto_alt_ref;

+  oxcf->noise_sensitivity      =  vp8_cfg.noise_sensitivity;

+  oxcf->Sharpness             =  vp8_cfg.Sharpness;

+  oxcf->two_pass_stats_in        =  cfg.rc_twopass_stats_in;

+  oxcf->output_pkt_list         =  vp8_cfg.pkt_list;

+  oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames;

+  oxcf->arnr_strength =  vp8_cfg.arnr_strength;

+  oxcf->arnr_type =      vp8_cfg.arnr_type;

+  oxcf->tuning = vp8_cfg.tuning;

+#if CONFIG_LOSSLESS

+  oxcf->lossless = cfg.lossless;

+#endif

+  /*

+      printf("Current VP8 Settings: \n");

+      printf("target_bandwidth: %d\n", oxcf->target_bandwidth);

+      printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity);

+      printf("Sharpness: %d\n",    oxcf->Sharpness);

+      printf("cpu_used: %d\n",  oxcf->cpu_used);

+      printf("Mode: %d\n",     oxcf->Mode);

+      printf("delete_first_pass_file: %d\n",  oxcf->delete_first_pass_file);

+      printf("auto_key: %d\n",  oxcf->auto_key);

+      printf("key_freq: %d\n", oxcf->key_freq);

+      printf("end_usage: %d\n", oxcf->end_usage);

+      printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct);

+      printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct);

+      printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level);

+      printf("optimal_buffer_level: %d\n",  oxcf->optimal_buffer_level);

+      printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size);

+      printf("fixed_q: %d\n",  oxcf->fixed_q);

+      printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q);

+      printf("best_allowed_q: %d\n", oxcf->best_allowed_q);

+      printf("two_pass_vbrbias: %d\n",  oxcf->two_pass_vbrbias);

+      printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);

+      printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);

+      printf("allow_lag: %d\n", oxcf->allow_lag);

+      printf("lag_in_frames: %d\n", oxcf->lag_in_frames);

+      printf("play_alternate: %d\n", oxcf->play_alternate);

+      printf("Version: %d\n", oxcf->Version);

+      printf("encode_breakout: %d\n", oxcf->encode_breakout);

+  */

+  return VPX_CODEC_OK;

+}

+static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t       *ctx,

+                                       const vpx_codec_enc_cfg_t  *cfg) {

+  vpx_codec_err_t res;

+  if ((cfg->g_w != ctx->cfg.g_w) || (cfg->g_h != ctx->cfg.g_h))

+    ERROR("Cannot change width or height after initialization");

+  /* Prevent increasing lag_in_frames. This check is stricter than it needs

+   * to be -- the limit is not increasing past the first lag_in_frames

+   * value, but we don't track the initial config, only the last successful

+   * config.

+   */

+  if ((cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames))

+    ERROR("Cannot increase lag_in_frames");

+  res = validate_config(ctx, cfg, &ctx->vp8_cfg);

+  if (!res) {

+    ctx->cfg = *cfg;

+    set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);

+    vp9_change_config(ctx->cpi, &ctx->oxcf);

+  }

+  return res;

+}

+int vp9_reverse_trans(int q);

+static vpx_codec_err_t get_param(vpx_codec_alg_priv_t *ctx,

+                                 int                   ctrl_id,

+                                 va_list               args) {

+  void *arg = va_arg(args, void *);

+#define MAP(id, var) case id: *(RECAST(id, arg)) = var; break

+  if (!arg)

+    return VPX_CODEC_INVALID_PARAM;

+  switch (ctrl_id) {

+      MAP(VP8E_GET_LAST_QUANTIZER, vp9_get_quantizer(ctx->cpi));

+      MAP(VP8E_GET_LAST_QUANTIZER_64,

+          vp9_reverse_trans(vp9_get_quantizer(ctx->cpi)));

+  }

+  return VPX_CODEC_OK;

+#undef MAP

+}

+static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,

+                                 int                   ctrl_id,

+                                 va_list               args) {

+  vpx_codec_err_t     res  = VPX_CODEC_OK;

+  struct vp8_extracfg xcfg = ctx->vp8_cfg;

+#define MAP(id, var) case id: var = CAST(id, args); break;

+  switch (ctrl_id) {

+      MAP(VP8E_SET_ENCODING_MODE,         ctx->deprecated_mode);

+      MAP(VP8E_SET_CPUUSED,               xcfg.cpu_used);

+      MAP(VP8E_SET_ENABLEAUTOALTREF,      xcfg.enable_auto_alt_ref);

+      MAP(VP8E_SET_NOISE_SENSITIVITY,     xcfg.noise_sensitivity);

+      MAP(VP8E_SET_SHARPNESS,             xcfg.Sharpness);

+      MAP(VP8E_SET_STATIC_THRESHOLD,      xcfg.static_thresh);

+      MAP(VP8E_SET_TOKEN_PARTITIONS,      xcfg.token_partitions);

+      MAP(VP8E_SET_ARNR_MAXFRAMES,        xcfg.arnr_max_frames);

+      MAP(VP8E_SET_ARNR_STRENGTH,        xcfg.arnr_strength);

+      MAP(VP8E_SET_ARNR_TYPE,        xcfg.arnr_type);

+      MAP(VP8E_SET_TUNING,                xcfg.tuning);

+      MAP(VP8E_SET_CQ_LEVEL,              xcfg.cq_level);

+      MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, xcfg.rc_max_intra_bitrate_pct);

+  }

+  res = validate_config(ctx, &ctx->cfg, &xcfg);

+  if (!res) {

+    ctx->vp8_cfg = xcfg;

+    set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);

+    vp9_change_config(ctx->cpi, &ctx->oxcf);

+  }

+  return res;

+#undef MAP

+}

+static vpx_codec_err_t vp8e_common_init(vpx_codec_ctx_t *ctx,

+                                        int              experimental) {

+  vpx_codec_err_t        res = VPX_DEC_OK;

+  struct vpx_codec_alg_priv *priv;

+  vpx_codec_enc_cfg_t       *cfg;

+  unsigned int               i;

+  VP9_PTR optr;

+  if (!ctx->priv) {

+    priv = calloc(1, sizeof(struct vpx_codec_alg_priv));

+    if (!priv) {

+      return VPX_CODEC_MEM_ERROR;

+    }

+    ctx->priv = &priv->base;

+    ctx->priv->sz = sizeof(*ctx->priv);

+    ctx->priv->iface = ctx->iface;

+    ctx->priv->alg_priv = priv;

+    ctx->priv->init_flags = ctx->init_flags;

+    if (ctx->config.enc) {

+      /* Update the reference to the config structure to an

+       * internal copy.

+       */

+      ctx->priv->alg_priv->cfg = *ctx->config.enc;

+      ctx->config.enc = &ctx->priv->alg_priv->cfg;

+    }

+    cfg =  &ctx->priv->alg_priv->cfg;

+    /* Select the extra vp6 configuration table based on the current

+     * usage value. If the current usage value isn't found, use the

+     * values for usage case 0.

+     */

+    for (i = 0;

+         extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;

+         i++);

+    priv->vp8_cfg = extracfg_map[i].cfg;

+    priv->vp8_cfg.pkt_list = &priv->pkt_list.head;

+    priv->vp8_cfg.experimental = experimental;

+    priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2;

+    if (priv->cx_data_sz < 4096) priv->cx_data_sz = 4096;

+    priv->cx_data = malloc(priv->cx_data_sz);

+    if (!priv->cx_data) {

+      return VPX_CODEC_MEM_ERROR;

+    }

+    priv->deprecated_mode = NO_MODE_SET;

+    vp9_initialize_enc();

+    res = validate_config(priv, &priv->cfg, &priv->vp8_cfg);

+    if (!res) {

+      set_vp8e_config(&ctx->priv->alg_priv->oxcf,

+                      ctx->priv->alg_priv->cfg,

+                      ctx->priv->alg_priv->vp8_cfg);

+      optr = vp9_create_compressor(&ctx->priv->alg_priv->oxcf);

+      if (!optr)

+        res = VPX_CODEC_MEM_ERROR;

+      else

+        ctx->priv->alg_priv->cpi = optr;

+    }

+  }

+  return res;

+}

+static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx) {

+  return vp8e_common_init(ctx, 0);

+}

+#if CONFIG_EXPERIMENTAL

+static vpx_codec_err_t vp8e_exp_init(vpx_codec_ctx_t *ctx) {

+  return vp8e_common_init(ctx, 1);

+}

+#endif

+static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx) {

+  free(ctx->cx_data);

+  vp9_remove_compressor(&ctx->cpi);

+  free(ctx);

+  return VPX_CODEC_OK;

+}

+static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,

+                                       YV12_BUFFER_CONFIG  *yv12) {

+  vpx_codec_err_t        res = VPX_CODEC_OK;

+  yv12->y_buffer = img->planes[VPX_PLANE_Y];

+  yv12->u_buffer = img->planes[VPX_PLANE_U];

+  yv12->v_buffer = img->planes[VPX_PLANE_V];

+  yv12->y_width  = img->d_w;

+  yv12->y_height = img->d_h;

+  yv12->uv_width = (1 + yv12->y_width) / 2;

+  yv12->uv_height = (1 + yv12->y_height) / 2;

+  yv12->y_stride = img->stride[VPX_PLANE_Y];

+  yv12->uv_stride = img->stride[VPX_PLANE_U];

+  yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;

+  yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12); // REG_YUV = 0

+  return res;

+}

+static void pick_quickcompress_mode(vpx_codec_alg_priv_t  *ctx,

+                                    unsigned long          duration,

+                                    unsigned long          deadline) {

+  unsigned int new_qc;

+  /* Use best quality mode if no deadline is given. */

+  if (deadline)

+    new_qc = MODE_GOODQUALITY;

+  else

+    new_qc = MODE_BESTQUALITY;

+  if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)

+    new_qc = MODE_FIRSTPASS;

+  else if (ctx->cfg.g_pass == VPX_RC_LAST_PASS)

+    new_qc = (new_qc == MODE_BESTQUALITY)

+             ? MODE_SECONDPASS_BEST

+             : MODE_SECONDPASS;

+  if (ctx->oxcf.Mode != new_qc) {

+    ctx->oxcf.Mode = new_qc;

+    vp9_change_config(ctx->cpi, &ctx->oxcf);

+  }

+}

+static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t  *ctx,

+                                   const vpx_image_t     *img,

+                                   vpx_codec_pts_t        pts,

+                                   unsigned long          duration,

+                                   vpx_enc_frame_flags_t  flags,

+                                   unsigned long          deadline) {

+  vpx_codec_err_t res = VPX_CODEC_OK;

+  if (img)

+    res = validate_img(ctx, img);

+  pick_quickcompress_mode(ctx, duration, deadline);

+  vpx_codec_pkt_list_init(&ctx->pkt_list);

+  /* Handle Flags */

+  if (((flags & VP8_EFLAG_NO_UPD_GF) && (flags & VP8_EFLAG_FORCE_GF))

+      || ((flags & VP8_EFLAG_NO_UPD_ARF) && (flags & VP8_EFLAG_FORCE_ARF))) {

+    ctx->base.err_detail = "Conflicting flags.";

+    return VPX_CODEC_INVALID_PARAM;

+  }

+  if (flags & (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF

+               | VP8_EFLAG_NO_REF_ARF)) {

+    int ref = 7;

+    if (flags & VP8_EFLAG_NO_REF_LAST)

+      ref ^= VP9_LAST_FLAG;

+    if (flags & VP8_EFLAG_NO_REF_GF)

+      ref ^= VP9_GOLD_FLAG;

+    if (flags & VP8_EFLAG_NO_REF_ARF)

+      ref ^= VP9_ALT_FLAG;

+    vp9_use_as_reference(ctx->cpi, ref);

+  }

+  if (flags & (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF

+               | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_FORCE_GF

+               | VP8_EFLAG_FORCE_ARF)) {

+    int upd = 7;

+    if (flags & VP8_EFLAG_NO_UPD_LAST)

+      upd ^= VP9_LAST_FLAG;

+    if (flags & VP8_EFLAG_NO_UPD_GF)

+      upd ^= VP9_GOLD_FLAG;

+    if (flags & VP8_EFLAG_NO_UPD_ARF)

+      upd ^= VP9_ALT_FLAG;

+    vp9_update_reference(ctx->cpi, upd);

+  }

+  if (flags & VP8_EFLAG_NO_UPD_ENTROPY) {

+    vp9_update_entropy(ctx->cpi, 0);

+  }

+  /* Handle fixed keyframe intervals */

+  if (ctx->cfg.kf_mode == VPX_KF_AUTO

+      && ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) {

+    if (++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) {

+      flags |= VPX_EFLAG_FORCE_KF;

+      ctx->fixed_kf_cntr = 1;

+    }

+  }

+  /* Initialize the encoder instance on the first frame*/

+  if (!res && ctx->cpi) {

+    unsigned int lib_flags;

+    YV12_BUFFER_CONFIG sd;

+    int64_t dst_time_stamp, dst_end_time_stamp;

+    unsigned long size, cx_data_sz;

+    unsigned char *cx_data;

+    /* Set up internal flags */

+    if (ctx->base.init_flags & VPX_CODEC_USE_PSNR)

+      ((VP9_COMP *)ctx->cpi)->b_calculate_psnr = 1;

+    // if (ctx->base.init_flags & VPX_CODEC_USE_OUTPUT_PARTITION)

+    //    ((VP9_COMP *)ctx->cpi)->output_partition = 1;

+    /* Convert API flags to internal codec lib flags */

+    lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;

+    /* vp8 use 10,000,000 ticks/second as time stamp */

+    dst_time_stamp    = pts * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;

+    dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;

+    if (img != NULL) {

+      res = image2yuvconfig(img, &sd);

+      if (vp9_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags,

+                                &sd, dst_time_stamp, dst_end_time_stamp)) {

+        VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;

+        res = update_error_state(ctx, &cpi->common.error);

+      }

+      /* reset for next frame */

+      ctx->next_frame_flag = 0;

+    }

+    cx_data = ctx->cx_data;

+    cx_data_sz = ctx->cx_data_sz;

+    lib_flags = 0;

+    while (cx_data_sz >= ctx->cx_data_sz / 2 &&

+           -1 != vp9_get_compressed_data(ctx->cpi, &lib_flags, &size,

+                                         cx_data, &dst_time_stamp,

+                                         &dst_end_time_stamp, !img)) {

+      if (size) {

+        vpx_codec_pts_t    round, delta;

+        vpx_codec_cx_pkt_t pkt;

+        VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;

+        /* Add the frame packet to the list of returned packets. */

+        round = 1000000 * ctx->cfg.g_timebase.num / 2 - 1;

+        delta = (dst_end_time_stamp - dst_time_stamp);

+        pkt.kind = VPX_CODEC_CX_FRAME_PKT;

+        pkt.data.frame.pts =

+          (dst_time_stamp * ctx->cfg.g_timebase.den + round)

+          / ctx->cfg.g_timebase.num / 10000000;

+        pkt.data.frame.duration =

+          (delta * ctx->cfg.g_timebase.den + round)

+          / ctx->cfg.g_timebase.num / 10000000;

+        pkt.data.frame.flags = lib_flags << 16;

+        if (lib_flags & FRAMEFLAGS_KEY)

+          pkt.data.frame.flags |= VPX_FRAME_IS_KEY;

+        if (!cpi->common.show_frame) {

+          pkt.data.frame.flags |= VPX_FRAME_IS_INVISIBLE;

+          // This timestamp should be as close as possible to the

+          // prior PTS so that if a decoder uses pts to schedule when

+          // to do this, we start right after last frame was decoded.

+          // Invisible frames have no duration.

+          pkt.data.frame.pts = ((cpi->last_time_stamp_seen

+                                 * ctx->cfg.g_timebase.den + round)

+                                / ctx->cfg.g_timebase.num / 10000000) + 1;

+          pkt.data.frame.duration = 0;

+        }

+        if (cpi->droppable)

+          pkt.data.frame.flags |= VPX_FRAME_IS_DROPPABLE;

+        /*if (cpi->output_partition)

+        {

+            int i;

+            const int num_partitions = 1;

+            pkt.data.frame.flags |= VPX_FRAME_IS_FRAGMENT;

+            for (i = 0; i < num_partitions; ++i)

+            {

+                pkt.data.frame.buf = cx_data;

+                pkt.data.frame.sz = cpi->partition_sz[i];

+                pkt.data.frame.partition_id = i;

+                // don't set the fragment bit for the last partition

+                if (i == (num_partitions - 1))

+                    pkt.data.frame.flags &= ~VPX_FRAME_IS_FRAGMENT;

+                vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);

+                cx_data += cpi->partition_sz[i];

+                cx_data_sz -= cpi->partition_sz[i];

+            }

+        }

+        else*/

+        {

+          pkt.data.frame.buf = cx_data;

+          pkt.data.frame.sz  = size;

+          pkt.data.frame.partition_id = -1;

+          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);

+          cx_data += size;

+          cx_data_sz -= size;

+        }

+        // printf("timestamp: %lld, duration: %d\n", pkt->data.frame.pts, pkt->data.frame.duration);

+      }

+    }

+  }

+  return res;

+}

+static const vpx_codec_cx_pkt_t *vp8e_get_cxdata(vpx_codec_alg_priv_t  *ctx,

+                                                 vpx_codec_iter_t      *iter) {

+  return vpx_codec_pkt_list_get(&ctx->pkt_list.head, iter);

+}

+static vpx_codec_err_t vp8e_set_reference(vpx_codec_alg_priv_t *ctx,

+                                          int ctr_id,

+                                          va_list args) {

+  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);

+  if (data) {

+    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;

+    YV12_BUFFER_CONFIG sd;

+    image2yuvconfig(&frame->img, &sd);

+    vp9_set_reference_enc(ctx->cpi, frame->frame_type, &sd);

+    return VPX_CODEC_OK;

+  } else

+    return VPX_CODEC_INVALID_PARAM;

+}

+static vpx_codec_err_t vp8e_get_reference(vpx_codec_alg_priv_t *ctx,

+                                          int ctr_id,

+                                          va_list args) {

+  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);

+  if (data) {

+    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;

+    YV12_BUFFER_CONFIG sd;

+    image2yuvconfig(&frame->img, &sd);

+    vp9_get_reference_enc(ctx->cpi, frame->frame_type, &sd);

+    return VPX_CODEC_OK;

+  } else

+    return VPX_CODEC_INVALID_PARAM;

+}

+static vpx_codec_err_t vp8e_set_previewpp(vpx_codec_alg_priv_t *ctx,

+                                          int ctr_id,

+                                          va_list args) {

+#if CONFIG_POSTPROC

+  vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);

+  (void)ctr_id;

+  if (data) {

+    ctx->preview_ppcfg = *((vp8_postproc_cfg_t *)data);

+    return VPX_CODEC_OK;

+  } else

+    return VPX_CODEC_INVALID_PARAM;

+#else

+  (void)ctx;

+  (void)ctr_id;

+  (void)args;

+  return VPX_CODEC_INCAPABLE;

+#endif

+}

+static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx) {

+  YV12_BUFFER_CONFIG sd;

+  vp9_ppflags_t flags = {0};

+  if (ctx->preview_ppcfg.post_proc_flag) {

+    flags.post_proc_flag        = ctx->preview_ppcfg.post_proc_flag;

+    flags.deblocking_level      = ctx->preview_ppcfg.deblocking_level;

+    flags.noise_level           = ctx->preview_ppcfg.noise_level;

+  }

+  if (0 == vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags)) {

+    /*

+    vpx_img_wrap(&ctx->preview_img, VPX_IMG_FMT_YV12,

+        sd.y_width + 2*VP8BORDERINPIXELS,

+        sd.y_height + 2*VP8BORDERINPIXELS,

+        1,

+        sd.buffer_alloc);

+    vpx_img_set_rect(&ctx->preview_img,

+        VP8BORDERINPIXELS, VP8BORDERINPIXELS,

+        sd.y_width, sd.y_height);

+        */

+    ctx->preview_img.bps = 12;

+    ctx->preview_img.planes[VPX_PLANE_Y] = sd.y_buffer;

+    ctx->preview_img.planes[VPX_PLANE_U] = sd.u_buffer;

+    ctx->preview_img.planes[VPX_PLANE_V] = sd.v_buffer;

+    if (sd.clrtype == REG_YUV)

+      ctx->preview_img.fmt = VPX_IMG_FMT_I420;

+    else

+      ctx->preview_img.fmt = VPX_IMG_FMT_VPXI420;

+    ctx->preview_img.x_chroma_shift = 1;

+    ctx->preview_img.y_chroma_shift = 1;

+    ctx->preview_img.d_w = sd.y_width;

+    ctx->preview_img.d_h = sd.y_height;

+    ctx->preview_img.stride[VPX_PLANE_Y] = sd.y_stride;

+    ctx->preview_img.stride[VPX_PLANE_U] = sd.uv_stride;

+    ctx->preview_img.stride[VPX_PLANE_V] = sd.uv_stride;

+    ctx->preview_img.w   = sd.y_width;

+    ctx->preview_img.h   = sd.y_height;

+    return &ctx->preview_img;

+  } else

+    return NULL;

+}

+static vpx_codec_err_t vp8e_update_entropy(vpx_codec_alg_priv_t *ctx,

+                                           int ctr_id,

+                                           va_list args) {

+  int update = va_arg(args, int);

+  vp9_update_entropy(ctx->cpi, update);

+  return VPX_CODEC_OK;

+}

+static vpx_codec_err_t vp8e_update_reference(vpx_codec_alg_priv_t *ctx,

+                                             int ctr_id,

+                                             va_list args) {

+  int update = va_arg(args, int);

+  vp9_update_reference(ctx->cpi, update);

+  return VPX_CODEC_OK;

+}

+static vpx_codec_err_t vp8e_use_reference(vpx_codec_alg_priv_t *ctx,

+                                          int ctr_id,

+                                          va_list args) {

+  int reference_flag = va_arg(args, int);

+  vp9_use_as_reference(ctx->cpi, reference_flag);

+  return VPX_CODEC_OK;

+}

+static vpx_codec_err_t vp8e_set_roi_map(vpx_codec_alg_priv_t *ctx,

+                                        int ctr_id,

+                                        va_list args) {

+  vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *);

+  if (data) {

+    vpx_roi_map_t *roi = (vpx_roi_map_t *)data;

+    if (!vp9_set_roimap(ctx->cpi, roi->roi_map, roi->rows, roi->cols,

+                        roi->delta_q, roi->delta_lf, roi->static_threshold))

+      return VPX_CODEC_OK;

+    else

+      return VPX_CODEC_INVALID_PARAM;

+  } else

+    return VPX_CODEC_INVALID_PARAM;

+}

+static vpx_codec_err_t vp8e_set_activemap(vpx_codec_alg_priv_t *ctx,

+                                          int ctr_id,

+                                          va_list args) {

+  vpx_active_map_t *data = va_arg(args, vpx_active_map_t *);

+  if (data) {

+    vpx_active_map_t *map = (vpx_active_map_t *)data;

+    if (!vp9_set_active_map(ctx->cpi, map->active_map, map->rows, map->cols))

+      return VPX_CODEC_OK;

+    else

+      return VPX_CODEC_INVALID_PARAM;

+  } else

+    return VPX_CODEC_INVALID_PARAM;

+}

+static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx,

+                                          int ctr_id,

+                                          va_list args) {

+  vpx_scaling_mode_t *data =  va_arg(args, vpx_scaling_mode_t *);

+  if (data) {

+    int res;

+    vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data;

+    res = vp9_set_internal_size(ctx->cpi, scalemode.h_scaling_mode,

+                                scalemode.v_scaling_mode);

+    if (!res) {

+      /*force next frame a key frame to effect scaling mode */

+      ctx->next_frame_flag |= FRAMEFLAGS_KEY;

+      return VPX_CODEC_OK;

+    } else

+      return VPX_CODEC_INVALID_PARAM;

+  } else

+    return VPX_CODEC_INVALID_PARAM;

+}

+static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = {

+  {VP8_SET_REFERENCE,                 vp8e_set_reference},

+  {VP8_COPY_REFERENCE,                vp8e_get_reference},

+  {VP8_SET_POSTPROC,                  vp8e_set_previewpp},

+  {VP8E_UPD_ENTROPY,                  vp8e_update_entropy},

+  {VP8E_UPD_REFERENCE,                vp8e_update_reference},

+  {VP8E_USE_REFERENCE,                vp8e_use_reference},

+  {VP8E_SET_ROI_MAP,                  vp8e_set_roi_map},

+  {VP8E_SET_ACTIVEMAP,                vp8e_set_activemap},

+  {VP8E_SET_SCALEMODE,                vp8e_set_scalemode},

+  {VP8E_SET_ENCODING_MODE,            set_param},

+  {VP8E_SET_CPUUSED,                  set_param},

+  {VP8E_SET_NOISE_SENSITIVITY,        set_param},

+  {VP8E_SET_ENABLEAUTOALTREF,         set_param},

+  {VP8E_SET_SHARPNESS,                set_param},

+  {VP8E_SET_STATIC_THRESHOLD,         set_param},

+  {VP8E_SET_TOKEN_PARTITIONS,         set_param},

+  {VP8E_GET_LAST_QUANTIZER,           get_param},

+  {VP8E_GET_LAST_QUANTIZER_64,        get_param},

+  {VP8E_SET_ARNR_MAXFRAMES,           set_param},

+  {VP8E_SET_ARNR_STRENGTH,           set_param},

+  {VP8E_SET_ARNR_TYPE,           set_param},

+  {VP8E_SET_TUNING,                   set_param},

+  {VP8E_SET_CQ_LEVEL,                 set_param},

+  {VP8E_SET_MAX_INTRA_BITRATE_PCT,    set_param},

+  { -1, NULL},

+};

+static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {

+  {

+    0,

+    {

+      0,                  /* g_usage */

+      0,                  /* g_threads */

+      0,                  /* g_profile */

+      320,                /* g_width */

+      240,                /* g_height */

+      {1, 30},            /* g_timebase */

+      0,                  /* g_error_resilient */

+      VPX_RC_ONE_PASS,    /* g_pass */

+      0,                  /* g_lag_in_frames */

+      0,                  /* rc_dropframe_thresh */

+      0,                  /* rc_resize_allowed */

+      60,                 /* rc_resize_down_thresold */

+      30,                 /* rc_resize_up_thresold */

+      VPX_VBR,            /* rc_end_usage */

+#if VPX_ENCODER_ABI_VERSION > (1 + VPX_CODEC_ABI_VERSION)

+      {0},                /* rc_twopass_stats_in */

+#endif

+      256,                /* rc_target_bandwidth */

+      4,                  /* rc_min_quantizer */

+      63,                 /* rc_max_quantizer */

+      100,                /* rc_undershoot_pct */

+      100,                /* rc_overshoot_pct */

+      6000,               /* rc_max_buffer_size */

+      4000,               /* rc_buffer_initial_size; */

+      5000,               /* rc_buffer_optimal_size; */

+      50,                 /* rc_two_pass_vbrbias  */

+      0,                  /* rc_two_pass_vbrmin_section */

+      400,                /* rc_two_pass_vbrmax_section */

+      /* keyframing settings (kf) */

+      VPX_KF_AUTO,        /* g_kfmode*/

+      0,                  /* kf_min_dist */

+      9999,               /* kf_max_dist */

+#if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION)

+      1,                  /* g_delete_first_pass_file */

+      "vp8.fpf"           /* first pass filename */

+#endif

+    }

+  },

+  { -1, {NOT_IMPLEMENTED}}

+};

+#ifndef VERSION_STRING

+#define VERSION_STRING

+#endif

+CODEC_INTERFACE(vpx_codec_vp8_cx) = {

+  "WebM Project VP8 Encoder" VERSION_STRING,

+  VPX_CODEC_INTERNAL_ABI_VERSION,

+  VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR |

+  VPX_CODEC_CAP_OUTPUT_PARTITION,

+  /* vpx_codec_caps_t          caps; */

+  vp8e_init,          /* vpx_codec_init_fn_t       init; */

+  vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */

+  vp8e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */

+  NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */

+  NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */

+  {

+    NOT_IMPLEMENTED,    /* vpx_codec_peek_si_fn_t    peek_si; */

+    NOT_IMPLEMENTED,    /* vpx_codec_get_si_fn_t     get_si; */

+    NOT_IMPLEMENTED,    /* vpx_codec_decode_fn_t     decode; */

+    NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */

+  },

+  {

+    vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */

+    vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */

+    vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */

+    vp8e_set_config,

+    NOT_IMPLEMENTED,

+    vp8e_get_preview,

+  } /* encoder functions */

+};

+#if CONFIG_EXPERIMENTAL

+CODEC_INTERFACE(vpx_codec_vp8x_cx) = {

+  "VP8 Experimental Encoder" VERSION_STRING,

+  VPX_CODEC_INTERNAL_ABI_VERSION,

+  VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR,

+  /* vpx_codec_caps_t          caps; */

+  vp8e_exp_init,      /* vpx_codec_init_fn_t       init; */

+  vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */

+  vp8e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */

+  NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */

+  NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */

+  {

+    NOT_IMPLEMENTED,    /* vpx_codec_peek_si_fn_t    peek_si; */

+    NOT_IMPLEMENTED,    /* vpx_codec_get_si_fn_t     get_si; */

+    NOT_IMPLEMENTED,    /* vpx_codec_decode_fn_t     decode; */

+    NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */

+  },

+  {

+    vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */

+    vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */

+    vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */

+    vp8e_set_config,

+    NOT_IMPLEMENTED,

+    vp8e_get_preview,

+  } /* encoder functions */

+};

+#endif

+/*

+ * BEGIN BACKWARDS COMPATIBILITY SHIM.

+ */

+#define FORCE_KEY   2

+static vpx_codec_err_t api1_control(vpx_codec_alg_priv_t *ctx,

+                                    int                   ctrl_id,

+                                    va_list               args) {

+  vpx_codec_ctrl_fn_map_t *entry;

+  switch (ctrl_id) {

+    case VP8E_SET_FLUSHFLAG:

+      /* VP8 sample code did VP8E_SET_FLUSHFLAG followed by

+       * vpx_codec_get_cx_data() rather than vpx_codec_encode().

+       */

+      return vp8e_encode(ctx, NULL, 0, 0, 0, 0);

+    case VP8E_SET_FRAMETYPE:

+      ctx->base.enc.tbd |= FORCE_KEY;

+      return VPX_CODEC_OK;

+  }

+  for (entry = vp8e_ctf_maps; entry && entry->fn; entry++) {

+    if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) {

+      return entry->fn(ctx, ctrl_id, args);

+    }

+  }

+  return VPX_CODEC_ERROR;

+}

+static vpx_codec_ctrl_fn_map_t api1_ctrl_maps[] = {

+  {0, api1_control},

+  { -1, NULL}

+};

+static vpx_codec_err_t api1_encode(vpx_codec_alg_priv_t  *ctx,

+                                   const vpx_image_t     *img,

+                                   vpx_codec_pts_t        pts,

+                                   unsigned long          duration,

+                                   vpx_enc_frame_flags_t  flags,

+                                   unsigned long          deadline) {

+  int force = ctx->base.enc.tbd;

+  ctx->base.enc.tbd = 0;

+  return vp8e_encode

+         (ctx,

+          img,

+          pts,

+          duration,

+          flags | ((force & FORCE_KEY) ? VPX_EFLAG_FORCE_KF : 0),

+          deadline);

+}

+vpx_codec_iface_t vpx_enc_vp8_algo = {

+  "WebM Project VP8 Encoder (Deprecated API)" VERSION_STRING,

+  VPX_CODEC_INTERNAL_ABI_VERSION,

+  VPX_CODEC_CAP_ENCODER,

+  /* vpx_codec_caps_t          caps; */

+  vp8e_init,          /* vpx_codec_init_fn_t       init; */

+  vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */

+  api1_ctrl_maps,     /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */

+  NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */

+  NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */

+  {NOT_IMPLEMENTED},  /* decoder functions */

+  {

+    vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */

+    api1_encode,        /* vpx_codec_encode_fn_t      encode; */

+    vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */

+    vp8e_set_config,

+    NOT_IMPLEMENTED,

+    vp8e_get_preview,

+  } /* encoder functions */

+};

--- /dev/null

+++ b/vp9/vp9_dx_iface.c

@@ -1,0 +1,717 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <stdlib.h>

+#include <string.h>

+#include "vpx/vpx_decoder.h"

+#include "vpx/vp8dx.h"

+#include "vpx/internal/vpx_codec_internal.h"

+#include "vpx_version.h"

+#include "common/onyxd.h"

+#include "decoder/onyxd_int.h"

+#define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)

+typedef vpx_codec_stream_info_t  vp8_stream_info_t;

+/* Structures for handling memory allocations */

+typedef enum {

+  VP8_SEG_ALG_PRIV     = 256,

+  VP8_SEG_MAX

+} mem_seg_id_t;

+#define NELEMENTS(x) ((int)(sizeof(x)/sizeof(x[0])))

+static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t);

+typedef struct {

+  unsigned int   id;

+  unsigned long  sz;

+  unsigned int   align;

+  unsigned int   flags;

+  unsigned long(*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t);

+} mem_req_t;

+static const mem_req_t vp8_mem_req_segs[] = {

+  {VP8_SEG_ALG_PRIV,    0, 8, VPX_CODEC_MEM_ZERO, vp8_priv_sz},

+  {VP8_SEG_MAX, 0, 0, 0, NULL}

+};

+struct vpx_codec_alg_priv {

+  vpx_codec_priv_t        base;

+  vpx_codec_mmap_t        mmaps[NELEMENTS(vp8_mem_req_segs) - 1];

+  vpx_codec_dec_cfg_t     cfg;

+  vp8_stream_info_t       si;

+  int                     defer_alloc;

+  int                     decoder_init;

+  VP9D_PTR                pbi;

+  int                     postproc_cfg_set;

+  vp8_postproc_cfg_t      postproc_cfg;

+#if CONFIG_POSTPROC_VISUALIZER

+  unsigned int            dbg_postproc_flag;

+  int                     dbg_color_ref_frame_flag;

+  int                     dbg_color_mb_modes_flag;

+  int                     dbg_color_b_modes_flag;

+  int                     dbg_display_mv_flag;

+#endif

+  vpx_image_t             img;

+  int                     img_setup;

+  int                     img_avail;

+};

+static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si,

+                                 vpx_codec_flags_t flags) {

+  /* Although this declaration is constant, we can't use it in the requested

+   * segments list because we want to define the requested segments list

+   * before defining the private type (so that the number of memory maps is

+   * known)

+   */

+  (void)si;

+  return sizeof(vpx_codec_alg_priv_t);

+}

+static void vp8_mmap_dtor(vpx_codec_mmap_t *mmap) {

+  free(mmap->priv);

+}

+static vpx_codec_err_t vp8_mmap_alloc(vpx_codec_mmap_t *mmap) {

+  vpx_codec_err_t  res;

+  unsigned int   align;

+  align = mmap->align ? mmap->align - 1 : 0;

+  if (mmap->flags & VPX_CODEC_MEM_ZERO)

+    mmap->priv = calloc(1, mmap->sz + align);

+  else

+    mmap->priv = malloc(mmap->sz + align);

+  res = (mmap->priv) ? VPX_CODEC_OK : VPX_CODEC_MEM_ERROR;

+  mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align);

+  mmap->dtor = vp8_mmap_dtor;

+  return res;

+}

+static vpx_codec_err_t vp8_validate_mmaps(const vp8_stream_info_t *si,

+                                          const vpx_codec_mmap_t *mmaps,

+                                          vpx_codec_flags_t init_flags) {

+  int i;

+  vpx_codec_err_t res = VPX_CODEC_OK;

+  for (i = 0; i < NELEMENTS(vp8_mem_req_segs) - 1; i++) {

+    /* Ensure the segment has been allocated */

+    if (!mmaps[i].base) {

+      res = VPX_CODEC_MEM_ERROR;

+      break;

+    }

+    /* Verify variable size segment is big enough for the current si. */

+    if (vp8_mem_req_segs[i].calc_sz) {

+      vpx_codec_dec_cfg_t cfg;

+      cfg.w = si->w;

+      cfg.h = si->h;

+      if (mmaps[i].sz < vp8_mem_req_segs[i].calc_sz(&cfg, init_flags)) {

+        res = VPX_CODEC_MEM_ERROR;

+        break;

+      }

+    }

+  }

+  return res;

+}

+static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) {

+  int i;

+  ctx->priv = mmap->base;

+  ctx->priv->sz = sizeof(*ctx->priv);

+  ctx->priv->iface = ctx->iface;

+  ctx->priv->alg_priv = mmap->base;

+  for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++)

+    ctx->priv->alg_priv->mmaps[i].id = vp8_mem_req_segs[i].id;

+  ctx->priv->alg_priv->mmaps[0] = *mmap;

+  ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);

+  ctx->priv->init_flags = ctx->init_flags;

+  if (ctx->config.dec) {

+    /* Update the reference to the config structure to an internal copy. */

+    ctx->priv->alg_priv->cfg = *ctx->config.dec;

+    ctx->config.dec = &ctx->priv->alg_priv->cfg;

+  }

+}

+static void *mmap_lkup(vpx_codec_alg_priv_t *ctx, unsigned int id) {

+  int i;

+  for (i = 0; i < NELEMENTS(ctx->mmaps); i++)

+    if (ctx->mmaps[i].id == id)

+      return ctx->mmaps[i].base;

+  return NULL;

+}

+static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx) {

+  /* nothing to clean up */

+}

+static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx) {

+  vpx_codec_err_t        res = VPX_CODEC_OK;

+  /* This function only allocates space for the vpx_codec_alg_priv_t

+   * structure. More memory may be required at the time the stream

+   * information becomes known.

+   */

+  if (!ctx->priv) {

+    vpx_codec_mmap_t mmap;

+    mmap.id = vp8_mem_req_segs[0].id;

+    mmap.sz = sizeof(vpx_codec_alg_priv_t);

+    mmap.align = vp8_mem_req_segs[0].align;

+    mmap.flags = vp8_mem_req_segs[0].flags;

+    res = vp8_mmap_alloc(&mmap);

+    if (!res) {

+      vp8_init_ctx(ctx, &mmap);

+      ctx->priv->alg_priv->defer_alloc = 1;

+      /*post processing level initialized to do nothing */

+    }

+  }

+  return res;

+}

+static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx) {

+  int i;

+  vp9_remove_decompressor(ctx->pbi);

+  for (i = NELEMENTS(ctx->mmaps) - 1; i >= 0; i--) {

+    if (ctx->mmaps[i].dtor)

+      ctx->mmaps[i].dtor(&ctx->mmaps[i]);

+  }

+  return VPX_CODEC_OK;

+}

+static vpx_codec_err_t vp8_peek_si(const uint8_t         *data,

+                                   unsigned int           data_sz,

+                                   vpx_codec_stream_info_t *si) {

+  vpx_codec_err_t res = VPX_CODEC_OK;

+  if (data + data_sz <= data)

+    res = VPX_CODEC_INVALID_PARAM;

+  else {

+    /* Parse uncompresssed part of key frame header.

+     * 3 bytes:- including version, frame type and an offset

+     * 3 bytes:- sync code (0x9d, 0x01, 0x2a)

+     * 4 bytes:- including image width and height in the lowest 14 bits

+     *           of each 2-byte value.

+     */

+    si->is_kf = 0;

+    if (data_sz >= 10 && !(data[0] & 0x01)) { /* I-Frame */

+      const uint8_t *c = data + 3;

+      si->is_kf = 1;

+      /* vet via sync code */

+      if (c[0] != 0x9d || c[1] != 0x01 || c[2] != 0x2a)

+        res = VPX_CODEC_UNSUP_BITSTREAM;

+      si->w = (c[3] | (c[4] << 8)) & 0x3fff;

+      si->h = (c[5] | (c[6] << 8)) & 0x3fff;

+      /*printf("w=%d, h=%d\n", si->w, si->h);*/

+      if (!(si->h | si->w))

+        res = VPX_CODEC_UNSUP_BITSTREAM;

+    } else

+      res = VPX_CODEC_UNSUP_BITSTREAM;

+  }

+  return res;

+}

+static vpx_codec_err_t vp8_get_si(vpx_codec_alg_priv_t    *ctx,

+                                  vpx_codec_stream_info_t *si) {

+  unsigned int sz;

+  if (si->sz >= sizeof(vp8_stream_info_t))

+    sz = sizeof(vp8_stream_info_t);

+  else

+    sz = sizeof(vpx_codec_stream_info_t);

+  memcpy(si, &ctx->si, sz);

+  si->sz = sz;

+  return VPX_CODEC_OK;

+}

+static vpx_codec_err_t

+update_error_state(vpx_codec_alg_priv_t                 *ctx,

+                   const struct vpx_internal_error_info *error) {

+  vpx_codec_err_t res;

+  if ((res = error->error_code))

+    ctx->base.err_detail = error->has_detail

+                           ? error->detail

+                           : NULL;

+  return res;

+}

+static void yuvconfig2image(vpx_image_t               *img,

+                            const YV12_BUFFER_CONFIG  *yv12,

+                            void                      *user_priv) {

+  /** vpx_img_wrap() doesn't allow specifying independent strides for

+    * the Y, U, and V planes, nor other alignment adjustments that

+    * might be representable by a YV12_BUFFER_CONFIG, so we just

+    * initialize all the fields.*/

+  img->fmt = yv12->clrtype == REG_YUV ?

+             VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;

+  img->w = yv12->y_stride;

+  img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;

+  img->d_w = yv12->y_width;

+  img->d_h = yv12->y_height;

+  img->x_chroma_shift = 1;

+  img->y_chroma_shift = 1;

+  img->planes[VPX_PLANE_Y] = yv12->y_buffer;

+  img->planes[VPX_PLANE_U] = yv12->u_buffer;

+  img->planes[VPX_PLANE_V] = yv12->v_buffer;

+  img->planes[VPX_PLANE_ALPHA] = NULL;

+  img->stride[VPX_PLANE_Y] = yv12->y_stride;

+  img->stride[VPX_PLANE_U] = yv12->uv_stride;

+  img->stride[VPX_PLANE_V] = yv12->uv_stride;

+  img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;

+  img->bps = 12;

+  img->user_priv = user_priv;

+  img->img_data = yv12->buffer_alloc;

+  img->img_data_owner = 0;

+  img->self_allocd = 0;

+}

+static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,

+                                  const uint8_t         *data,

+                                  unsigned int            data_sz,

+                                  void                    *user_priv,

+                                  long                    deadline) {

+  vpx_codec_err_t res = VPX_CODEC_OK;

+  ctx->img_avail = 0;

+  /* Determine the stream parameters. Note that we rely on peek_si to

+   * validate that we have a buffer that does not wrap around the top

+   * of the heap.

+   */

+  if (!ctx->si.h)

+    res = ctx->base.iface->dec.peek_si(data, data_sz, &ctx->si);

+  /* Perform deferred allocations, if required */

+  if (!res && ctx->defer_alloc) {

+    int i;

+    for (i = 1; !res && i < NELEMENTS(ctx->mmaps); i++) {

+      vpx_codec_dec_cfg_t cfg;

+      cfg.w = ctx->si.w;

+      cfg.h = ctx->si.h;

+      ctx->mmaps[i].id = vp8_mem_req_segs[i].id;

+      ctx->mmaps[i].sz = vp8_mem_req_segs[i].sz;

+      ctx->mmaps[i].align = vp8_mem_req_segs[i].align;

+      ctx->mmaps[i].flags = vp8_mem_req_segs[i].flags;

+      if (!ctx->mmaps[i].sz)

+        ctx->mmaps[i].sz = vp8_mem_req_segs[i].calc_sz(&cfg,

+                                                       ctx->base.init_flags);

+      res = vp8_mmap_alloc(&ctx->mmaps[i]);

+    }

+    if (!res)

+      vp8_finalize_mmaps(ctx);

+    ctx->defer_alloc = 0;

+  }

+  /* Initialize the decoder instance on the first frame*/

+  if (!res && !ctx->decoder_init) {

+    res = vp8_validate_mmaps(&ctx->si, ctx->mmaps, ctx->base.init_flags);

+    if (!res) {

+      VP9D_CONFIG oxcf;

+      VP9D_PTR optr;

+      vp9_initialize_dec();

+      oxcf.Width = ctx->si.w;

+      oxcf.Height = ctx->si.h;

+      oxcf.Version = 9;

+      oxcf.postprocess = 0;

+      oxcf.max_threads = ctx->cfg.threads;

+      optr = vp9_create_decompressor(&oxcf);

+      /* If postprocessing was enabled by the application and a

+       * configuration has not been provided, default it.

+       */

+      if (!ctx->postproc_cfg_set

+          && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) {

+        ctx->postproc_cfg.post_proc_flag =

+          VP8_DEBLOCK | VP8_DEMACROBLOCK;

+        ctx->postproc_cfg.deblocking_level = 4;

+        ctx->postproc_cfg.noise_level = 0;

+      }

+      if (!optr)

+        res = VPX_CODEC_ERROR;

+      else

+        ctx->pbi = optr;

+    }

+    ctx->decoder_init = 1;

+  }

+  if (!res && ctx->pbi) {

+    YV12_BUFFER_CONFIG sd;

+    int64_t time_stamp = 0, time_end_stamp = 0;

+    vp9_ppflags_t flags = {0};

+    if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) {

+      flags.post_proc_flag = ctx->postproc_cfg.post_proc_flag

+#if CONFIG_POSTPROC_VISUALIZER

+                             | ((ctx->dbg_color_ref_frame_flag != 0) ? VP9D_DEBUG_CLR_FRM_REF_BLKS : 0)

+                             | ((ctx->dbg_color_mb_modes_flag != 0) ? VP9D_DEBUG_CLR_BLK_MODES : 0)

+                             | ((ctx->dbg_color_b_modes_flag != 0) ? VP9D_DEBUG_CLR_BLK_MODES : 0)

+                             | ((ctx->dbg_display_mv_flag != 0) ? VP9D_DEBUG_DRAW_MV : 0)

+#endif

+;

+      flags.deblocking_level      = ctx->postproc_cfg.deblocking_level;

+      flags.noise_level           = ctx->postproc_cfg.noise_level;

+#if CONFIG_POSTPROC_VISUALIZER

+      flags.display_ref_frame_flag = ctx->dbg_color_ref_frame_flag;

+      flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag;

+      flags.display_b_modes_flag  = ctx->dbg_color_b_modes_flag;

+      flags.display_mv_flag       = ctx->dbg_display_mv_flag;

+#endif

+    }

+    if (vp9_receive_compressed_data(ctx->pbi, data_sz, data, deadline)) {

+      VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;

+      res = update_error_state(ctx, &pbi->common.error);

+    }

+    if (!res && 0 == vp9_get_raw_frame(ctx->pbi, &sd, &time_stamp,

+                                       &time_end_stamp, &flags)) {

+      yuvconfig2image(&ctx->img, &sd, user_priv);

+      ctx->img_avail = 1;

+    }

+  }

+  return res;

+}

+static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t  *ctx,

+                                  vpx_codec_iter_t      *iter) {

+  vpx_image_t *img = NULL;

+  if (ctx->img_avail) {

+    /* iter acts as a flip flop, so an image is only returned on the first

+     * call to get_frame.

+     */

+    if (!(*iter)) {

+      img = &ctx->img;

+      *iter = img;

+    }

+  }

+  return img;

+}

+static

+vpx_codec_err_t vp8_xma_get_mmap(const vpx_codec_ctx_t      *ctx,

+                                 vpx_codec_mmap_t           *mmap,

+                                 vpx_codec_iter_t           *iter) {

+  vpx_codec_err_t     res;

+  const mem_req_t  *seg_iter = *iter;

+  /* Get address of next segment request */

+  do {

+    if (!seg_iter)

+      seg_iter = vp8_mem_req_segs;

+    else if (seg_iter->id != VP8_SEG_MAX)

+      seg_iter++;

+    *iter = (vpx_codec_iter_t)seg_iter;

+    if (seg_iter->id != VP8_SEG_MAX) {

+      mmap->id = seg_iter->id;

+      mmap->sz = seg_iter->sz;

+      mmap->align = seg_iter->align;

+      mmap->flags = seg_iter->flags;

+      if (!seg_iter->sz)

+        mmap->sz = seg_iter->calc_sz(ctx->config.dec, ctx->init_flags);

+      res = VPX_CODEC_OK;

+    } else

+      res = VPX_CODEC_LIST_END;

+  } while (!mmap->sz && res != VPX_CODEC_LIST_END);

+  return res;

+}

+static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t         *ctx,

+                                        const vpx_codec_mmap_t  *mmap) {

+  vpx_codec_err_t res = VPX_CODEC_MEM_ERROR;

+  int i, done;

+  if (!ctx->priv) {

+    if (mmap->id == VP8_SEG_ALG_PRIV) {

+      if (!ctx->priv) {

+        vp8_init_ctx(ctx, mmap);

+        res = VPX_CODEC_OK;

+      }

+    }

+  }

+  done = 1;

+  if (!res && ctx->priv->alg_priv) {

+    for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++) {

+      if (ctx->priv->alg_priv->mmaps[i].id == mmap->id)

+        if (!ctx->priv->alg_priv->mmaps[i].base) {

+          ctx->priv->alg_priv->mmaps[i] = *mmap;

+          res = VPX_CODEC_OK;

+        }

+      done &= (ctx->priv->alg_priv->mmaps[i].base != NULL);

+    }

+  }

+  if (done && !res) {

+    vp8_finalize_mmaps(ctx->priv->alg_priv);

+    res = ctx->iface->init(ctx);

+  }

+  return res;

+}

+static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,

+                                       YV12_BUFFER_CONFIG  *yv12) {

+  vpx_codec_err_t        res = VPX_CODEC_OK;

+  yv12->y_buffer = img->planes[VPX_PLANE_Y];

+  yv12->u_buffer = img->planes[VPX_PLANE_U];

+  yv12->v_buffer = img->planes[VPX_PLANE_V];

+  yv12->y_width  = img->d_w;

+  yv12->y_height = img->d_h;

+  yv12->uv_width = yv12->y_width / 2;

+  yv12->uv_height = yv12->y_height / 2;

+  yv12->y_stride = img->stride[VPX_PLANE_Y];

+  yv12->uv_stride = img->stride[VPX_PLANE_U];

+  yv12->border  = (img->stride[VPX_PLANE_Y] - img->d_w) / 2;

+  yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 ||

+                   img->fmt == VPX_IMG_FMT_VPXYV12);

+  return res;

+}

+static vpx_codec_err_t vp9_set_reference(vpx_codec_alg_priv_t *ctx,

+                                         int ctr_id,

+                                         va_list args) {

+  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);

+  if (data) {

+    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;

+    YV12_BUFFER_CONFIG sd;

+    image2yuvconfig(&frame->img, &sd);

+    return vp9_set_reference_dec(ctx->pbi, frame->frame_type, &sd);

+  } else

+    return VPX_CODEC_INVALID_PARAM;

+}

+static vpx_codec_err_t vp9_get_reference(vpx_codec_alg_priv_t *ctx,

+                                         int ctr_id,

+                                         va_list args) {

+  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);

+  if (data) {

+    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;

+    YV12_BUFFER_CONFIG sd;

+    image2yuvconfig(&frame->img, &sd);

+    return vp9_get_reference_dec(ctx->pbi, frame->frame_type, &sd);

+  } else

+    return VPX_CODEC_INVALID_PARAM;

+}

+static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,

+                                        int ctr_id,

+                                        va_list args) {

+#if CONFIG_POSTPROC

+  vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);

+  if (data) {

+    ctx->postproc_cfg_set = 1;

+    ctx->postproc_cfg = *((vp8_postproc_cfg_t *)data);

+    return VPX_CODEC_OK;

+  } else

+    return VPX_CODEC_INVALID_PARAM;

+#else

+  return VPX_CODEC_INCAPABLE;

+#endif

+}

+static vpx_codec_err_t vp8_set_dbg_options(vpx_codec_alg_priv_t *ctx,

+                                           int ctrl_id,

+                                           va_list args) {

+#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC

+  int data = va_arg(args, int);

+#define MAP(id, var) case id: var = data; break;

+  switch (ctrl_id) {

+      MAP(VP8_SET_DBG_COLOR_REF_FRAME,   ctx->dbg_color_ref_frame_flag);

+      MAP(VP8_SET_DBG_COLOR_MB_MODES,    ctx->dbg_color_mb_modes_flag);

+      MAP(VP8_SET_DBG_COLOR_B_MODES,     ctx->dbg_color_b_modes_flag);

+      MAP(VP8_SET_DBG_DISPLAY_MV,        ctx->dbg_display_mv_flag);

+  }

+  return VPX_CODEC_OK;

+#else

+  return VPX_CODEC_INCAPABLE;

+#endif

+}

+static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,

+                                                int ctrl_id,

+                                                va_list args) {

+  int *update_info = va_arg(args, int *);

+  VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;

+  if (update_info) {

+    *update_info = pbi->common.refresh_alt_ref_frame * (int) VP8_ALTR_FRAME

+                   + pbi->common.refresh_golden_frame * (int) VP8_GOLD_FRAME

+                   + pbi->common.refresh_last_frame * (int) VP8_LAST_FRAME;

+    return VPX_CODEC_OK;

+  } else

+    return VPX_CODEC_INVALID_PARAM;

+}

+static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,

+                                               int ctrl_id,

+                                               va_list args) {

+  int *corrupted = va_arg(args, int *);

+  if (corrupted) {

+    VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;

+    *corrupted = pbi->common.frame_to_show->corrupted;

+    return VPX_CODEC_OK;

+  } else

+    return VPX_CODEC_INVALID_PARAM;

+}

+static vpx_codec_ctrl_fn_map_t ctf_maps[] = {

+  {VP8_SET_REFERENCE,             vp9_set_reference},

+  {VP8_COPY_REFERENCE,            vp9_get_reference},

+  {VP8_SET_POSTPROC,              vp8_set_postproc},

+  {VP8_SET_DBG_COLOR_REF_FRAME,   vp8_set_dbg_options},

+  {VP8_SET_DBG_COLOR_MB_MODES,    vp8_set_dbg_options},

+  {VP8_SET_DBG_COLOR_B_MODES,     vp8_set_dbg_options},

+  {VP8_SET_DBG_DISPLAY_MV,        vp8_set_dbg_options},

+  {VP8D_GET_LAST_REF_UPDATES,     vp8_get_last_ref_updates},

+  {VP8D_GET_FRAME_CORRUPTED,      vp8_get_frame_corrupted},

+  { -1, NULL},

+};

+#ifndef VERSION_STRING

+#define VERSION_STRING

+#endif

+CODEC_INTERFACE(vpx_codec_vp8_dx) = {

+  "WebM Project VP8 Decoder" VERSION_STRING,

+  VPX_CODEC_INTERNAL_ABI_VERSION,

+  VPX_CODEC_CAP_DECODER | VP8_CAP_POSTPROC |

+  VPX_CODEC_CAP_INPUT_PARTITION,

+  /* vpx_codec_caps_t          caps; */

+  vp8_init,         /* vpx_codec_init_fn_t       init; */

+  vp8_destroy,      /* vpx_codec_destroy_fn_t    destroy; */

+  ctf_maps,         /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */

+  vp8_xma_get_mmap, /* vpx_codec_get_mmap_fn_t   get_mmap; */

+  vp8_xma_set_mmap, /* vpx_codec_set_mmap_fn_t   set_mmap; */

+  {

+    vp8_peek_si,      /* vpx_codec_peek_si_fn_t    peek_si; */

+    vp8_get_si,       /* vpx_codec_get_si_fn_t     get_si; */

+    vp8_decode,       /* vpx_codec_decode_fn_t     decode; */

+    vp8_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */

+  },

+  {

+    /* encoder functions */

+    NOT_IMPLEMENTED,

+    NOT_IMPLEMENTED,

+    NOT_IMPLEMENTED,

+    NOT_IMPLEMENTED,

+    NOT_IMPLEMENTED,

+    NOT_IMPLEMENTED

+  }

+};

+/*

+ * BEGIN BACKWARDS COMPATIBILITY SHIM.

+ */

+vpx_codec_iface_t vpx_codec_vp8_algo = {

+  "WebM Project VP8 Decoder (Deprecated API)" VERSION_STRING,

+  VPX_CODEC_INTERNAL_ABI_VERSION,

+  VPX_CODEC_CAP_DECODER | VP8_CAP_POSTPROC,

+  /* vpx_codec_caps_t          caps; */

+  vp8_init,         /* vpx_codec_init_fn_t       init; */

+  vp8_destroy,      /* vpx_codec_destroy_fn_t    destroy; */

+  ctf_maps,         /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */

+  vp8_xma_get_mmap, /* vpx_codec_get_mmap_fn_t   get_mmap; */

+  vp8_xma_set_mmap, /* vpx_codec_set_mmap_fn_t   set_mmap; */

+  {

+    vp8_peek_si,      /* vpx_codec_peek_si_fn_t    peek_si; */

+    vp8_get_si,       /* vpx_codec_get_si_fn_t     get_si; */

+    vp8_decode,       /* vpx_codec_decode_fn_t     decode; */

+    vp8_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */

+  },

+  {

+    /* encoder functions */

+    NOT_IMPLEMENTED,

+    NOT_IMPLEMENTED,

+    NOT_IMPLEMENTED,

+    NOT_IMPLEMENTED,

+    NOT_IMPLEMENTED,

+    NOT_IMPLEMENTED

+  }

+};

--- /dev/null

+++ b/vp9/vp9cx.mk

@@ -1,0 +1,120 @@

+##

+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+##

+##  Use of this source code is governed by a BSD-style license

+##  that can be found in the LICENSE file in the root of the source

+##  tree. An additional intellectual property rights grant can be found

+##  in the file PATENTS.  All contributing project authors may

+##  be found in the AUTHORS file in the root of the source tree.

+##

+include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9_common.mk

+VP9_CX_EXPORTS += exports_enc

+VP9_CX_SRCS-yes += $(VP9_COMMON_SRCS-yes)

+VP9_CX_SRCS-no  += $(VP9_COMMON_SRCS-no)

+VP9_CX_SRCS_REMOVE-yes += $(VP9_COMMON_SRCS_REMOVE-yes)

+VP9_CX_SRCS_REMOVE-no  += $(VP9_COMMON_SRCS_REMOVE-no)

+ifeq ($(ARCH_ARM),yes)

+  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9cx_arm.mk

+endif

+VP9_CX_SRCS-yes += vp9_cx_iface.c

+# encoder

+#INCLUDES += algo/vpx_common/vpx_mem/include

+#INCLUDES += common

+#INCLUDES += common

+#INCLUDES += common

+#INCLUDES += algo/vpx_ref/cpu_id/include

+#INCLUDES += common

+#INCLUDES += encoder

+VP9_CX_SRCS-yes += encoder/asm_enc_offsets.c

+VP9_CX_SRCS-yes += encoder/bitstream.c

+VP9_CX_SRCS-yes += encoder/boolhuff.c

+VP9_CX_SRCS-yes += encoder/dct.c

+VP9_CX_SRCS-yes += encoder/encodeframe.c

+VP9_CX_SRCS-yes += encoder/encodeintra.c

+VP9_CX_SRCS-yes += encoder/encodemb.c

+VP9_CX_SRCS-yes += encoder/encodemv.c

+VP9_CX_SRCS-yes += encoder/firstpass.c

+VP9_CX_SRCS-yes += encoder/generic/csystemdependent.c

+VP9_CX_SRCS-yes += encoder/block.h

+VP9_CX_SRCS-yes += encoder/boolhuff.h

+VP9_CX_SRCS-yes += encoder/bitstream.h

+VP9_CX_SRCS-yes += encoder/encodeintra.h

+VP9_CX_SRCS-yes += encoder/encodemb.h

+VP9_CX_SRCS-yes += encoder/encodemv.h

+VP9_CX_SRCS-yes += encoder/firstpass.h

+VP9_CX_SRCS-yes += encoder/lookahead.c

+VP9_CX_SRCS-yes += encoder/lookahead.h

+VP9_CX_SRCS-yes += encoder/mcomp.h

+VP9_CX_SRCS-yes += encoder/modecosts.h

+VP9_CX_SRCS-yes += encoder/onyx_int.h

+VP9_CX_SRCS-yes += encoder/psnr.h

+VP9_CX_SRCS-yes += encoder/quantize.h

+VP9_CX_SRCS-yes += encoder/ratectrl.h

+VP9_CX_SRCS-yes += encoder/rdopt.h

+VP9_CX_SRCS-yes += encoder/tokenize.h

+VP9_CX_SRCS-yes += encoder/treewriter.h

+VP9_CX_SRCS-yes += encoder/variance.h

+VP9_CX_SRCS-yes += encoder/mcomp.c

+VP9_CX_SRCS-yes += encoder/modecosts.c

+VP9_CX_SRCS-yes += encoder/onyx_if.c

+VP9_CX_SRCS-yes += encoder/picklpf.c

+VP9_CX_SRCS-yes += encoder/psnr.c

+VP9_CX_SRCS-yes += encoder/quantize.c

+VP9_CX_SRCS-yes += encoder/ratectrl.c

+VP9_CX_SRCS-yes += encoder/rdopt.c

+VP9_CX_SRCS-yes += encoder/sad_c.c

+VP9_CX_SRCS-yes += encoder/satd_c.c

+VP9_CX_SRCS-yes += encoder/segmentation.c

+VP9_CX_SRCS-yes += encoder/segmentation.h

+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/ssim.c

+VP9_CX_SRCS-yes += encoder/tokenize.c

+VP9_CX_SRCS-yes += encoder/treewriter.c

+VP9_CX_SRCS-yes += encoder/variance_c.c

+ifeq ($(CONFIG_POSTPROC),yes)

+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h

+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c

+endif

+VP9_CX_SRCS-yes += encoder/temporal_filter.c

+VP9_CX_SRCS-yes += encoder/temporal_filter.h

+VP9_CX_SRCS-yes += encoder/mbgraph.c

+VP9_CX_SRCS-yes += encoder/mbgraph.h

+VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/mcomp_x86.h

+VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_x86.h

+VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/temporal_filter_x86.h

+VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/x86_csystemdependent.c

+VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_mmx.c

+VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_impl_mmx.asm

+VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/sad_mmx.asm

+VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm

+VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm

+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm

+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_sse2.c

+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_impl_sse2.asm

+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm

+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm

+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm

+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm

+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm

+VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm

+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm

+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_ssse3.c

+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_impl_ssse3.asm

+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm

+VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm

+VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm

+VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm

+VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm

+VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm

+VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))

--- /dev/null

+++ b/vp9/vp9cx_arm.mk

@@ -1,0 +1,63 @@

+##

+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+##

+##  Use of this source code is governed by a BSD-style license

+##  that can be found in the LICENSE file in the root of the source

+##  tree. An additional intellectual property rights grant can be found

+##  in the file PATENTS.  All contributing project authors may

+##  be found in the AUTHORS file in the root of the source tree.

+##

+#VP9_CX_SRCS list is modified according to different platforms.

+#File list for arm

+# encoder

+VP9_CX_SRCS-$(ARCH_ARM)  += encoder/arm/arm_csystemdependent.c

+VP9_CX_SRCS-$(ARCH_ARM)  += encoder/arm/dct_arm.c

+VP9_CX_SRCS-$(ARCH_ARM)  += encoder/arm/dct_arm.h

+VP9_CX_SRCS-$(ARCH_ARM)  += encoder/arm/encodemb_arm.h

+VP9_CX_SRCS-$(ARCH_ARM)  += encoder/arm/quantize_arm.c

+VP9_CX_SRCS-$(ARCH_ARM)  += encoder/arm/quantize_arm.h

+VP9_CX_SRCS-$(ARCH_ARM)  += encoder/arm/variance_arm.c

+VP9_CX_SRCS-$(ARCH_ARM)  += encoder/arm/variance_arm.h

+#File list for armv5te

+# encoder

+VP9_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c

+VP9_CX_SRCS_REMOVE-$(HAVE_ARMV5TE)  += encoder/boolhuff.c

+VP9_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/boolhuff_armv5te$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_armv5$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_mbrow_armv5$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_partitions_armv5$(ASM)

+#File list for armv6

+# encoder

+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_subtract_armv6$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance8x8_armv6$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/walsh_v6$(ASM)

+#File list for neon

+# encoder

+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/fastquantizeb_neon$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/picklpf_arm.c

+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/sad8_neon$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/sad16_neon$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/shortfdct_neon$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/subtract_neon$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/variance_neon$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_mse16x16_neon$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_memcpy_neon$(ASM)

+VP9_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM)

--- /dev/null

+++ b/vp9/vp9dx.mk

@@ -1,0 +1,71 @@

+##

+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+##

+##  Use of this source code is governed by a BSD-style license

+##  that can be found in the LICENSE file in the root of the source

+##  tree. An additional intellectual property rights grant can be found

+##  in the file PATENTS.  All contributing project authors may

+##  be found in the AUTHORS file in the root of the source tree.

+##

+include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9_common.mk

+VP9_DX_EXPORTS += exports_dec

+VP9_DX_SRCS-yes += $(VP9_COMMON_SRCS-yes)

+VP9_DX_SRCS-no  += $(VP9_COMMON_SRCS-no)

+VP9_DX_SRCS_REMOVE-yes += $(VP9_COMMON_SRCS_REMOVE-yes)

+VP9_DX_SRCS_REMOVE-no  += $(VP9_COMMON_SRCS_REMOVE-no)

+ifeq ($(ARCH_ARM),yes)

+  include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9dx_arm.mk

+endif

+VP9_DX_SRCS-yes += vp9_dx_iface.c

+# common

+#define ARM

+#define DISABLE_THREAD

+#INCLUDES += algo/vpx_common/vpx_mem/include

+#INCLUDES += common

+#INCLUDES += common

+#INCLUDES += common

+#INCLUDES += common

+#INCLUDES += decoder

+# decoder

+#define ARM

+#define DISABLE_THREAD

+#INCLUDES += algo/vpx_common/vpx_mem/include

+#INCLUDES += common

+#INCLUDES += common

+#INCLUDES += common

+#INCLUDES += common

+#INCLUDES += decoder

+VP9_DX_SRCS-yes += decoder/asm_dec_offsets.c

+VP9_DX_SRCS-yes += decoder/dboolhuff.c

+VP9_DX_SRCS-yes += decoder/decodemv.c

+VP9_DX_SRCS-yes += decoder/decodframe.c

+VP9_DX_SRCS-yes += decoder/dequantize.c

+VP9_DX_SRCS-yes += decoder/detokenize.c

+VP9_DX_SRCS-yes += decoder/dboolhuff.h

+VP9_DX_SRCS-yes += decoder/decodemv.h

+VP9_DX_SRCS-yes += decoder/dequantize.h

+VP9_DX_SRCS-yes += decoder/detokenize.h

+VP9_DX_SRCS-yes += decoder/onyxd_int.h

+VP9_DX_SRCS-yes += decoder/treereader.h

+VP9_DX_SRCS-yes += decoder/onyxd_if.c

+VP9_DX_SRCS-yes += decoder/idct_blk.c

+VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes))

+VP9_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/x86_dsystemdependent.c

+VP9_DX_SRCS-$(HAVE_MMX) += decoder/x86/dequantize_mmx.asm

+VP9_DX_SRCS-$(HAVE_MMX) += decoder/x86/idct_blk_mmx.c

+VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/idct_blk_sse2.c

--- /dev/null

+++ b/vp9/vp9dx_arm.mk

@@ -1,0 +1,29 @@

+##

+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+##

+##  Use of this source code is governed by a BSD-style license

+##  that can be found in the LICENSE file in the root of the source

+##  tree. An additional intellectual property rights grant can be found

+##  in the file PATENTS.  All contributing project authors may

+##  be found in the AUTHORS file in the root of the source tree.

+##

+#VP8_DX_SRCS list is modified according to different platforms.

+VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/dequantize_arm.c

+#File list for armv6

+VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequant_dc_idct_v6$(ASM)

+VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequant_idct_v6$(ASM)

+VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequantize_v6$(ASM)

+VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/idct_blk_v6.c

+#File list for neon

+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_full_2x_neon$(ASM)

+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_0_2x_neon$(ASM)

+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequant_idct_neon$(ASM)

+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM)

+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM)

+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequantizeb_neon$(ASM)

+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_blk_neon.c

--- a/vpx/vp8.h

+++ b/vpx/vp8.h

@@ -28,8 +28,8 @@

 /*!\file

  * \brief Provides controls common to both the VP8 encoder and decoder.

*/

-#ifndef VP8_H

-#define VP8_H

+#ifndef VP9_H

+#define VP9_H

 #include "vpx_codec_impl_top.h"

 /*!\brief Control functions

--- a/vpx/vp8cx.h

+++ b/vpx/vp8cx.h

@@ -20,8 +20,8 @@

  * \brief Provides definitions for using the VP8 encoder algorithm within the

  *        vpx Codec Interface.

*/

-#ifndef VP8CX_H

-#define VP8CX_H

+#ifndef VP9CX_H

+#define VP9CX_H

 #include "vpx_config.h"

 #include "vpx_codec_impl_top.h"

--- a/vpx/vp8dx.h

+++ b/vpx/vp8dx.h

@@ -20,8 +20,8 @@

  * \brief Provides definitions for using the VP8 algorithm within the vpx Decoder

  *        interface.

*/

-#ifndef VP8DX_H

-#define VP8DX_H

+#ifndef VP9DX_H

+#define VP9DX_H

 #include "vpx_codec_impl_top.h"

 /*!\name Algorithm interface for VP8

--- a/vpx/vp8e.h

+++ b/vpx/vp8e.h

@@ -12,8 +12,8 @@

 /* This file contains backwards compatibility stubs for applications using

  * the VP8 version 1.0 API.

*/

-#ifndef VP8E_H

-#define VP8E_H

+#ifndef VP9E_H

+#define VP9E_H

 #include "vpx_codec_impl_top.h"

 #if defined(VPX_CODEC_DISABLE_COMPAT) && VPX_CODEC_DISABLE_COMPAT

--- a/vpxdec.c

+++ b/vpxdec.c

@@ -22,7 +22,7 @@

 #include "vpx_config.h"

 #include "vpx/vpx_decoder.h"

 #include "vpx_ports/vpx_timer.h"

-#if CONFIG_VP8_DECODER

+#if CONFIG_VP9_DECODER

 #include "vpx/vp8dx.h"

 #endif

 #if CONFIG_MD5

@@ -56,8 +56,8 @@

    unsigned int             fourcc;

    unsigned int             fourcc_mask;

 } ifaces[] = {

-#if CONFIG_VP8_DECODER

-  {"vp8",  vpx_codec_vp8_dx,   VP8_FOURCC, 0x00FFFFFF},

+#if CONFIG_VP9_DECODER

+  {"vp9",  vpx_codec_vp8_dx,   VP8_FOURCC, 0x00FFFFFF},

 #endif

};

@@ -104,7 +104,7 @@

   NULL

};

-#if CONFIG_VP8_DECODER

+#if CONFIG_VP9_DECODER

 static const arg_def_t addnoise_level = ARG_DEF(NULL, "noise-level", 1,

                                                 "Enable VP8 postproc add noise");

 static const arg_def_t deblock = ARG_DEF(NULL, "deblock", 0,

@@ -135,7 +135,7 @@

   fprintf(stderr, "Usage: %s <options> filename\n\n"

           "Options:\n", exec_name);

   arg_show_usage(stderr, all_args);

-#if CONFIG_VP8_DECODER

+#if CONFIG_VP9_DECODER

   fprintf(stderr, "\nVP8 Postprocessing Options:\n");

   arg_show_usage(stderr, vp8_pp_args);

 #endif

@@ -684,7 +684,7 @@

   unsigned int            fps_num;

   void                   *out = NULL;

   vpx_codec_dec_cfg_t     cfg = {0};

-#if CONFIG_VP8_DECODER

+#if CONFIG_VP9_DECODER

   vp8_postproc_cfg_t      vp8_pp_cfg = {0};

   int                     vp8_dbg_color_ref_frame = 0;

   int                     vp8_dbg_color_mb_modes = 0;

@@ -744,7 +744,7 @@

     else if (arg_match(&arg, &verbosearg, argi))

       quiet = 0;

-#if CONFIG_VP8_DECODER

+#if CONFIG_VP9_DECODER

     else if (arg_match(&arg, &addnoise_level, argi)) {

       postproc = 1;

       vp8_pp_cfg.post_proc_flag |= VP8_ADDNOISE;

@@ -909,7 +909,7 @@

   if (!quiet)

     fprintf(stderr, "%s\n", decoder.name);

-#if CONFIG_VP8_DECODER

+#if CONFIG_VP9_DECODER

   if (vp8_pp_cfg.post_proc_flag

       && vpx_codec_control(&decoder, VP8_SET_POSTPROC, &vp8_pp_cfg)) {

--- a/vpxenc.c

+++ b/vpxenc.c

@@ -82,8 +82,8 @@

   unsigned int             fourcc;

   unsigned int             fourcc_mask;

 } ifaces[] = {

-#if CONFIG_VP8_DECODER

-  {"vp8",  &vpx_codec_vp8_dx,   VP8_FOURCC, 0x00FFFFFF},

+#if CONFIG_VP9_DECODER

+  {"vp9",  &vpx_codec_vp8_dx,   VP8_FOURCC, 0x00FFFFFF},

 #endif

};

@@ -93,8 +93,8 @@

   unsigned int             fourcc;

   unsigned int             fourcc_mask;

 } codecs[] = {

-#if CONFIG_VP8_ENCODER

-  {"vp8",  vpx_codec_vp8x_cx,   VP8_FOURCC, 0x00FFFFFF},

+#if CONFIG_VP9_ENCODER

+  {"vp9",  vpx_codec_vp8x_cx,   VP8_FOURCC, 0x00FFFFFF},

 #endif

};

@@ -1011,7 +1011,7 @@

};

-#if CONFIG_VP8_ENCODER

+#if CONFIG_VP9_ENCODER

 static const arg_def_t noise_sens = ARG_DEF(NULL, "noise-sensitivity", 1,

                                             "Noise sensitivity (frames to blur)");

 static const arg_def_t sharpness = ARG_DEF(NULL, "sharpness", 1,

@@ -1020,13 +1020,13 @@

                                                "Motion detection threshold");

 #endif

-#if CONFIG_VP8_ENCODER

+#if CONFIG_VP9_ENCODER

 static const arg_def_t cpu_used = ARG_DEF(NULL, "cpu-used", 1,

                                           "CPU Used (-16..16)");

 #endif

-#if CONFIG_VP8_ENCODER

+#if CONFIG_VP9_ENCODER

 static const arg_def_t token_parts = ARG_DEF(NULL, "token-parts", 1,

                                              "Number of token partitions to use, log2");

 static const arg_def_t auto_altref = ARG_DEF(NULL, "auto-alt-ref", 1,

@@ -1081,7 +1081,7 @@

   arg_show_usage(stdout, rc_twopass_args);

   fprintf(stderr, "\nKeyframe Placement Options:\n");

   arg_show_usage(stdout, kf_args);

-#if CONFIG_VP8_ENCODER

+#if CONFIG_VP9_ENCODER

   fprintf(stderr, "\nVP8 Specific Options:\n");

   arg_show_usage(stdout, vp8_args);

 #endif

@@ -1659,7 +1659,7 @@

 #endif

   /* Handle codec specific options */

-#if CONFIG_VP8_ENCODER

+#if CONFIG_VP9_ENCODER

   if (codec->fourcc == VP8_FOURCC) {

     ctrl_args = vp8_args;

--

⑨